#!/usr/bin/env ruby require 'rubygems' require 'monkeyshines' require 'monkeyshines/runner' require 'pathname' # # # require 'wuclan/twitter' # un-namespace request classes. include Wuclan::Twitter::Scrape Monkeyshines::WORK_DIR = '/tmp' WORK_DIR = Pathname.new(Monkeyshines::WORK_DIR).realpath.to_s # =========================================================================== # # scrape_shorturls.rb -- # # To scrape from a list of shortened urls: # # ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv # # To do a random scrape: # # ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix= # # opts = Trollop::options do opt :log, "Log to file instead of STDERR" # input from file opt :from, "URI for scrape store to load from", :type => String opt :skip, "Initial lines to skip", :type => Integer # output storage opt :cache_uri, "URI for cache server", :type => String, :default => ':1978' opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4 opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd' opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv" opt :into, "URI for scrape store into", :type => String end opts[:handle] ||= 'com.twitter' scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines')) opts.merge! scrape_config # ******************** Log ******************** if (opts[:log]) opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv')) $stdout = $stderr = File.open(opts[:log]+"-console.log", "a") end # # Execute the scrape # scraper = Monkeyshines::Runner.new( :dest_store => { :type => :conditional_store, :cache => { :type => :tyrant_rdb_key_store, :uri => opts[:cache_uri] }, :store => opts.merge({ :type => :chunked_flat_file_store }), }, # :store => { :type => :flat_file_store, :filename => opts[:into] }, }, :request_stream => { :type => :base, :klass => Monkeyshines::ScrapeRequest, :store => { :type => :flat_file_store, :filemode => 'r', :filename => opts[:from] } } ) scraper.run