Sha256: 2864e6ac2beb12fa6e21118fe6f0231e9180737198d98a0d5e37c728061dcb47

Contents?: true

Size: 1.44 KB

Versions: 5

Compression:

Stored size: 1.44 KB

Contents

#!/usr/bin/env ruby
require 'rubygems'
require 'monkeyshines'
require 'monkeyshines/runner'
require 'feedzirra'


#!/usr/bin/env ruby
require 'rubygems'
require 'monkeyshines'
require 'monkeyshines/recursive_runner'
WORK_DIR = Subdir[__FILE__,'work'].expand_path
puts WORK_DIR

#
# Set up scrape
#

#
# * jobs stream from an edamame job queue.
# * Many jobs generate paginated requests, stopping when a response overlaps the
#   prev_max item.
# * Each request is fetched with the standard HTTP fetcher.
#
# * low-generation jobs are rescheduled based on the observed item rate
# * jobs can spawn recursive requests. These have their request_generation
#   incremented
# * results are sent to a ChunkedFlatFileStore
#

#
# Create runner
#
scraper = Monkeyshines::Runner.new({
    :log     => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
    :source  => { :type  => Monkeyshines::RequestStream::KlassHashRequestStream,
      :store => { :type => Monkeyshines::RequestStream::EdamameQueue,
        :queue => { :uris => ['localhost:11210'], :type => 'BeanstalkQueue', },
        :store => { :uri =>            ':11211',  :type => 'TyrantStore',    }, }, },
    :dest    => { :type  => :conditional_store,
      :cache => { :uri =>              ':11212', },
      :store => { :rootdir => WORK_DIR },},
    # :fetcher => { :type => :fake_fetcher },
    :force_fetch => false,
    :sleep_time  => 0.2,
  })

# Execute the scrape
loop do
  puts Time.now
  scraper.run
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
monkeyshines-0.2.3 examples/rss_feeds/scrape_rss_feeds.rb
monkeyshines-0.2.2 examples/rss_feeds/scrape_rss_feeds.rb
monkeyshines-0.2.1 examples/rss_feeds/scrape_rss_feeds.rb
monkeyshines-0.2.0 examples/rss_feeds/scrape_rss_feeds.rb
monkeyshines-0.0.2 examples/rss_feeds/scrape_rss_feeds.rb