lib/spidey/abstract_spider.rb in spidey-0.0.4 vs lib/spidey/abstract_spider.rb in spidey-0.1.0

- old
+ new

@@ -1,29 +1,27 @@ # encoding: utf-8 require 'mechanize' module Spidey class AbstractSpider - attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors + attr_accessor :urls, :handlers, :results, :request_interval, :errors DEFAULT_REQUEST_INTERVAL = 3 # seconds def self.handle(url, handler, default_data = {}) start_urls << url handlers[url] = [handler, default_data] end # Accepts: # request_interval: number of seconds to wait between requests (default: 3) - # verbose: prints debugging and progress information if true def initialize(attrs = {}) @urls = [] @handlers = {} @results = [] self.class.start_urls.each { |url| handle url, *self.class.handlers[url] } @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL - @verbose = !!attrs[:verbose] end # Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts: # max_urls: maximum number of URLs to crawl before returning (optional) def crawl(options = {}) @@ -31,11 +29,11 @@ i = 0 each_url do |url, handler, default_data| break if options[:max_urls] && i >= options[:max_urls] begin page = agent.get(url) - $stderr.puts "Handling #{url.inspect}" if verbose + Spidey.logger.info "Handling #{url.inspect}" send handler, page, default_data rescue => ex add_error url: url, handler: handler, error: ex end sleep request_interval if request_interval > 0 @@ -58,21 +56,21 @@ def each_url(&block) urls.each do |url| yield url, handlers[url].first, handlers[url].last end end - + # Override this for custom result storage. def record(data) results << data - $stderr.puts "Recording #{data.inspect}" if verbose + Spidey.logger.info "Recording #{data.inspect}" end - + # Override this for custom error-handling. def add_error(attrs) @errors << attrs - $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose + Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" end def resolve_url(href, page) agent.agent.resolve(href, page).to_s end @@ -96,7 +94,7 @@ def self.handlers @handlers ||= {} end end - + end