lib/spidey/abstract_spider.rb in spidey-0.0.4 vs lib/spidey/abstract_spider.rb in spidey-0.1.0
- old
+ new
@@ -1,29 +1,27 @@
# encoding: utf-8
require 'mechanize'
module Spidey
class AbstractSpider
- attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
+ attr_accessor :urls, :handlers, :results, :request_interval, :errors
DEFAULT_REQUEST_INTERVAL = 3 # seconds
def self.handle(url, handler, default_data = {})
start_urls << url
handlers[url] = [handler, default_data]
end
# Accepts:
# request_interval: number of seconds to wait between requests (default: 3)
- # verbose: prints debugging and progress information if true
def initialize(attrs = {})
@urls = []
@handlers = {}
@results = []
self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
@request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
- @verbose = !!attrs[:verbose]
end
# Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
# max_urls: maximum number of URLs to crawl before returning (optional)
def crawl(options = {})
@@ -31,11 +29,11 @@
i = 0
each_url do |url, handler, default_data|
break if options[:max_urls] && i >= options[:max_urls]
begin
page = agent.get(url)
- $stderr.puts "Handling #{url.inspect}" if verbose
+ Spidey.logger.info "Handling #{url.inspect}"
send handler, page, default_data
rescue => ex
add_error url: url, handler: handler, error: ex
end
sleep request_interval if request_interval > 0
@@ -58,21 +56,21 @@
def each_url(&block)
urls.each do |url|
yield url, handlers[url].first, handlers[url].last
end
end
-
+
# Override this for custom result storage.
def record(data)
results << data
- $stderr.puts "Recording #{data.inspect}" if verbose
+ Spidey.logger.info "Recording #{data.inspect}"
end
-
+
# Override this for custom error-handling.
def add_error(attrs)
@errors << attrs
- $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
end
def resolve_url(href, page)
agent.agent.resolve(href, page).to_s
end
@@ -96,7 +94,7 @@
def self.handlers
@handlers ||= {}
end
end
-
+
end