lib/spidr/agent.rb in spidr-0.4.1 vs lib/spidr/agent.rb in spidr-0.5.0
- old
+ new
@@ -1,61 +1,85 @@
-require 'spidr/sanitizers'
-require 'spidr/filters'
-require 'spidr/events'
-require 'spidr/actions'
+require 'spidr/agent/sanitizers'
+require 'spidr/agent/filters'
+require 'spidr/agent/events'
+require 'spidr/agent/actions'
require 'spidr/page'
require 'spidr/session_cache'
require 'spidr/cookie_jar'
require 'spidr/auth_store'
require 'spidr/spidr'
require 'openssl'
require 'net/http'
require 'set'
+begin
+ require 'robots'
+rescue LoadError
+end
+
module Spidr
class Agent
- include Sanitizers
- include Filters
- include Events
- include Actions
-
# HTTP Host Header to use
+ #
+ # @return [String]
attr_accessor :host_header
# HTTP Host Headers to use for specific hosts
+ #
+ # @return [Hash{String,Regexp => String}]
attr_reader :host_headers
# User-Agent to use
+ #
+ # @return [String]
attr_accessor :user_agent
# HTTP Authentication credentials
+ #
+ # @return [AuthStore]
attr_accessor :authorized
# Referer to use
+ #
+ # @return [String]
attr_accessor :referer
# Delay in between fetching pages
+ #
+ # @return [Integer]
attr_accessor :delay
# History containing visited URLs
+ #
+ # @return [Set<URI::HTTP>]
attr_reader :history
# List of unreachable URLs
+ #
+ # @return [Set<URI::HTTP>]
attr_reader :failures
# Queue of URLs to visit
+ #
+ # @return [Array<URI::HTTP>]
attr_reader :queue
# Cached cookies
+ #
+ # @return [CookieJar]
attr_reader :cookies
# Maximum depth
+ #
+ # @return [Integer]
attr_reader :max_depth
# The visited URLs and their depth within a site
+ #
+ # @return [Hash{URI::HTTP => Integer}]
attr_reader :levels
#
# Creates a new Agent object.
#
@@ -99,41 +123,57 @@
# The initial list of visited URLs.
#
# @option options [Integer] :max_depth
# The maximum link depth to follow.
#
+ # @option options [Boolean] :robots (Spidr.robots?)
+ # Specifies whether `robots.txt` should be honored.
+ #
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# for further configuration.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @see #initialize_sanitizers
+ # @see #initialize_filters
+ # @see #initialize_actions
+ # @see #initialize_events
+ #
def initialize(options={})
- @host_header = options[:host_header]
+ @host_header = options[:host_header]
@host_headers = {}
if options[:host_headers]
@host_headers.merge!(options[:host_headers])
end
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
- @referer = options[:referer]
+ @referer = options[:referer]
- @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
- @cookies = CookieJar.new
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
+ @cookies = CookieJar.new
@authorized = AuthStore.new
- @running = false
- @delay = options.fetch(:delay,0)
- @history = Set[]
+ @running = false
+ @delay = options.fetch(:delay,0)
+ @history = Set[]
@failures = Set[]
- @queue = []
+ @queue = []
- @levels = Hash.new(0)
+ @levels = Hash.new(0)
@max_depth = options[:max_depth]
+ if options.fetch(:robots,Spidr.robots?)
+ unless Object.const_defined?(:Robots)
+ raise(ArgumentError,":robots option given but unable to require 'robots' gem")
+ end
+
+ @robots = Robots.new(@user_agent)
+ end
+
initialize_sanitizers(options)
initialize_filters(options)
initialize_actions(options)
initialize_events(options)
@@ -154,10 +194,13 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @see #initialize
+ # @see #start_at
+ #
def self.start_at(url,options={},&block)
agent = new(options,&block)
agent.start_at(url)
end
@@ -175,21 +218,23 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @see #initialize
+ #
def self.site(url,options={},&block)
url = URI(url.to_s) unless url.kind_of?(URI)
- agent = new(options.merge(:host => url.host),&block)
+ agent = new(options.merge(host: url.host),&block)
agent.start_at(url)
end
#
# Creates a new agent and spiders the given host.
#
- # @param [String]
+ # @param [String] name
# The host-name to spider.
#
# @param [Hash] options
# Additional options. See {Agent#initialize}.
#
@@ -198,12 +243,15 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @see #initialize
+ #
def self.host(name,options={},&block)
- site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
+ agent = new(options.merge(host: name),&block)
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
end
#
# Clears the history of the agent.
#
@@ -313,15 +361,13 @@
#
def history=(new_history)
@history.clear
new_history.each do |url|
- @history << unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
- end
+ url = URI(url.to_s) unless url.kind_of?(URI)
+
+ @history << url
end
return @history
end
@@ -361,13 +407,26 @@
return @history.include?(url)
end
#
+ # Determines whether a URL is allowed by the robot policy.
+ #
+ # @param [URI::HTTP, String] url
+ # The URL to check.
+ #
+ # @return [Boolean]
+ # Specifies whether a URL is allowed by the robot policy.
+ #
+ def robot_allowed?(url)
+ @robots ? @robots.allowed?(url) : true
+ end
+
+ #
# Sets the list of failed URLs.
#
- # @param [#each]
+ # @param [#each] new_failures
# The new list of failed URLs.
#
# @return [Array<URI::HTTP>]
# The list of failed URLs.
#
@@ -376,15 +435,13 @@
#
def failures=(new_failures)
@failures.clear
new_failures.each do |url|
- @failures << unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
- end
+ url = URI(url.to_s) unless url.kind_of?(URI)
+
+ @failures << url
end
return @failures
end
@@ -406,11 +463,11 @@
alias pending_urls queue
#
# Sets the queue of URLs to visit.
#
- # @param [#each]
+ # @param [#each] new_queue
# The new list of URLs to visit.
#
# @return [Array<URI::HTTP>]
# The list of URLs to visit.
#
@@ -419,15 +476,13 @@
#
def queue=(new_queue)
@queue.clear
new_queue.each do |url|
- @queue << unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
- end
+ url = URI(url.to_s) unless url.kind_of?(URI)
+
+ @queue << url
end
return @queue
end
@@ -540,11 +595,11 @@
# The page for the response, or `nil` if the request failed.
#
# @since 0.2.2
#
def post_page(url,post_data='')
- url = URI(url.to_s)
+ url = URI(url.to_s) unless url.kind_of?(URI)
prepare_request(url) do |session,path,headers|
new_page = Page.new(url,session.post(path,post_data,headers))
# save any new cookies
@@ -614,11 +669,11 @@
# @return [Hash]
# The agent represented as a Hash containing the `history` and
# the `queue` of the agent.
#
def to_hash
- {:history => @history, :queue => @queue}
+ {history: @history, queue: @queue}
end
protected
#
@@ -664,13 +719,13 @@
break
end
end
end
- headers['Host'] ||= @host_header if @host_header
+ headers['Host'] ||= @host_header if @host_header
headers['User-Agent'] = @user_agent if @user_agent
- headers['Referer'] = @referer if @referer
+ headers['Referer'] = @referer if @referer
if (authorization = @authorized.for_url(url))
headers['Authorization'] = "Basic #{authorization}"
end
@@ -685,11 +740,12 @@
rescue SystemCallError,
Timeout::Error,
SocketError,
IOError,
OpenSSL::SSL::SSLError,
- Net::HTTPBadResponse
+ Net::HTTPBadResponse,
+ Zlib::Error
@sessions.kill!(url)
failed(url)
return nil
@@ -720,10 +776,11 @@
visit_scheme?(url.scheme) &&
visit_host?(url.host) &&
visit_port?(url.port) &&
visit_link?(url.to_s) &&
visit_url?(url) &&
- visit_ext?(url.path)
+ visit_ext?(url.path) &&
+ robot_allowed?(url.to_s)
end
#
# Adds a given URL to the failures list.
#