lib/spidr/agent.rb in spidr-0.5.0 vs lib/spidr/agent.rb in spidr-0.6.0
- old
+ new
@@ -1,41 +1,42 @@
+require 'spidr/settings/user_agent'
require 'spidr/agent/sanitizers'
require 'spidr/agent/filters'
require 'spidr/agent/events'
require 'spidr/agent/actions'
+require 'spidr/agent/robots'
require 'spidr/page'
require 'spidr/session_cache'
require 'spidr/cookie_jar'
require 'spidr/auth_store'
require 'spidr/spidr'
require 'openssl'
require 'net/http'
require 'set'
-begin
- require 'robots'
-rescue LoadError
-end
-
module Spidr
class Agent
+ include Settings::UserAgent
+
# HTTP Host Header to use
#
# @return [String]
attr_accessor :host_header
# HTTP Host Headers to use for specific hosts
#
# @return [Hash{String,Regexp => String}]
attr_reader :host_headers
- # User-Agent to use
+ # HTTP Headers to use for every request
#
- # @return [String]
- attr_accessor :user_agent
+ # @return [Hash{String => String}]
+ #
+ # @since 0.6.0
+ attr_reader :default_headers
# HTTP Authentication credentials
#
# @return [AuthStore]
attr_accessor :authorized
@@ -63,15 +64,27 @@
# Queue of URLs to visit
#
# @return [Array<URI::HTTP>]
attr_reader :queue
+ # The session cache
+ #
+ # @return [SessionCache]
+ #
+ # @since 0.6.0
+ attr_reader :sessions
+
# Cached cookies
#
# @return [CookieJar]
attr_reader :cookies
-
+
+ # Maximum number of pages to visit.
+ #
+ # @return [Integer]
+ attr_reader :limit
+
# Maximum depth
#
# @return [Integer]
attr_reader :max_depth
@@ -84,10 +97,25 @@
# Creates a new Agent object.
#
# @param [Hash] options
# Additional options
#
+ # @option options [Integer] :open_timeout (Spidr.open_timeout)
+ # Optional open timeout.
+ #
+ # @option options [Integer] :read_timeout (Spidr.read_timeout)
+ # Optional read timeout.
+ #
+ # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
+ # Optional ssl timeout.
+ #
+ # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+ # Optional continue timeout.
+ #
+ # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
+ # Optional keep_alive timeout.
+ #
# @option options [Hash] :proxy (Spidr.proxy)
# The proxy information to use.
#
# @option :proxy [String] :host
# The host the proxy is running on.
@@ -99,10 +127,13 @@
# The user to authenticate as with the proxy.
#
# @option :proxy [String] :password
# The password to authenticate with.
#
+ # @option options [Hash{String => String}] :default_headers
+ # Default headers to set for every request.
+ #
# @option options [String] :host_header
# The HTTP Host header to use with each request.
#
# @option options [Hash{String,Regexp => String}] :host_headers
# The HTTP Host headers to use for specific hosts.
@@ -120,10 +151,13 @@
# The initial queue of URLs to visit.
#
# @option options [Set, Array] :history
# The initial list of visited URLs.
#
+ # @option options [Integer] :limit
+ # The maximum number of pages to visit.
+ #
# @option options [Integer] :max_depth
# The maximum link depth to follow.
#
# @option options [Boolean] :robots (Spidr.robots?)
# Specifies whether `robots.txt` should be honored.
@@ -146,39 +180,50 @@
if options[:host_headers]
@host_headers.merge!(options[:host_headers])
end
+ @default_headers = {}
+
+ if options[:default_headers]
+ @default_headers.merge!(options[:default_headers])
+ end
+
@user_agent = options.fetch(:user_agent,Spidr.user_agent)
@referer = options[:referer]
- @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
+ @sessions = SessionCache.new(options)
@cookies = CookieJar.new
@authorized = AuthStore.new
@running = false
@delay = options.fetch(:delay,0)
@history = Set[]
@failures = Set[]
@queue = []
+ @limit = options[:limit]
@levels = Hash.new(0)
@max_depth = options[:max_depth]
- if options.fetch(:robots,Spidr.robots?)
- unless Object.const_defined?(:Robots)
- raise(ArgumentError,":robots option given but unable to require 'robots' gem")
- end
+ if options[:queue]
+ self.queue = options[:queue]
+ end
- @robots = Robots.new(@user_agent)
+ if options[:history]
+ self.history = options[:history]
end
initialize_sanitizers(options)
initialize_filters(options)
initialize_actions(options)
initialize_events(options)
+ if options.fetch(:robots,Spidr.robots?)
+ initialize_robots
+ end
+
yield self if block_given?
end
#
# Creates a new agent and begin spidering at the given URL.
@@ -251,10 +296,41 @@
agent = new(options.merge(host: name),&block)
agent.start_at(URI::HTTP.build(host: name, path: '/'))
end
#
+ # The proxy information the agent uses.
+ #
+ # @return [Proxy]
+ # The proxy information.
+ #
+ # @see SessionCache#proxy
+ #
+ # @since 0.2.2
+ #
+ def proxy
+ @sessions.proxy
+ end
+
+ #
+ # Sets the proxy information that the agent uses.
+ #
+ # @param [Proxy] new_proxy
+ # The new proxy information.
+ #
+ # @return [Hash]
+ # The new proxy information.
+ #
+ # @see SessionCache#proxy=
+ #
+ # @since 0.2.2
+ #
+ def proxy=(new_proxy)
+ @sessions.proxy = new_proxy
+ end
+
+ #
# Clears the history of the agent.
#
def clear
@queue.clear
@history.clear
@@ -290,11 +366,11 @@
# A page which has been visited.
#
def run(&block)
@running = true
- until (@queue.empty? || paused?)
+ until (@queue.empty? || paused? || limit_reached?)
begin
visit_page(dequeue,&block)
rescue Actions::Paused
return self
rescue Actions::Action
@@ -315,41 +391,10 @@
def running?
@running == true
end
#
- # The proxy information the agent uses.
- #
- # @return [Hash]
- # The proxy information.
- #
- # @see SessionCache#proxy
- #
- # @since 0.2.2
- #
- def proxy
- @sessions.proxy
- end
-
- #
- # Sets the proxy information that the agent uses.
- #
- # @param [Hash] new_proxy
- # The new proxy information.
- #
- # @return [Hash]
- # The new proxy information.
- #
- # @see SessionCache#proxy=
- #
- # @since 0.2.2
- #
- def proxy=(new_proxy)
- @sessions.proxy = new_proxy
- end
-
- #
# Sets the history of URLs that were previously visited.
#
# @param [#each] new_history
# A list of URLs to populate the history with.
#
@@ -407,23 +452,10 @@
return @history.include?(url)
end
#
- # Determines whether a URL is allowed by the robot policy.
- #
- # @param [URI::HTTP, String] url
- # The URL to check.
- #
- # @return [Boolean]
- # Specifies whether a URL is allowed by the robot policy.
- #
- def robot_allowed?(url)
- @robots ? @robots.allowed?(url) : true
- end
-
- #
# Sets the list of failed URLs.
#
# @param [#each] new_failures
# The new list of failed URLs.
#
@@ -534,19 +566,19 @@
raise(action)
rescue Actions::SkipLink
return false
rescue Actions::Action
end
-
+
@queue << url
@levels[url] = level
return true
end
return false
end
-
+
#
# Requests and creates a new Page object from a given URL.
#
# @param [URI::HTTP] url
# The URL to request.
@@ -675,10 +707,49 @@
end
protected
#
+ # Prepares request headers for the given URL.
+ #
+ # @param [URI::HTTP] url
+ # The URL to prepare the request headers for.
+ #
+ # @return [Hash{String => String}]
+ # The prepared headers.
+ #
+ # @since 0.6.0
+ #
+ def prepare_request_headers(url)
+ # set any additional HTTP headers
+ headers = @default_headers.dup
+
+ unless @host_headers.empty?
+ @host_headers.each do |name,header|
+ if host.match(name)
+ headers['Host'] = header
+ break
+ end
+ end
+ end
+
+ headers['Host'] ||= @host_header if @host_header
+ headers['User-Agent'] = @user_agent if @user_agent
+ headers['Referer'] = @referer if @referer
+
+ if (authorization = @authorized.for_url(url))
+ headers['Authorization'] = "Basic #{authorization}"
+ end
+
+ if (header_cookies = @cookies.for_host(url.host))
+ headers['Cookie'] = header_cookies
+ end
+
+ return headers
+ end
+
+ #
# Normalizes the request path and grabs a session to handle page
# get and post requests.
#
# @param [URI::HTTP] url
# The URL to request.
@@ -707,34 +778,12 @@
end
# append the URL query to the path
path += "?#{url.query}" if url.query
- # set any additional HTTP headers
- headers = {}
+ headers = prepare_request_headers(url)
- unless @host_headers.empty?
- @host_headers.each do |name,header|
- if host.match(name)
- headers['Host'] = header
- break
- end
- end
- end
-
- headers['Host'] ||= @host_header if @host_header
- headers['User-Agent'] = @user_agent if @user_agent
- headers['Referer'] = @referer if @referer
-
- if (authorization = @authorized.for_url(url))
- headers['Authorization'] = "Basic #{authorization}"
- end
-
- if (header_cookies = @cookies.for_host(url.host))
- headers['Cookie'] = header_cookies
- end
-
begin
sleep(@delay) if @delay > 0
yield @sessions[url], path, headers
rescue SystemCallError,
@@ -758,9 +807,20 @@
# @return [URI::HTTP]
# The URL that was at the front of the queue.
#
def dequeue
@queue.shift
+ end
+
+ #
+ # Determines if the maximum limit has been reached.
+ #
+ # @return [Boolean]
+ #
+ # @since 0.6.0
+ #
+ def limit_reached?
+ @limit && @history.length >= @limit
end
#
# Determines if a given URL should be visited.
#