agent.rb in spidr-0.6.0

- old
+ new

@@ -1,41 +1,42 @@
+require 'spidr/settings/user_agent'
 require 'spidr/agent/sanitizers'
 require 'spidr/agent/filters'
 require 'spidr/agent/events'
 require 'spidr/agent/actions'
+require 'spidr/agent/robots'
 require 'spidr/page'
 require 'spidr/session_cache'
 require 'spidr/cookie_jar'
 require 'spidr/auth_store'
 require 'spidr/spidr'
 
 require 'openssl'
 require 'net/http'
 require 'set'
 
-begin
-  require 'robots'
-rescue LoadError
-end
-
 module Spidr
   class Agent
 
+    include Settings::UserAgent
+
     # HTTP Host Header to use
     #
     # @return [String]
     attr_accessor :host_header
 
     # HTTP Host Headers to use for specific hosts
     #
     # @return [Hash{String,Regexp => String}]
     attr_reader :host_headers
 
-    # User-Agent to use
+    # HTTP Headers to use for every request
     #
-    # @return [String]
-    attr_accessor :user_agent
+    # @return [Hash{String => String}]
+    #
+    # @since 0.6.0
+    attr_reader :default_headers
 
     # HTTP Authentication credentials
     #
     # @return [AuthStore]
     attr_accessor :authorized
@@ -63,15 +64,27 @@
     # Queue of URLs to visit
     #
     # @return [Array<URI::HTTP>]
     attr_reader :queue
 
+    # The session cache
+    #
+    # @return [SessionCache]
+    #
+    # @since 0.6.0
+    attr_reader :sessions
+
     # Cached cookies
     #
     # @return [CookieJar]
     attr_reader :cookies
-    
+
+    # Maximum number of pages to visit.
+    #
+    # @return [Integer]
+    attr_reader :limit
+
     # Maximum depth
     #
     # @return [Integer]
     attr_reader :max_depth
 
@@ -84,10 +97,25 @@
     # Creates a new Agent object.
     #
     # @param [Hash] options
     #   Additional options
     #
+    # @option options [Integer] :open_timeout (Spidr.open_timeout)
+    #   Optional open timeout.
+    #
+    # @option options [Integer] :read_timeout (Spidr.read_timeout)
+    #   Optional read timeout.
+    #
+    # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
+    #   Optional ssl timeout.
+    #
+    # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+    #   Optional continue timeout.
+    #
+    # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
+    #   Optional keep_alive timeout.
+    #
     # @option options [Hash] :proxy (Spidr.proxy)
     #   The proxy information to use.
     #
     # @option :proxy [String] :host
     #   The host the proxy is running on.
@@ -99,10 +127,13 @@
     #   The user to authenticate as with the proxy.
     #
     # @option :proxy [String] :password
     #   The password to authenticate with.
     #
+    # @option options [Hash{String => String}] :default_headers
+    #   Default headers to set for every request.
+    #
     # @option options [String] :host_header
     #   The HTTP Host header to use with each request.
     #
     # @option options [Hash{String,Regexp => String}] :host_headers
     #   The HTTP Host headers to use for specific hosts.
@@ -120,10 +151,13 @@
     #   The initial queue of URLs to visit.
     #
     # @option options [Set, Array] :history
     #   The initial list of visited URLs.
     #
+    # @option options [Integer] :limit
+    #   The maximum number of pages to visit.
+    #
     # @option options [Integer] :max_depth
     #   The maximum link depth to follow.
     #
     # @option options [Boolean] :robots (Spidr.robots?)
     #   Specifies whether `robots.txt` should be honored.
@@ -146,39 +180,50 @@
 
       if options[:host_headers]
         @host_headers.merge!(options[:host_headers])
       end
 
+      @default_headers = {}
+
+      if options[:default_headers]
+        @default_headers.merge!(options[:default_headers])
+      end
+
       @user_agent = options.fetch(:user_agent,Spidr.user_agent)
       @referer    = options[:referer]
 
-      @sessions   = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
+      @sessions   = SessionCache.new(options)
       @cookies    = CookieJar.new
       @authorized = AuthStore.new
 
       @running  = false
       @delay    = options.fetch(:delay,0)
       @history  = Set[]
       @failures = Set[]
       @queue    = []
 
+      @limit     = options[:limit]
       @levels    = Hash.new(0)
       @max_depth = options[:max_depth]
 
-      if options.fetch(:robots,Spidr.robots?)
-        unless Object.const_defined?(:Robots)
-          raise(ArgumentError,":robots option given but unable to require 'robots' gem")
-        end
+      if options[:queue]
+        self.queue = options[:queue]
+      end
 
-        @robots = Robots.new(@user_agent)
+      if options[:history]
+        self.history = options[:history]
       end
 
       initialize_sanitizers(options)
       initialize_filters(options)
       initialize_actions(options)
       initialize_events(options)
 
+      if options.fetch(:robots,Spidr.robots?)
+        initialize_robots
+      end
+
       yield self if block_given?
     end
 
     #
     # Creates a new agent and begin spidering at the given URL.
@@ -251,10 +296,41 @@
       agent = new(options.merge(host: name),&block)
       agent.start_at(URI::HTTP.build(host: name, path: '/'))
     end
 
     #
+    # The proxy information the agent uses.
+    #
+    # @return [Proxy]
+    #   The proxy information.
+    #
+    # @see SessionCache#proxy
+    #
+    # @since 0.2.2
+    #
+    def proxy
+      @sessions.proxy
+    end
+
+    #
+    # Sets the proxy information that the agent uses.
+    #
+    # @param [Proxy] new_proxy
+    #   The new proxy information.
+    #
+    # @return [Hash]
+    #   The new proxy information.
+    #
+    # @see SessionCache#proxy=
+    #
+    # @since 0.2.2
+    #
+    def proxy=(new_proxy)
+      @sessions.proxy = new_proxy
+    end
+
+    #
     # Clears the history of the agent.
     #
     def clear
       @queue.clear
       @history.clear
@@ -290,11 +366,11 @@
     #   A page which has been visited.
     #
     def run(&block)
       @running = true
 
-      until (@queue.empty? || paused?)
+      until (@queue.empty? || paused? || limit_reached?)
         begin
           visit_page(dequeue,&block)
         rescue Actions::Paused
           return self
         rescue Actions::Action
@@ -315,41 +391,10 @@
     def running?
       @running == true
     end
 
     #
-    # The proxy information the agent uses.
-    #
-    # @return [Hash]
-    #   The proxy information.
-    #
-    # @see SessionCache#proxy
-    #
-    # @since 0.2.2
-    #
-    def proxy
-      @sessions.proxy
-    end
-
-    #
-    # Sets the proxy information that the agent uses.
-    #
-    # @param [Hash] new_proxy
-    #   The new proxy information.
-    #
-    # @return [Hash]
-    #   The new proxy information.
-    #
-    # @see SessionCache#proxy=
-    #
-    # @since 0.2.2
-    #
-    def proxy=(new_proxy)
-      @sessions.proxy = new_proxy
-    end
-
-    #
     # Sets the history of URLs that were previously visited.
     #
     # @param [#each] new_history
     #   A list of URLs to populate the history with.
     #
@@ -407,23 +452,10 @@
 
       return @history.include?(url)
     end
 
     #
-    # Determines whether a URL is allowed by the robot policy.
-    #
-    # @param [URI::HTTP, String] url
-    #   The URL to check.
-    #
-    # @return [Boolean]
-    #   Specifies whether a URL is allowed by the robot policy.
-    #
-    def robot_allowed?(url)
-      @robots ? @robots.allowed?(url) : true
-    end
-
-    #
     # Sets the list of failed URLs.
     #
     # @param [#each] new_failures
     #   The new list of failed URLs.
     #
@@ -534,19 +566,19 @@
           raise(action)
         rescue Actions::SkipLink
           return false
         rescue Actions::Action
         end
-        
+
         @queue << url
         @levels[url] = level
         return true
       end
 
       return false
     end
-    
+
     #
     # Requests and creates a new Page object from a given URL.
     #
     # @param [URI::HTTP] url
     #   The URL to request.
@@ -675,10 +707,49 @@
     end
 
     protected
 
     #
+    # Prepares request headers for the given URL.
+    #
+    # @param [URI::HTTP] url
+    #   The URL to prepare the request headers for.
+    #
+    # @return [Hash{String => String}]
+    #   The prepared headers.
+    #
+    # @since 0.6.0
+    #
+    def prepare_request_headers(url)
+      # set any additional HTTP headers
+      headers = @default_headers.dup
+
+      unless @host_headers.empty?
+        @host_headers.each do |name,header|
+          if host.match(name)
+            headers['Host'] = header
+            break
+          end
+        end
+      end
+
+      headers['Host']     ||= @host_header if @host_header
+      headers['User-Agent'] = @user_agent if @user_agent
+      headers['Referer']    = @referer if @referer
+
+      if (authorization = @authorized.for_url(url))
+        headers['Authorization'] = "Basic #{authorization}"
+      end
+
+      if (header_cookies = @cookies.for_host(url.host))
+        headers['Cookie'] = header_cookies
+      end
+
+      return headers
+    end
+
+    #
     # Normalizes the request path and grabs a session to handle page
     # get and post requests.
     #
     # @param [URI::HTTP] url
     #   The URL to request.
@@ -707,34 +778,12 @@
              end
 
       # append the URL query to the path
       path += "?#{url.query}" if url.query
 
-      # set any additional HTTP headers
-      headers = {}
+      headers = prepare_request_headers(url)
 
-      unless @host_headers.empty?
-        @host_headers.each do |name,header|
-          if host.match(name)
-            headers['Host'] = header
-            break
-          end
-        end
-      end
-
-      headers['Host']     ||= @host_header if @host_header
-      headers['User-Agent'] = @user_agent if @user_agent
-      headers['Referer']    = @referer if @referer
-
-      if (authorization = @authorized.for_url(url))
-        headers['Authorization'] = "Basic #{authorization}"
-      end
-
-      if (header_cookies = @cookies.for_host(url.host))
-        headers['Cookie'] = header_cookies
-      end
-
       begin
         sleep(@delay) if @delay > 0
 
         yield @sessions[url], path, headers
       rescue SystemCallError,
@@ -758,9 +807,20 @@
     # @return [URI::HTTP]
     #   The URL that was at the front of the queue.
     #
     def dequeue
       @queue.shift
+    end
+
+    #
+    # Determines if the maximum limit has been reached.
+    #
+    # @return [Boolean]
+    #
+    # @since 0.6.0
+    #
+    def limit_reached?
+      @limit && @history.length >= @limit
     end
 
     #
     # Determines if a given URL should be visited.
     #