agent.rb in spidr-0.5.0

- old
+ new

@@ -1,61 +1,85 @@
-require 'spidr/sanitizers'
-require 'spidr/filters'
-require 'spidr/events'
-require 'spidr/actions'
+require 'spidr/agent/sanitizers'
+require 'spidr/agent/filters'
+require 'spidr/agent/events'
+require 'spidr/agent/actions'
 require 'spidr/page'
 require 'spidr/session_cache'
 require 'spidr/cookie_jar'
 require 'spidr/auth_store'
 require 'spidr/spidr'
 
 require 'openssl'
 require 'net/http'
 require 'set'
 
+begin
+  require 'robots'
+rescue LoadError
+end
+
 module Spidr
   class Agent
 
-    include Sanitizers
-    include Filters
-    include Events
-    include Actions
-
     # HTTP Host Header to use
+    #
+    # @return [String]
     attr_accessor :host_header
 
     # HTTP Host Headers to use for specific hosts
+    #
+    # @return [Hash{String,Regexp => String}]
     attr_reader :host_headers
 
     # User-Agent to use
+    #
+    # @return [String]
     attr_accessor :user_agent
 
     # HTTP Authentication credentials
+    #
+    # @return [AuthStore]
     attr_accessor :authorized
 
     # Referer to use
+    #
+    # @return [String]
     attr_accessor :referer
 
     # Delay in between fetching pages
+    #
+    # @return [Integer]
     attr_accessor :delay
 
     # History containing visited URLs
+    #
+    # @return [Set<URI::HTTP>]
     attr_reader :history
 
     # List of unreachable URLs
+    #
+    # @return [Set<URI::HTTP>]
     attr_reader :failures
 
     # Queue of URLs to visit
+    #
+    # @return [Array<URI::HTTP>]
     attr_reader :queue
 
     # Cached cookies
+    #
+    # @return [CookieJar]
     attr_reader :cookies
     
     # Maximum depth
+    #
+    # @return [Integer]
     attr_reader :max_depth
 
     # The visited URLs and their depth within a site
+    #
+    # @return [Hash{URI::HTTP => Integer}]
     attr_reader :levels
 
     #
     # Creates a new Agent object.
     #
@@ -99,41 +123,57 @@
     #   The initial list of visited URLs.
     #
     # @option options [Integer] :max_depth
     #   The maximum link depth to follow.
     #
+    # @option options [Boolean] :robots (Spidr.robots?)
+    #   Specifies whether `robots.txt` should be honored.
+    #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
     #   for further configuration.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @see #initialize_sanitizers
+    # @see #initialize_filters
+    # @see #initialize_actions
+    # @see #initialize_events
+    #
     def initialize(options={})
-      @host_header = options[:host_header]
+      @host_header  = options[:host_header]
       @host_headers = {}
 
       if options[:host_headers]
         @host_headers.merge!(options[:host_headers])
       end
 
       @user_agent = options.fetch(:user_agent,Spidr.user_agent)
-      @referer = options[:referer]
+      @referer    = options[:referer]
 
-      @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
-      @cookies = CookieJar.new
+      @sessions   = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
+      @cookies    = CookieJar.new
       @authorized = AuthStore.new
 
-      @running = false
-      @delay = options.fetch(:delay,0)
-      @history = Set[]
+      @running  = false
+      @delay    = options.fetch(:delay,0)
+      @history  = Set[]
       @failures = Set[]
-      @queue = []
+      @queue    = []
 
-      @levels = Hash.new(0)
+      @levels    = Hash.new(0)
       @max_depth = options[:max_depth]
 
+      if options.fetch(:robots,Spidr.robots?)
+        unless Object.const_defined?(:Robots)
+          raise(ArgumentError,":robots option given but unable to require 'robots' gem")
+        end
+
+        @robots = Robots.new(@user_agent)
+      end
+
       initialize_sanitizers(options)
       initialize_filters(options)
       initialize_actions(options)
       initialize_events(options)
 
@@ -154,10 +194,13 @@
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @see #initialize
+    # @see #start_at
+    #
     def self.start_at(url,options={},&block)
       agent = new(options,&block)
       agent.start_at(url)
     end
 
@@ -175,21 +218,23 @@
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @see #initialize
+    #
     def self.site(url,options={},&block)
       url = URI(url.to_s) unless url.kind_of?(URI)
 
-      agent = new(options.merge(:host => url.host),&block)
+      agent = new(options.merge(host: url.host),&block)
       agent.start_at(url)
     end
 
     #
     # Creates a new agent and spiders the given host.
     #
-    # @param [String]
+    # @param [String] name
     #   The host-name to spider.
     #
     # @param [Hash] options
     #   Additional options. See {Agent#initialize}.
     #
@@ -198,12 +243,15 @@
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @see #initialize
+    #
     def self.host(name,options={},&block)
-      site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
+      agent = new(options.merge(host: name),&block)
+      agent.start_at(URI::HTTP.build(host: name, path: '/'))
     end
 
     #
     # Clears the history of the agent.
     #
@@ -313,15 +361,13 @@
     #
     def history=(new_history)
       @history.clear
 
       new_history.each do |url|
-        @history << unless url.kind_of?(URI)
-                      URI(url.to_s)
-                    else
-                      url
-                    end
+        url = URI(url.to_s) unless url.kind_of?(URI)
+
+        @history << url
       end
 
       return @history
     end
 
@@ -361,13 +407,26 @@
 
       return @history.include?(url)
     end
 
     #
+    # Determines whether a URL is allowed by the robot policy.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL to check.
+    #
+    # @return [Boolean]
+    #   Specifies whether a URL is allowed by the robot policy.
+    #
+    def robot_allowed?(url)
+      @robots ? @robots.allowed?(url) : true
+    end
+
+    #
     # Sets the list of failed URLs.
     #
-    # @param [#each]
+    # @param [#each] new_failures
     #   The new list of failed URLs.
     #
     # @return [Array<URI::HTTP>]
     #   The list of failed URLs.
     #
@@ -376,15 +435,13 @@
     #
     def failures=(new_failures)
       @failures.clear
 
       new_failures.each do |url|
-        @failures << unless url.kind_of?(URI)
-                       URI(url.to_s)
-                     else
-                       url
-                     end
+        url = URI(url.to_s) unless url.kind_of?(URI)
+
+        @failures << url
       end
 
       return @failures
     end
 
@@ -406,11 +463,11 @@
     alias pending_urls queue
 
     #
     # Sets the queue of URLs to visit.
     #
-    # @param [#each]
+    # @param [#each] new_queue
     #   The new list of URLs to visit.
     #
     # @return [Array<URI::HTTP>]
     #   The list of URLs to visit.
     #
@@ -419,15 +476,13 @@
     #
     def queue=(new_queue)
       @queue.clear
 
       new_queue.each do |url|
-        @queue << unless url.kind_of?(URI)
-                    URI(url.to_s)
-                  else
-                    url
-                  end
+        url = URI(url.to_s) unless url.kind_of?(URI)
+
+        @queue << url
       end
 
       return @queue
     end
 
@@ -540,11 +595,11 @@
     #   The page for the response, or `nil` if the request failed.
     #
     # @since 0.2.2
     #
     def post_page(url,post_data='')
-      url = URI(url.to_s)
+      url = URI(url.to_s) unless url.kind_of?(URI)
 
       prepare_request(url) do |session,path,headers|
         new_page = Page.new(url,session.post(path,post_data,headers))
 
         # save any new cookies
@@ -614,11 +669,11 @@
     # @return [Hash]
     #   The agent represented as a Hash containing the `history` and
     #   the `queue` of the agent.
     #
     def to_hash
-      {:history => @history, :queue => @queue}
+      {history: @history, queue: @queue}
     end
 
     protected
 
     #
@@ -664,13 +719,13 @@
             break
           end
         end
       end
 
-      headers['Host'] ||= @host_header if @host_header
+      headers['Host']     ||= @host_header if @host_header
       headers['User-Agent'] = @user_agent if @user_agent
-      headers['Referer'] = @referer if @referer
+      headers['Referer']    = @referer if @referer
 
       if (authorization = @authorized.for_url(url))
         headers['Authorization'] = "Basic #{authorization}"
       end
 
@@ -685,11 +740,12 @@
       rescue SystemCallError,
              Timeout::Error,
              SocketError,
              IOError,
              OpenSSL::SSL::SSLError,
-             Net::HTTPBadResponse
+             Net::HTTPBadResponse,
+             Zlib::Error
 
         @sessions.kill!(url)
 
         failed(url)
         return nil
@@ -720,10 +776,11 @@
        visit_scheme?(url.scheme) &&
        visit_host?(url.host) &&
        visit_port?(url.port) &&
        visit_link?(url.to_s) &&
        visit_url?(url) &&
-       visit_ext?(url.path)
+       visit_ext?(url.path) &&
+       robot_allowed?(url.to_s)
     end
 
     #
     # Adds a given URL to the failures list.
     #