lib/spidr/agent.rb in spidr-0.4.1 vs lib/spidr/agent.rb in spidr-0.5.0

- old
+ new

@@ -1,61 +1,85 @@ -require 'spidr/sanitizers' -require 'spidr/filters' -require 'spidr/events' -require 'spidr/actions' +require 'spidr/agent/sanitizers' +require 'spidr/agent/filters' +require 'spidr/agent/events' +require 'spidr/agent/actions' require 'spidr/page' require 'spidr/session_cache' require 'spidr/cookie_jar' require 'spidr/auth_store' require 'spidr/spidr' require 'openssl' require 'net/http' require 'set' +begin + require 'robots' +rescue LoadError +end + module Spidr class Agent - include Sanitizers - include Filters - include Events - include Actions - # HTTP Host Header to use + # + # @return [String] attr_accessor :host_header # HTTP Host Headers to use for specific hosts + # + # @return [Hash{String,Regexp => String}] attr_reader :host_headers # User-Agent to use + # + # @return [String] attr_accessor :user_agent # HTTP Authentication credentials + # + # @return [AuthStore] attr_accessor :authorized # Referer to use + # + # @return [String] attr_accessor :referer # Delay in between fetching pages + # + # @return [Integer] attr_accessor :delay # History containing visited URLs + # + # @return [Set<URI::HTTP>] attr_reader :history # List of unreachable URLs + # + # @return [Set<URI::HTTP>] attr_reader :failures # Queue of URLs to visit + # + # @return [Array<URI::HTTP>] attr_reader :queue # Cached cookies + # + # @return [CookieJar] attr_reader :cookies # Maximum depth + # + # @return [Integer] attr_reader :max_depth # The visited URLs and their depth within a site + # + # @return [Hash{URI::HTTP => Integer}] attr_reader :levels # # Creates a new Agent object. # @@ -99,41 +123,57 @@ # The initial list of visited URLs. # # @option options [Integer] :max_depth # The maximum link depth to follow. # + # @option options [Boolean] :robots (Spidr.robots?) + # Specifies whether `robots.txt` should be honored. + # # @yield [agent] # If a block is given, it will be passed the newly created agent # for further configuration. # # @yieldparam [Agent] agent # The newly created agent. # + # @see #initialize_sanitizers + # @see #initialize_filters + # @see #initialize_actions + # @see #initialize_events + # def initialize(options={}) - @host_header = options[:host_header] + @host_header = options[:host_header] @host_headers = {} if options[:host_headers] @host_headers.merge!(options[:host_headers]) end @user_agent = options.fetch(:user_agent,Spidr.user_agent) - @referer = options[:referer] + @referer = options[:referer] - @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy)) - @cookies = CookieJar.new + @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy)) + @cookies = CookieJar.new @authorized = AuthStore.new - @running = false - @delay = options.fetch(:delay,0) - @history = Set[] + @running = false + @delay = options.fetch(:delay,0) + @history = Set[] @failures = Set[] - @queue = [] + @queue = [] - @levels = Hash.new(0) + @levels = Hash.new(0) @max_depth = options[:max_depth] + if options.fetch(:robots,Spidr.robots?) + unless Object.const_defined?(:Robots) + raise(ArgumentError,":robots option given but unable to require 'robots' gem") + end + + @robots = Robots.new(@user_agent) + end + initialize_sanitizers(options) initialize_filters(options) initialize_actions(options) initialize_events(options) @@ -154,10 +194,13 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @see #initialize + # @see #start_at + # def self.start_at(url,options={},&block) agent = new(options,&block) agent.start_at(url) end @@ -175,21 +218,23 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @see #initialize + # def self.site(url,options={},&block) url = URI(url.to_s) unless url.kind_of?(URI) - agent = new(options.merge(:host => url.host),&block) + agent = new(options.merge(host: url.host),&block) agent.start_at(url) end # # Creates a new agent and spiders the given host. # - # @param [String] + # @param [String] name # The host-name to spider. # # @param [Hash] options # Additional options. See {Agent#initialize}. # @@ -198,12 +243,15 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @see #initialize + # def self.host(name,options={},&block) - site(URI::HTTP.build(:host => name, :path => '/'),options,&block) + agent = new(options.merge(host: name),&block) + agent.start_at(URI::HTTP.build(host: name, path: '/')) end # # Clears the history of the agent. # @@ -313,15 +361,13 @@ # def history=(new_history) @history.clear new_history.each do |url| - @history << unless url.kind_of?(URI) - URI(url.to_s) - else - url - end + url = URI(url.to_s) unless url.kind_of?(URI) + + @history << url end return @history end @@ -361,13 +407,26 @@ return @history.include?(url) end # + # Determines whether a URL is allowed by the robot policy. + # + # @param [URI::HTTP, String] url + # The URL to check. + # + # @return [Boolean] + # Specifies whether a URL is allowed by the robot policy. + # + def robot_allowed?(url) + @robots ? @robots.allowed?(url) : true + end + + # # Sets the list of failed URLs. # - # @param [#each] + # @param [#each] new_failures # The new list of failed URLs. # # @return [Array<URI::HTTP>] # The list of failed URLs. # @@ -376,15 +435,13 @@ # def failures=(new_failures) @failures.clear new_failures.each do |url| - @failures << unless url.kind_of?(URI) - URI(url.to_s) - else - url - end + url = URI(url.to_s) unless url.kind_of?(URI) + + @failures << url end return @failures end @@ -406,11 +463,11 @@ alias pending_urls queue # # Sets the queue of URLs to visit. # - # @param [#each] + # @param [#each] new_queue # The new list of URLs to visit. # # @return [Array<URI::HTTP>] # The list of URLs to visit. # @@ -419,15 +476,13 @@ # def queue=(new_queue) @queue.clear new_queue.each do |url| - @queue << unless url.kind_of?(URI) - URI(url.to_s) - else - url - end + url = URI(url.to_s) unless url.kind_of?(URI) + + @queue << url end return @queue end @@ -540,11 +595,11 @@ # The page for the response, or `nil` if the request failed. # # @since 0.2.2 # def post_page(url,post_data='') - url = URI(url.to_s) + url = URI(url.to_s) unless url.kind_of?(URI) prepare_request(url) do |session,path,headers| new_page = Page.new(url,session.post(path,post_data,headers)) # save any new cookies @@ -614,11 +669,11 @@ # @return [Hash] # The agent represented as a Hash containing the `history` and # the `queue` of the agent. # def to_hash - {:history => @history, :queue => @queue} + {history: @history, queue: @queue} end protected # @@ -664,13 +719,13 @@ break end end end - headers['Host'] ||= @host_header if @host_header + headers['Host'] ||= @host_header if @host_header headers['User-Agent'] = @user_agent if @user_agent - headers['Referer'] = @referer if @referer + headers['Referer'] = @referer if @referer if (authorization = @authorized.for_url(url)) headers['Authorization'] = "Basic #{authorization}" end @@ -685,11 +740,12 @@ rescue SystemCallError, Timeout::Error, SocketError, IOError, OpenSSL::SSL::SSLError, - Net::HTTPBadResponse + Net::HTTPBadResponse, + Zlib::Error @sessions.kill!(url) failed(url) return nil @@ -720,10 +776,11 @@ visit_scheme?(url.scheme) && visit_host?(url.host) && visit_port?(url.port) && visit_link?(url.to_s) && visit_url?(url) && - visit_ext?(url.path) + visit_ext?(url.path) && + robot_allowed?(url.to_s) end # # Adds a given URL to the failures list. #