lib/spidr/agent.rb in spidr-0.5.0 vs lib/spidr/agent.rb in spidr-0.6.0

- old
+ new

@@ -1,41 +1,42 @@ +require 'spidr/settings/user_agent' require 'spidr/agent/sanitizers' require 'spidr/agent/filters' require 'spidr/agent/events' require 'spidr/agent/actions' +require 'spidr/agent/robots' require 'spidr/page' require 'spidr/session_cache' require 'spidr/cookie_jar' require 'spidr/auth_store' require 'spidr/spidr' require 'openssl' require 'net/http' require 'set' -begin - require 'robots' -rescue LoadError -end - module Spidr class Agent + include Settings::UserAgent + # HTTP Host Header to use # # @return [String] attr_accessor :host_header # HTTP Host Headers to use for specific hosts # # @return [Hash{String,Regexp => String}] attr_reader :host_headers - # User-Agent to use + # HTTP Headers to use for every request # - # @return [String] - attr_accessor :user_agent + # @return [Hash{String => String}] + # + # @since 0.6.0 + attr_reader :default_headers # HTTP Authentication credentials # # @return [AuthStore] attr_accessor :authorized @@ -63,15 +64,27 @@ # Queue of URLs to visit # # @return [Array<URI::HTTP>] attr_reader :queue + # The session cache + # + # @return [SessionCache] + # + # @since 0.6.0 + attr_reader :sessions + # Cached cookies # # @return [CookieJar] attr_reader :cookies - + + # Maximum number of pages to visit. + # + # @return [Integer] + attr_reader :limit + # Maximum depth # # @return [Integer] attr_reader :max_depth @@ -84,10 +97,25 @@ # Creates a new Agent object. # # @param [Hash] options # Additional options # + # @option options [Integer] :open_timeout (Spidr.open_timeout) + # Optional open timeout. + # + # @option options [Integer] :read_timeout (Spidr.read_timeout) + # Optional read timeout. + # + # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout) + # Optional ssl timeout. + # + # @option options [Integer] :continue_timeout (Spidr.continue_timeout) + # Optional continue timeout. + # + # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout) + # Optional keep_alive timeout. + # # @option options [Hash] :proxy (Spidr.proxy) # The proxy information to use. # # @option :proxy [String] :host # The host the proxy is running on. @@ -99,10 +127,13 @@ # The user to authenticate as with the proxy. # # @option :proxy [String] :password # The password to authenticate with. # + # @option options [Hash{String => String}] :default_headers + # Default headers to set for every request. + # # @option options [String] :host_header # The HTTP Host header to use with each request. # # @option options [Hash{String,Regexp => String}] :host_headers # The HTTP Host headers to use for specific hosts. @@ -120,10 +151,13 @@ # The initial queue of URLs to visit. # # @option options [Set, Array] :history # The initial list of visited URLs. # + # @option options [Integer] :limit + # The maximum number of pages to visit. + # # @option options [Integer] :max_depth # The maximum link depth to follow. # # @option options [Boolean] :robots (Spidr.robots?) # Specifies whether `robots.txt` should be honored. @@ -146,39 +180,50 @@ if options[:host_headers] @host_headers.merge!(options[:host_headers]) end + @default_headers = {} + + if options[:default_headers] + @default_headers.merge!(options[:default_headers]) + end + @user_agent = options.fetch(:user_agent,Spidr.user_agent) @referer = options[:referer] - @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy)) + @sessions = SessionCache.new(options) @cookies = CookieJar.new @authorized = AuthStore.new @running = false @delay = options.fetch(:delay,0) @history = Set[] @failures = Set[] @queue = [] + @limit = options[:limit] @levels = Hash.new(0) @max_depth = options[:max_depth] - if options.fetch(:robots,Spidr.robots?) - unless Object.const_defined?(:Robots) - raise(ArgumentError,":robots option given but unable to require 'robots' gem") - end + if options[:queue] + self.queue = options[:queue] + end - @robots = Robots.new(@user_agent) + if options[:history] + self.history = options[:history] end initialize_sanitizers(options) initialize_filters(options) initialize_actions(options) initialize_events(options) + if options.fetch(:robots,Spidr.robots?) + initialize_robots + end + yield self if block_given? end # # Creates a new agent and begin spidering at the given URL. @@ -251,10 +296,41 @@ agent = new(options.merge(host: name),&block) agent.start_at(URI::HTTP.build(host: name, path: '/')) end # + # The proxy information the agent uses. + # + # @return [Proxy] + # The proxy information. + # + # @see SessionCache#proxy + # + # @since 0.2.2 + # + def proxy + @sessions.proxy + end + + # + # Sets the proxy information that the agent uses. + # + # @param [Proxy] new_proxy + # The new proxy information. + # + # @return [Hash] + # The new proxy information. + # + # @see SessionCache#proxy= + # + # @since 0.2.2 + # + def proxy=(new_proxy) + @sessions.proxy = new_proxy + end + + # # Clears the history of the agent. # def clear @queue.clear @history.clear @@ -290,11 +366,11 @@ # A page which has been visited. # def run(&block) @running = true - until (@queue.empty? || paused?) + until (@queue.empty? || paused? || limit_reached?) begin visit_page(dequeue,&block) rescue Actions::Paused return self rescue Actions::Action @@ -315,41 +391,10 @@ def running? @running == true end # - # The proxy information the agent uses. - # - # @return [Hash] - # The proxy information. - # - # @see SessionCache#proxy - # - # @since 0.2.2 - # - def proxy - @sessions.proxy - end - - # - # Sets the proxy information that the agent uses. - # - # @param [Hash] new_proxy - # The new proxy information. - # - # @return [Hash] - # The new proxy information. - # - # @see SessionCache#proxy= - # - # @since 0.2.2 - # - def proxy=(new_proxy) - @sessions.proxy = new_proxy - end - - # # Sets the history of URLs that were previously visited. # # @param [#each] new_history # A list of URLs to populate the history with. # @@ -407,23 +452,10 @@ return @history.include?(url) end # - # Determines whether a URL is allowed by the robot policy. - # - # @param [URI::HTTP, String] url - # The URL to check. - # - # @return [Boolean] - # Specifies whether a URL is allowed by the robot policy. - # - def robot_allowed?(url) - @robots ? @robots.allowed?(url) : true - end - - # # Sets the list of failed URLs. # # @param [#each] new_failures # The new list of failed URLs. # @@ -534,19 +566,19 @@ raise(action) rescue Actions::SkipLink return false rescue Actions::Action end - + @queue << url @levels[url] = level return true end return false end - + # # Requests and creates a new Page object from a given URL. # # @param [URI::HTTP] url # The URL to request. @@ -675,10 +707,49 @@ end protected # + # Prepares request headers for the given URL. + # + # @param [URI::HTTP] url + # The URL to prepare the request headers for. + # + # @return [Hash{String => String}] + # The prepared headers. + # + # @since 0.6.0 + # + def prepare_request_headers(url) + # set any additional HTTP headers + headers = @default_headers.dup + + unless @host_headers.empty? + @host_headers.each do |name,header| + if host.match(name) + headers['Host'] = header + break + end + end + end + + headers['Host'] ||= @host_header if @host_header + headers['User-Agent'] = @user_agent if @user_agent + headers['Referer'] = @referer if @referer + + if (authorization = @authorized.for_url(url)) + headers['Authorization'] = "Basic #{authorization}" + end + + if (header_cookies = @cookies.for_host(url.host)) + headers['Cookie'] = header_cookies + end + + return headers + end + + # # Normalizes the request path and grabs a session to handle page # get and post requests. # # @param [URI::HTTP] url # The URL to request. @@ -707,34 +778,12 @@ end # append the URL query to the path path += "?#{url.query}" if url.query - # set any additional HTTP headers - headers = {} + headers = prepare_request_headers(url) - unless @host_headers.empty? - @host_headers.each do |name,header| - if host.match(name) - headers['Host'] = header - break - end - end - end - - headers['Host'] ||= @host_header if @host_header - headers['User-Agent'] = @user_agent if @user_agent - headers['Referer'] = @referer if @referer - - if (authorization = @authorized.for_url(url)) - headers['Authorization'] = "Basic #{authorization}" - end - - if (header_cookies = @cookies.for_host(url.host)) - headers['Cookie'] = header_cookies - end - begin sleep(@delay) if @delay > 0 yield @sessions[url], path, headers rescue SystemCallError, @@ -758,9 +807,20 @@ # @return [URI::HTTP] # The URL that was at the front of the queue. # def dequeue @queue.shift + end + + # + # Determines if the maximum limit has been reached. + # + # @return [Boolean] + # + # @since 0.6.0 + # + def limit_reached? + @limit && @history.length >= @limit end # # Determines if a given URL should be visited. #