lib/spidr/agent.rb in spidr-0.6.1 vs lib/spidr/agent.rb in spidr-0.7.0

- old
+ new

@@ -17,16 +17,16 @@ module Spidr class Agent include Settings::UserAgent - # HTTP Host Header to use + # HTTP Host `Header` to use # # @return [String] attr_accessor :host_header - # HTTP Host Headers to use for specific hosts + # HTTP `Host` Headers to use for specific hosts # # @return [Hash{String,Regexp => String}] attr_reader :host_headers # HTTP Headers to use for every request @@ -94,212 +94,334 @@ attr_reader :levels # # Creates a new Agent object. # - # @param [Hash] options - # Additional options + # @param [String, nil] host_header + # The HTTP `Host` header to use with each request. # - # @option options [Integer] :open_timeout (Spidr.open_timeout) - # Optional open timeout. + # @param [Hash{String,Regexp => String}] host_headers + # The HTTP `Host` headers to use for specific hosts. # - # @option options [Integer] :read_timeout (Spidr.read_timeout) + # @param [Hash{String => String}] default_headers + # Default headers to set for every request. + # + # @param [String, nil] user_agent + # The `User-Agent` string to send with each requests. + # + # @param [String, nil] referer + # The `Referer` URL to send with each request. + # + # @param [Integer, nil] open_timeout + # Optional open connection timeout. + # + # @param [Integer, nil] read_timeout # Optional read timeout. # - # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout) - # Optional ssl timeout. + # @param [Integer, nil] ssl_timeout + # Optional SSL connection timeout. # - # @option options [Integer] :continue_timeout (Spidr.continue_timeout) + # @param [Integer, nil] continue_timeout # Optional continue timeout. # - # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout) - # Optional keep_alive timeout. + # @param [Integer, nil] keep_alive_timeout + # Optional `Keep-Alive` timeout. # - # @option options [Hash] :proxy (Spidr.proxy) + # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy # The proxy information to use. # - # @option :proxy [String] :host + # @option proxy [String] :host # The host the proxy is running on. # - # @option :proxy [Integer] :port + # @option proxy [Integer] :port (8080) # The port the proxy is running on. # - # @option :proxy [String] :user + # @option proxy [String, nil] :user # The user to authenticate as with the proxy. # - # @option :proxy [String] :password + # @option proxy [String, nil] :password # The password to authenticate with. # - # @option options [Hash{String => String}] :default_headers - # Default headers to set for every request. + # @param [Integer] delay + # The number of seconds to pause between each request. # - # @option options [String] :host_header - # The HTTP Host header to use with each request. + # @param [Integer, nil] limit + # The maximum number of pages to visit. # - # @option options [Hash{String,Regexp => String}] :host_headers - # The HTTP Host headers to use for specific hosts. + # @param [Integer, nil] max_depth + # The maximum link depth to follow. # - # @option options [String] :user_agent (Spidr.user_agent) - # The User-Agent string to send with each requests. + # @param [Set, Array, nil] queue + # The initial queue of URLs to visit. # - # @option options [String] :referer - # The Referer URL to send with each request. + # @param [Set, Array, nil] history + # The initial list of visited URLs. # - # @option options [Integer] :delay (0) - # The number of seconds to pause between each request. + # @param [Boolean] strip_fragments + # Controls whether to strip the fragment components from the URLs. # - # @option options [Set, Array] :queue - # The initial queue of URLs to visit. + # @param [Boolean] strip_query + # Controls whether to strip the query components from the URLs. # - # @option options [Set, Array] :history - # The initial list of visited URLs. + # @param [Array<String>] schemes + # The list of acceptable URI schemes to visit. + # The `https` scheme will be ignored if `net/https` cannot be loaded. # - # @option options [Integer] :limit - # The maximum number of pages to visit. + # @param [String] host + # The host-name to visit. # - # @option options [Integer] :max_depth - # The maximum link depth to follow. + # @param [Array<String, Regexp, Proc>] hosts + # The patterns which match the host-names to visit. # - # @option options [Boolean] :robots (Spidr.robots?) + # @param [Array<String, Regexp, Proc>] ignore_hosts + # The patterns which match the host-names to not visit. + # + # @param [Array<Integer, Regexp, Proc>] ports + # The patterns which match the ports to visit. + # + # @param [Array<Integer, Regexp, Proc>] ignore_ports + # The patterns which match the ports to not visit. + # + # @param [Array<String, Regexp, Proc>] links + # The patterns which match the links to visit. + # + # @param [Array<String, Regexp, Proc>] ignore_links + # The patterns which match the links to not visit. + # + # @param [Array<String, Regexp, Proc>] urls + # The patterns which match the URLs to visit. + # + # @param [Array<String, Regexp, Proc>] ignore_urls + # The patterns which match the URLs to not visit. + # + # @param [Array<String, Regexp, Proc>] exts + # The patterns which match the URI path extensions to visit. + # + # @param [Array<String, Regexp, Proc>] ignore_exts + # The patterns which match the URI path extensions to not visit. + # + # @param [Boolean] robots # Specifies whether `robots.txt` should be honored. # # @yield [agent] # If a block is given, it will be passed the newly created agent # for further configuration. # # @yieldparam [Agent] agent # The newly created agent. # - # @see #initialize_sanitizers - # @see #initialize_filters - # @see #initialize_actions - # @see #initialize_events - # - def initialize(options={}) - @host_header = options[:host_header] - @host_headers = {} + def initialize(# header keyword arguments + host_header: nil, + host_headers: {}, + default_headers: {}, + user_agent: Spidr.user_agent, + referer: nil, + # session cache keyword arguments + proxy: Spidr.proxy, + open_timeout: Spidr.open_timeout, + ssl_timeout: Spidr.ssl_timeout, + read_timeout: Spidr.read_timeout, + continue_timeout: Spidr.continue_timeout, + keep_alive_timeout: Spidr.keep_alive_timeout, + # spidering controls keyword arguments + delay: 0, + limit: nil, + max_depth: nil, + # history keyword arguments + queue: nil, + history: nil, + # sanitizer keyword arguments + strip_fragments: true, + strip_query: false, + # filtering keyword arguments + schemes: self.class.default_schemes, + host: nil, + hosts: nil, + ignore_hosts: nil, + ports: nil, + ignore_ports: nil, + links: nil, + ignore_links: nil, + urls: nil, + ignore_urls: nil, + exts: nil, + ignore_exts: nil, + # robots keyword arguments + robots: Spidr.robots?) + @host_header = host_header + @host_headers = host_headers - if options[:host_headers] - @host_headers.merge!(options[:host_headers]) - end + @default_headers = default_headers - @default_headers = {} + @user_agent = user_agent + @referer = referer - if options[:default_headers] - @default_headers.merge!(options[:default_headers]) - end - - @user_agent = options.fetch(:user_agent,Spidr.user_agent) - @referer = options[:referer] - - @sessions = SessionCache.new(options) + @sessions = SessionCache.new( + proxy: proxy, + open_timeout: open_timeout, + ssl_timeout: ssl_timeout, + read_timeout: read_timeout, + continue_timeout: continue_timeout, + keep_alive_timeout: keep_alive_timeout + ) @cookies = CookieJar.new @authorized = AuthStore.new @running = false - @delay = options.fetch(:delay,0) + @delay = delay @history = Set[] @failures = Set[] @queue = [] - @limit = options[:limit] + @limit = limit @levels = Hash.new(0) - @max_depth = options[:max_depth] + @max_depth = max_depth - if options[:queue] - self.queue = options[:queue] - end + self.queue = queue if queue + self.history = history if history - if options[:history] - self.history = options[:history] - end + initialize_sanitizers( + strip_fragments: strip_fragments, + strip_query: strip_query + ) - initialize_sanitizers(options) - initialize_filters(options) - initialize_actions(options) - initialize_events(options) + initialize_filters( + schemes: schemes, + host: host, + hosts: hosts, + ignore_hosts: ignore_hosts, + ports: ports, + ignore_ports: ignore_ports, + links: links, + ignore_links: ignore_links, + urls: urls, + ignore_urls: ignore_urls, + exts: exts, + ignore_exts: ignore_exts + ) + initialize_actions + initialize_events - if options.fetch(:robots,Spidr.robots?) - initialize_robots - end + initialize_robots if robots yield self if block_given? end # # Creates a new agent and begin spidering at the given URL. # # @param [URI::HTTP, String] url # The URL to start spidering at. # - # @param [Hash] options - # Additional options. See {Agent#initialize}. + # @param [Hash{Symbol => Object}] kwargs + # Additional keyword arguments. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @return [Agent] + # The created agent object. + # # @see #initialize # @see #start_at # - def self.start_at(url,options={},&block) - agent = new(options,&block) + def self.start_at(url,**kwargs,&block) + agent = new(**kwargs,&block) agent.start_at(url) + return agent end # # Creates a new agent and spiders the web-site located at the given URL. # # @param [URI::HTTP, String] url # The web-site to spider. # - # @param [Hash] options - # Additional options. See {Agent#initialize}. + # @param [Hash{Symbol => Object}] kwargs + # Additional keyword arguments. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @return [Agent] + # The created agent object. + # # @see #initialize # - def self.site(url,options={},&block) + def self.site(url,**kwargs,&block) url = URI(url) - agent = new(options.merge(host: url.host),&block) + agent = new(host: url.host, **kwargs, &block) agent.start_at(url) + return agent end # # Creates a new agent and spiders the given host. # # @param [String] name # The host-name to spider. # - # @param [Hash] options - # Additional options. See {Agent#initialize}. + # @param [Hash{Symbol => Object}] kwargs + # Additional keyword arguments. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # + # @return [Agent] + # The created agent object. + # # @see #initialize # - def self.host(name,options={},&block) - agent = new(options.merge(host: name),&block) + def self.host(name,**kwargs,&block) + agent = new(host: name, **kwargs, &block) agent.start_at(URI::HTTP.build(host: name, path: '/')) + return agent end # + # Creates a new agent and spiders the entire domain. + # + # @param [String] name + # The top-level domain to spider. + # + # @param [Hash{Symbol => Object}] kwargs + # Additional keyword arguments. See {Agent#initialize}. + # + # @yield [agent] + # If a block is given, it will be passed the newly created agent + # before it begins spidering. + # + # @yieldparam [Agent] agent + # The newly created agent. + # + # @return [Agent] + # The created agent object. + # + # @see #initialize + # + # @since 0.7.0 + # + def self.domain(name,**kwargs,&block) + agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block) + agent.start_at(URI::HTTP.build(host: name, path: '/')) + return agent + end + + # # The proxy information the agent uses. # # @return [Proxy] # The proxy information. # @@ -312,14 +434,14 @@ end # # Sets the proxy information that the agent uses. # - # @param [Proxy] new_proxy + # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy # The new proxy information. # - # @return [Hash] + # @return [Proxy] # The new proxy information. # # @see SessionCache#proxy= # # @since 0.2.2 @@ -532,11 +654,11 @@ # Specifies whether the URL was enqueued, or ignored. # def enqueue(url,level=0) url = sanitize_url(url) - if (!(queued?(url)) && visit?(url)) + if (!queued?(url) && visit?(url)) link = url.to_s begin @every_url_blocks.each { |url_block| url_block.call(url) } @@ -631,10 +753,10 @@ return new_page end end # - # Visits a given URL, and enqueus the links recovered from the URL + # Visits a given URL, and enqueues the links recovered from the URL # to be visited later. # # @param [URI::HTTP, String] url # The URL to visit. #