agent.rb in spidr-0.7.0

- old
+ new

@@ -17,16 +17,16 @@
 module Spidr
   class Agent
 
     include Settings::UserAgent
 
-    # HTTP Host Header to use
+    # HTTP Host `Header` to use
     #
     # @return [String]
     attr_accessor :host_header
 
-    # HTTP Host Headers to use for specific hosts
+    # HTTP `Host` Headers to use for specific hosts
     #
     # @return [Hash{String,Regexp => String}]
     attr_reader :host_headers
 
     # HTTP Headers to use for every request
@@ -94,212 +94,334 @@
     attr_reader :levels
 
     #
     # Creates a new Agent object.
     #
-    # @param [Hash] options
-    #   Additional options
+    # @param [String, nil] host_header
+    #   The HTTP `Host` header to use with each request.
     #
-    # @option options [Integer] :open_timeout (Spidr.open_timeout)
-    #   Optional open timeout.
+    # @param [Hash{String,Regexp => String}] host_headers
+    #   The HTTP `Host` headers to use for specific hosts.
     #
-    # @option options [Integer] :read_timeout (Spidr.read_timeout)
+    # @param [Hash{String => String}] default_headers
+    #   Default headers to set for every request.
+    #
+    # @param [String, nil] user_agent
+    #   The `User-Agent` string to send with each requests.
+    #
+    # @param [String, nil] referer
+    #   The `Referer` URL to send with each request.
+    #
+    # @param [Integer, nil] open_timeout
+    #   Optional open connection timeout.
+    #
+    # @param [Integer, nil] read_timeout
     #   Optional read timeout.
     #
-    # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
-    #   Optional ssl timeout.
+    # @param [Integer, nil] ssl_timeout
+    #   Optional SSL connection timeout.
     #
-    # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+    # @param [Integer, nil] continue_timeout
     #   Optional continue timeout.
     #
-    # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
-    #   Optional keep_alive timeout.
+    # @param [Integer, nil] keep_alive_timeout
+    #   Optional `Keep-Alive` timeout.
     #
-    # @option options [Hash] :proxy (Spidr.proxy)
+    # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
     #   The proxy information to use.
     #
-    # @option :proxy [String] :host
+    # @option proxy [String] :host
     #   The host the proxy is running on.
     #
-    # @option :proxy [Integer] :port
+    # @option proxy [Integer] :port (8080)
     #   The port the proxy is running on.
     #
-    # @option :proxy [String] :user
+    # @option proxy [String, nil] :user
     #   The user to authenticate as with the proxy.
     #
-    # @option :proxy [String] :password
+    # @option proxy [String, nil] :password
     #   The password to authenticate with.
     #
-    # @option options [Hash{String => String}] :default_headers
-    #   Default headers to set for every request.
+    # @param [Integer] delay
+    #   The number of seconds to pause between each request.
     #
-    # @option options [String] :host_header
-    #   The HTTP Host header to use with each request.
+    # @param [Integer, nil] limit
+    #   The maximum number of pages to visit.
     #
-    # @option options [Hash{String,Regexp => String}] :host_headers
-    #   The HTTP Host headers to use for specific hosts.
+    # @param [Integer, nil] max_depth
+    #   The maximum link depth to follow.
     #
-    # @option options [String] :user_agent (Spidr.user_agent)
-    #   The User-Agent string to send with each requests.
+    # @param [Set, Array, nil] queue
+    #   The initial queue of URLs to visit.
     #
-    # @option options [String] :referer
-    #   The Referer URL to send with each request.
+    # @param [Set, Array, nil] history
+    #   The initial list of visited URLs.
     #
-    # @option options [Integer] :delay (0)
-    #   The number of seconds to pause between each request.
+    # @param [Boolean] strip_fragments
+    #   Controls whether to strip the fragment components from the URLs.
     #
-    # @option options [Set, Array] :queue
-    #   The initial queue of URLs to visit.
+    # @param [Boolean] strip_query
+    #   Controls whether to strip the query components from the URLs.
     #
-    # @option options [Set, Array] :history
-    #   The initial list of visited URLs.
+    # @param [Array<String>] schemes
+    #   The list of acceptable URI schemes to visit.
+    #   The `https` scheme will be ignored if `net/https` cannot be loaded.
     #
-    # @option options [Integer] :limit
-    #   The maximum number of pages to visit.
+    # @param [String] host
+    #   The host-name to visit.
     #
-    # @option options [Integer] :max_depth
-    #   The maximum link depth to follow.
+    # @param [Array<String, Regexp, Proc>] hosts
+    #   The patterns which match the host-names to visit.
     #
-    # @option options [Boolean] :robots (Spidr.robots?)
+    # @param [Array<String, Regexp, Proc>] ignore_hosts
+    #   The patterns which match the host-names to not visit.
+    #
+    # @param [Array<Integer, Regexp, Proc>] ports
+    #   The patterns which match the ports to visit.
+    #
+    # @param [Array<Integer, Regexp, Proc>] ignore_ports
+    #   The patterns which match the ports to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] links
+    #   The patterns which match the links to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_links
+    #   The patterns which match the links to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] urls
+    #   The patterns which match the URLs to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_urls
+    #   The patterns which match the URLs to not visit.
+    #
+    # @param [Array<String, Regexp, Proc>] exts
+    #   The patterns which match the URI path extensions to visit.
+    #
+    # @param [Array<String, Regexp, Proc>] ignore_exts
+    #   The patterns which match the URI path extensions to not visit.
+    #
+    # @param [Boolean] robots
     #   Specifies whether `robots.txt` should be honored.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
     #   for further configuration.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
-    # @see #initialize_sanitizers
-    # @see #initialize_filters
-    # @see #initialize_actions
-    # @see #initialize_events
-    #
-    def initialize(options={})
-      @host_header  = options[:host_header]
-      @host_headers = {}
+    def initialize(# header keyword arguments
+                   host_header:        nil,
+                   host_headers:       {},
+                   default_headers:    {},
+                   user_agent:         Spidr.user_agent,
+                   referer:            nil,
+                   # session cache keyword arguments
+                   proxy:              Spidr.proxy,
+                   open_timeout:       Spidr.open_timeout,
+                   ssl_timeout:        Spidr.ssl_timeout,
+                   read_timeout:       Spidr.read_timeout,
+                   continue_timeout:   Spidr.continue_timeout,
+                   keep_alive_timeout: Spidr.keep_alive_timeout,
+                   # spidering controls keyword arguments
+                   delay:     0,
+                   limit:     nil,
+                   max_depth: nil,
+                   # history keyword arguments
+                   queue:   nil,
+                   history: nil,
+                   # sanitizer keyword arguments
+                   strip_fragments: true,
+                   strip_query:     false,
+                   # filtering keyword arguments
+                   schemes:      self.class.default_schemes,
+                   host:         nil,
+                   hosts:        nil,
+                   ignore_hosts: nil,
+                   ports:        nil,
+                   ignore_ports: nil,
+                   links:        nil,
+                   ignore_links: nil,
+                   urls:         nil,
+                   ignore_urls:  nil,
+                   exts:         nil,
+                   ignore_exts:  nil,
+                   # robots keyword arguments
+                   robots:       Spidr.robots?)
+      @host_header  = host_header
+      @host_headers = host_headers
 
-      if options[:host_headers]
-        @host_headers.merge!(options[:host_headers])
-      end
+      @default_headers = default_headers
 
-      @default_headers = {}
+      @user_agent = user_agent
+      @referer    = referer
 
-      if options[:default_headers]
-        @default_headers.merge!(options[:default_headers])
-      end
-
-      @user_agent = options.fetch(:user_agent,Spidr.user_agent)
-      @referer    = options[:referer]
-
-      @sessions   = SessionCache.new(options)
+      @sessions   = SessionCache.new(
+        proxy:              proxy,
+        open_timeout:       open_timeout,
+        ssl_timeout:        ssl_timeout,
+        read_timeout:       read_timeout,
+        continue_timeout:   continue_timeout,
+        keep_alive_timeout: keep_alive_timeout
+      )
       @cookies    = CookieJar.new
       @authorized = AuthStore.new
 
       @running  = false
-      @delay    = options.fetch(:delay,0)
+      @delay    = delay
       @history  = Set[]
       @failures = Set[]
       @queue    = []
 
-      @limit     = options[:limit]
+      @limit     = limit
       @levels    = Hash.new(0)
-      @max_depth = options[:max_depth]
+      @max_depth = max_depth
 
-      if options[:queue]
-        self.queue = options[:queue]
-      end
+      self.queue   = queue   if queue
+      self.history = history if history
 
-      if options[:history]
-        self.history = options[:history]
-      end
+      initialize_sanitizers(
+        strip_fragments: strip_fragments,
+        strip_query:     strip_query
+      )
 
-      initialize_sanitizers(options)
-      initialize_filters(options)
-      initialize_actions(options)
-      initialize_events(options)
+      initialize_filters(
+        schemes:      schemes,
+        host:         host,
+        hosts:        hosts,
+        ignore_hosts: ignore_hosts,
+        ports:        ports,
+        ignore_ports: ignore_ports,
+        links:        links,
+        ignore_links: ignore_links,
+        urls:         urls,
+        ignore_urls:  ignore_urls,
+        exts:         exts,
+        ignore_exts:  ignore_exts
+      )
+      initialize_actions
+      initialize_events
 
-      if options.fetch(:robots,Spidr.robots?)
-        initialize_robots
-      end
+      initialize_robots if robots
 
       yield self if block_given?
     end
 
     #
     # Creates a new agent and begin spidering at the given URL.
     #
     # @param [URI::HTTP, String] url
     #   The URL to start spidering at.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     # @see #start_at
     #
-    def self.start_at(url,options={},&block)
-      agent = new(options,&block)
+    def self.start_at(url,**kwargs,&block)
+      agent = new(**kwargs,&block)
       agent.start_at(url)
+      return agent
     end
 
     #
     # Creates a new agent and spiders the web-site located at the given URL.
     #
     # @param [URI::HTTP, String] url
     #   The web-site to spider.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     #
-    def self.site(url,options={},&block)
+    def self.site(url,**kwargs,&block)
       url = URI(url)
 
-      agent = new(options.merge(host: url.host),&block)
+      agent = new(host: url.host, **kwargs, &block)
       agent.start_at(url)
+      return agent
     end
 
     #
     # Creates a new agent and spiders the given host.
     #
     # @param [String] name
     #   The host-name to spider.
     #
-    # @param [Hash] options
-    #   Additional options. See {Agent#initialize}.
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
     #
     # @yield [agent]
     #   If a block is given, it will be passed the newly created agent
     #   before it begins spidering.
     #
     # @yieldparam [Agent] agent
     #   The newly created agent.
     #
+    # @return [Agent]
+    #   The created agent object.
+    #
     # @see #initialize
     #
-    def self.host(name,options={},&block)
-      agent = new(options.merge(host: name),&block)
+    def self.host(name,**kwargs,&block)
+      agent = new(host: name, **kwargs, &block)
       agent.start_at(URI::HTTP.build(host: name, path: '/'))
+      return agent
     end
 
     #
+    # Creates a new agent and spiders the entire domain.
+    #
+    # @param [String] name
+    #   The top-level domain to spider.
+    #
+    # @param [Hash{Symbol => Object}] kwargs
+    #   Additional keyword arguments. See {Agent#initialize}.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   before it begins spidering.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
+    # @return [Agent]
+    #   The created agent object.
+    #
+    # @see #initialize
+    #
+    # @since 0.7.0
+    #
+    def self.domain(name,**kwargs,&block)
+      agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
+      agent.start_at(URI::HTTP.build(host: name, path: '/'))
+      return agent
+    end
+
+    #
     # The proxy information the agent uses.
     #
     # @return [Proxy]
     #   The proxy information.
     #
@@ -312,14 +434,14 @@
     end
 
     #
     # Sets the proxy information that the agent uses.
     #
-    # @param [Proxy] new_proxy
+    # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
     #   The new proxy information.
     #
-    # @return [Hash]
+    # @return [Proxy]
     #   The new proxy information.
     #
     # @see SessionCache#proxy=
     #
     # @since 0.2.2
@@ -532,11 +654,11 @@
     #   Specifies whether the URL was enqueued, or ignored.
     #
     def enqueue(url,level=0)
       url = sanitize_url(url)
 
-      if (!(queued?(url)) && visit?(url))
+      if (!queued?(url) && visit?(url))
         link = url.to_s
 
         begin
           @every_url_blocks.each { |url_block| url_block.call(url) }
 
@@ -631,10 +753,10 @@
         return new_page
       end
     end
 
     #
-    # Visits a given URL, and enqueus the links recovered from the URL
+    # Visits a given URL, and enqueues the links recovered from the URL
     # to be visited later.
     #
     # @param [URI::HTTP, String] url
     #   The URL to visit.
     #