lib/spidr/agent.rb in spidr-0.6.1 vs lib/spidr/agent.rb in spidr-0.7.0
- old
+ new
@@ -17,16 +17,16 @@
module Spidr
class Agent
include Settings::UserAgent
- # HTTP Host Header to use
+ # HTTP Host `Header` to use
#
# @return [String]
attr_accessor :host_header
- # HTTP Host Headers to use for specific hosts
+ # HTTP `Host` Headers to use for specific hosts
#
# @return [Hash{String,Regexp => String}]
attr_reader :host_headers
# HTTP Headers to use for every request
@@ -94,212 +94,334 @@
attr_reader :levels
#
# Creates a new Agent object.
#
- # @param [Hash] options
- # Additional options
+ # @param [String, nil] host_header
+ # The HTTP `Host` header to use with each request.
#
- # @option options [Integer] :open_timeout (Spidr.open_timeout)
- # Optional open timeout.
+ # @param [Hash{String,Regexp => String}] host_headers
+ # The HTTP `Host` headers to use for specific hosts.
#
- # @option options [Integer] :read_timeout (Spidr.read_timeout)
+ # @param [Hash{String => String}] default_headers
+ # Default headers to set for every request.
+ #
+ # @param [String, nil] user_agent
+ # The `User-Agent` string to send with each requests.
+ #
+ # @param [String, nil] referer
+ # The `Referer` URL to send with each request.
+ #
+ # @param [Integer, nil] open_timeout
+ # Optional open connection timeout.
+ #
+ # @param [Integer, nil] read_timeout
# Optional read timeout.
#
- # @option options [Integer] :ssl_timeout (Spidr.ssl_timeout)
- # Optional ssl timeout.
+ # @param [Integer, nil] ssl_timeout
+ # Optional SSL connection timeout.
#
- # @option options [Integer] :continue_timeout (Spidr.continue_timeout)
+ # @param [Integer, nil] continue_timeout
# Optional continue timeout.
#
- # @option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)
- # Optional keep_alive timeout.
+ # @param [Integer, nil] keep_alive_timeout
+ # Optional `Keep-Alive` timeout.
#
- # @option options [Hash] :proxy (Spidr.proxy)
+ # @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
# The proxy information to use.
#
- # @option :proxy [String] :host
+ # @option proxy [String] :host
# The host the proxy is running on.
#
- # @option :proxy [Integer] :port
+ # @option proxy [Integer] :port (8080)
# The port the proxy is running on.
#
- # @option :proxy [String] :user
+ # @option proxy [String, nil] :user
# The user to authenticate as with the proxy.
#
- # @option :proxy [String] :password
+ # @option proxy [String, nil] :password
# The password to authenticate with.
#
- # @option options [Hash{String => String}] :default_headers
- # Default headers to set for every request.
+ # @param [Integer] delay
+ # The number of seconds to pause between each request.
#
- # @option options [String] :host_header
- # The HTTP Host header to use with each request.
+ # @param [Integer, nil] limit
+ # The maximum number of pages to visit.
#
- # @option options [Hash{String,Regexp => String}] :host_headers
- # The HTTP Host headers to use for specific hosts.
+ # @param [Integer, nil] max_depth
+ # The maximum link depth to follow.
#
- # @option options [String] :user_agent (Spidr.user_agent)
- # The User-Agent string to send with each requests.
+ # @param [Set, Array, nil] queue
+ # The initial queue of URLs to visit.
#
- # @option options [String] :referer
- # The Referer URL to send with each request.
+ # @param [Set, Array, nil] history
+ # The initial list of visited URLs.
#
- # @option options [Integer] :delay (0)
- # The number of seconds to pause between each request.
+ # @param [Boolean] strip_fragments
+ # Controls whether to strip the fragment components from the URLs.
#
- # @option options [Set, Array] :queue
- # The initial queue of URLs to visit.
+ # @param [Boolean] strip_query
+ # Controls whether to strip the query components from the URLs.
#
- # @option options [Set, Array] :history
- # The initial list of visited URLs.
+ # @param [Array<String>] schemes
+ # The list of acceptable URI schemes to visit.
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
#
- # @option options [Integer] :limit
- # The maximum number of pages to visit.
+ # @param [String] host
+ # The host-name to visit.
#
- # @option options [Integer] :max_depth
- # The maximum link depth to follow.
+ # @param [Array<String, Regexp, Proc>] hosts
+ # The patterns which match the host-names to visit.
#
- # @option options [Boolean] :robots (Spidr.robots?)
+ # @param [Array<String, Regexp, Proc>] ignore_hosts
+ # The patterns which match the host-names to not visit.
+ #
+ # @param [Array<Integer, Regexp, Proc>] ports
+ # The patterns which match the ports to visit.
+ #
+ # @param [Array<Integer, Regexp, Proc>] ignore_ports
+ # The patterns which match the ports to not visit.
+ #
+ # @param [Array<String, Regexp, Proc>] links
+ # The patterns which match the links to visit.
+ #
+ # @param [Array<String, Regexp, Proc>] ignore_links
+ # The patterns which match the links to not visit.
+ #
+ # @param [Array<String, Regexp, Proc>] urls
+ # The patterns which match the URLs to visit.
+ #
+ # @param [Array<String, Regexp, Proc>] ignore_urls
+ # The patterns which match the URLs to not visit.
+ #
+ # @param [Array<String, Regexp, Proc>] exts
+ # The patterns which match the URI path extensions to visit.
+ #
+ # @param [Array<String, Regexp, Proc>] ignore_exts
+ # The patterns which match the URI path extensions to not visit.
+ #
+ # @param [Boolean] robots
# Specifies whether `robots.txt` should be honored.
#
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# for further configuration.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
- # @see #initialize_sanitizers
- # @see #initialize_filters
- # @see #initialize_actions
- # @see #initialize_events
- #
- def initialize(options={})
- @host_header = options[:host_header]
- @host_headers = {}
+ def initialize(# header keyword arguments
+ host_header: nil,
+ host_headers: {},
+ default_headers: {},
+ user_agent: Spidr.user_agent,
+ referer: nil,
+ # session cache keyword arguments
+ proxy: Spidr.proxy,
+ open_timeout: Spidr.open_timeout,
+ ssl_timeout: Spidr.ssl_timeout,
+ read_timeout: Spidr.read_timeout,
+ continue_timeout: Spidr.continue_timeout,
+ keep_alive_timeout: Spidr.keep_alive_timeout,
+ # spidering controls keyword arguments
+ delay: 0,
+ limit: nil,
+ max_depth: nil,
+ # history keyword arguments
+ queue: nil,
+ history: nil,
+ # sanitizer keyword arguments
+ strip_fragments: true,
+ strip_query: false,
+ # filtering keyword arguments
+ schemes: self.class.default_schemes,
+ host: nil,
+ hosts: nil,
+ ignore_hosts: nil,
+ ports: nil,
+ ignore_ports: nil,
+ links: nil,
+ ignore_links: nil,
+ urls: nil,
+ ignore_urls: nil,
+ exts: nil,
+ ignore_exts: nil,
+ # robots keyword arguments
+ robots: Spidr.robots?)
+ @host_header = host_header
+ @host_headers = host_headers
- if options[:host_headers]
- @host_headers.merge!(options[:host_headers])
- end
+ @default_headers = default_headers
- @default_headers = {}
+ @user_agent = user_agent
+ @referer = referer
- if options[:default_headers]
- @default_headers.merge!(options[:default_headers])
- end
-
- @user_agent = options.fetch(:user_agent,Spidr.user_agent)
- @referer = options[:referer]
-
- @sessions = SessionCache.new(options)
+ @sessions = SessionCache.new(
+ proxy: proxy,
+ open_timeout: open_timeout,
+ ssl_timeout: ssl_timeout,
+ read_timeout: read_timeout,
+ continue_timeout: continue_timeout,
+ keep_alive_timeout: keep_alive_timeout
+ )
@cookies = CookieJar.new
@authorized = AuthStore.new
@running = false
- @delay = options.fetch(:delay,0)
+ @delay = delay
@history = Set[]
@failures = Set[]
@queue = []
- @limit = options[:limit]
+ @limit = limit
@levels = Hash.new(0)
- @max_depth = options[:max_depth]
+ @max_depth = max_depth
- if options[:queue]
- self.queue = options[:queue]
- end
+ self.queue = queue if queue
+ self.history = history if history
- if options[:history]
- self.history = options[:history]
- end
+ initialize_sanitizers(
+ strip_fragments: strip_fragments,
+ strip_query: strip_query
+ )
- initialize_sanitizers(options)
- initialize_filters(options)
- initialize_actions(options)
- initialize_events(options)
+ initialize_filters(
+ schemes: schemes,
+ host: host,
+ hosts: hosts,
+ ignore_hosts: ignore_hosts,
+ ports: ports,
+ ignore_ports: ignore_ports,
+ links: links,
+ ignore_links: ignore_links,
+ urls: urls,
+ ignore_urls: ignore_urls,
+ exts: exts,
+ ignore_exts: ignore_exts
+ )
+ initialize_actions
+ initialize_events
- if options.fetch(:robots,Spidr.robots?)
- initialize_robots
- end
+ initialize_robots if robots
yield self if block_given?
end
#
# Creates a new agent and begin spidering at the given URL.
#
# @param [URI::HTTP, String] url
# The URL to start spidering at.
#
- # @param [Hash] options
- # Additional options. See {Agent#initialize}.
+ # @param [Hash{Symbol => Object}] kwargs
+ # Additional keyword arguments. See {Agent#initialize}.
#
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @return [Agent]
+ # The created agent object.
+ #
# @see #initialize
# @see #start_at
#
- def self.start_at(url,options={},&block)
- agent = new(options,&block)
+ def self.start_at(url,**kwargs,&block)
+ agent = new(**kwargs,&block)
agent.start_at(url)
+ return agent
end
#
# Creates a new agent and spiders the web-site located at the given URL.
#
# @param [URI::HTTP, String] url
# The web-site to spider.
#
- # @param [Hash] options
- # Additional options. See {Agent#initialize}.
+ # @param [Hash{Symbol => Object}] kwargs
+ # Additional keyword arguments. See {Agent#initialize}.
#
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @return [Agent]
+ # The created agent object.
+ #
# @see #initialize
#
- def self.site(url,options={},&block)
+ def self.site(url,**kwargs,&block)
url = URI(url)
- agent = new(options.merge(host: url.host),&block)
+ agent = new(host: url.host, **kwargs, &block)
agent.start_at(url)
+ return agent
end
#
# Creates a new agent and spiders the given host.
#
# @param [String] name
# The host-name to spider.
#
- # @param [Hash] options
- # Additional options. See {Agent#initialize}.
+ # @param [Hash{Symbol => Object}] kwargs
+ # Additional keyword arguments. See {Agent#initialize}.
#
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
+ # @return [Agent]
+ # The created agent object.
+ #
# @see #initialize
#
- def self.host(name,options={},&block)
- agent = new(options.merge(host: name),&block)
+ def self.host(name,**kwargs,&block)
+ agent = new(host: name, **kwargs, &block)
agent.start_at(URI::HTTP.build(host: name, path: '/'))
+ return agent
end
#
+ # Creates a new agent and spiders the entire domain.
+ #
+ # @param [String] name
+ # The top-level domain to spider.
+ #
+ # @param [Hash{Symbol => Object}] kwargs
+ # Additional keyword arguments. See {Agent#initialize}.
+ #
+ # @yield [agent]
+ # If a block is given, it will be passed the newly created agent
+ # before it begins spidering.
+ #
+ # @yieldparam [Agent] agent
+ # The newly created agent.
+ #
+ # @return [Agent]
+ # The created agent object.
+ #
+ # @see #initialize
+ #
+ # @since 0.7.0
+ #
+ def self.domain(name,**kwargs,&block)
+ agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block)
+ agent.start_at(URI::HTTP.build(host: name, path: '/'))
+ return agent
+ end
+
+ #
# The proxy information the agent uses.
#
# @return [Proxy]
# The proxy information.
#
@@ -312,14 +434,14 @@
end
#
# Sets the proxy information that the agent uses.
#
- # @param [Proxy] new_proxy
+ # @param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
# The new proxy information.
#
- # @return [Hash]
+ # @return [Proxy]
# The new proxy information.
#
# @see SessionCache#proxy=
#
# @since 0.2.2
@@ -532,11 +654,11 @@
# Specifies whether the URL was enqueued, or ignored.
#
def enqueue(url,level=0)
url = sanitize_url(url)
- if (!(queued?(url)) && visit?(url))
+ if (!queued?(url) && visit?(url))
link = url.to_s
begin
@every_url_blocks.each { |url_block| url_block.call(url) }
@@ -631,10 +753,10 @@
return new_page
end
end
#
- # Visits a given URL, and enqueus the links recovered from the URL
+ # Visits a given URL, and enqueues the links recovered from the URL
# to be visited later.
#
# @param [URI::HTTP, String] url
# The URL to visit.
#