lib/spidr/agent.rb in spidr-0.1.9 vs lib/spidr/agent.rb in spidr-0.2.0
- old
+ new
@@ -1,14 +1,21 @@
-require 'spidr/rules'
+require 'spidr/filters'
+require 'spidr/events'
+require 'spidr/actions'
require 'spidr/page'
require 'spidr/spidr'
require 'net/http'
+require 'set'
module Spidr
class Agent
+ include Filters
+ include Events
+ include Actions
+
# Proxy to use
attr_accessor :proxy
# User-Agent to use
attr_accessor :user_agent
@@ -17,151 +24,144 @@
attr_accessor :referer
# Delay in between fetching pages
attr_accessor :delay
- # List of acceptable URL schemes to follow
- attr_reader :schemes
-
# History containing visited URLs
attr_reader :history
# List of unreachable URLs
attr_reader :failures
# Queue of URLs to visit
attr_reader :queue
#
- # Creates a new Agent object with the given _options_ and _block_.
- # If a _block_ is given, it will be passed the newly created
- # Agent object.
+ # Creates a new Agent object.
#
- # _options_ may contain the following keys:
- # <tt>:proxy</tt>:: The proxy to use while spidering.
- # <tt>:user_agent</tt>:: The User-Agent string to send.
- # <tt>:referer</tt>:: The referer URL to send.
- # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
- # link. Defaults to 0.
- # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
- # Defaults to +http+ and +https+. +https+ URL
- # schemes will be ignored if <tt>net/http</tt>
- # cannot be loaded.
- # <tt>:host</tt>:: The host-name to visit.
- # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
- # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
- # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
- # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
- # <tt>:links</tt>:: An +Array+ of link patterns to visit.
- # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
- # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
- # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
- # visit.
- # <tt>:queue</tt>:: An initial queue of URLs to visit.
- # <tt>:history</tt>:: An initial list of visited URLs.
+ # @param [Hash] options
+ # Additional options
#
+ # @option options [Hash] :proxy (Spidr.proxy)
+ # The proxy information to use.
+ #
+ # @option :proxy [String] :host
+ # The host the proxy is running on.
+ #
+ # @option :proxy [Integer] :port
+ # The port the proxy is running on.
+ #
+ # @option :proxy [String] :user
+ # The user to authenticate as with the proxy.
+ #
+ # @option :proxy [String] :password
+ # The password to authenticate with.
+ #
+ # @option options [String] :user_agent (Spidr.user_agent)
+ # The User-Agent string to send with each requests.
+ #
+ # @option options [String] :referer
+ # The Referer URL to send with each request.
+ #
+ # @option options [Integer] :delay (0)
+ # The number of seconds to pause between each request.
+ #
+ # @option options [Set, Array] :queue
+ # The initial queue of URLs to visit.
+ #
+ # @option options [Set, Array] :history
+ # The initial list of visited URLs.
+ #
+ # @yield [agent]
+ # If a block is given, it will be passed the newly created agent
+ # for further configuration.
+ #
+ # @yieldparam [Agent] agent
+ # The newly created agent.
+ #
def initialize(options={},&block)
@proxy = (options[:proxy] || Spidr.proxy)
@user_agent = (options[:user_agent] || Spidr.user_agent)
@referer = options[:referer]
- @schemes = []
-
- if options[:schemes]
- @schemes += options[:schemes]
- else
- @schemes << 'http'
-
- begin
- require 'net/https'
-
- @schemes << 'https'
- rescue Gem::LoadError => e
- raise(e)
- rescue ::LoadError
- STDERR.puts "Warning: cannot load 'net/https', https support disabled"
- end
- end
-
- @host_rules = Rules.new(
- :accept => options[:hosts],
- :reject => options[:ignore_hosts]
- )
- @port_rules = Rules.new(
- :accept => options[:ports],
- :reject => options[:ignore_ports]
- )
- @link_rules = Rules.new(
- :accept => options[:links],
- :reject => options[:ignore_links]
- )
- @ext_rules = Rules.new(
- :accept => options[:exts],
- :reject => options[:ignore_exts]
- )
-
- @every_url_blocks = []
- @every_failed_url_blocks = []
- @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
-
- @every_page_blocks = []
-
+ @running = false
@delay = (options[:delay] || 0)
- @history = []
- @failures = []
+ @history = Set[]
+ @failures = Set[]
@queue = []
- @paused = true
- if options[:host]
- visit_hosts_like(options[:host])
- end
+ @sessions = {}
- if options[:queue]
- self.queue = options[:queue]
- end
+ super(options)
- if options[:history]
- self.history = options[:history]
- end
-
block.call(self) if block
end
#
- # Creates a new Agent object with the given _options_ and will begin
- # spidering at the specified _url_. If a _block_ is given it will be
- # passed the newly created Agent object, before the agent begins
- # spidering.
+ # Creates a new agent and begin spidering at the given URL.
#
+ # @param [URI::HTTP, String] url
+ # The URL to start spidering at.
+ #
+ # @param [Hash] options
+ # Additional options. See {Agent#initialize}.
+ #
+ # @yield [agent]
+ # If a block is given, it will be passed the newly created agent
+ # before it begins spidering.
+ #
+ # @yieldparam [Agent] agent
+ # The newly created agent.
+ #
def self.start_at(url,options={},&block)
self.new(options) do |spider|
block.call(spider) if block
spider.start_at(url)
end
end
#
- # Creates a new Agent object with the given _options_ and will begin
- # spidering the specified host _name_. If a _block_ is given it will be
- # passed the newly created Agent object, before the agent begins
- # spidering.
+ # Creates a new agent and spiders the given host.
#
+ # @param [String]
+ # The host-name to spider.
+ #
+ # @param [Hash] options
+ # Additional options. See {Agent#initialize}.
+ #
+ # @yield [agent]
+ # If a block is given, it will be passed the newly created agent
+ # before it begins spidering.
+ #
+ # @yieldparam [Agent] agent
+ # The newly created agent.
+ #
def self.host(name,options={},&block)
self.new(options.merge(:host => name)) do |spider|
block.call(spider) if block
spider.start_at("http://#{name}/")
end
end
#
- # Creates a new Agent object with the given _options_ and will begin
- # spidering the host of the specified _url_. If a _block_ is given it
- # will be passed the newly created Agent object, before the agent
- # begins spidering.
+ # Creates a new agent and spiders the web-site located at the given URL.
#
+ # @param [URI::HTTP, String] url
+ # The web-site to spider.
+ #
+ # @param [Hash] options
+ # Additional options. See {Agent#initialize}.
+ #
+ # @yield [agent]
+ # If a block is given, it will be passed the newly created agent
+ # before it begins spidering.
+ #
+ # @yieldparam [Agent] agent
+ # The newly created agent.
+ #
def self.site(url,options={},&block)
url = URI(url.to_s)
return self.new(options.merge(:host => url.host)) do |spider|
block.call(spider) if block
@@ -169,528 +169,469 @@
spider.start_at(url)
end
end
#
- # Returns the +Array+ of host patterns to visit.
+ # Clears the history of the agent.
#
- def visit_hosts
- @host_rules.accept
- end
-
- #
- # Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
- # it will be added to the visit_hosts.
- #
- def visit_hosts_like(pattern=nil,&block)
- if pattern
- visit_hosts << pattern
- elsif block
- visit_hosts << block
- end
-
+ def clear
+ @queue.clear
+ @history.clear
+ @failures.clear
return self
end
#
- # Returns the +Array+ of URL host patterns to not visit.
+ # Start spidering at a given URL.
#
- def ignore_hosts
- @host_rules.reject
- end
-
+ # @param [URI::HTTP, String] url
+ # The URL to start spidering at.
#
- # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
- # it will be added to the ignore_hosts.
+ # @yield [page]
+ # If a block is given, it will be passed every page visited.
#
- def ignore_hosts_like(pattern=nil,&block)
- if pattern
- ignore_hosts << pattern
- elsif block
- ignore_hosts << block
- end
+ # @yieldparam [Page] page
+ # A page which has been visited.
+ #
+ def start_at(url,&block)
+ enqueue(url)
- return self
+ return run(&block)
end
#
- # Returns the +Array+ of URL port patterns to visit.
+ # Start spidering until the queue becomes empty or the agent is
+ # paused.
#
- def visit_ports
- @port_rules.accept
- end
-
+ # @yield [page]
+ # If a block is given, it will be passed every page visited.
#
- # Adds the given _pattern_ to the visit_ports. If a _block_ is given,
- # it will be added to the visit_ports.
+ # @yieldparam [Page] page
+ # A page which has been visited.
#
- def visit_ports_like(pattern=nil,&block)
- if pattern
- visit_ports << pattern
- elsif block
- visit_ports << block
+ def run(&block)
+ @running = true
+
+ until (@queue.empty? || paused?)
+ begin
+ visit_page(dequeue,&block)
+ rescue Actions::Paused
+ return self
+ rescue Actions::Action
+ end
end
- return self
- end
+ @running = false
- #
- # Returns the +Array+ of URL port patterns to not visit.
- #
- def ignore_ports
- @port_rules.reject
- end
-
- #
- # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
- # it will be added to the ignore_hosts.
- #
- def ignore_ports_like(pattern=nil,&block)
- if pattern
- ignore_ports << pattern
- elsif block
- ignore_ports << block
+ @sessions.each_value do |sess|
+ begin
+ sess.finish
+ rescue IOError
+ nil
+ end
end
+ @sessions.clear
return self
end
#
- # Returns the +Array+ of link patterns to visit.
+ # Determines if the agent is running.
#
- def visit_links
- @link_rules.accept
+ # @return [Boolean]
+ # Specifies whether the agent is running or stopped.
+ #
+ def running?
+ @running == true
end
#
- # Adds the given _pattern_ to the visit_links. If a _block_ is given,
- # it will be added to the visit_links.
+ # Sets the history of URLs that were previously visited.
#
- def visit_links_like(pattern=nil,&block)
- if pattern
- visit_links << pattern
- elsif block
- visit_links << block
+ # @param [#each] new_history
+ # A list of URLs to populate the history with.
+ #
+ # @return [Set<URI::HTTP>]
+ # The history of the agent.
+ #
+ # @example
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
+ #
+ def history=(new_history)
+ @history.clear
+
+ new_history.each do |url|
+ @history << unless url.kind_of?(URI)
+ URI(url.to_s)
+ else
+ url
+ end
end
- return self
+ return @history
end
+ alias visited_urls history
+
#
- # Returns the +Array+ of link patterns to not visit.
+ # Specifies the links which have been visited.
#
- def ignore_links
- @link_rules.reject
+ # @return [Array<String>]
+ # The links which have been visited.
+ #
+ def visited_links
+ @history.map { |url| url.to_s }
end
#
- # Adds the given _pattern_ to the ignore_links. If a _block_ is given,
- # it will be added to the ignore_links.
+ # Specifies all hosts that were visited.
#
- def ignore_links_like(pattern=nil,&block)
- if pattern
- ignore_links << pattern
- elsif block
- ignore_links << block
- end
-
- return self
+ # @return [Array<String>]
+ # The hosts which have been visited.
+ #
+ def visited_hosts
+ visited_urls.map { |uri| uri.host }.uniq
end
#
- # Returns the +Array+ of URL extension patterns to visit.
+ # Determines whether a URL was visited or not.
#
- def visit_exts
- @ext_rules.accept
- end
-
+ # @param [URI::HTTP, String] url
+ # The URL to search for.
#
- # Adds the given _pattern_ to the visit_exts. If a _block_ is given,
- # it will be added to the visit_exts.
+ # @return [Boolean]
+ # Specifies whether a URL was visited.
#
- def visit_exts_like(pattern=nil,&block)
- if pattern
- visit_exts << pattern
- elsif block
- visit_exts << block
- end
+ def visited?(url)
+ url = URI(url.to_s) unless url.kind_of?(URI)
- return self
+ return @history.include?(url)
end
#
- # Returns the +Array+ of URL extension patterns to not visit.
+ # Sets the list of failed URLs.
#
- def ignore_exts
- @ext_rules.reject
- end
-
+ # @param [#each]
+ # The new list of failed URLs.
#
- # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
- # it will be added to the ignore_exts.
+ # @return [Array<URI::HTTP>]
+ # The list of failed URLs.
#
- def ignore_exts_like(pattern=nil,&block)
- if pattern
- ignore_exts << pattern
- elsif block
- ignore_exts << block
+ # @example
+ # agent.failures = ['http://localhost/']
+ #
+ def failures=(new_failures)
+ @failures.clear
+
+ new_failures.each do |url|
+ @failures << unless url.kind_of?(URI)
+ URI(url.to_s)
+ else
+ url
+ end
end
- return self
+ return @failures
end
#
- # For every URL that the agent visits it will be passed to the
- # specified _block_.
+ # Determines whether a given URL could not be visited.
#
- def every_url(&block)
- @every_url_blocks << block
- return self
- end
-
+ # @param [URI::HTTP, String] url
+ # The URL to check for failures.
#
- # For every URL that the agent is unable to visit, it will be passed
- # to the specified _block_.
+ # @return [Boolean]
+ # Specifies whether the given URL was unable to be visited.
#
- def every_failed_url(&block)
- @every_failed_url_blocks << block
- return self
- end
+ def failed?(url)
+ url = URI(url.to_s) unless url.kind_of?(URI)
- #
- # For every URL that the agent visits and matches the specified
- # _pattern_, it will be passed to the specified _block_.
- #
- def urls_like(pattern,&block)
- @urls_like_blocks[pattern] << block
- return self
+ return @failures.include?(url)
end
- #
- # For every Page that the agent visits, pass the page to the
- # specified _block_.
- #
- def every_page(&block)
- @every_page_blocks << block
- return self
- end
+ alias pending_urls queue
#
- # For every Page that the agent visits, pass the headers to the given
- # _block_.
+ # Sets the queue of URLs to visit.
#
- def all_headers(&block)
- every_page { |page| block.call(page.headers) }
- end
-
+ # @param [#each]
+ # The new list of URLs to visit.
#
- # Clears the history of the agent.
+ # @return [Array<URI::HTTP>]
+ # The list of URLs to visit.
#
- def clear
+ # @example
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
+ #
+ def queue=(new_queue)
@queue.clear
- @history.clear
- @failures.clear
- return self
- end
- #
- # Start spidering at the specified _url_.
- #
- def start_at(url)
- enqueue(url)
-
- return continue!
- end
-
- #
- # Start spidering until the queue becomes empty or the agent is
- # paused.
- #
- def run
- until (@queue.empty? || @paused == true)
- visit_page(dequeue)
+ new_queue.each do |url|
+ @queue << unless url.kind_of?(URI)
+ URI(url.to_s)
+ else
+ url
+ end
end
- return self
+ return @queue
end
#
- # Continue spidering.
+ # Determines whether a given URL has been enqueued.
#
- def continue!
- @paused = false
- return run
- end
-
+ # @param [URI::HTTP] url
+ # The URL to search for in the queue.
#
- # Returns +true+ if the agent is still spidering, returns +false+
- # otherwise.
+ # @return [Boolean]
+ # Specifies whether the given URL has been queued for visiting.
#
- def running?
- @paused == false
+ def queued?(url)
+ @queue.include?(url)
end
#
- # Returns +true+ if the agent is paused, returns +false+ otherwise.
+ # Enqueues a given URL for visiting, only if it passes all of the
+ # agent's rules for visiting a given URL.
#
- def paused?
- @paused == true
- end
-
+ # @param [URI::HTTP, String] url
+ # The URL to enqueue for visiting.
#
- # Pauses the agent, causing spidering to temporarily stop.
+ # @return [Boolean]
+ # Specifies whether the URL was enqueued, or ignored.
#
- def pause!
- @paused = true
- return self
- end
+ def enqueue(url)
+ link = url.to_s
+ url = URI(link) unless url.kind_of?(URI)
- #
- # Sets the list of acceptable URL schemes to follow to the
- # _new_schemes_.
- #
- # agent.schemes = ['http']
- #
- def schemes=(new_schemes)
- @schemes = new_schemes.map { |scheme| scheme.to_s }
- end
+ if (!(queued?(url)) && visit?(url))
+ begin
+ @every_url_blocks.each { |block| block.call(url) }
- #
- # Sets the history of links that were previously visited to the
- # specified _new_history_.
- #
- # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
- #
- def history=(new_history)
- @history = new_history.map do |url|
- unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
+ @urls_like_blocks.each do |pattern,blocks|
+ if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
+ blocks.each { |url_block| url_block.call(url) }
+ end
+ end
+ rescue Actions::Paused => action
+ raise(action)
+ rescue Actions::SkipLink
+ return false
+ rescue Actions::Action
end
+
+ @queue << url
+ return true
end
- end
- alias visited_urls history
-
- #
- # Returns the +Array+ of visited URLs.
- #
- def visited_links
- @history.map { |uri| uri.to_s }
+ return false
end
#
- # Return the +Array+ of hosts that were visited.
+ # Requests and creates a new Page object from a given URL.
#
- def visited_hosts
- @history.map { |uri| uri.host }.uniq
- end
-
+ # @param [URI::HTTP] url
+ # The URL to request.
#
- # Returns +true+ if the specified _url_ was visited, returns +false+
- # otherwise.
+ # @yield [page]
+ # If a block is given, it will be passed the page that represents the
+ # response.
#
- def visited?(url)
- url = URI(url) unless url.kind_of?(URI)
-
- return @history.include?(url)
- end
-
+ # @yieldparam [Page] page
+ # The page for the response.
#
- # Returns +true+ if the specified _url_ was unable to be visited,
- # returns +false+ otherwise.
+ # @return [Page, nil]
+ # The page for the response, or +nil+ if the request failed.
#
- def failed?(url)
- url = URI(url) unless url.kind_of?(URI)
-
- return @failures.include?(url)
- end
-
- alias pending_urls queue
-
- #
- # Creates a new Page object from the specified _url_. If a _block_ is
- # given, it will be passed the newly created Page object.
- #
def get_page(url,&block)
+ url = URI(url.to_s) unless url.kind_of?(URI)
+
host = url.host
port = url.port
unless url.path.empty?
path = url.path
else
path = '/'
end
- proxy_host = @proxy[:host]
- proxy_port = @proxy[:port]
- proxy_user = @proxy[:user]
- proxy_password = @proxy[:password]
+ # append the URL query to the path
+ path += "?#{url.query}" if url.query
begin
- Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+ get_session(url.scheme,host,port) do |sess|
headers = {}
-
headers['User-Agent'] = @user_agent if @user_agent
headers['Referer'] = @referer if @referer
new_page = Page.new(url,sess.get(path,headers))
block.call(new_page) if block
return new_page
end
- rescue SystemCallError, Net::HTTPBadResponse
+ rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
failed(url)
+ kill_session(url.scheme,host,port)
return nil
end
end
#
- # Returns the agent represented as a Hash containing the agents
- # +history+ and +queue+ information.
+ # Visits a given URL, and enqueus the links recovered from the URL
+ # to be visited later.
#
- def to_hash
- {:history => @history, :queue => @queue}
- end
-
+ # @param [URI::HTTP, String] url
+ # The URL to visit.
#
- # Sets the queue of links to visit to the specified _new_queue_.
+ # @yield [page]
+ # If a block is given, it will be passed the page which was visited.
#
- # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
+ # @yieldparam [Page] page
+ # The page which was visited.
#
- def queue=(new_queue)
- @queue = new_queue.map do |url|
- unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
+ # @return [Page, nil]
+ # The page that was visited. If +nil+ is returned, either the request
+ # for the page failed, or the page was skipped.
+ #
+ def visit_page(url,&block)
+ url = URI(url.to_s) unless url.kind_of?(URI)
+
+ get_page(url) do |page|
+ @history << page.url
+
+ begin
+ @every_page_blocks.each { |page_block| page_block.call(page) }
+
+ block.call(page) if block
+ rescue Actions::Paused => action
+ raise(action)
+ rescue Actions::SkipPage
+ return nil
+ rescue Actions::Action
end
+
+ page.urls.each { |next_url| enqueue(next_url) }
end
end
#
- # Returns +true+ if the specified _url_ is queued for visiting, returns
- # +false+ otherwise.
+ # Converts the agent into a Hash.
#
- def queued?(url)
- @queue.include?(url)
+ # @return [Hash]
+ # The agent represented as a Hash containing the +history+ and
+ # the +queue+ of the agent.
+ #
+ def to_hash
+ {:history => @history, :queue => @queue}
end
+ protected
+
#
- # Enqueues the specified _url_ for visiting, only if it passes all the
- # agent's rules for visiting a given URL. Returns +true+ if the _url_
- # was successfully enqueued, returns +false+ otherwise.
+ # Provides an active HTTP session for the given scheme, host
+ # and port.
#
- def enqueue(url)
- link = url.to_s
- url = URI(link)
+ # @param [String] scheme
+ # The scheme of the URL, which will be requested later.
+ #
+ # @param [String] host
+ # The host that the session is needed with.
+ #
+ # @param [Integer] port
+ # The port that the session is needed for.
+ #
+ # @yield [session]
+ # If a block is given, it will be passed the active HTTP session.
+ #
+ # @yieldparam [Net::HTTP] session
+ # The active HTTP session object.
+ #
+ def get_session(scheme,host,port,&block)
+ key = [scheme,host,port]
- if (!(queued?(url)) && visit?(url))
- @every_url_blocks.each { |block| block.call(url) }
+ unless @sessions[key]
+ session = Net::HTTP::Proxy(
+ @proxy[:host],
+ @proxy[:port],
+ @proxy[:user],
+ @proxy[:password]
+ ).new(host,port)
- @urls_like_blocks.each do |pattern,blocks|
- if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
- blocks.each { |url_block| url_block.call(url) }
- end
+ if scheme == 'https'
+ session.use_ssl = true
+ session.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
- @queue << url
- return true
+ @sessions[key] = session
end
- return false
+ session = @sessions[key]
+ block.call(session) if block
+ return session
end
- protected
-
#
- # Dequeues a URL that will later be visited.
+ # Destroys an HTTP session for the given scheme, host and port.
#
- def dequeue
- @queue.shift
- end
-
+ # @param [String] scheme
+ # The scheme of the URL, which was requested through the session.
#
- # Returns +true+ if the specified _url_ should be visited, based on
- # it's scheme, returns +false+ otherwise.
+ # @param [String] host
+ # The host that the session was connected with.
#
- def visit_scheme?(url)
- if url.scheme
- return @schemes.include?(url.scheme)
- else
- return true
+ # @param [Integer] port
+ # The port that the session was connected to.
+ #
+ def kill_session(scheme,host,port,&block)
+ key = [scheme,host,port]
+ sess = @sessions[key]
+
+ begin
+ sess.finish
+ rescue IOError
+ nil
end
- end
- #
- # Returns +true+ if the specified _url_ should be visited, based on
- # the host of the _url_, returns +false+ otherwise.
- #
- def visit_host?(url)
- @host_rules.accept?(url.host)
+ @sessions.delete(key)
+ block.call if block
+ return nil
end
#
- # Returns +true+ if the specified _url_ should be visited, based on
- # the port of the _url_, returns +false+ otherwise.
+ # Dequeues a URL that will later be visited.
#
- def visit_port?(url)
- @port_rules.accept?(url.port)
- end
-
+ # @return [URI::HTTP]
+ # The URL that was at the front of the queue.
#
- # Returns +true+ if the specified _url_ should be visited, based on
- # the pattern of the _url_, returns +false+ otherwise.
- #
- def visit_link?(url)
- @link_rules.accept?(url.to_s)
+ def dequeue
+ @queue.shift
end
#
- # Returns +true+ if the specified _url_ should be visited, based on
- # the file extension of the _url_, returns +false+ otherwise.
+ # Determines if a given URL should be visited.
#
- def visit_ext?(url)
- @ext_rules.accept?(File.extname(url.path)[1..-1])
- end
-
+ # @param [URI::HTTP] url
+ # The URL in question.
#
- # Returns +true+ if the specified URL should be visited, returns
- # +false+ otherwise.
+ # @return [Boolean]
+ # Specifies whether the given URL should be visited.
#
def visit?(url)
(!(visited?(url)) &&
- visit_scheme?(url) &&
- visit_host?(url) &&
- visit_port?(url) &&
- visit_link?(url) &&
- visit_ext?(url))
+ visit_scheme?(url.scheme) &&
+ visit_host?(url.host) &&
+ visit_port?(url.port) &&
+ visit_link?(url.to_s) &&
+ visit_ext?(url.path))
end
#
- # Visits the spedified _url_ and enqueus it's links for visiting. If a
- # _block_ is given, it will be passed a newly created Page object
- # for the specified _url_.
+ # Adds a given URL to the failures list.
#
- def visit_page(url,&block)
- get_page(url) do |page|
- @history << page.url
-
- page.urls.each { |next_url| enqueue(next_url) }
-
- @every_page_blocks.each { |page_block| page_block.call(page) }
-
- block.call(page) if block
- end
- end
-
+ # @param [URI::HTTP] url
+ # The URL to add to the failures list.
#
- # Adds the specified _url_ to the failures list.
- #
def failed(url)
- url = URI(url.to_s) unless url.kind_of?(URI)
-
@every_failed_url_blocks.each { |block| block.call(url) }
@failures << url
return true
end