lib/spidr/agent.rb in spidr-0.1.9 vs lib/spidr/agent.rb in spidr-0.2.0

- old
+ new

@@ -1,14 +1,21 @@ -require 'spidr/rules' +require 'spidr/filters' +require 'spidr/events' +require 'spidr/actions' require 'spidr/page' require 'spidr/spidr' require 'net/http' +require 'set' module Spidr class Agent + include Filters + include Events + include Actions + # Proxy to use attr_accessor :proxy # User-Agent to use attr_accessor :user_agent @@ -17,151 +24,144 @@ attr_accessor :referer # Delay in between fetching pages attr_accessor :delay - # List of acceptable URL schemes to follow - attr_reader :schemes - # History containing visited URLs attr_reader :history # List of unreachable URLs attr_reader :failures # Queue of URLs to visit attr_reader :queue # - # Creates a new Agent object with the given _options_ and _block_. - # If a _block_ is given, it will be passed the newly created - # Agent object. + # Creates a new Agent object. # - # _options_ may contain the following keys: - # <tt>:proxy</tt>:: The proxy to use while spidering. - # <tt>:user_agent</tt>:: The User-Agent string to send. - # <tt>:referer</tt>:: The referer URL to send. - # <tt>:delay</tt>:: Duration in seconds to pause between spidering each - # link. Defaults to 0. - # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow. - # Defaults to +http+ and +https+. +https+ URL - # schemes will be ignored if <tt>net/http</tt> - # cannot be loaded. - # <tt>:host</tt>:: The host-name to visit. - # <tt>:hosts</tt>:: An +Array+ of host patterns to visit. - # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit. - # <tt>:ports</tt>:: An +Array+ of port patterns to visit. - # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit. - # <tt>:links</tt>:: An +Array+ of link patterns to visit. - # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit. - # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit. - # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not - # visit. - # <tt>:queue</tt>:: An initial queue of URLs to visit. - # <tt>:history</tt>:: An initial list of visited URLs. + # @param [Hash] options + # Additional options # + # @option options [Hash] :proxy (Spidr.proxy) + # The proxy information to use. + # + # @option :proxy [String] :host + # The host the proxy is running on. + # + # @option :proxy [Integer] :port + # The port the proxy is running on. + # + # @option :proxy [String] :user + # The user to authenticate as with the proxy. + # + # @option :proxy [String] :password + # The password to authenticate with. + # + # @option options [String] :user_agent (Spidr.user_agent) + # The User-Agent string to send with each requests. + # + # @option options [String] :referer + # The Referer URL to send with each request. + # + # @option options [Integer] :delay (0) + # The number of seconds to pause between each request. + # + # @option options [Set, Array] :queue + # The initial queue of URLs to visit. + # + # @option options [Set, Array] :history + # The initial list of visited URLs. + # + # @yield [agent] + # If a block is given, it will be passed the newly created agent + # for further configuration. + # + # @yieldparam [Agent] agent + # The newly created agent. + # def initialize(options={},&block) @proxy = (options[:proxy] || Spidr.proxy) @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] - @schemes = [] - - if options[:schemes] - @schemes += options[:schemes] - else - @schemes << 'http' - - begin - require 'net/https' - - @schemes << 'https' - rescue Gem::LoadError => e - raise(e) - rescue ::LoadError - STDERR.puts "Warning: cannot load 'net/https', https support disabled" - end - end - - @host_rules = Rules.new( - :accept => options[:hosts], - :reject => options[:ignore_hosts] - ) - @port_rules = Rules.new( - :accept => options[:ports], - :reject => options[:ignore_ports] - ) - @link_rules = Rules.new( - :accept => options[:links], - :reject => options[:ignore_links] - ) - @ext_rules = Rules.new( - :accept => options[:exts], - :reject => options[:ignore_exts] - ) - - @every_url_blocks = [] - @every_failed_url_blocks = [] - @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] } - - @every_page_blocks = [] - + @running = false @delay = (options[:delay] || 0) - @history = [] - @failures = [] + @history = Set[] + @failures = Set[] @queue = [] - @paused = true - if options[:host] - visit_hosts_like(options[:host]) - end + @sessions = {} - if options[:queue] - self.queue = options[:queue] - end + super(options) - if options[:history] - self.history = options[:history] - end - block.call(self) if block end # - # Creates a new Agent object with the given _options_ and will begin - # spidering at the specified _url_. If a _block_ is given it will be - # passed the newly created Agent object, before the agent begins - # spidering. + # Creates a new agent and begin spidering at the given URL. # + # @param [URI::HTTP, String] url + # The URL to start spidering at. + # + # @param [Hash] options + # Additional options. See {Agent#initialize}. + # + # @yield [agent] + # If a block is given, it will be passed the newly created agent + # before it begins spidering. + # + # @yieldparam [Agent] agent + # The newly created agent. + # def self.start_at(url,options={},&block) self.new(options) do |spider| block.call(spider) if block spider.start_at(url) end end # - # Creates a new Agent object with the given _options_ and will begin - # spidering the specified host _name_. If a _block_ is given it will be - # passed the newly created Agent object, before the agent begins - # spidering. + # Creates a new agent and spiders the given host. # + # @param [String] + # The host-name to spider. + # + # @param [Hash] options + # Additional options. See {Agent#initialize}. + # + # @yield [agent] + # If a block is given, it will be passed the newly created agent + # before it begins spidering. + # + # @yieldparam [Agent] agent + # The newly created agent. + # def self.host(name,options={},&block) self.new(options.merge(:host => name)) do |spider| block.call(spider) if block spider.start_at("http://#{name}/") end end # - # Creates a new Agent object with the given _options_ and will begin - # spidering the host of the specified _url_. If a _block_ is given it - # will be passed the newly created Agent object, before the agent - # begins spidering. + # Creates a new agent and spiders the web-site located at the given URL. # + # @param [URI::HTTP, String] url + # The web-site to spider. + # + # @param [Hash] options + # Additional options. See {Agent#initialize}. + # + # @yield [agent] + # If a block is given, it will be passed the newly created agent + # before it begins spidering. + # + # @yieldparam [Agent] agent + # The newly created agent. + # def self.site(url,options={},&block) url = URI(url.to_s) return self.new(options.merge(:host => url.host)) do |spider| block.call(spider) if block @@ -169,528 +169,469 @@ spider.start_at(url) end end # - # Returns the +Array+ of host patterns to visit. + # Clears the history of the agent. # - def visit_hosts - @host_rules.accept - end - - # - # Adds the given _pattern_ to the visit_hosts. If a _block_ is given, - # it will be added to the visit_hosts. - # - def visit_hosts_like(pattern=nil,&block) - if pattern - visit_hosts << pattern - elsif block - visit_hosts << block - end - + def clear + @queue.clear + @history.clear + @failures.clear return self end # - # Returns the +Array+ of URL host patterns to not visit. + # Start spidering at a given URL. # - def ignore_hosts - @host_rules.reject - end - + # @param [URI::HTTP, String] url + # The URL to start spidering at. # - # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given, - # it will be added to the ignore_hosts. + # @yield [page] + # If a block is given, it will be passed every page visited. # - def ignore_hosts_like(pattern=nil,&block) - if pattern - ignore_hosts << pattern - elsif block - ignore_hosts << block - end + # @yieldparam [Page] page + # A page which has been visited. + # + def start_at(url,&block) + enqueue(url) - return self + return run(&block) end # - # Returns the +Array+ of URL port patterns to visit. + # Start spidering until the queue becomes empty or the agent is + # paused. # - def visit_ports - @port_rules.accept - end - + # @yield [page] + # If a block is given, it will be passed every page visited. # - # Adds the given _pattern_ to the visit_ports. If a _block_ is given, - # it will be added to the visit_ports. + # @yieldparam [Page] page + # A page which has been visited. # - def visit_ports_like(pattern=nil,&block) - if pattern - visit_ports << pattern - elsif block - visit_ports << block + def run(&block) + @running = true + + until (@queue.empty? || paused?) + begin + visit_page(dequeue,&block) + rescue Actions::Paused + return self + rescue Actions::Action + end end - return self - end + @running = false - # - # Returns the +Array+ of URL port patterns to not visit. - # - def ignore_ports - @port_rules.reject - end - - # - # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given, - # it will be added to the ignore_hosts. - # - def ignore_ports_like(pattern=nil,&block) - if pattern - ignore_ports << pattern - elsif block - ignore_ports << block + @sessions.each_value do |sess| + begin + sess.finish + rescue IOError + nil + end end + @sessions.clear return self end # - # Returns the +Array+ of link patterns to visit. + # Determines if the agent is running. # - def visit_links - @link_rules.accept + # @return [Boolean] + # Specifies whether the agent is running or stopped. + # + def running? + @running == true end # - # Adds the given _pattern_ to the visit_links. If a _block_ is given, - # it will be added to the visit_links. + # Sets the history of URLs that were previously visited. # - def visit_links_like(pattern=nil,&block) - if pattern - visit_links << pattern - elsif block - visit_links << block + # @param [#each] new_history + # A list of URLs to populate the history with. + # + # @return [Set<URI::HTTP>] + # The history of the agent. + # + # @example + # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] + # + def history=(new_history) + @history.clear + + new_history.each do |url| + @history << unless url.kind_of?(URI) + URI(url.to_s) + else + url + end end - return self + return @history end + alias visited_urls history + # - # Returns the +Array+ of link patterns to not visit. + # Specifies the links which have been visited. # - def ignore_links - @link_rules.reject + # @return [Array<String>] + # The links which have been visited. + # + def visited_links + @history.map { |url| url.to_s } end # - # Adds the given _pattern_ to the ignore_links. If a _block_ is given, - # it will be added to the ignore_links. + # Specifies all hosts that were visited. # - def ignore_links_like(pattern=nil,&block) - if pattern - ignore_links << pattern - elsif block - ignore_links << block - end - - return self + # @return [Array<String>] + # The hosts which have been visited. + # + def visited_hosts + visited_urls.map { |uri| uri.host }.uniq end # - # Returns the +Array+ of URL extension patterns to visit. + # Determines whether a URL was visited or not. # - def visit_exts - @ext_rules.accept - end - + # @param [URI::HTTP, String] url + # The URL to search for. # - # Adds the given _pattern_ to the visit_exts. If a _block_ is given, - # it will be added to the visit_exts. + # @return [Boolean] + # Specifies whether a URL was visited. # - def visit_exts_like(pattern=nil,&block) - if pattern - visit_exts << pattern - elsif block - visit_exts << block - end + def visited?(url) + url = URI(url.to_s) unless url.kind_of?(URI) - return self + return @history.include?(url) end # - # Returns the +Array+ of URL extension patterns to not visit. + # Sets the list of failed URLs. # - def ignore_exts - @ext_rules.reject - end - + # @param [#each] + # The new list of failed URLs. # - # Adds the given _pattern_ to the ignore_exts. If a _block_ is given, - # it will be added to the ignore_exts. + # @return [Array<URI::HTTP>] + # The list of failed URLs. # - def ignore_exts_like(pattern=nil,&block) - if pattern - ignore_exts << pattern - elsif block - ignore_exts << block + # @example + # agent.failures = ['http://localhost/'] + # + def failures=(new_failures) + @failures.clear + + new_failures.each do |url| + @failures << unless url.kind_of?(URI) + URI(url.to_s) + else + url + end end - return self + return @failures end # - # For every URL that the agent visits it will be passed to the - # specified _block_. + # Determines whether a given URL could not be visited. # - def every_url(&block) - @every_url_blocks << block - return self - end - + # @param [URI::HTTP, String] url + # The URL to check for failures. # - # For every URL that the agent is unable to visit, it will be passed - # to the specified _block_. + # @return [Boolean] + # Specifies whether the given URL was unable to be visited. # - def every_failed_url(&block) - @every_failed_url_blocks << block - return self - end + def failed?(url) + url = URI(url.to_s) unless url.kind_of?(URI) - # - # For every URL that the agent visits and matches the specified - # _pattern_, it will be passed to the specified _block_. - # - def urls_like(pattern,&block) - @urls_like_blocks[pattern] << block - return self + return @failures.include?(url) end - # - # For every Page that the agent visits, pass the page to the - # specified _block_. - # - def every_page(&block) - @every_page_blocks << block - return self - end + alias pending_urls queue # - # For every Page that the agent visits, pass the headers to the given - # _block_. + # Sets the queue of URLs to visit. # - def all_headers(&block) - every_page { |page| block.call(page.headers) } - end - + # @param [#each] + # The new list of URLs to visit. # - # Clears the history of the agent. + # @return [Array<URI::HTTP>] + # The list of URLs to visit. # - def clear + # @example + # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/'] + # + def queue=(new_queue) @queue.clear - @history.clear - @failures.clear - return self - end - # - # Start spidering at the specified _url_. - # - def start_at(url) - enqueue(url) - - return continue! - end - - # - # Start spidering until the queue becomes empty or the agent is - # paused. - # - def run - until (@queue.empty? || @paused == true) - visit_page(dequeue) + new_queue.each do |url| + @queue << unless url.kind_of?(URI) + URI(url.to_s) + else + url + end end - return self + return @queue end # - # Continue spidering. + # Determines whether a given URL has been enqueued. # - def continue! - @paused = false - return run - end - + # @param [URI::HTTP] url + # The URL to search for in the queue. # - # Returns +true+ if the agent is still spidering, returns +false+ - # otherwise. + # @return [Boolean] + # Specifies whether the given URL has been queued for visiting. # - def running? - @paused == false + def queued?(url) + @queue.include?(url) end # - # Returns +true+ if the agent is paused, returns +false+ otherwise. + # Enqueues a given URL for visiting, only if it passes all of the + # agent's rules for visiting a given URL. # - def paused? - @paused == true - end - + # @param [URI::HTTP, String] url + # The URL to enqueue for visiting. # - # Pauses the agent, causing spidering to temporarily stop. + # @return [Boolean] + # Specifies whether the URL was enqueued, or ignored. # - def pause! - @paused = true - return self - end + def enqueue(url) + link = url.to_s + url = URI(link) unless url.kind_of?(URI) - # - # Sets the list of acceptable URL schemes to follow to the - # _new_schemes_. - # - # agent.schemes = ['http'] - # - def schemes=(new_schemes) - @schemes = new_schemes.map { |scheme| scheme.to_s } - end + if (!(queued?(url)) && visit?(url)) + begin + @every_url_blocks.each { |block| block.call(url) } - # - # Sets the history of links that were previously visited to the - # specified _new_history_. - # - # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] - # - def history=(new_history) - @history = new_history.map do |url| - unless url.kind_of?(URI) - URI(url.to_s) - else - url + @urls_like_blocks.each do |pattern,blocks| + if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url) + blocks.each { |url_block| url_block.call(url) } + end + end + rescue Actions::Paused => action + raise(action) + rescue Actions::SkipLink + return false + rescue Actions::Action end + + @queue << url + return true end - end - alias visited_urls history - - # - # Returns the +Array+ of visited URLs. - # - def visited_links - @history.map { |uri| uri.to_s } + return false end # - # Return the +Array+ of hosts that were visited. + # Requests and creates a new Page object from a given URL. # - def visited_hosts - @history.map { |uri| uri.host }.uniq - end - + # @param [URI::HTTP] url + # The URL to request. # - # Returns +true+ if the specified _url_ was visited, returns +false+ - # otherwise. + # @yield [page] + # If a block is given, it will be passed the page that represents the + # response. # - def visited?(url) - url = URI(url) unless url.kind_of?(URI) - - return @history.include?(url) - end - + # @yieldparam [Page] page + # The page for the response. # - # Returns +true+ if the specified _url_ was unable to be visited, - # returns +false+ otherwise. + # @return [Page, nil] + # The page for the response, or +nil+ if the request failed. # - def failed?(url) - url = URI(url) unless url.kind_of?(URI) - - return @failures.include?(url) - end - - alias pending_urls queue - - # - # Creates a new Page object from the specified _url_. If a _block_ is - # given, it will be passed the newly created Page object. - # def get_page(url,&block) + url = URI(url.to_s) unless url.kind_of?(URI) + host = url.host port = url.port unless url.path.empty? path = url.path else path = '/' end - proxy_host = @proxy[:host] - proxy_port = @proxy[:port] - proxy_user = @proxy[:user] - proxy_password = @proxy[:password] + # append the URL query to the path + path += "?#{url.query}" if url.query begin - Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess| + get_session(url.scheme,host,port) do |sess| headers = {} - headers['User-Agent'] = @user_agent if @user_agent headers['Referer'] = @referer if @referer new_page = Page.new(url,sess.get(path,headers)) block.call(new_page) if block return new_page end - rescue SystemCallError, Net::HTTPBadResponse + rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError failed(url) + kill_session(url.scheme,host,port) return nil end end # - # Returns the agent represented as a Hash containing the agents - # +history+ and +queue+ information. + # Visits a given URL, and enqueus the links recovered from the URL + # to be visited later. # - def to_hash - {:history => @history, :queue => @queue} - end - + # @param [URI::HTTP, String] url + # The URL to visit. # - # Sets the queue of links to visit to the specified _new_queue_. + # @yield [page] + # If a block is given, it will be passed the page which was visited. # - # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/'] + # @yieldparam [Page] page + # The page which was visited. # - def queue=(new_queue) - @queue = new_queue.map do |url| - unless url.kind_of?(URI) - URI(url.to_s) - else - url + # @return [Page, nil] + # The page that was visited. If +nil+ is returned, either the request + # for the page failed, or the page was skipped. + # + def visit_page(url,&block) + url = URI(url.to_s) unless url.kind_of?(URI) + + get_page(url) do |page| + @history << page.url + + begin + @every_page_blocks.each { |page_block| page_block.call(page) } + + block.call(page) if block + rescue Actions::Paused => action + raise(action) + rescue Actions::SkipPage + return nil + rescue Actions::Action end + + page.urls.each { |next_url| enqueue(next_url) } end end # - # Returns +true+ if the specified _url_ is queued for visiting, returns - # +false+ otherwise. + # Converts the agent into a Hash. # - def queued?(url) - @queue.include?(url) + # @return [Hash] + # The agent represented as a Hash containing the +history+ and + # the +queue+ of the agent. + # + def to_hash + {:history => @history, :queue => @queue} end + protected + # - # Enqueues the specified _url_ for visiting, only if it passes all the - # agent's rules for visiting a given URL. Returns +true+ if the _url_ - # was successfully enqueued, returns +false+ otherwise. + # Provides an active HTTP session for the given scheme, host + # and port. # - def enqueue(url) - link = url.to_s - url = URI(link) + # @param [String] scheme + # The scheme of the URL, which will be requested later. + # + # @param [String] host + # The host that the session is needed with. + # + # @param [Integer] port + # The port that the session is needed for. + # + # @yield [session] + # If a block is given, it will be passed the active HTTP session. + # + # @yieldparam [Net::HTTP] session + # The active HTTP session object. + # + def get_session(scheme,host,port,&block) + key = [scheme,host,port] - if (!(queued?(url)) && visit?(url)) - @every_url_blocks.each { |block| block.call(url) } + unless @sessions[key] + session = Net::HTTP::Proxy( + @proxy[:host], + @proxy[:port], + @proxy[:user], + @proxy[:password] + ).new(host,port) - @urls_like_blocks.each do |pattern,blocks| - if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url) - blocks.each { |url_block| url_block.call(url) } - end + if scheme == 'https' + session.use_ssl = true + session.verify_mode = OpenSSL::SSL::VERIFY_NONE end - @queue << url - return true + @sessions[key] = session end - return false + session = @sessions[key] + block.call(session) if block + return session end - protected - # - # Dequeues a URL that will later be visited. + # Destroys an HTTP session for the given scheme, host and port. # - def dequeue - @queue.shift - end - + # @param [String] scheme + # The scheme of the URL, which was requested through the session. # - # Returns +true+ if the specified _url_ should be visited, based on - # it's scheme, returns +false+ otherwise. + # @param [String] host + # The host that the session was connected with. # - def visit_scheme?(url) - if url.scheme - return @schemes.include?(url.scheme) - else - return true + # @param [Integer] port + # The port that the session was connected to. + # + def kill_session(scheme,host,port,&block) + key = [scheme,host,port] + sess = @sessions[key] + + begin + sess.finish + rescue IOError + nil end - end - # - # Returns +true+ if the specified _url_ should be visited, based on - # the host of the _url_, returns +false+ otherwise. - # - def visit_host?(url) - @host_rules.accept?(url.host) + @sessions.delete(key) + block.call if block + return nil end # - # Returns +true+ if the specified _url_ should be visited, based on - # the port of the _url_, returns +false+ otherwise. + # Dequeues a URL that will later be visited. # - def visit_port?(url) - @port_rules.accept?(url.port) - end - + # @return [URI::HTTP] + # The URL that was at the front of the queue. # - # Returns +true+ if the specified _url_ should be visited, based on - # the pattern of the _url_, returns +false+ otherwise. - # - def visit_link?(url) - @link_rules.accept?(url.to_s) + def dequeue + @queue.shift end # - # Returns +true+ if the specified _url_ should be visited, based on - # the file extension of the _url_, returns +false+ otherwise. + # Determines if a given URL should be visited. # - def visit_ext?(url) - @ext_rules.accept?(File.extname(url.path)[1..-1]) - end - + # @param [URI::HTTP] url + # The URL in question. # - # Returns +true+ if the specified URL should be visited, returns - # +false+ otherwise. + # @return [Boolean] + # Specifies whether the given URL should be visited. # def visit?(url) (!(visited?(url)) && - visit_scheme?(url) && - visit_host?(url) && - visit_port?(url) && - visit_link?(url) && - visit_ext?(url)) + visit_scheme?(url.scheme) && + visit_host?(url.host) && + visit_port?(url.port) && + visit_link?(url.to_s) && + visit_ext?(url.path)) end # - # Visits the spedified _url_ and enqueus it's links for visiting. If a - # _block_ is given, it will be passed a newly created Page object - # for the specified _url_. + # Adds a given URL to the failures list. # - def visit_page(url,&block) - get_page(url) do |page| - @history << page.url - - page.urls.each { |next_url| enqueue(next_url) } - - @every_page_blocks.each { |page_block| page_block.call(page) } - - block.call(page) if block - end - end - + # @param [URI::HTTP] url + # The URL to add to the failures list. # - # Adds the specified _url_ to the failures list. - # def failed(url) - url = URI(url.to_s) unless url.kind_of?(URI) - @every_failed_url_blocks.each { |block| block.call(url) } @failures << url return true end