agent.rb in spidr-0.2.0

- old
+ new

@@ -1,14 +1,21 @@
-require 'spidr/rules'
+require 'spidr/filters'
+require 'spidr/events'
+require 'spidr/actions'
 require 'spidr/page'
 require 'spidr/spidr'
 
 require 'net/http'
+require 'set'
 
 module Spidr
   class Agent
 
+    include Filters
+    include Events
+    include Actions
+
     # Proxy to use
     attr_accessor :proxy
 
     # User-Agent to use
     attr_accessor :user_agent
@@ -17,151 +24,144 @@
     attr_accessor :referer
 
     # Delay in between fetching pages
     attr_accessor :delay
 
-    # List of acceptable URL schemes to follow
-    attr_reader :schemes
-
     # History containing visited URLs
     attr_reader :history
 
     # List of unreachable URLs
     attr_reader :failures
 
     # Queue of URLs to visit
     attr_reader :queue
 
     #
-    # Creates a new Agent object with the given _options_ and _block_.
-    # If a _block_ is given, it will be passed the newly created
-    # Agent object.
+    # Creates a new Agent object.
     #
-    # _options_ may contain the following keys:
-    # <tt>:proxy</tt>:: The proxy to use while spidering.
-    # <tt>:user_agent</tt>:: The User-Agent string to send.
-    # <tt>:referer</tt>:: The referer URL to send.
-    # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
-    #                   link. Defaults to 0.
-    # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
-    #                     Defaults to +http+ and +https+. +https+ URL
-    #                     schemes will be ignored if <tt>net/http</tt>
-    #                     cannot be loaded.
-    # <tt>:host</tt>:: The host-name to visit.
-    # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
-    # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
-    # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
-    # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
-    # <tt>:links</tt>:: An +Array+ of link patterns to visit.
-    # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
-    # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
-    # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
-    #                         visit.
-    # <tt>:queue</tt>:: An initial queue of URLs to visit.
-    # <tt>:history</tt>:: An initial list of visited URLs.
+    # @param [Hash] options
+    #   Additional options
     #
+    # @option options [Hash] :proxy (Spidr.proxy)
+    #   The proxy information to use.
+    #
+    # @option :proxy [String] :host
+    #   The host the proxy is running on.
+    #
+    # @option :proxy [Integer] :port
+    #   The port the proxy is running on.
+    #
+    # @option :proxy [String] :user
+    #   The user to authenticate as with the proxy.
+    #
+    # @option :proxy [String] :password
+    #   The password to authenticate with.
+    #
+    # @option options [String] :user_agent (Spidr.user_agent)
+    #   The User-Agent string to send with each requests.
+    #
+    # @option options [String] :referer
+    #   The Referer URL to send with each request.
+    #
+    # @option options [Integer] :delay (0)
+    #   The number of seconds to pause between each request.
+    #
+    # @option options [Set, Array] :queue
+    #   The initial queue of URLs to visit.
+    #
+    # @option options [Set, Array] :history
+    #   The initial list of visited URLs.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   for further configuration.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
     def initialize(options={},&block)
       @proxy = (options[:proxy] || Spidr.proxy)
       @user_agent = (options[:user_agent] || Spidr.user_agent)
       @referer = options[:referer]
 
-      @schemes = []
-
-      if options[:schemes]
-        @schemes += options[:schemes]
-      else
-        @schemes << 'http'
-
-        begin
-          require 'net/https'
-
-          @schemes << 'https'
-        rescue Gem::LoadError => e
-          raise(e)
-        rescue ::LoadError
-          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
-        end
-      end
-
-      @host_rules = Rules.new(
-        :accept => options[:hosts],
-        :reject => options[:ignore_hosts]
-      )
-      @port_rules = Rules.new(
-        :accept => options[:ports],
-        :reject => options[:ignore_ports]
-      )
-      @link_rules = Rules.new(
-        :accept => options[:links],
-        :reject => options[:ignore_links]
-      )
-      @ext_rules = Rules.new(
-        :accept => options[:exts],
-        :reject => options[:ignore_exts]
-      )
-
-      @every_url_blocks = []
-      @every_failed_url_blocks = []
-      @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
-
-      @every_page_blocks = []
-
+      @running = false
       @delay = (options[:delay] || 0)
-      @history = []
-      @failures = []
+      @history = Set[]
+      @failures = Set[]
       @queue = []
-      @paused = true
 
-      if options[:host]
-        visit_hosts_like(options[:host])
-      end
+      @sessions = {}
 
-      if options[:queue]
-        self.queue = options[:queue]
-      end
+      super(options)
 
-      if options[:history]
-        self.history = options[:history]
-      end
-
       block.call(self) if block
     end
 
     #
-    # Creates a new Agent object with the given _options_ and will begin
-    # spidering at the specified _url_. If a _block_ is given it will be
-    # passed the newly created Agent object, before the agent begins
-    # spidering.
+    # Creates a new agent and begin spidering at the given URL.
     #
+    # @param [URI::HTTP, String] url
+    #   The URL to start spidering at.
+    #
+    # @param [Hash] options
+    #   Additional options. See {Agent#initialize}.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   before it begins spidering.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
     def self.start_at(url,options={},&block)
       self.new(options) do |spider|
         block.call(spider) if block
 
         spider.start_at(url)
       end
     end
 
     #
-    # Creates a new Agent object with the given _options_ and will begin
-    # spidering the specified host _name_. If a _block_ is given it will be
-    # passed the newly created Agent object, before the agent begins
-    # spidering.
+    # Creates a new agent and spiders the given host.
     #
+    # @param [String]
+    #   The host-name to spider.
+    #
+    # @param [Hash] options
+    #   Additional options. See {Agent#initialize}.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   before it begins spidering.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
     def self.host(name,options={},&block)
       self.new(options.merge(:host => name)) do |spider|
         block.call(spider) if block
 
         spider.start_at("http://#{name}/")
       end
     end
 
     #
-    # Creates a new Agent object with the given _options_ and will begin
-    # spidering the host of the specified _url_. If a _block_ is given it
-    # will be passed the newly created Agent object, before the agent
-    # begins spidering.
+    # Creates a new agent and spiders the web-site located at the given URL.
     #
+    # @param [URI::HTTP, String] url
+    #   The web-site to spider.
+    #
+    # @param [Hash] options
+    #   Additional options. See {Agent#initialize}.
+    #
+    # @yield [agent]
+    #   If a block is given, it will be passed the newly created agent
+    #   before it begins spidering.
+    #
+    # @yieldparam [Agent] agent
+    #   The newly created agent.
+    #
     def self.site(url,options={},&block)
       url = URI(url.to_s)
 
       return self.new(options.merge(:host => url.host)) do |spider|
         block.call(spider) if block
@@ -169,528 +169,469 @@
         spider.start_at(url)
       end
     end
 
     #
-    # Returns the +Array+ of host patterns to visit.
+    # Clears the history of the agent.
     #
-    def visit_hosts
-      @host_rules.accept
-    end
-
-    #
-    # Adds the given _pattern_ to the visit_hosts. If a _block_ is given,
-    # it will be added to the visit_hosts.
-    #
-    def visit_hosts_like(pattern=nil,&block)
-      if pattern
-        visit_hosts << pattern
-      elsif block
-        visit_hosts << block
-      end
-
+    def clear
+      @queue.clear
+      @history.clear
+      @failures.clear
       return self
     end
 
     #
-    # Returns the +Array+ of URL host patterns to not visit.
+    # Start spidering at a given URL.
     #
-    def ignore_hosts
-      @host_rules.reject
-    end
-
+    # @param [URI::HTTP, String] url
+    #   The URL to start spidering at.
     #
-    # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
-    # it will be added to the ignore_hosts.
+    # @yield [page]
+    #   If a block is given, it will be passed every page visited.
     #
-    def ignore_hosts_like(pattern=nil,&block)
-      if pattern
-        ignore_hosts << pattern
-      elsif block
-        ignore_hosts << block
-      end
+    # @yieldparam [Page] page
+    #   A page which has been visited.
+    #
+    def start_at(url,&block)
+      enqueue(url)
 
-      return self
+      return run(&block)
     end
 
     #
-    # Returns the +Array+ of URL port patterns to visit.
+    # Start spidering until the queue becomes empty or the agent is
+    # paused.
     #
-    def visit_ports
-      @port_rules.accept
-    end
-
+    # @yield [page]
+    #   If a block is given, it will be passed every page visited.
     #
-    # Adds the given _pattern_ to the visit_ports. If a _block_ is given,
-    # it will be added to the visit_ports.
+    # @yieldparam [Page] page
+    #   A page which has been visited.
     #
-    def visit_ports_like(pattern=nil,&block)
-      if pattern
-        visit_ports << pattern
-      elsif block
-        visit_ports << block
+    def run(&block)
+      @running = true
+
+      until (@queue.empty? || paused?)
+        begin
+          visit_page(dequeue,&block)
+        rescue Actions::Paused
+          return self
+        rescue Actions::Action
+        end
       end
 
-      return self
-    end
+      @running = false
 
-    #
-    # Returns the +Array+ of URL port patterns to not visit.
-    #
-    def ignore_ports
-      @port_rules.reject
-    end
-
-    #
-    # Adds the given _pattern_ to the ignore_hosts. If a _block_ is given,
-    # it will be added to the ignore_hosts.
-    #
-    def ignore_ports_like(pattern=nil,&block)
-      if pattern
-        ignore_ports << pattern
-      elsif block
-        ignore_ports << block
+      @sessions.each_value do |sess|
+        begin
+          sess.finish
+        rescue IOError
+          nil
+        end
       end
 
+      @sessions.clear
       return self
     end
 
     #
-    # Returns the +Array+ of link patterns to visit.
+    # Determines if the agent is running.
     #
-    def visit_links
-      @link_rules.accept
+    # @return [Boolean]
+    #   Specifies whether the agent is running or stopped.
+    #
+    def running?
+      @running == true
     end
 
     #
-    # Adds the given _pattern_ to the visit_links. If a _block_ is given,
-    # it will be added to the visit_links.
+    # Sets the history of URLs that were previously visited.
     #
-    def visit_links_like(pattern=nil,&block)
-      if pattern
-        visit_links << pattern
-      elsif block
-        visit_links << block
+    # @param [#each] new_history
+    #   A list of URLs to populate the history with.
+    #
+    # @return [Set<URI::HTTP>]
+    #   The history of the agent.
+    #
+    # @example
+    #   agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
+    #
+    def history=(new_history)
+      @history.clear
+
+      new_history.each do |url|
+        @history << unless url.kind_of?(URI)
+                      URI(url.to_s)
+                    else
+                      url
+                    end
       end
 
-      return self
+      return @history
     end
 
+    alias visited_urls history
+
     #
-    # Returns the +Array+ of link patterns to not visit.
+    # Specifies the links which have been visited.
     #
-    def ignore_links
-      @link_rules.reject
+    # @return [Array<String>]
+    #   The links which have been visited.
+    #
+    def visited_links
+      @history.map { |url| url.to_s }
     end
 
     #
-    # Adds the given _pattern_ to the ignore_links. If a _block_ is given,
-    # it will be added to the ignore_links.
+    # Specifies all hosts that were visited.
     #
-    def ignore_links_like(pattern=nil,&block)
-      if pattern
-        ignore_links << pattern
-      elsif block
-        ignore_links << block
-      end
-
-      return self
+    # @return [Array<String>]
+    #   The hosts which have been visited.
+    #
+    def visited_hosts
+      visited_urls.map { |uri| uri.host }.uniq
     end
 
     #
-    # Returns the +Array+ of URL extension patterns to visit.
+    # Determines whether a URL was visited or not.
     #
-    def visit_exts
-      @ext_rules.accept
-    end
-
+    # @param [URI::HTTP, String] url
+    #   The URL to search for.
     #
-    # Adds the given _pattern_ to the visit_exts. If a _block_ is given,
-    # it will be added to the visit_exts.
+    # @return [Boolean]
+    #   Specifies whether a URL was visited.
     #
-    def visit_exts_like(pattern=nil,&block)
-      if pattern
-        visit_exts << pattern
-      elsif block
-        visit_exts << block
-      end
+    def visited?(url)
+      url = URI(url.to_s) unless url.kind_of?(URI)
 
-      return self
+      return @history.include?(url)
     end
 
     #
-    # Returns the +Array+ of URL extension patterns to not visit.
+    # Sets the list of failed URLs.
     #
-    def ignore_exts
-      @ext_rules.reject
-    end
-
+    # @param [#each]
+    #   The new list of failed URLs.
     #
-    # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
-    # it will be added to the ignore_exts.
+    # @return [Array<URI::HTTP>]
+    #   The list of failed URLs.
     #
-    def ignore_exts_like(pattern=nil,&block)
-      if pattern
-        ignore_exts << pattern
-      elsif block
-        ignore_exts << block
+    # @example
+    #   agent.failures = ['http://localhost/']
+    #
+    def failures=(new_failures)
+      @failures.clear
+
+      new_failures.each do |url|
+        @failures << unless url.kind_of?(URI)
+                    URI(url.to_s)
+                  else
+                    url
+                  end
       end
 
-      return self
+      return @failures
     end
 
     #
-    # For every URL that the agent visits it will be passed to the
-    # specified _block_.
+    # Determines whether a given URL could not be visited.
     #
-    def every_url(&block)
-      @every_url_blocks << block
-      return self
-    end
-
+    # @param [URI::HTTP, String] url
+    #   The URL to check for failures.
     #
-    # For every URL that the agent is unable to visit, it will be passed
-    # to the specified _block_.
+    # @return [Boolean]
+    #   Specifies whether the given URL was unable to be visited.
     #
-    def every_failed_url(&block)
-      @every_failed_url_blocks << block
-      return self
-    end
+    def failed?(url)
+      url = URI(url.to_s) unless url.kind_of?(URI)
 
-    #
-    # For every URL that the agent visits and matches the specified
-    # _pattern_, it will be passed to the specified _block_.
-    #
-    def urls_like(pattern,&block)
-      @urls_like_blocks[pattern] << block
-      return self
+      return @failures.include?(url)
     end
 
-    #
-    # For every Page that the agent visits, pass the page to the
-    # specified _block_.
-    #
-    def every_page(&block)
-      @every_page_blocks << block
-      return self
-    end
+    alias pending_urls queue
 
     #
-    # For every Page that the agent visits, pass the headers to the given
-    # _block_.
+    # Sets the queue of URLs to visit.
     #
-    def all_headers(&block)
-      every_page { |page| block.call(page.headers) }
-    end
-
+    # @param [#each]
+    #   The new list of URLs to visit.
     #
-    # Clears the history of the agent.
+    # @return [Array<URI::HTTP>]
+    #   The list of URLs to visit.
     #
-    def clear
+    # @example
+    #   agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
+    #
+    def queue=(new_queue)
       @queue.clear
-      @history.clear
-      @failures.clear
-      return self
-    end
 
-    #
-    # Start spidering at the specified _url_.
-    #
-    def start_at(url)
-      enqueue(url)
-
-      return continue!
-    end
-
-    #
-    # Start spidering until the queue becomes empty or the agent is
-    # paused.
-    #
-    def run
-      until (@queue.empty? || @paused == true)
-        visit_page(dequeue)
+      new_queue.each do |url|
+        @queue << unless url.kind_of?(URI)
+                    URI(url.to_s)
+                  else
+                    url
+                  end
       end
 
-      return self
+      return @queue
     end
 
     #
-    # Continue spidering.
+    # Determines whether a given URL has been enqueued.
     #
-    def continue!
-      @paused = false
-      return run
-    end
-
+    # @param [URI::HTTP] url
+    #   The URL to search for in the queue.
     #
-    # Returns +true+ if the agent is still spidering, returns +false+
-    # otherwise.
+    # @return [Boolean]
+    #   Specifies whether the given URL has been queued for visiting.
     #
-    def running?
-      @paused == false
+    def queued?(url)
+      @queue.include?(url)
     end
 
     #
-    # Returns +true+ if the agent is paused, returns +false+ otherwise.
+    # Enqueues a given URL for visiting, only if it passes all of the
+    # agent's rules for visiting a given URL.
     #
-    def paused?
-      @paused == true
-    end
-
+    # @param [URI::HTTP, String] url
+    #   The URL to enqueue for visiting.
     #
-    # Pauses the agent, causing spidering to temporarily stop.
+    # @return [Boolean]
+    #   Specifies whether the URL was enqueued, or ignored.
     #
-    def pause!
-      @paused = true
-      return self
-    end
+    def enqueue(url)
+      link = url.to_s
+      url = URI(link) unless url.kind_of?(URI)
 
-    #
-    # Sets the list of acceptable URL schemes to follow to the
-    # _new_schemes_.
-    #
-    #   agent.schemes = ['http']
-    #
-    def schemes=(new_schemes)
-      @schemes = new_schemes.map { |scheme| scheme.to_s }
-    end
+      if (!(queued?(url)) && visit?(url))
+        begin
+          @every_url_blocks.each { |block| block.call(url) }
 
-    #
-    # Sets the history of links that were previously visited to the
-    # specified _new_history_.
-    #
-    #   agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
-    #
-    def history=(new_history)
-      @history = new_history.map do |url|
-        unless url.kind_of?(URI)
-          URI(url.to_s)
-        else
-          url
+          @urls_like_blocks.each do |pattern,blocks|
+            if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
+              blocks.each { |url_block| url_block.call(url) }
+            end
+          end
+        rescue Actions::Paused => action
+          raise(action)
+        rescue Actions::SkipLink
+          return false
+        rescue Actions::Action
         end
+
+        @queue << url
+        return true
       end
-    end
 
-    alias visited_urls history
-
-    #
-    # Returns the +Array+ of visited URLs.
-    #
-    def visited_links
-      @history.map { |uri| uri.to_s }
+      return false
     end
 
     #
-    # Return the +Array+ of hosts that were visited.
+    # Requests and creates a new Page object from a given URL.
     #
-    def visited_hosts
-      @history.map { |uri| uri.host }.uniq
-    end
-
+    # @param [URI::HTTP] url
+    #   The URL to request.
     #
-    # Returns +true+ if the specified _url_ was visited, returns +false+
-    # otherwise.
+    # @yield [page]
+    #   If a block is given, it will be passed the page that represents the
+    #   response.
     #
-    def visited?(url)
-      url = URI(url) unless url.kind_of?(URI)
-
-      return @history.include?(url)
-    end
-
+    # @yieldparam [Page] page
+    #   The page for the response.
     #
-    # Returns +true+ if the specified _url_ was unable to be visited,
-    # returns +false+ otherwise.
+    # @return [Page, nil]
+    #   The page for the response, or +nil+ if the request failed.
     #
-    def failed?(url)
-      url = URI(url) unless url.kind_of?(URI)
-
-      return @failures.include?(url)
-    end
-
-    alias pending_urls queue
-
-    #
-    # Creates a new Page object from the specified _url_. If a _block_ is
-    # given, it will be passed the newly created Page object.
-    #
     def get_page(url,&block)
+      url = URI(url.to_s) unless url.kind_of?(URI)
+
       host = url.host
       port = url.port
 
       unless url.path.empty?
         path = url.path
       else
         path = '/'
       end
 
-      proxy_host = @proxy[:host]
-      proxy_port = @proxy[:port]
-      proxy_user = @proxy[:user]
-      proxy_password = @proxy[:password]
+      # append the URL query to the path
+      path += "?#{url.query}" if url.query
 
       begin
-        Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+        get_session(url.scheme,host,port) do |sess|
           headers = {}
-
           headers['User-Agent'] = @user_agent if @user_agent
           headers['Referer'] = @referer if @referer
 
           new_page = Page.new(url,sess.get(path,headers))
 
           block.call(new_page) if block
           return new_page
         end
-      rescue SystemCallError, Net::HTTPBadResponse
+      rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
         failed(url)
+        kill_session(url.scheme,host,port)
         return nil
       end
     end
 
     #
-    # Returns the agent represented as a Hash containing the agents
-    # +history+ and +queue+ information.
+    # Visits a given URL, and enqueus the links recovered from the URL
+    # to be visited later.
     #
-    def to_hash
-      {:history => @history, :queue => @queue}
-    end
-
+    # @param [URI::HTTP, String] url
+    #   The URL to visit.
     #
-    # Sets the queue of links to visit to the specified _new_queue_.
+    # @yield [page]
+    #   If a block is given, it will be passed the page which was visited.
     #
-    #   agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
+    # @yieldparam [Page] page
+    #   The page which was visited.
     #
-    def queue=(new_queue)
-      @queue = new_queue.map do |url|
-        unless url.kind_of?(URI)
-          URI(url.to_s)
-        else
-          url
+    # @return [Page, nil]
+    #   The page that was visited. If +nil+ is returned, either the request
+    #   for the page failed, or the page was skipped.
+    #
+    def visit_page(url,&block)
+      url = URI(url.to_s) unless url.kind_of?(URI)
+
+      get_page(url) do |page|
+        @history << page.url
+
+        begin
+          @every_page_blocks.each { |page_block| page_block.call(page) }
+
+          block.call(page) if block
+        rescue Actions::Paused => action
+          raise(action)
+        rescue Actions::SkipPage
+          return nil
+        rescue Actions::Action
         end
+
+        page.urls.each { |next_url| enqueue(next_url) }
       end
     end
 
     #
-    # Returns +true+ if the specified _url_ is queued for visiting, returns
-    # +false+ otherwise.
+    # Converts the agent into a Hash.
     #
-    def queued?(url)
-      @queue.include?(url)
+    # @return [Hash]
+    #   The agent represented as a Hash containing the +history+ and
+    #   the +queue+ of the agent.
+    #
+    def to_hash
+      {:history => @history, :queue => @queue}
     end
 
+    protected
+
     #
-    # Enqueues the specified _url_ for visiting, only if it passes all the
-    # agent's rules for visiting a given URL. Returns +true+ if the _url_
-    # was successfully enqueued, returns +false+ otherwise.
+    # Provides an active HTTP session for the given scheme, host
+    # and port.
     #
-    def enqueue(url)
-      link = url.to_s
-      url = URI(link)
+    # @param [String] scheme
+    #   The scheme of the URL, which will be requested later.
+    #
+    # @param [String] host
+    #   The host that the session is needed with.
+    #
+    # @param [Integer] port
+    #   The port that the session is needed for.
+    #
+    # @yield [session]
+    #   If a block is given, it will be passed the active HTTP session.
+    #
+    # @yieldparam [Net::HTTP] session
+    #   The active HTTP session object.
+    #
+    def get_session(scheme,host,port,&block)
+      key = [scheme,host,port]
 
-      if (!(queued?(url)) && visit?(url))
-        @every_url_blocks.each { |block| block.call(url) }
+      unless @sessions[key]
+        session = Net::HTTP::Proxy(
+          @proxy[:host],
+          @proxy[:port],
+          @proxy[:user],
+          @proxy[:password]
+        ).new(host,port)
 
-        @urls_like_blocks.each do |pattern,blocks|
-          if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
-            blocks.each { |url_block| url_block.call(url) }
-          end
+        if scheme == 'https'
+          session.use_ssl = true
+          session.verify_mode = OpenSSL::SSL::VERIFY_NONE
         end
 
-        @queue << url
-        return true
+        @sessions[key] = session
       end
 
-      return false
+      session = @sessions[key]
+      block.call(session) if block
+      return session
     end
 
-    protected
-
     #
-    # Dequeues a URL that will later be visited.
+    # Destroys an HTTP session for the given scheme, host and port.
     #
-    def dequeue
-      @queue.shift
-    end
-
+    # @param [String] scheme
+    #   The scheme of the URL, which was requested through the session.
     #
-    # Returns +true+ if the specified _url_ should be visited, based on
-    # it's scheme, returns +false+ otherwise.
+    # @param [String] host
+    #   The host that the session was connected with.
     #
-    def visit_scheme?(url)
-      if url.scheme
-        return @schemes.include?(url.scheme)
-      else
-        return true
+    # @param [Integer] port
+    #   The port that the session was connected to.
+    #
+    def kill_session(scheme,host,port,&block)
+      key = [scheme,host,port]
+      sess = @sessions[key]
+
+      begin 
+        sess.finish
+      rescue IOError
+        nil
       end
-    end
 
-    #
-    # Returns +true+ if the specified _url_ should be visited, based on
-    # the host of the _url_, returns +false+ otherwise.
-    #
-    def visit_host?(url)
-      @host_rules.accept?(url.host)
+      @sessions.delete(key)
+      block.call if block
+      return nil
     end
 
     #
-    # Returns +true+ if the specified _url_ should be visited, based on
-    # the port of the _url_, returns +false+ otherwise.
+    # Dequeues a URL that will later be visited.
     #
-    def visit_port?(url)
-      @port_rules.accept?(url.port)
-    end
-
+    # @return [URI::HTTP]
+    #   The URL that was at the front of the queue.
     #
-    # Returns +true+ if the specified _url_ should be visited, based on
-    # the pattern of the _url_, returns +false+ otherwise.
-    #
-    def visit_link?(url)
-      @link_rules.accept?(url.to_s)
+    def dequeue
+      @queue.shift
     end
 
     #
-    # Returns +true+ if the specified _url_ should be visited, based on
-    # the file extension of the _url_, returns +false+ otherwise.
+    # Determines if a given URL should be visited.
     #
-    def visit_ext?(url)
-      @ext_rules.accept?(File.extname(url.path)[1..-1])
-    end
-
+    # @param [URI::HTTP] url
+    #   The URL in question.
     #
-    # Returns +true+ if the specified URL should be visited, returns
-    # +false+ otherwise.
+    # @return [Boolean]
+    #   Specifies whether the given URL should be visited.
     #
     def visit?(url)
       (!(visited?(url)) &&
-       visit_scheme?(url) &&
-       visit_host?(url) &&
-       visit_port?(url) &&
-       visit_link?(url) &&
-       visit_ext?(url))
+       visit_scheme?(url.scheme) &&
+       visit_host?(url.host) &&
+       visit_port?(url.port) &&
+       visit_link?(url.to_s) &&
+       visit_ext?(url.path))
     end
 
     #
-    # Visits the spedified _url_ and enqueus it's links for visiting. If a
-    # _block_ is given, it will be passed a newly created Page object
-    # for the specified _url_.
+    # Adds a given URL to the failures list.
     #
-    def visit_page(url,&block)
-      get_page(url) do |page|
-        @history << page.url
-
-        page.urls.each { |next_url| enqueue(next_url) }
-
-        @every_page_blocks.each { |page_block| page_block.call(page) }
-
-        block.call(page) if block
-      end
-    end
-
+    # @param [URI::HTTP] url
+    #   The URL to add to the failures list.
     #
-    # Adds the specified _url_ to the failures list.
-    #
     def failed(url)
-      url = URI(url.to_s) unless url.kind_of?(URI)
-
       @every_failed_url_blocks.each { |block| block.call(url) }
       @failures << url
       return true
     end