agent.rb in spidr-0.1.9

- old
+ new

@@ -5,13 +5,10 @@
 require 'net/http'
 
 module Spidr
   class Agent
 
-    # URL schemes to visit
-    SCHEMES = ['http', 'https']
-
     # Proxy to use
     attr_accessor :proxy
 
     # User-Agent to use
     attr_accessor :user_agent
@@ -20,10 +17,13 @@
     attr_accessor :referer
 
     # Delay in between fetching pages
     attr_accessor :delay
 
+    # List of acceptable URL schemes to follow
+    attr_reader :schemes
+
     # History containing visited URLs
     attr_reader :history
 
     # List of unreachable URLs
     attr_reader :failures
@@ -40,26 +40,50 @@
     # <tt>:proxy</tt>:: The proxy to use while spidering.
     # <tt>:user_agent</tt>:: The User-Agent string to send.
     # <tt>:referer</tt>:: The referer URL to send.
     # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
     #                   link. Defaults to 0.
+    # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
+    #                     Defaults to +http+ and +https+. +https+ URL
+    #                     schemes will be ignored if <tt>net/http</tt>
+    #                     cannot be loaded.
     # <tt>:host</tt>:: The host-name to visit.
     # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
     # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
     # <tt>:ports</tt>:: An +Array+ of port patterns to visit.
     # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
     # <tt>:links</tt>:: An +Array+ of link patterns to visit.
     # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
     # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
     # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
     #                         visit.
+    # <tt>:queue</tt>:: An initial queue of URLs to visit.
+    # <tt>:history</tt>:: An initial list of visited URLs.
     #
     def initialize(options={},&block)
       @proxy = (options[:proxy] || Spidr.proxy)
       @user_agent = (options[:user_agent] || Spidr.user_agent)
       @referer = options[:referer]
 
+      @schemes = []
+
+      if options[:schemes]
+        @schemes += options[:schemes]
+      else
+        @schemes << 'http'
+
+        begin
+          require 'net/https'
+
+          @schemes << 'https'
+        rescue Gem::LoadError => e
+          raise(e)
+        rescue ::LoadError
+          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
+        end
+      end
+
       @host_rules = Rules.new(
         :accept => options[:hosts],
         :reject => options[:ignore_hosts]
       )
       @port_rules = Rules.new(
@@ -89,10 +113,18 @@
 
       if options[:host]
         visit_hosts_like(options[:host])
       end
 
+      if options[:queue]
+        self.queue = options[:queue]
+      end
+
+      if options[:history]
+        self.history = options[:history]
+      end
+
       block.call(self) if block
     end
 
     #
     # Creates a new Agent object with the given _options_ and will begin
@@ -359,14 +391,13 @@
       @failures.clear
       return self
     end
 
     #
-    # Clear the history and start spidering at the specified _url_.
+    # Start spidering at the specified _url_.
     #
     def start_at(url)
-      clear
       enqueue(url)
 
       return continue!
     end
 
@@ -412,10 +443,20 @@
       @paused = true
       return self
     end
 
     #
+    # Sets the list of acceptable URL schemes to follow to the
+    # _new_schemes_.
+    #
+    #   agent.schemes = ['http']
+    #
+    def schemes=(new_schemes)
+      @schemes = new_schemes.map { |scheme| scheme.to_s }
+    end
+
+    #
     # Sets the history of links that were previously visited to the
     # specified _new_history_.
     #
     #   agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
     #
@@ -573,10 +614,10 @@
     # Returns +true+ if the specified _url_ should be visited, based on
     # it's scheme, returns +false+ otherwise.
     #
     def visit_scheme?(url)
       if url.scheme
-        return SCHEMES.include?(url.scheme)
+        return @schemes.include?(url.scheme)
       else
         return true
       end
     end