lib/spidr/agent.rb in spidr-0.1.8 vs lib/spidr/agent.rb in spidr-0.1.9

- old
+ new

@@ -5,13 +5,10 @@ require 'net/http' module Spidr class Agent - # URL schemes to visit - SCHEMES = ['http', 'https'] - # Proxy to use attr_accessor :proxy # User-Agent to use attr_accessor :user_agent @@ -20,10 +17,13 @@ attr_accessor :referer # Delay in between fetching pages attr_accessor :delay + # List of acceptable URL schemes to follow + attr_reader :schemes + # History containing visited URLs attr_reader :history # List of unreachable URLs attr_reader :failures @@ -40,26 +40,50 @@ # <tt>:proxy</tt>:: The proxy to use while spidering. # <tt>:user_agent</tt>:: The User-Agent string to send. # <tt>:referer</tt>:: The referer URL to send. # <tt>:delay</tt>:: Duration in seconds to pause between spidering each # link. Defaults to 0. + # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow. + # Defaults to +http+ and +https+. +https+ URL + # schemes will be ignored if <tt>net/http</tt> + # cannot be loaded. # <tt>:host</tt>:: The host-name to visit. # <tt>:hosts</tt>:: An +Array+ of host patterns to visit. # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit. # <tt>:ports</tt>:: An +Array+ of port patterns to visit. # <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit. # <tt>:links</tt>:: An +Array+ of link patterns to visit. # <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit. # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit. # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not # visit. + # <tt>:queue</tt>:: An initial queue of URLs to visit. + # <tt>:history</tt>:: An initial list of visited URLs. # def initialize(options={},&block) @proxy = (options[:proxy] || Spidr.proxy) @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] + @schemes = [] + + if options[:schemes] + @schemes += options[:schemes] + else + @schemes << 'http' + + begin + require 'net/https' + + @schemes << 'https' + rescue Gem::LoadError => e + raise(e) + rescue ::LoadError + STDERR.puts "Warning: cannot load 'net/https', https support disabled" + end + end + @host_rules = Rules.new( :accept => options[:hosts], :reject => options[:ignore_hosts] ) @port_rules = Rules.new( @@ -89,10 +113,18 @@ if options[:host] visit_hosts_like(options[:host]) end + if options[:queue] + self.queue = options[:queue] + end + + if options[:history] + self.history = options[:history] + end + block.call(self) if block end # # Creates a new Agent object with the given _options_ and will begin @@ -359,14 +391,13 @@ @failures.clear return self end # - # Clear the history and start spidering at the specified _url_. + # Start spidering at the specified _url_. # def start_at(url) - clear enqueue(url) return continue! end @@ -412,10 +443,20 @@ @paused = true return self end # + # Sets the list of acceptable URL schemes to follow to the + # _new_schemes_. + # + # agent.schemes = ['http'] + # + def schemes=(new_schemes) + @schemes = new_schemes.map { |scheme| scheme.to_s } + end + + # # Sets the history of links that were previously visited to the # specified _new_history_. # # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] # @@ -573,10 +614,10 @@ # Returns +true+ if the specified _url_ should be visited, based on # it's scheme, returns +false+ otherwise. # def visit_scheme?(url) if url.scheme - return SCHEMES.include?(url.scheme) + return @schemes.include?(url.scheme) else return true end end