lib/spidr/agent.rb in spidr-0.1.8 vs lib/spidr/agent.rb in spidr-0.1.9
- old
+ new
@@ -5,13 +5,10 @@
require 'net/http'
module Spidr
class Agent
- # URL schemes to visit
- SCHEMES = ['http', 'https']
-
# Proxy to use
attr_accessor :proxy
# User-Agent to use
attr_accessor :user_agent
@@ -20,10 +17,13 @@
attr_accessor :referer
# Delay in between fetching pages
attr_accessor :delay
+ # List of acceptable URL schemes to follow
+ attr_reader :schemes
+
# History containing visited URLs
attr_reader :history
# List of unreachable URLs
attr_reader :failures
@@ -40,26 +40,50 @@
# <tt>:proxy</tt>:: The proxy to use while spidering.
# <tt>:user_agent</tt>:: The User-Agent string to send.
# <tt>:referer</tt>:: The referer URL to send.
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
# link. Defaults to 0.
+ # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
+ # Defaults to +http+ and +https+. +https+ URL
+ # schemes will be ignored if <tt>net/http</tt>
+ # cannot be loaded.
# <tt>:host</tt>:: The host-name to visit.
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
# <tt>:ports</tt>:: An +Array+ of port patterns to visit.
# <tt>:ignore_ports</tt>:: An +Array+ of port patterns to not visit.
# <tt>:links</tt>:: An +Array+ of link patterns to visit.
# <tt>:ignore_links</tt>:: An +Array+ of link patterns to not visit.
# <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
# <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
# visit.
+ # <tt>:queue</tt>:: An initial queue of URLs to visit.
+ # <tt>:history</tt>:: An initial list of visited URLs.
#
def initialize(options={},&block)
@proxy = (options[:proxy] || Spidr.proxy)
@user_agent = (options[:user_agent] || Spidr.user_agent)
@referer = options[:referer]
+ @schemes = []
+
+ if options[:schemes]
+ @schemes += options[:schemes]
+ else
+ @schemes << 'http'
+
+ begin
+ require 'net/https'
+
+ @schemes << 'https'
+ rescue Gem::LoadError => e
+ raise(e)
+ rescue ::LoadError
+ STDERR.puts "Warning: cannot load 'net/https', https support disabled"
+ end
+ end
+
@host_rules = Rules.new(
:accept => options[:hosts],
:reject => options[:ignore_hosts]
)
@port_rules = Rules.new(
@@ -89,10 +113,18 @@
if options[:host]
visit_hosts_like(options[:host])
end
+ if options[:queue]
+ self.queue = options[:queue]
+ end
+
+ if options[:history]
+ self.history = options[:history]
+ end
+
block.call(self) if block
end
#
# Creates a new Agent object with the given _options_ and will begin
@@ -359,14 +391,13 @@
@failures.clear
return self
end
#
- # Clear the history and start spidering at the specified _url_.
+ # Start spidering at the specified _url_.
#
def start_at(url)
- clear
enqueue(url)
return continue!
end
@@ -412,10 +443,20 @@
@paused = true
return self
end
#
+ # Sets the list of acceptable URL schemes to follow to the
+ # _new_schemes_.
+ #
+ # agent.schemes = ['http']
+ #
+ def schemes=(new_schemes)
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
+ end
+
+ #
# Sets the history of links that were previously visited to the
# specified _new_history_.
#
# agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
#
@@ -573,10 +614,10 @@
# Returns +true+ if the specified _url_ should be visited, based on
# it's scheme, returns +false+ otherwise.
#
def visit_scheme?(url)
if url.scheme
- return SCHEMES.include?(url.scheme)
+ return @schemes.include?(url.scheme)
else
return true
end
end