lib/spidr/agent.rb in spidr-0.3.1 vs lib/spidr/agent.rb in spidr-0.3.2
- old
+ new
@@ -113,27 +113,30 @@
if options[:host_headers]
@host_headers.merge!(options[:host_headers])
end
- @user_agent = (options[:user_agent] || Spidr.user_agent)
+ @user_agent = options.fetch(:user_agent,Spidr.user_agent)
@referer = options[:referer]
- @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
@cookies = CookieJar.new
@authorized = AuthStore.new
@running = false
- @delay = (options[:delay] || 0)
+ @delay = options.fetch(:delay,0)
@history = Set[]
@failures = Set[]
@queue = []
@levels = Hash.new(0)
@max_depth = options[:max_depth]
- super(options)
+ initialize_sanitizers(options)
+ initialize_filters(options)
+ initialize_actions(options)
+ initialize_events(options)
yield self if block_given?
end
#
@@ -150,23 +153,20 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
- def self.start_at(url,options={})
- self.new(options) do |spider|
- yield spider if block_given?
-
- spider.start_at(url)
- end
+ def self.start_at(url,options={},&block)
+ agent = new(options,&block)
+ agent.start_at(url)
end
#
- # Creates a new agent and spiders the given host.
+ # Creates a new agent and spiders the web-site located at the given URL.
#
- # @param [String]
- # The host-name to spider.
+ # @param [URI::HTTP, String] url
+ # The web-site to spider.
#
# @param [Hash] options
# Additional options. See {Agent#initialize}.
#
# @yield [agent]
@@ -174,23 +174,22 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
- def self.host(name,options={})
- self.new(options.merge(:host => name)) do |spider|
- yield spider if block_given?
+ def self.site(url,options={},&block)
+ url = URI(url.to_s) unless url.kind_of?(URI)
- spider.start_at("http://#{name}/")
- end
+ agent = new(options.merge(:host => url.host),&block)
+ agent.start_at(url)
end
#
- # Creates a new agent and spiders the web-site located at the given URL.
+ # Creates a new agent and spiders the given host.
#
- # @param [URI::HTTP, String] url
- # The web-site to spider.
+ # @param [String]
+ # The host-name to spider.
#
# @param [Hash] options
# Additional options. See {Agent#initialize}.
#
# @yield [agent]
@@ -198,18 +197,12 @@
# before it begins spidering.
#
# @yieldparam [Agent] agent
# The newly created agent.
#
- def self.site(url,options={})
- url = URI(url.to_s)
-
- return self.new(options.merge(:host => url.host)) do |spider|
- yield spider if block_given?
-
- spider.start_at(url)
- end
+ def self.host(name,options={},&block)
+ site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
end
#
# Clears the history of the agent.
#
@@ -232,11 +225,10 @@
# @yieldparam [Page] page
# A page which has been visited.
#
def start_at(url,&block)
enqueue(url)
-
return run(&block)
end
#
# Start spidering until the queue becomes empty or the agent is
@@ -259,11 +251,10 @@
rescue Actions::Action
end
end
@running = false
-
@sessions.clear
return self
end
#
@@ -385,14 +376,14 @@
def failures=(new_failures)
@failures.clear
new_failures.each do |url|
@failures << unless url.kind_of?(URI)
- URI(url.to_s)
- else
- url
- end
+ URI(url.to_s)
+ else
+ url
+ end
end
return @failures
end
@@ -469,11 +460,11 @@
link = url.to_s
begin
@every_url_blocks.each { |url_block| url_block.call(url) }
- @urls_like_blocks.each do |pattern,url_blocks|
+ @every_url_like_blocks.each do |pattern,url_blocks|
match = case pattern
when Regexp
link =~ pattern
else
(pattern == link) || (pattern == url)
@@ -651,17 +642,16 @@
# @since 0.2.2
#
def prepare_request(url,&block)
host = url.host
port = url.port
+ path = unless url.path.empty?
+ url.path
+ else
+ '/'
+ end
- unless url.path.empty?
- path = url.path
- else
- path = '/'
- end
-
# append the URL query to the path
path += "?#{url.query}" if url.query
# set any additional HTTP headers
headers = {}
@@ -722,10 +712,10 @@
#
# @return [Boolean]
# Specifies whether the given URL should be visited.
#
def visit?(url)
- !(visited?(url)) &&
+ !visited?(url) &&
visit_scheme?(url.scheme) &&
visit_host?(url.host) &&
visit_port?(url.port) &&
visit_link?(url.to_s) &&
visit_url?(url) &&