lib/spidr/agent.rb in spidr-0.2.7 vs lib/spidr/agent.rb in spidr-0.3.0
- old
+ new
@@ -46,11 +46,17 @@
# Queue of URLs to visit
attr_reader :queue
# Cached cookies
attr_reader :cookies
+
+ # Maximum depth
+ attr_reader :max_depth
+ # The visited URLs and their depth within a site
+ attr_reader :levels
+
#
# Creates a new Agent object.
#
# @param [Hash] options
# Additional options
@@ -89,10 +95,13 @@
# The initial queue of URLs to visit.
#
# @option options [Set, Array] :history
# The initial list of visited URLs.
#
+ # @option options [Integer] :max_depth
+ # The maximum link depth to follow.
+ #
# @yield [agent]
# If a block is given, it will be passed the newly created agent
# for further configuration.
#
# @yieldparam [Agent] agent
@@ -117,10 +126,13 @@
@delay = (options[:delay] || 0)
@history = Set[]
@failures = Set[]
@queue = []
+ @levels = Hash.new(0)
+ @max_depth = options[:max_depth]
+
super(options)
yield self if block_given?
end
@@ -448,11 +460,11 @@
# The URL to enqueue for visiting.
#
# @return [Boolean]
# Specifies whether the URL was enqueued, or ignored.
#
- def enqueue(url)
+ def enqueue(url,level=0)
url = sanitize_url(url)
if (!(queued?(url)) && visit?(url))
link = url.to_s
@@ -475,18 +487,19 @@
raise(action)
rescue Actions::SkipLink
return false
rescue Actions::Action
end
-
+
@queue << url
+ @levels[url] = level
return true
end
return false
end
-
+
#
# Requests and creates a new Page object from a given URL.
#
# @param [URI::HTTP] url
# The URL to request.
@@ -566,11 +579,11 @@
# @return [Page, nil]
# The page that was visited. If `nil` is returned, either the request
# for the page failed, or the page was skipped.
#
def visit_page(url)
- url = URI(url.to_s) unless url.kind_of?(URI)
+ url = sanitize_url(url)
get_page(url) do |page|
@history << page.url
begin
@@ -582,11 +595,11 @@
rescue Actions::SkipPage
return nil
rescue Actions::Action
end
- page.urls.each do |next_url|
+ page.each_url do |next_url|
begin
@every_link_blocks.each do |link_block|
link_block.call(page.url,next_url)
end
rescue Actions::Paused => action
@@ -594,10 +607,12 @@
rescue Actions::SkipLink
next
rescue Actions::Action
end
- enqueue(next_url)
+ if (@max_depth.nil? || @max_depth > @levels[url])
+ enqueue(next_url,@levels[url] + 1)
+ end
end
end
end
#