lib/spidr/agent.rb in spidr-0.2.7 vs lib/spidr/agent.rb in spidr-0.3.0

- old
+ new

@@ -46,11 +46,17 @@ # Queue of URLs to visit attr_reader :queue # Cached cookies attr_reader :cookies + + # Maximum depth + attr_reader :max_depth + # The visited URLs and their depth within a site + attr_reader :levels + # # Creates a new Agent object. # # @param [Hash] options # Additional options @@ -89,10 +95,13 @@ # The initial queue of URLs to visit. # # @option options [Set, Array] :history # The initial list of visited URLs. # + # @option options [Integer] :max_depth + # The maximum link depth to follow. + # # @yield [agent] # If a block is given, it will be passed the newly created agent # for further configuration. # # @yieldparam [Agent] agent @@ -117,10 +126,13 @@ @delay = (options[:delay] || 0) @history = Set[] @failures = Set[] @queue = [] + @levels = Hash.new(0) + @max_depth = options[:max_depth] + super(options) yield self if block_given? end @@ -448,11 +460,11 @@ # The URL to enqueue for visiting. # # @return [Boolean] # Specifies whether the URL was enqueued, or ignored. # - def enqueue(url) + def enqueue(url,level=0) url = sanitize_url(url) if (!(queued?(url)) && visit?(url)) link = url.to_s @@ -475,18 +487,19 @@ raise(action) rescue Actions::SkipLink return false rescue Actions::Action end - + @queue << url + @levels[url] = level return true end return false end - + # # Requests and creates a new Page object from a given URL. # # @param [URI::HTTP] url # The URL to request. @@ -566,11 +579,11 @@ # @return [Page, nil] # The page that was visited. If `nil` is returned, either the request # for the page failed, or the page was skipped. # def visit_page(url) - url = URI(url.to_s) unless url.kind_of?(URI) + url = sanitize_url(url) get_page(url) do |page| @history << page.url begin @@ -582,11 +595,11 @@ rescue Actions::SkipPage return nil rescue Actions::Action end - page.urls.each do |next_url| + page.each_url do |next_url| begin @every_link_blocks.each do |link_block| link_block.call(page.url,next_url) end rescue Actions::Paused => action @@ -594,10 +607,12 @@ rescue Actions::SkipLink next rescue Actions::Action end - enqueue(next_url) + if (@max_depth.nil? || @max_depth > @levels[url]) + enqueue(next_url,@levels[url] + 1) + end end end end #