lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.1.7 vs lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.1.8

- old
+ new

@@ -5,25 +5,29 @@ class Relevance::Tarantula::Crawler extend Forwardable include Relevance::Tarantula + class CrawlTimeout < RuntimeError; end + attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber, :reporters, :links_to_crawl, :links_queued, :forms_to_crawl, :form_signatures_queued, :max_url_length, :response_code_handler, - :times_to_crawl, :fuzzers, :test_name - attr_reader :transform_url_patterns, :referrers, :failures, :successes + :times_to_crawl, :fuzzers, :test_name, :crawl_timeout + attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times def initialize @max_url_length = 1024 @successes = [] @failures = [] @handlers = [@response_code_handler = Result] @links_queued = Set.new @form_signatures_queued = Set.new @links_to_crawl = [] @forms_to_crawl = [] + @crawl_start_times, @crawl_end_times = [], [] + @crawl_timeout = 20.minutes @referrers = {} @skip_uri_patterns = [ /^javascript/, /^mailto/, /^http/, @@ -51,17 +55,22 @@ def crawl(url = "/") orig_links_queued = @links_queued.dup orig_form_signatures_queued = @form_signatures_queued.dup orig_links_to_crawl = @links_to_crawl.dup orig_forms_to_crawl = @forms_to_crawl.dup - @times_to_crawl.times do |i| + @times_to_crawl.times do |num| queue_link url - do_crawl + + begin + do_crawl num + rescue CrawlTimeout => e + puts e.message + end + + puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1 - puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1 - - if i + 1 < @times_to_crawl + if num + 1 < @times_to_crawl @links_queued = orig_links_queued @form_signatures_queued = orig_form_signatures_queued @links_to_crawl = orig_links_to_crawl @forms_to_crawl = orig_forms_to_crawl @referrers = {} @@ -75,23 +84,25 @@ def finished? @links_to_crawl.empty? && @forms_to_crawl.empty? end - def do_crawl + def do_crawl(number) while (!finished?) - crawl_queued_links - crawl_queued_forms + @crawl_start_times << Time.now + crawl_queued_links(number) + crawl_queued_forms(number) + @crawl_end_times << Time.now end end - def crawl_queued_links + def crawl_queued_links(number = 0) while (link = @links_to_crawl.pop) response = proxy.send(link.method, link.href) log "Response #{response.code} for #{link}" handle_link_results(link, response) - blip + blip(number) end end def save_result(result) reporters.each do |reporter| @@ -122,17 +133,21 @@ rescue ActiveRecord::RecordNotFound => e log "Skipping #{form.action}, presumed ok that record is missing" Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain") end - def crawl_queued_forms + def crawl_queued_forms(number = 0) while (form = @forms_to_crawl.pop) response = crawl_form(form) handle_form_results(form, response) - blip + blip(number) end end + + def elasped_time_for_pass(num) + Time.now - crawl_start_times[num] + end def grab_log! @log_grabber && @log_grabber.grab! end @@ -232,11 +247,18 @@ def links_completed_count total_links_count - links_remaining_count end - def blip + def blip(number = 0) unless verbose print "\r #{links_completed_count} of #{total_links_count} links completed " + timeout_if_too_long(number) + end + end + + def timeout_if_too_long(number = 0) + if elasped_time_for_pass(number) > crawl_timeout + raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..." end end end