lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.1.7 vs lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.1.8
- old
+ new
@@ -5,25 +5,29 @@
class Relevance::Tarantula::Crawler
extend Forwardable
include Relevance::Tarantula
+ class CrawlTimeout < RuntimeError; end
+
attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber,
:reporters, :links_to_crawl, :links_queued, :forms_to_crawl,
:form_signatures_queued, :max_url_length, :response_code_handler,
- :times_to_crawl, :fuzzers, :test_name
- attr_reader :transform_url_patterns, :referrers, :failures, :successes
+ :times_to_crawl, :fuzzers, :test_name, :crawl_timeout
+ attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@links_to_crawl = []
@forms_to_crawl = []
+ @crawl_start_times, @crawl_end_times = [], []
+ @crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
@@ -51,17 +55,22 @@
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_links_to_crawl = @links_to_crawl.dup
orig_forms_to_crawl = @forms_to_crawl.dup
- @times_to_crawl.times do |i|
+ @times_to_crawl.times do |num|
queue_link url
- do_crawl
+
+ begin
+ do_crawl num
+ rescue CrawlTimeout => e
+ puts e.message
+ end
+
+ puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
- puts "#{(i+1).ordinalize} crawl" if @times_to_crawl > 1
-
- if i + 1 < @times_to_crawl
+ if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@links_to_crawl = orig_links_to_crawl
@forms_to_crawl = orig_forms_to_crawl
@referrers = {}
@@ -75,23 +84,25 @@
def finished?
@links_to_crawl.empty? && @forms_to_crawl.empty?
end
- def do_crawl
+ def do_crawl(number)
while (!finished?)
- crawl_queued_links
- crawl_queued_forms
+ @crawl_start_times << Time.now
+ crawl_queued_links(number)
+ crawl_queued_forms(number)
+ @crawl_end_times << Time.now
end
end
- def crawl_queued_links
+ def crawl_queued_links(number = 0)
while (link = @links_to_crawl.pop)
response = proxy.send(link.method, link.href)
log "Response #{response.code} for #{link}"
handle_link_results(link, response)
- blip
+ blip(number)
end
end
def save_result(result)
reporters.each do |reporter|
@@ -122,17 +133,21 @@
rescue ActiveRecord::RecordNotFound => e
log "Skipping #{form.action}, presumed ok that record is missing"
Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
end
- def crawl_queued_forms
+ def crawl_queued_forms(number = 0)
while (form = @forms_to_crawl.pop)
response = crawl_form(form)
handle_form_results(form, response)
- blip
+ blip(number)
end
end
+
+ def elasped_time_for_pass(num)
+ Time.now - crawl_start_times[num]
+ end
def grab_log!
@log_grabber && @log_grabber.grab!
end
@@ -232,11 +247,18 @@
def links_completed_count
total_links_count - links_remaining_count
end
- def blip
+ def blip(number = 0)
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed "
+ timeout_if_too_long(number)
+ end
+ end
+
+ def timeout_if_too_long(number = 0)
+ if elasped_time_for_pass(number) > crawl_timeout
+ raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
end
end
end