lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.2.1 vs lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.3.2

- old
+ new

@@ -8,11 +8,11 @@ include Relevance::Tarantula class CrawlTimeout < RuntimeError; end attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber, - :reporters, :links_to_crawl, :links_queued, :forms_to_crawl, + :reporters, :crawl_queue, :links_queued, :form_signatures_queued, :max_url_length, :response_code_handler, :times_to_crawl, :fuzzers, :test_name, :crawl_timeout attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times def initialize @@ -20,12 +20,11 @@ @successes = [] @failures = [] @handlers = [@response_code_handler = Result] @links_queued = Set.new @form_signatures_queued = Set.new - @links_to_crawl = [] - @forms_to_crawl = [] + @crawl_queue = [] @crawl_start_times, @crawl_end_times = [], [] @crawl_timeout = 20.minutes @referrers = {} @skip_uri_patterns = [ /^javascript/, @@ -53,12 +52,11 @@ end def crawl(url = "/") orig_links_queued = @links_queued.dup orig_form_signatures_queued = @form_signatures_queued.dup - orig_links_to_crawl = @links_to_crawl.dup - orig_forms_to_crawl = @forms_to_crawl.dup + orig_crawl_queue = @crawl_queue.dup @times_to_crawl.times do |num| queue_link url begin do_crawl num @@ -69,89 +67,79 @@ puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1 if num + 1 < @times_to_crawl @links_queued = orig_links_queued @form_signatures_queued = orig_form_signatures_queued - @links_to_crawl = orig_links_to_crawl - @forms_to_crawl = orig_forms_to_crawl + @crawl_queue = orig_crawl_queue @referrers = {} end end rescue Interrupt $stderr.puts "CTRL-C" ensure report_results end def finished? - @links_to_crawl.empty? && @forms_to_crawl.empty? + @crawl_queue.empty? end def do_crawl(number) while (!finished?) @crawl_start_times << Time.now - crawl_queued_links(number) - crawl_queued_forms(number) + crawl_the_queue(number) @crawl_end_times << Time.now end end - def crawl_queued_links(number = 0) - while (link = @links_to_crawl.pop) - response = proxy.send(link.method, link.href) - log "Response #{response.code} for #{link}" - handle_link_results(link, response) + def crawl_the_queue(number = 0) + while (request = @crawl_queue.pop) + request.crawl blip(number) end end def save_result(result) reporters.each do |reporter| reporter.report(result) end end - def handle_link_results(link, response) + def handle_link_results(link, result) handlers.each do |h| begin - save_result h.handle(Result.new(:method => link.method, - :url => link.href, - :response => response, - :log => grab_log!, - :referrer => referrers[link], - :test_name => test_name).freeze) + save_result h.handle(result) rescue Exception => e log "error handling #{link} #{e.message}" # TODO: pass to results end end end - def crawl_form(form) - response = proxy.send(form.method, form.action, form.data) - log "Response #{response.code} for #{form}" - response - rescue ActiveRecord::RecordNotFound => e - log "Skipping #{form.action}, presumed ok that record is missing" - Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain") + def follow(method, url, data=nil) + proxy.send(method, url, data) end - - def crawl_queued_forms(number = 0) - while (form = @forms_to_crawl.pop) - response = crawl_form(form) - handle_form_results(form, response) - blip(number) - end - end + def submit(method, action, data) + proxy.send(method, action, data) + end + def elasped_time_for_pass(num) Time.now - crawl_start_times[num] end def grab_log! @log_grabber && @log_grabber.grab! end + + def make_result(options) + defaults = { + :log => grab_log!, + :test_name => test_name + } + Result.new(defaults.merge(options)).freeze + end def handle_form_results(form, response) handlers.each do |h| save_result h.handle(Result.new(:method => form.method, :url => form.action, @@ -191,27 +179,25 @@ end url end def queue_link(dest, referrer = nil) - dest = Link.new(dest) - dest.href = transform_url(dest.href) + dest = Link.new(dest, self, referrer) return if should_skip_link?(dest) - @referrers[dest] = referrer if referrer - @links_to_crawl << dest + @crawl_queue << dest @links_queued << dest dest end def queue_form(form, referrer = nil) fuzzers.each do |fuzzer| - fuzzer.mutate(Form.new(form)).each do |fs| - # fs = fuzzer.new(Form.new(form)) + fuzzer.mutate(Form.new(form, self, referrer)).each do |fs| + # fs = fuzzer.new(Form.new(form, self, referrer)) fs.action = transform_url(fs.action) return if should_skip_form_submission?(fs) @referrers[fs.action] = referrer if referrer - @forms_to_crawl << fs + @crawl_queue << fs @form_signatures_queued << fs.signature end end end @@ -232,27 +218,28 @@ raise errors.map(&:message).join("\n") end end def report_results + puts "Crawled #{total_links_count} links and forms." generate_reports end def total_links_count @links_queued.size + @form_signatures_queued.size end def links_remaining_count - @links_to_crawl.size + @forms_to_crawl.size + @crawl_queue.size end def links_completed_count total_links_count - links_remaining_count end def blip(number = 0) unless verbose - print "\r #{links_completed_count} of #{total_links_count} links completed " + print "\r #{links_completed_count} of #{total_links_count} links completed " if $stdout.tty? timeout_if_too_long(number) end end def timeout_if_too_long(number = 0)