lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.2.1 vs lib/relevance/tarantula/crawler.rb in relevance-tarantula-0.3.2
- old
+ new
@@ -8,11 +8,11 @@
include Relevance::Tarantula
class CrawlTimeout < RuntimeError; end
attr_accessor :proxy, :handlers, :skip_uri_patterns, :log_grabber,
- :reporters, :links_to_crawl, :links_queued, :forms_to_crawl,
+ :reporters, :crawl_queue, :links_queued,
:form_signatures_queued, :max_url_length, :response_code_handler,
:times_to_crawl, :fuzzers, :test_name, :crawl_timeout
attr_reader :transform_url_patterns, :referrers, :failures, :successes, :crawl_start_times, :crawl_end_times
def initialize
@@ -20,12 +20,11 @@
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
- @links_to_crawl = []
- @forms_to_crawl = []
+ @crawl_queue = []
@crawl_start_times, @crawl_end_times = [], []
@crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
@@ -53,12 +52,11 @@
end
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
- orig_links_to_crawl = @links_to_crawl.dup
- orig_forms_to_crawl = @forms_to_crawl.dup
+ orig_crawl_queue = @crawl_queue.dup
@times_to_crawl.times do |num|
queue_link url
begin
do_crawl num
@@ -69,89 +67,79 @@
puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
- @links_to_crawl = orig_links_to_crawl
- @forms_to_crawl = orig_forms_to_crawl
+ @crawl_queue = orig_crawl_queue
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
def finished?
- @links_to_crawl.empty? && @forms_to_crawl.empty?
+ @crawl_queue.empty?
end
def do_crawl(number)
while (!finished?)
@crawl_start_times << Time.now
- crawl_queued_links(number)
- crawl_queued_forms(number)
+ crawl_the_queue(number)
@crawl_end_times << Time.now
end
end
- def crawl_queued_links(number = 0)
- while (link = @links_to_crawl.pop)
- response = proxy.send(link.method, link.href)
- log "Response #{response.code} for #{link}"
- handle_link_results(link, response)
+ def crawl_the_queue(number = 0)
+ while (request = @crawl_queue.pop)
+ request.crawl
blip(number)
end
end
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
- def handle_link_results(link, response)
+ def handle_link_results(link, result)
handlers.each do |h|
begin
- save_result h.handle(Result.new(:method => link.method,
- :url => link.href,
- :response => response,
- :log => grab_log!,
- :referrer => referrers[link],
- :test_name => test_name).freeze)
+ save_result h.handle(result)
rescue Exception => e
log "error handling #{link} #{e.message}"
# TODO: pass to results
end
end
end
- def crawl_form(form)
- response = proxy.send(form.method, form.action, form.data)
- log "Response #{response.code} for #{form}"
- response
- rescue ActiveRecord::RecordNotFound => e
- log "Skipping #{form.action}, presumed ok that record is missing"
- Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
+ def follow(method, url, data=nil)
+ proxy.send(method, url, data)
end
-
- def crawl_queued_forms(number = 0)
- while (form = @forms_to_crawl.pop)
- response = crawl_form(form)
- handle_form_results(form, response)
- blip(number)
- end
- end
+ def submit(method, action, data)
+ proxy.send(method, action, data)
+ end
+
def elasped_time_for_pass(num)
Time.now - crawl_start_times[num]
end
def grab_log!
@log_grabber && @log_grabber.grab!
end
+
+ def make_result(options)
+ defaults = {
+ :log => grab_log!,
+ :test_name => test_name
+ }
+ Result.new(defaults.merge(options)).freeze
+ end
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.method,
:url => form.action,
@@ -191,27 +179,25 @@
end
url
end
def queue_link(dest, referrer = nil)
- dest = Link.new(dest)
- dest.href = transform_url(dest.href)
+ dest = Link.new(dest, self, referrer)
return if should_skip_link?(dest)
- @referrers[dest] = referrer if referrer
- @links_to_crawl << dest
+ @crawl_queue << dest
@links_queued << dest
dest
end
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
- fuzzer.mutate(Form.new(form)).each do |fs|
- # fs = fuzzer.new(Form.new(form))
+ fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
+ # fs = fuzzer.new(Form.new(form, self, referrer))
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
- @forms_to_crawl << fs
+ @crawl_queue << fs
@form_signatures_queued << fs.signature
end
end
end
@@ -232,27 +218,28 @@
raise errors.map(&:message).join("\n")
end
end
def report_results
+ puts "Crawled #{total_links_count} links and forms."
generate_reports
end
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
def links_remaining_count
- @links_to_crawl.size + @forms_to_crawl.size
+ @crawl_queue.size
end
def links_completed_count
total_links_count - links_remaining_count
end
def blip(number = 0)
unless verbose
- print "\r #{links_completed_count} of #{total_links_count} links completed "
+ print "\r #{links_completed_count} of #{total_links_count} links completed " if $stdout.tty?
timeout_if_too_long(number)
end
end
def timeout_if_too_long(number = 0)