examples/relevance/tarantula/crawler_example.rb in tarantula-0.1.5 vs examples/relevance/tarantula/crawler_example.rb in tarantula-0.1.8

- old
+ new

@@ -1,206 +1,248 @@ -require File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb") +require File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb")) -describe 'Relevance::Tarantula::Crawler#transform_url' do - before {@crawler = Relevance::Tarantula::Crawler.new} - it "de-obfuscates unicode obfuscated urls" do - obfuscated_mailto = "&#109;&#97;&#105;&#108;&#116;&#111;&#58;" - @crawler.transform_url(obfuscated_mailto).should == "mailto:" - end +describe Relevance::Tarantula::Crawler do - it "strips the trailing name portion of a link" do - @crawler.transform_url('http://host/path#name').should == 'http://host/path' - end -end + describe "transform_url" do -describe 'Relevance::Tarantula::Crawler log grabbing' do - it "returns nil if no grabber is specified" do - crawler = Relevance::Tarantula::Crawler.new - crawler.grab_log!.should == nil - end + before { @crawler = Relevance::Tarantula::Crawler.new } + + it "de-obfuscates unicode obfuscated urls" do + obfuscated_mailto = "&#109;&#97;&#105;&#108;&#116;&#111;&#58;" + @crawler.transform_url(obfuscated_mailto).should == "mailto:" + end - it "returns grabber.grab if grabber is specified" do - crawler = Relevance::Tarantula::Crawler.new - crawler.log_grabber = stub(:grab! => "fake log entry") - crawler.grab_log!.should == "fake log entry" + it "strips the trailing name portion of a link" do + @crawler.transform_url('http://host/path#name').should == 'http://host/path' + end end -end + + + describe "log grabbing" do -describe 'Relevance::Tarantula::Crawler interruption' do - it 'catches interruption and writes the partial report' do - crawler = Relevance::Tarantula::Crawler.new - crawler.stubs(:queue_link) - crawler.stubs(:do_crawl).raises(Interrupt) - crawler.expects(:report_results) - $stderr.expects(:puts).with("CTRL-C") - crawler.crawl - end -end + it "returns nil if no grabber is specified" do + crawler = Relevance::Tarantula::Crawler.new + crawler.grab_log!.should == nil + end -describe 'Relevance::Tarantula::Crawler handle_form_results' do - it 'captures the result values (bugfix)' do - response = stub_everything - result_args = {:url => :action_stub, - :data => 'nil', - :response => response, - :referrer => :action_stub, - :log => nil, - :method => :stub_method, - :test_name => nil} - result = Relevance::Tarantula::Result.new(result_args) - Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result) - crawler = Relevance::Tarantula::Crawler.new - crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub), - response) + it "returns grabber.grab if grabber is specified" do + crawler = Relevance::Tarantula::Crawler.new + crawler.log_grabber = stub(:grab! => "fake log entry") + crawler.grab_log!.should == "fake log entry" + end + end -end - -describe 'Relevance::Tarantula::Crawler#crawl' do - it 'queues the first url, does crawl, and then reports results' do - crawler = Relevance::Tarantula::Crawler.new - crawler.expects(:queue_link).with("/foobar") - crawler.expects(:do_crawl) - crawler.expects(:report_results) - crawler.crawl("/foobar") - end - it 'reports results even if the crawl fails' do - crawler = Relevance::Tarantula::Crawler.new - crawler.expects(:do_crawl).raises(RuntimeError) - crawler.expects(:report_results) - lambda {crawler.crawl('/')}.should raise_error(RuntimeError) + describe "interrupt" do + + it 'catches interruption and writes the partial report' do + crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:queue_link) + crawler.stubs(:do_crawl).raises(Interrupt) + crawler.expects(:report_results) + $stderr.expects(:puts).with("CTRL-C") + crawler.crawl + end + end -end - -describe 'Relevance::Tarantula::Crawler queuing' do - it 'queues and remembers links' do - crawler = Relevance::Tarantula::Crawler.new - crawler.expects(:transform_url).with("/url").returns("/transformed") - crawler.queue_link("/url") - crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")] - crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")]) - end - it 'queues and remembers forms' do - crawler = Relevance::Tarantula::Crawler.new - form = Hpricot('<form action="/action" method="post"/>').at('form') - signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature - crawler.queue_form(form) - crawler.forms_to_crawl.size.should == 1 - crawler.form_signatures_queued.should == Set.new([signature]) + describe 'handle_form_results' do + + it 'captures the result values (bugfix)' do + response = stub_everything + result_args = {:url => :action_stub, + :data => 'nil', + :response => response, + :referrer => :action_stub, + :log => nil, + :method => :stub_method, + :test_name => nil} + result = Relevance::Tarantula::Result.new(result_args) + Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result) + crawler = Relevance::Tarantula::Crawler.new + crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub), + response) + end + end - it 'remembers link referrer if there is one' do - crawler = Relevance::Tarantula::Crawler.new - crawler.queue_link("/url", "/some-referrer") - crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"} + describe "crawl" do + + it 'queues the first url, does crawl, and then reports results' do + crawler = Relevance::Tarantula::Crawler.new + crawler.expects(:queue_link).with("/foobar") + crawler.expects(:do_crawl) + crawler.expects(:report_results) + crawler.crawl("/foobar") + end + + it 'reports results even if the crawl fails' do + crawler = Relevance::Tarantula::Crawler.new + crawler.expects(:do_crawl).raises(RuntimeError) + crawler.expects(:report_results) + lambda {crawler.crawl('/')}.should raise_error(RuntimeError) + end + end -end + describe "queueing" do -describe 'Relevance::Tarantula::Crawler#report_results' do - it "delegates to generate_reports" do - crawler = Relevance::Tarantula::Crawler.new - crawler.expects(:generate_reports) - crawler.report_results - end -end + it 'queues and remembers links' do + crawler = Relevance::Tarantula::Crawler.new + crawler.expects(:transform_url).with("/url").returns("/transformed") + crawler.queue_link("/url") + crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")] + crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")]) + end -describe 'Relevance::Tarantula::Crawler#crawling' do + it 'queues and remembers forms' do + crawler = Relevance::Tarantula::Crawler.new + form = Hpricot('<form action="/action" method="post"/>').at('form') + signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature + crawler.queue_form(form) + crawler.forms_to_crawl.size.should == 1 + crawler.form_signatures_queued.should == Set.new([signature]) + end - it "converts ActiveRecord::RecordNotFound into a 404" do - (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound) - crawler = Relevance::Tarantula::Crawler.new - crawler.proxy = proxy - response = crawler.crawl_form stub_everything(:method => nil) - response.code.should == "404" - response.content_type.should == "text/plain" - response.body.should == "ActiveRecord::RecordNotFound" + it 'remembers link referrer if there is one' do + crawler = Relevance::Tarantula::Crawler.new + crawler.queue_link("/url", "/some-referrer") + crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"} + end + end + + describe "crawling" do + + it "converts ActiveRecord::RecordNotFound into a 404" do + (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound) + crawler = Relevance::Tarantula::Crawler.new + crawler.proxy = proxy + response = crawler.crawl_form stub_everything(:method => nil) + response.code.should == "404" + response.content_type.should == "text/plain" + response.body.should == "ActiveRecord::RecordNotFound" + end - it "does four things with each link: get, log, handle, and blip" do - crawler = Relevance::Tarantula::Crawler.new - crawler.proxy = stub - response = stub(:code => "200") - crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] - crawler.proxy.expects(:get).returns(response).times(2) - crawler.expects(:log).times(2) - crawler.expects(:handle_link_results).times(2) - crawler.expects(:blip).times(2) - crawler.crawl_queued_links - crawler.links_to_crawl.should == [] - end + it "does four things with each link: get, log, handle, and blip" do + crawler = Relevance::Tarantula::Crawler.new + crawler.proxy = stub + response = stub(:code => "200") + crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] + crawler.proxy.expects(:get).returns(response).times(2) + crawler.expects(:log).times(2) + crawler.expects(:handle_link_results).times(2) + crawler.expects(:blip).times(2) + crawler.crawl_queued_links + crawler.links_to_crawl.should == [] + end + + it "invokes queued forms, logs responses, and calls handlers" do + crawler = Relevance::Tarantula::Crawler.new + crawler.forms_to_crawl << stub_everything(:method => "get", + :action => "/foo", + :data => "some data", + :to_s => "stub") + crawler.proxy = stub_everything(:send => stub(:code => "200" )) + crawler.expects(:log).with("Response 200 for stub") + crawler.expects(:blip) + crawler.crawl_queued_forms + end - it "invokes queued forms, logs responses, and calls handlers" do - crawler = Relevance::Tarantula::Crawler.new - crawler.forms_to_crawl << stub_everything(:method => "get", - :action => "/foo", - :data => "some data", - :to_s => "stub") - crawler.proxy = stub_everything(:send => stub(:code => "200" )) - crawler.expects(:log).with("Response 200 for stub") - crawler.expects(:blip) - crawler.crawl_queued_forms + it "breaks out early if a timeout is set" do + crawler = Relevance::Tarantula::Crawler.new + stub_puts_and_print(crawler) + crawler.proxy = stub + response = stub(:code => "200") + crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)] + crawler.proxy.expects(:get).returns(response).times(4) + crawler.forms_to_crawl << stub_everything(:method => "post", + :action => "/foo", + :data => "some data", + :to_s => "stub") + crawler.proxy.expects(:post).returns(response).times(2) + crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6) + crawler.times_to_crawl = 2 + crawler.crawl + + end + + it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do + crawler = Relevance::Tarantula::Crawler.new + stub_puts_and_print(crawler) + crawler.proxy = stub + response = stub(:code => "200") + crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)] + crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2 + crawler.forms_to_crawl << stub_everything(:method => "post", + :action => "/foo", + :data => "some data", + :to_s => "stub") + crawler.proxy.expects(:post).returns(response).times(2) + crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6) + crawler.times_to_crawl = 2 + crawler.crawl + end + end - it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do - crawler = Relevance::Tarantula::Crawler.new - stub_puts_and_print(crawler) - crawler.proxy = stub - response = stub(:code => "200") - crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)] - crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2 - crawler.forms_to_crawl << stub_everything(:method => "post", - :action => "/foo", - :data => "some data", - :to_s => "stub") - crawler.proxy.expects(:post).returns(response).times(2) - crawler.expects(:links_completed_count).returns(*(0..6).to_a).times(6) - crawler.times_to_crawl = 2 - crawler.crawl - end -end + describe "report_results" do -describe 'Crawler blip' do - it "blips the current progress if !verbose" do - crawler = Relevance::Tarantula::Crawler.new - crawler.stubs(:verbose).returns false - crawler.expects(:print).with("\r 0 of 0 links completed ") - crawler.blip + it "delegates to generate_reports" do + crawler = Relevance::Tarantula::Crawler.new + crawler.expects(:generate_reports) + crawler.report_results + end + end - it "blips nothing if verbose" do - crawler = Relevance::Tarantula::Crawler.new - crawler.stubs(:verbose).returns true - crawler.expects(:print).never - crawler.blip - end -end + + describe "blip" do -describe 'Relevance::Tarantula::Crawler' do - it "is finished when the links and forms are crawled" do - crawler = Relevance::Tarantula::Crawler.new - crawler.finished?.should == true + it "blips the current progress if !verbose" do + crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:verbose).returns false + crawler.stubs(:timeout_if_too_long) + crawler.expects(:print).with("\r 0 of 0 links completed ") + crawler.blip + end + + it "blips nothing if verbose" do + crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:verbose).returns true + crawler.expects(:print).never + crawler.blip + end + end + + describe "finished?" do - it "isn't finished when links remain" do - crawler = Relevance::Tarantula::Crawler.new - crawler.links_to_crawl = [:stub_link] - crawler.finished?.should == false - end + it "is finished when the links and forms are crawled" do + crawler = Relevance::Tarantula::Crawler.new + crawler.finished?.should == true + end - it "isn't finished when links remain" do - crawler = Relevance::Tarantula::Crawler.new - crawler.forms_to_crawl = [:stub_form] - crawler.finished?.should == false + it "isn't finished when links remain" do + crawler = Relevance::Tarantula::Crawler.new + crawler.links_to_crawl = [:stub_link] + crawler.finished?.should == false + end + + it "isn't finished when links remain" do + crawler = Relevance::Tarantula::Crawler.new + crawler.forms_to_crawl = [:stub_form] + crawler.finished?.should == false + end + end - + it "crawls links and forms again and again until finished?==true" do crawler = Relevance::Tarantula::Crawler.new crawler.expects(:finished?).times(3).returns(false, false, true) crawler.expects(:crawl_queued_links).times(2) crawler.expects(:crawl_queued_forms).times(2) - crawler.do_crawl + crawler.do_crawl(1) end it "asks each reporter to write its report in report_dir" do crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:report_dir).returns(test_output_dir) @@ -223,74 +265,122 @@ crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false crawler.queue_link("/foo").should == Relevance::Tarantula::Link.new("/foo") crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true end -end - -describe "Crawler link skipping" do - before do - @crawler = Relevance::Tarantula::Crawler.new - end + describe "link skipping" do + + before { @crawler = Relevance::Tarantula::Crawler.new } + + it "skips links that are too long" do + @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false + @crawler.max_url_length = 2 + @crawler.expects(:log).with("Skipping long url /foo") + @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true + end - it "skips links that are too long" do - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false - @crawler.max_url_length = 2 - @crawler.expects(:log).with("Skipping long url /foo") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true - end - - it "skips outbound links (those that begin with http)" do - @crawler.expects(:log).with("Skipping http-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true - end + it "skips outbound links (those that begin with http)" do + @crawler.expects(:log).with("Skipping http-anything") + @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true + end - it "skips javascript links (those that begin with javascript)" do - @crawler.expects(:log).with("Skipping javascript-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true - end + it "skips javascript links (those that begin with javascript)" do + @crawler.expects(:log).with("Skipping javascript-anything") + @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true + end - it "skips mailto links (those that begin with http)" do - @crawler.expects(:log).with("Skipping mailto-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true - end + it "skips mailto links (those that begin with http)" do + @crawler.expects(:log).with("Skipping mailto-anything") + @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true + end - it 'skips blank links' do - @crawler.queue_link(nil) - @crawler.links_to_crawl.should == [] - @crawler.queue_link("") - @crawler.links_to_crawl.should == [] - end + it 'skips blank links' do + @crawler.queue_link(nil) + @crawler.links_to_crawl.should == [] + @crawler.queue_link("") + @crawler.links_to_crawl.should == [] + end - it "logs and skips links that match a pattern" do - @crawler.expects(:log).with("Skipping /the-red-button") - @crawler.skip_uri_patterns << /red-button/ - @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button") - @crawler.queue_link("/the-red-button").should == nil - end + it "logs and skips links that match a pattern" do + @crawler.expects(:log).with("Skipping /the-red-button") + @crawler.skip_uri_patterns << /red-button/ + @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button") + @crawler.queue_link("/the-red-button").should == nil + end - it "logs and skips form submissions that match a pattern" do - @crawler.expects(:log).with("Skipping /reset-password-form") - @crawler.skip_uri_patterns << /reset-password/ - fs = stub_everything(:action => "/reset-password-form") - @crawler.should_skip_form_submission?(fs).should == true + it "logs and skips form submissions that match a pattern" do + @crawler.expects(:log).with("Skipping /reset-password-form") + @crawler.skip_uri_patterns << /reset-password/ + fs = stub_everything(:action => "/reset-password-form") + @crawler.should_skip_form_submission?(fs).should == true + end end -end + + describe "allow_nnn_for" do -describe "allow_nnn_for" do - it "installs result as a response_code_handler" do - crawler = Relevance::Tarantula::Crawler.new - crawler.response_code_handler.should == Relevance::Tarantula::Result + it "installs result as a response_code_handler" do + crawler = Relevance::Tarantula::Crawler.new + crawler.response_code_handler.should == Relevance::Tarantula::Result + end + + it "delegates to the response_code_handler" do + crawler = Relevance::Tarantula::Crawler.new + (response_code_handler = mock).expects(:allow_404_for).with(:stub) + crawler.response_code_handler = response_code_handler + crawler.allow_404_for(:stub) + end + + it "chains up to super for method_missing" do + crawler = Relevance::Tarantula::Crawler.new + lambda{crawler.foo}.should raise_error(NoMethodError) + end + end - it "delegates to the response_code_handler" do - crawler = Relevance::Tarantula::Crawler.new - (response_code_handler = mock).expects(:allow_404_for).with(:stub) - crawler.response_code_handler = response_code_handler - crawler.allow_404_for(:stub) + describe "timeouts" do + + it "sets start and end times for a single crawl" do + start_time = Time.parse("March 1st, 2008 10:00am") + end_time = Time.parse("March 1st, 2008 10:10am") + Time.stubs(:now).returns(start_time, end_time) + + crawler = Relevance::Tarantula::Crawler.new + stub_puts_and_print(crawler) + crawler.proxy = stub_everything(:get => response = stub(:code => "200")) + crawler.crawl + crawler.crawl_start_times.first.should == start_time + crawler.crawl_end_times.first.should == end_time + end + + it "has elasped time for a crawl" do + start_time = Time.parse("March 1st, 2008 10:00am") + elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am") + Time.stubs(:now).returns(start_time, elasped_time_check) + + crawler = Relevance::Tarantula::Crawler.new + stub_puts_and_print(crawler) + crawler.proxy = stub_everything(:get => response = stub(:code => "200")) + crawler.crawl + crawler.elasped_time_for_pass(0).should == 600.seconds + end + + it "raises out of the crawl if elasped time is greater then the crawl timeout" do + start_time = Time.parse("March 1st, 2008 10:00am") + elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am") + Time.stubs(:now).returns(start_time, elasped_time_check) + + crawler = Relevance::Tarantula::Crawler.new + crawler.crawl_timeout = 5.minutes + + crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] + crawler.proxy = stub + crawler.proxy.stubs(:get).returns(response = stub(:code => "200")) + + stub_puts_and_print(crawler) + lambda { + crawler.do_crawl(0) + }.should raise_error + end + end - it "chains up to super for method_missing" do - crawler = Relevance::Tarantula::Crawler.new - lambda{crawler.foo}.should raise_error(NoMethodError) - end -end +end \ No newline at end of file