examples/relevance/tarantula/crawler_example.rb in tarantula-0.2.0 vs examples/relevance/tarantula/crawler_example.rb in tarantula-0.3.3

- old
+ new

@@ -86,128 +86,118 @@ describe "queueing" do it 'queues and remembers links' do crawler = Relevance::Tarantula::Crawler.new - crawler.expects(:transform_url).with("/url").returns("/transformed") + crawler.expects(:transform_url).with("/url").returns("/transformed").at_least_once crawler.queue_link("/url") - crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")] - crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")]) + # TODO not sure this is the best way to test this anymore; relying on result of transform in both actual and expected + crawler.crawl_queue.should == [make_link("/url", crawler)] + crawler.links_queued.should == Set.new([make_link("/url", crawler)]) end it 'queues and remembers forms' do crawler = Relevance::Tarantula::Crawler.new form = Hpricot('<form action="/action" method="post"/>').at('form') - signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature + signature = Relevance::Tarantula::FormSubmission.new(make_form(form)).signature crawler.queue_form(form) - crawler.forms_to_crawl.size.should == 1 + crawler.crawl_queue.size.should == 1 crawler.form_signatures_queued.should == Set.new([signature]) end - it 'remembers link referrer if there is one' do + it "passes link, self, and referrer when creating Link objects" do crawler = Relevance::Tarantula::Crawler.new - crawler.queue_link("/url", "/some-referrer") - crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"} + Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer') + crawler.stubs(:should_skip_link?) + crawler.queue_link('/url', '/some-referrer') end end describe "crawling" do + before do + @form = Hpricot('<form action="/action" method="post"/>').at('form') + end - it "converts ActiveRecord::RecordNotFound into a 404" do - (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound) + it "does two things with each link: crawl and blip" do crawler = Relevance::Tarantula::Crawler.new - crawler.proxy = proxy - response = crawler.crawl_form stub_everything(:method => nil) - response.code.should == "404" - response.content_type.should == "text/plain" - response.body.should == "ActiveRecord::RecordNotFound" - end - - it "does four things with each link: get, log, handle, and blip" do - crawler = Relevance::Tarantula::Crawler.new crawler.proxy = stub - response = stub(:code => "200") - crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] - crawler.proxy.expects(:get).returns(response).times(2) - crawler.expects(:log).times(2) - crawler.expects(:handle_link_results).times(2) + crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)] + + links.each{|link| link.expects(:crawl)} crawler.expects(:blip).times(2) - crawler.crawl_queued_links - crawler.links_to_crawl.should == [] + + crawler.crawl_the_queue + crawler.crawl_queue.should == [] end it "invokes queued forms, logs responses, and calls handlers" do crawler = Relevance::Tarantula::Crawler.new - crawler.forms_to_crawl << stub_everything(:method => "get", - :action => "/foo", - :data => "some data", - :to_s => "stub") - crawler.proxy = stub_everything(:send => stub(:code => "200" )) - crawler.expects(:log).with("Response 200 for stub") + crawler.crawl_queue << Relevance::Tarantula::FormSubmission.new(make_form(@form, crawler)) + crawler.expects(:submit).returns(stub(:code => "200")) crawler.expects(:blip) - crawler.crawl_queued_forms + crawler.crawl_the_queue end - it "breaks out early if a timeout is set" do - crawler = Relevance::Tarantula::Crawler.new - stub_puts_and_print(crawler) - crawler.proxy = stub - response = stub(:code => "200") - crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)] - crawler.proxy.expects(:get).returns(response).times(4) - crawler.forms_to_crawl << stub_everything(:method => "post", - :action => "/foo", - :data => "some data", - :to_s => "stub") - crawler.proxy.expects(:post).returns(response).times(2) - crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6) - crawler.times_to_crawl = 2 - crawler.crawl - - end + # TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout. + it "breaks out early if a timeout is set" it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do crawler = Relevance::Tarantula::Crawler.new stub_puts_and_print(crawler) - crawler.proxy = stub response = stub(:code => "200") - crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)] - crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2 - crawler.forms_to_crawl << stub_everything(:method => "post", - :action => "/foo", - :data => "some data", - :to_s => "stub") - crawler.proxy.expects(:post).returns(response).times(2) - crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6) + crawler.queue_link('/foo') + crawler.expects(:follow).returns(response).times(4) # (stub and "/") * 2 + crawler.queue_form(@form) + crawler.expects(:submit).returns(response).times(2) + crawler.expects(:blip).times(6) crawler.times_to_crawl = 2 crawler.crawl end end describe "report_results" do - + it "prints a final summary line" do + crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:generate_reports) + crawler.expects(:total_links_count).returns(42) + crawler.expects(:puts).with("Crawled 42 links and forms.") + crawler.report_results + end + it "delegates to generate_reports" do crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:puts) crawler.expects(:generate_reports) crawler.report_results end end describe "blip" do it "blips the current progress if !verbose" do + $stdout.stubs(:tty?).returns(true) crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:verbose).returns false crawler.stubs(:timeout_if_too_long) crawler.expects(:print).with("\r 0 of 0 links completed ") crawler.blip end + it "suppresses the blip message if not writing to a tty" do + $stdout.stubs(:tty?).returns(false) + crawler = Relevance::Tarantula::Crawler.new + crawler.stubs(:verbose).returns false + crawler.stubs(:timeout_if_too_long) + crawler.expects(:print).never + crawler.blip + end + it "blips nothing if verbose" do + $stdout.stubs(:tty?).returns(true) crawler = Relevance::Tarantula::Crawler.new crawler.stubs(:verbose).returns true crawler.expects(:print).never crawler.blip end @@ -221,27 +211,26 @@ crawler.finished?.should == true end it "isn't finished when links remain" do crawler = Relevance::Tarantula::Crawler.new - crawler.links_to_crawl = [:stub_link] + crawler.crawl_queue = [:stub_link] crawler.finished?.should == false end - it "isn't finished when links remain" do + it "isn't finished when forms remain" do crawler = Relevance::Tarantula::Crawler.new - crawler.forms_to_crawl = [:stub_form] + crawler.crawl_queue = [:stub_form] crawler.finished?.should == false end end it "crawls links and forms again and again until finished?==true" do crawler = Relevance::Tarantula::Crawler.new crawler.expects(:finished?).times(3).returns(false, false, true) - crawler.expects(:crawl_queued_links).times(2) - crawler.expects(:crawl_queued_forms).times(2) + crawler.expects(:crawl_the_queue).times(2) crawler.do_crawl(1) end it "asks each reporter to write its report in report_dir" do crawler = Relevance::Tarantula::Crawler.new @@ -260,52 +249,52 @@ crawler.report_dir.should == "faux_rails_root/tmp/tarantula" end it "skips links that are already queued" do crawler = Relevance::Tarantula::Crawler.new - crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false - crawler.queue_link("/foo").should == Relevance::Tarantula::Link.new("/foo") - crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true + crawler.should_skip_link?(make_link("/foo")).should == false + crawler.queue_link("/foo").should == make_link("/foo") + crawler.should_skip_link?(make_link("/foo")).should == true end describe "link skipping" do before { @crawler = Relevance::Tarantula::Crawler.new } it "skips links that are too long" do - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false + @crawler.should_skip_link?(make_link("/foo")).should == false @crawler.max_url_length = 2 @crawler.expects(:log).with("Skipping long url /foo") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true + @crawler.should_skip_link?(make_link("/foo")).should == true end it "skips outbound links (those that begin with http)" do @crawler.expects(:log).with("Skipping http-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true + @crawler.should_skip_link?(make_link("http-anything")).should == true end it "skips javascript links (those that begin with javascript)" do @crawler.expects(:log).with("Skipping javascript-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true + @crawler.should_skip_link?(make_link("javascript-anything")).should == true end it "skips mailto links (those that begin with http)" do @crawler.expects(:log).with("Skipping mailto-anything") - @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true + @crawler.should_skip_link?(make_link("mailto-anything")).should == true end it 'skips blank links' do @crawler.queue_link(nil) - @crawler.links_to_crawl.should == [] + @crawler.crawl_queue.should == [] @crawler.queue_link("") - @crawler.links_to_crawl.should == [] + @crawler.crawl_queue.should == [] end it "logs and skips links that match a pattern" do @crawler.expects(:log).with("Skipping /the-red-button") @crawler.skip_uri_patterns << /red-button/ - @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button") + @crawler.queue_link("/blue-button").should == make_link("/blue-button") @crawler.queue_link("/the-red-button").should == nil end it "logs and skips form submissions that match a pattern" do @crawler.expects(:log).with("Skipping /reset-password-form") @@ -369,10 +358,10 @@ Time.stubs(:now).returns(start_time, elasped_time_check) crawler = Relevance::Tarantula::Crawler.new crawler.crawl_timeout = 5.minutes - crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] + crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)] crawler.proxy = stub crawler.proxy.stubs(:get).returns(response = stub(:code => "200")) stub_puts_and_print(crawler) lambda { \ No newline at end of file