examples/relevance/tarantula/crawler_example.rb in tarantula-0.2.0 vs examples/relevance/tarantula/crawler_example.rb in tarantula-0.3.3
- old
+ new
@@ -86,128 +86,118 @@
describe "queueing" do
it 'queues and remembers links' do
crawler = Relevance::Tarantula::Crawler.new
- crawler.expects(:transform_url).with("/url").returns("/transformed")
+ crawler.expects(:transform_url).with("/url").returns("/transformed").at_least_once
crawler.queue_link("/url")
- crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
- crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
+ # TODO not sure this is the best way to test this anymore; relying on result of transform in both actual and expected
+ crawler.crawl_queue.should == [make_link("/url", crawler)]
+ crawler.links_queued.should == Set.new([make_link("/url", crawler)])
end
it 'queues and remembers forms' do
crawler = Relevance::Tarantula::Crawler.new
form = Hpricot('<form action="/action" method="post"/>').at('form')
- signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
+ signature = Relevance::Tarantula::FormSubmission.new(make_form(form)).signature
crawler.queue_form(form)
- crawler.forms_to_crawl.size.should == 1
+ crawler.crawl_queue.size.should == 1
crawler.form_signatures_queued.should == Set.new([signature])
end
- it 'remembers link referrer if there is one' do
+ it "passes link, self, and referrer when creating Link objects" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.queue_link("/url", "/some-referrer")
- crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
+ Relevance::Tarantula::Link.expects(:new).with('/url', crawler, '/some-referrer')
+ crawler.stubs(:should_skip_link?)
+ crawler.queue_link('/url', '/some-referrer')
end
end
describe "crawling" do
+ before do
+ @form = Hpricot('<form action="/action" method="post"/>').at('form')
+ end
- it "converts ActiveRecord::RecordNotFound into a 404" do
- (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
+ it "does two things with each link: crawl and blip" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.proxy = proxy
- response = crawler.crawl_form stub_everything(:method => nil)
- response.code.should == "404"
- response.content_type.should == "text/plain"
- response.body.should == "ActiveRecord::RecordNotFound"
- end
-
- it "does four things with each link: get, log, handle, and blip" do
- crawler = Relevance::Tarantula::Crawler.new
crawler.proxy = stub
- response = stub(:code => "200")
- crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
- crawler.proxy.expects(:get).returns(response).times(2)
- crawler.expects(:log).times(2)
- crawler.expects(:handle_link_results).times(2)
+ crawler.crawl_queue = links = [make_link("/foo1", crawler), make_link("/foo2", crawler)]
+
+ links.each{|link| link.expects(:crawl)}
crawler.expects(:blip).times(2)
- crawler.crawl_queued_links
- crawler.links_to_crawl.should == []
+
+ crawler.crawl_the_queue
+ crawler.crawl_queue.should == []
end
it "invokes queued forms, logs responses, and calls handlers" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.forms_to_crawl << stub_everything(:method => "get",
- :action => "/foo",
- :data => "some data",
- :to_s => "stub")
- crawler.proxy = stub_everything(:send => stub(:code => "200" ))
- crawler.expects(:log).with("Response 200 for stub")
+ crawler.crawl_queue << Relevance::Tarantula::FormSubmission.new(make_form(@form, crawler))
+ crawler.expects(:submit).returns(stub(:code => "200"))
crawler.expects(:blip)
- crawler.crawl_queued_forms
+ crawler.crawl_the_queue
end
- it "breaks out early if a timeout is set" do
- crawler = Relevance::Tarantula::Crawler.new
- stub_puts_and_print(crawler)
- crawler.proxy = stub
- response = stub(:code => "200")
- crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
- crawler.proxy.expects(:get).returns(response).times(4)
- crawler.forms_to_crawl << stub_everything(:method => "post",
- :action => "/foo",
- :data => "some data",
- :to_s => "stub")
- crawler.proxy.expects(:post).returns(response).times(2)
- crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
- crawler.times_to_crawl = 2
- crawler.crawl
-
- end
+ # TODO this is the same as "resets to the initial links/forms ..." and doesn't appear to test anything related to a timeout.
+ it "breaks out early if a timeout is set"
it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
crawler = Relevance::Tarantula::Crawler.new
stub_puts_and_print(crawler)
- crawler.proxy = stub
response = stub(:code => "200")
- crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
- crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
- crawler.forms_to_crawl << stub_everything(:method => "post",
- :action => "/foo",
- :data => "some data",
- :to_s => "stub")
- crawler.proxy.expects(:post).returns(response).times(2)
- crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
+ crawler.queue_link('/foo')
+ crawler.expects(:follow).returns(response).times(4) # (stub and "/") * 2
+ crawler.queue_form(@form)
+ crawler.expects(:submit).returns(response).times(2)
+ crawler.expects(:blip).times(6)
crawler.times_to_crawl = 2
crawler.crawl
end
end
describe "report_results" do
-
+ it "prints a final summary line" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:generate_reports)
+ crawler.expects(:total_links_count).returns(42)
+ crawler.expects(:puts).with("Crawled 42 links and forms.")
+ crawler.report_results
+ end
+
it "delegates to generate_reports" do
crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:puts)
crawler.expects(:generate_reports)
crawler.report_results
end
end
describe "blip" do
it "blips the current progress if !verbose" do
+ $stdout.stubs(:tty?).returns(true)
crawler = Relevance::Tarantula::Crawler.new
crawler.stubs(:verbose).returns false
crawler.stubs(:timeout_if_too_long)
crawler.expects(:print).with("\r 0 of 0 links completed ")
crawler.blip
end
+ it "suppresses the blip message if not writing to a tty" do
+ $stdout.stubs(:tty?).returns(false)
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:verbose).returns false
+ crawler.stubs(:timeout_if_too_long)
+ crawler.expects(:print).never
+ crawler.blip
+ end
+
it "blips nothing if verbose" do
+ $stdout.stubs(:tty?).returns(true)
crawler = Relevance::Tarantula::Crawler.new
crawler.stubs(:verbose).returns true
crawler.expects(:print).never
crawler.blip
end
@@ -221,27 +211,26 @@
crawler.finished?.should == true
end
it "isn't finished when links remain" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.links_to_crawl = [:stub_link]
+ crawler.crawl_queue = [:stub_link]
crawler.finished?.should == false
end
- it "isn't finished when links remain" do
+ it "isn't finished when forms remain" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.forms_to_crawl = [:stub_form]
+ crawler.crawl_queue = [:stub_form]
crawler.finished?.should == false
end
end
it "crawls links and forms again and again until finished?==true" do
crawler = Relevance::Tarantula::Crawler.new
crawler.expects(:finished?).times(3).returns(false, false, true)
- crawler.expects(:crawl_queued_links).times(2)
- crawler.expects(:crawl_queued_forms).times(2)
+ crawler.expects(:crawl_the_queue).times(2)
crawler.do_crawl(1)
end
it "asks each reporter to write its report in report_dir" do
crawler = Relevance::Tarantula::Crawler.new
@@ -260,52 +249,52 @@
crawler.report_dir.should == "faux_rails_root/tmp/tarantula"
end
it "skips links that are already queued" do
crawler = Relevance::Tarantula::Crawler.new
- crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
- crawler.queue_link("/foo").should == Relevance::Tarantula::Link.new("/foo")
- crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
+ crawler.should_skip_link?(make_link("/foo")).should == false
+ crawler.queue_link("/foo").should == make_link("/foo")
+ crawler.should_skip_link?(make_link("/foo")).should == true
end
describe "link skipping" do
before { @crawler = Relevance::Tarantula::Crawler.new }
it "skips links that are too long" do
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
+ @crawler.should_skip_link?(make_link("/foo")).should == false
@crawler.max_url_length = 2
@crawler.expects(:log).with("Skipping long url /foo")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
+ @crawler.should_skip_link?(make_link("/foo")).should == true
end
it "skips outbound links (those that begin with http)" do
@crawler.expects(:log).with("Skipping http-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
+ @crawler.should_skip_link?(make_link("http-anything")).should == true
end
it "skips javascript links (those that begin with javascript)" do
@crawler.expects(:log).with("Skipping javascript-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
+ @crawler.should_skip_link?(make_link("javascript-anything")).should == true
end
it "skips mailto links (those that begin with http)" do
@crawler.expects(:log).with("Skipping mailto-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
+ @crawler.should_skip_link?(make_link("mailto-anything")).should == true
end
it 'skips blank links' do
@crawler.queue_link(nil)
- @crawler.links_to_crawl.should == []
+ @crawler.crawl_queue.should == []
@crawler.queue_link("")
- @crawler.links_to_crawl.should == []
+ @crawler.crawl_queue.should == []
end
it "logs and skips links that match a pattern" do
@crawler.expects(:log).with("Skipping /the-red-button")
@crawler.skip_uri_patterns << /red-button/
- @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
+ @crawler.queue_link("/blue-button").should == make_link("/blue-button")
@crawler.queue_link("/the-red-button").should == nil
end
it "logs and skips form submissions that match a pattern" do
@crawler.expects(:log).with("Skipping /reset-password-form")
@@ -369,10 +358,10 @@
Time.stubs(:now).returns(start_time, elasped_time_check)
crawler = Relevance::Tarantula::Crawler.new
crawler.crawl_timeout = 5.minutes
- crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
+ crawler.crawl_queue = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
crawler.proxy = stub
crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
stub_puts_and_print(crawler)
lambda {
\ No newline at end of file