examples/relevance/tarantula/crawler_example.rb in tarantula-0.1.5 vs examples/relevance/tarantula/crawler_example.rb in tarantula-0.1.8
- old
+ new
@@ -1,206 +1,248 @@
-require File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb")
+require File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "example_helper.rb"))
-describe 'Relevance::Tarantula::Crawler#transform_url' do
- before {@crawler = Relevance::Tarantula::Crawler.new}
- it "de-obfuscates unicode obfuscated urls" do
- obfuscated_mailto = "mailto:"
- @crawler.transform_url(obfuscated_mailto).should == "mailto:"
- end
+describe Relevance::Tarantula::Crawler do
- it "strips the trailing name portion of a link" do
- @crawler.transform_url('http://host/path#name').should == 'http://host/path'
- end
-end
+ describe "transform_url" do
-describe 'Relevance::Tarantula::Crawler log grabbing' do
- it "returns nil if no grabber is specified" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.grab_log!.should == nil
- end
+ before { @crawler = Relevance::Tarantula::Crawler.new }
+
+ it "de-obfuscates unicode obfuscated urls" do
+ obfuscated_mailto = "mailto:"
+ @crawler.transform_url(obfuscated_mailto).should == "mailto:"
+ end
- it "returns grabber.grab if grabber is specified" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.log_grabber = stub(:grab! => "fake log entry")
- crawler.grab_log!.should == "fake log entry"
+ it "strips the trailing name portion of a link" do
+ @crawler.transform_url('http://host/path#name').should == 'http://host/path'
+ end
end
-end
+
+
+ describe "log grabbing" do
-describe 'Relevance::Tarantula::Crawler interruption' do
- it 'catches interruption and writes the partial report' do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.stubs(:queue_link)
- crawler.stubs(:do_crawl).raises(Interrupt)
- crawler.expects(:report_results)
- $stderr.expects(:puts).with("CTRL-C")
- crawler.crawl
- end
-end
+ it "returns nil if no grabber is specified" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.grab_log!.should == nil
+ end
-describe 'Relevance::Tarantula::Crawler handle_form_results' do
- it 'captures the result values (bugfix)' do
- response = stub_everything
- result_args = {:url => :action_stub,
- :data => 'nil',
- :response => response,
- :referrer => :action_stub,
- :log => nil,
- :method => :stub_method,
- :test_name => nil}
- result = Relevance::Tarantula::Result.new(result_args)
- Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
- crawler = Relevance::Tarantula::Crawler.new
- crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
- response)
+ it "returns grabber.grab if grabber is specified" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.log_grabber = stub(:grab! => "fake log entry")
+ crawler.grab_log!.should == "fake log entry"
+ end
+
end
-end
-
-describe 'Relevance::Tarantula::Crawler#crawl' do
- it 'queues the first url, does crawl, and then reports results' do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.expects(:queue_link).with("/foobar")
- crawler.expects(:do_crawl)
- crawler.expects(:report_results)
- crawler.crawl("/foobar")
- end
- it 'reports results even if the crawl fails' do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.expects(:do_crawl).raises(RuntimeError)
- crawler.expects(:report_results)
- lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
+ describe "interrupt" do
+
+ it 'catches interruption and writes the partial report' do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:queue_link)
+ crawler.stubs(:do_crawl).raises(Interrupt)
+ crawler.expects(:report_results)
+ $stderr.expects(:puts).with("CTRL-C")
+ crawler.crawl
+ end
+
end
-end
-
-describe 'Relevance::Tarantula::Crawler queuing' do
- it 'queues and remembers links' do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.expects(:transform_url).with("/url").returns("/transformed")
- crawler.queue_link("/url")
- crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
- crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
- end
- it 'queues and remembers forms' do
- crawler = Relevance::Tarantula::Crawler.new
- form = Hpricot('<form action="/action" method="post"/>').at('form')
- signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
- crawler.queue_form(form)
- crawler.forms_to_crawl.size.should == 1
- crawler.form_signatures_queued.should == Set.new([signature])
+ describe 'handle_form_results' do
+
+ it 'captures the result values (bugfix)' do
+ response = stub_everything
+ result_args = {:url => :action_stub,
+ :data => 'nil',
+ :response => response,
+ :referrer => :action_stub,
+ :log => nil,
+ :method => :stub_method,
+ :test_name => nil}
+ result = Relevance::Tarantula::Result.new(result_args)
+ Relevance::Tarantula::Result.expects(:new).with(result_args).returns(result)
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.handle_form_results(stub_everything(:method => :stub_method, :action => :action_stub),
+ response)
+ end
+
end
- it 'remembers link referrer if there is one' do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.queue_link("/url", "/some-referrer")
- crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
+ describe "crawl" do
+
+ it 'queues the first url, does crawl, and then reports results' do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.expects(:queue_link).with("/foobar")
+ crawler.expects(:do_crawl)
+ crawler.expects(:report_results)
+ crawler.crawl("/foobar")
+ end
+
+ it 'reports results even if the crawl fails' do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.expects(:do_crawl).raises(RuntimeError)
+ crawler.expects(:report_results)
+ lambda {crawler.crawl('/')}.should raise_error(RuntimeError)
+ end
+
end
-end
+ describe "queueing" do
-describe 'Relevance::Tarantula::Crawler#report_results' do
- it "delegates to generate_reports" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.expects(:generate_reports)
- crawler.report_results
- end
-end
+ it 'queues and remembers links' do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.expects(:transform_url).with("/url").returns("/transformed")
+ crawler.queue_link("/url")
+ crawler.links_to_crawl.should == [Relevance::Tarantula::Link.new("/transformed")]
+ crawler.links_queued.should == Set.new([Relevance::Tarantula::Link.new("/transformed")])
+ end
-describe 'Relevance::Tarantula::Crawler#crawling' do
+ it 'queues and remembers forms' do
+ crawler = Relevance::Tarantula::Crawler.new
+ form = Hpricot('<form action="/action" method="post"/>').at('form')
+ signature = Relevance::Tarantula::FormSubmission.new(Relevance::Tarantula::Form.new(form)).signature
+ crawler.queue_form(form)
+ crawler.forms_to_crawl.size.should == 1
+ crawler.form_signatures_queued.should == Set.new([signature])
+ end
- it "converts ActiveRecord::RecordNotFound into a 404" do
- (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
- crawler = Relevance::Tarantula::Crawler.new
- crawler.proxy = proxy
- response = crawler.crawl_form stub_everything(:method => nil)
- response.code.should == "404"
- response.content_type.should == "text/plain"
- response.body.should == "ActiveRecord::RecordNotFound"
+ it 'remembers link referrer if there is one' do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.queue_link("/url", "/some-referrer")
+ crawler.referrers.should == {Relevance::Tarantula::Link.new("/url") => "/some-referrer"}
+ end
+
end
+
+ describe "crawling" do
+
+ it "converts ActiveRecord::RecordNotFound into a 404" do
+ (proxy = stub_everything).expects(:send).raises(ActiveRecord::RecordNotFound)
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.proxy = proxy
+ response = crawler.crawl_form stub_everything(:method => nil)
+ response.code.should == "404"
+ response.content_type.should == "text/plain"
+ response.body.should == "ActiveRecord::RecordNotFound"
+ end
- it "does four things with each link: get, log, handle, and blip" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.proxy = stub
- response = stub(:code => "200")
- crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
- crawler.proxy.expects(:get).returns(response).times(2)
- crawler.expects(:log).times(2)
- crawler.expects(:handle_link_results).times(2)
- crawler.expects(:blip).times(2)
- crawler.crawl_queued_links
- crawler.links_to_crawl.should == []
- end
+ it "does four things with each link: get, log, handle, and blip" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.proxy = stub
+ response = stub(:code => "200")
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
+ crawler.proxy.expects(:get).returns(response).times(2)
+ crawler.expects(:log).times(2)
+ crawler.expects(:handle_link_results).times(2)
+ crawler.expects(:blip).times(2)
+ crawler.crawl_queued_links
+ crawler.links_to_crawl.should == []
+ end
+
+ it "invokes queued forms, logs responses, and calls handlers" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.forms_to_crawl << stub_everything(:method => "get",
+ :action => "/foo",
+ :data => "some data",
+ :to_s => "stub")
+ crawler.proxy = stub_everything(:send => stub(:code => "200" ))
+ crawler.expects(:log).with("Response 200 for stub")
+ crawler.expects(:blip)
+ crawler.crawl_queued_forms
+ end
- it "invokes queued forms, logs responses, and calls handlers" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.forms_to_crawl << stub_everything(:method => "get",
- :action => "/foo",
- :data => "some data",
- :to_s => "stub")
- crawler.proxy = stub_everything(:send => stub(:code => "200" ))
- crawler.expects(:log).with("Response 200 for stub")
- crawler.expects(:blip)
- crawler.crawl_queued_forms
+ it "breaks out early if a timeout is set" do
+ crawler = Relevance::Tarantula::Crawler.new
+ stub_puts_and_print(crawler)
+ crawler.proxy = stub
+ response = stub(:code => "200")
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
+ crawler.proxy.expects(:get).returns(response).times(4)
+ crawler.forms_to_crawl << stub_everything(:method => "post",
+ :action => "/foo",
+ :data => "some data",
+ :to_s => "stub")
+ crawler.proxy.expects(:post).returns(response).times(2)
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
+ crawler.times_to_crawl = 2
+ crawler.crawl
+
+ end
+
+ it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
+ crawler = Relevance::Tarantula::Crawler.new
+ stub_puts_and_print(crawler)
+ crawler.proxy = stub
+ response = stub(:code => "200")
+ crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
+ crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
+ crawler.forms_to_crawl << stub_everything(:method => "post",
+ :action => "/foo",
+ :data => "some data",
+ :to_s => "stub")
+ crawler.proxy.expects(:post).returns(response).times(2)
+ crawler.expects(:links_completed_count).returns(0,1,2,3,4,5).times(6)
+ crawler.times_to_crawl = 2
+ crawler.crawl
+ end
+
end
- it "resets to the initial links/forms on subsequent crawls when times_to_crawl > 1" do
- crawler = Relevance::Tarantula::Crawler.new
- stub_puts_and_print(crawler)
- crawler.proxy = stub
- response = stub(:code => "200")
- crawler.links_to_crawl = [stub(:href => "/foo", :method => :get)]
- crawler.proxy.expects(:get).returns(response).times(4) # (stub and "/") * 2
- crawler.forms_to_crawl << stub_everything(:method => "post",
- :action => "/foo",
- :data => "some data",
- :to_s => "stub")
- crawler.proxy.expects(:post).returns(response).times(2)
- crawler.expects(:links_completed_count).returns(*(0..6).to_a).times(6)
- crawler.times_to_crawl = 2
- crawler.crawl
- end
-end
+ describe "report_results" do
-describe 'Crawler blip' do
- it "blips the current progress if !verbose" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.stubs(:verbose).returns false
- crawler.expects(:print).with("\r 0 of 0 links completed ")
- crawler.blip
+ it "delegates to generate_reports" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.expects(:generate_reports)
+ crawler.report_results
+ end
+
end
- it "blips nothing if verbose" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.stubs(:verbose).returns true
- crawler.expects(:print).never
- crawler.blip
- end
-end
+
+ describe "blip" do
-describe 'Relevance::Tarantula::Crawler' do
- it "is finished when the links and forms are crawled" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.finished?.should == true
+ it "blips the current progress if !verbose" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:verbose).returns false
+ crawler.stubs(:timeout_if_too_long)
+ crawler.expects(:print).with("\r 0 of 0 links completed ")
+ crawler.blip
+ end
+
+ it "blips nothing if verbose" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.stubs(:verbose).returns true
+ crawler.expects(:print).never
+ crawler.blip
+ end
+
end
+
+ describe "finished?" do
- it "isn't finished when links remain" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.links_to_crawl = [:stub_link]
- crawler.finished?.should == false
- end
+ it "is finished when the links and forms are crawled" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.finished?.should == true
+ end
- it "isn't finished when links remain" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.forms_to_crawl = [:stub_form]
- crawler.finished?.should == false
+ it "isn't finished when links remain" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.links_to_crawl = [:stub_link]
+ crawler.finished?.should == false
+ end
+
+ it "isn't finished when links remain" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.forms_to_crawl = [:stub_form]
+ crawler.finished?.should == false
+ end
+
end
-
+
it "crawls links and forms again and again until finished?==true" do
crawler = Relevance::Tarantula::Crawler.new
crawler.expects(:finished?).times(3).returns(false, false, true)
crawler.expects(:crawl_queued_links).times(2)
crawler.expects(:crawl_queued_forms).times(2)
- crawler.do_crawl
+ crawler.do_crawl(1)
end
it "asks each reporter to write its report in report_dir" do
crawler = Relevance::Tarantula::Crawler.new
crawler.stubs(:report_dir).returns(test_output_dir)
@@ -223,74 +265,122 @@
crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
crawler.queue_link("/foo").should == Relevance::Tarantula::Link.new("/foo")
crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
end
-end
-
-describe "Crawler link skipping" do
- before do
- @crawler = Relevance::Tarantula::Crawler.new
- end
+ describe "link skipping" do
+
+ before { @crawler = Relevance::Tarantula::Crawler.new }
+
+ it "skips links that are too long" do
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
+ @crawler.max_url_length = 2
+ @crawler.expects(:log).with("Skipping long url /foo")
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
+ end
- it "skips links that are too long" do
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == false
- @crawler.max_url_length = 2
- @crawler.expects(:log).with("Skipping long url /foo")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("/foo")).should == true
- end
-
- it "skips outbound links (those that begin with http)" do
- @crawler.expects(:log).with("Skipping http-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
- end
+ it "skips outbound links (those that begin with http)" do
+ @crawler.expects(:log).with("Skipping http-anything")
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("http-anything")).should == true
+ end
- it "skips javascript links (those that begin with javascript)" do
- @crawler.expects(:log).with("Skipping javascript-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
- end
+ it "skips javascript links (those that begin with javascript)" do
+ @crawler.expects(:log).with("Skipping javascript-anything")
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("javascript-anything")).should == true
+ end
- it "skips mailto links (those that begin with http)" do
- @crawler.expects(:log).with("Skipping mailto-anything")
- @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
- end
+ it "skips mailto links (those that begin with http)" do
+ @crawler.expects(:log).with("Skipping mailto-anything")
+ @crawler.should_skip_link?(Relevance::Tarantula::Link.new("mailto-anything")).should == true
+ end
- it 'skips blank links' do
- @crawler.queue_link(nil)
- @crawler.links_to_crawl.should == []
- @crawler.queue_link("")
- @crawler.links_to_crawl.should == []
- end
+ it 'skips blank links' do
+ @crawler.queue_link(nil)
+ @crawler.links_to_crawl.should == []
+ @crawler.queue_link("")
+ @crawler.links_to_crawl.should == []
+ end
- it "logs and skips links that match a pattern" do
- @crawler.expects(:log).with("Skipping /the-red-button")
- @crawler.skip_uri_patterns << /red-button/
- @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
- @crawler.queue_link("/the-red-button").should == nil
- end
+ it "logs and skips links that match a pattern" do
+ @crawler.expects(:log).with("Skipping /the-red-button")
+ @crawler.skip_uri_patterns << /red-button/
+ @crawler.queue_link("/blue-button").should == Relevance::Tarantula::Link.new("/blue-button")
+ @crawler.queue_link("/the-red-button").should == nil
+ end
- it "logs and skips form submissions that match a pattern" do
- @crawler.expects(:log).with("Skipping /reset-password-form")
- @crawler.skip_uri_patterns << /reset-password/
- fs = stub_everything(:action => "/reset-password-form")
- @crawler.should_skip_form_submission?(fs).should == true
+ it "logs and skips form submissions that match a pattern" do
+ @crawler.expects(:log).with("Skipping /reset-password-form")
+ @crawler.skip_uri_patterns << /reset-password/
+ fs = stub_everything(:action => "/reset-password-form")
+ @crawler.should_skip_form_submission?(fs).should == true
+ end
end
-end
+
+ describe "allow_nnn_for" do
-describe "allow_nnn_for" do
- it "installs result as a response_code_handler" do
- crawler = Relevance::Tarantula::Crawler.new
- crawler.response_code_handler.should == Relevance::Tarantula::Result
+ it "installs result as a response_code_handler" do
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.response_code_handler.should == Relevance::Tarantula::Result
+ end
+
+ it "delegates to the response_code_handler" do
+ crawler = Relevance::Tarantula::Crawler.new
+ (response_code_handler = mock).expects(:allow_404_for).with(:stub)
+ crawler.response_code_handler = response_code_handler
+ crawler.allow_404_for(:stub)
+ end
+
+ it "chains up to super for method_missing" do
+ crawler = Relevance::Tarantula::Crawler.new
+ lambda{crawler.foo}.should raise_error(NoMethodError)
+ end
+
end
- it "delegates to the response_code_handler" do
- crawler = Relevance::Tarantula::Crawler.new
- (response_code_handler = mock).expects(:allow_404_for).with(:stub)
- crawler.response_code_handler = response_code_handler
- crawler.allow_404_for(:stub)
+ describe "timeouts" do
+
+ it "sets start and end times for a single crawl" do
+ start_time = Time.parse("March 1st, 2008 10:00am")
+ end_time = Time.parse("March 1st, 2008 10:10am")
+ Time.stubs(:now).returns(start_time, end_time)
+
+ crawler = Relevance::Tarantula::Crawler.new
+ stub_puts_and_print(crawler)
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
+ crawler.crawl
+ crawler.crawl_start_times.first.should == start_time
+ crawler.crawl_end_times.first.should == end_time
+ end
+
+ it "has elasped time for a crawl" do
+ start_time = Time.parse("March 1st, 2008 10:00am")
+ elasped_time_check = Time.parse("March 1st, 2008, 10:10:00am")
+ Time.stubs(:now).returns(start_time, elasped_time_check)
+
+ crawler = Relevance::Tarantula::Crawler.new
+ stub_puts_and_print(crawler)
+ crawler.proxy = stub_everything(:get => response = stub(:code => "200"))
+ crawler.crawl
+ crawler.elasped_time_for_pass(0).should == 600.seconds
+ end
+
+ it "raises out of the crawl if elasped time is greater then the crawl timeout" do
+ start_time = Time.parse("March 1st, 2008 10:00am")
+ elasped_time_check = Time.parse("March 1st, 2008, 10:35:00am")
+ Time.stubs(:now).returns(start_time, elasped_time_check)
+
+ crawler = Relevance::Tarantula::Crawler.new
+ crawler.crawl_timeout = 5.minutes
+
+ crawler.links_to_crawl = [stub(:href => "/foo1", :method => :get), stub(:href => "/foo2", :method => :get)]
+ crawler.proxy = stub
+ crawler.proxy.stubs(:get).returns(response = stub(:code => "200"))
+
+ stub_puts_and_print(crawler)
+ lambda {
+ crawler.do_crawl(0)
+ }.should raise_error
+ end
+
end
- it "chains up to super for method_missing" do
- crawler = Relevance::Tarantula::Crawler.new
- lambda{crawler.foo}.should raise_error(NoMethodError)
- end
-end
+end
\ No newline at end of file