require 'spec_helper' describe GovukMirrorer::Crawler do before :each do GovukMirrorer::Indexer.any_instance.stub(:process_artefacts) GovukMirrorer::Crawler.any_instance.stub(:logger).and_return(Logger.new("/dev/null")) end it 'should have a version number' do GovukMirrorer::VERSION.should_not be_nil end describe "initializing" do it "should handle all urls returned from the indexer" do GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w( https://www.example.com/ https://www.example.com/designprinciples https://www.example.com/designprinciples/styleguide https://www.example.com/designprinciples/performanceframework )) m = GovukMirrorer::Crawler.new m.urls.should == %w( https://www.example.com/ https://www.example.com/designprinciples https://www.example.com/designprinciples/styleguide https://www.example.com/designprinciples/performanceframework ) end describe "setting up the logger" do before :each do GovukMirrorer::Crawler.any_instance.unstub(:logger) end it "should log to stdout by default" do m = GovukMirrorer::Crawler.new logdev = m.logger.instance_variable_get('@logdev') logdev.dev.should == STDOUT end it "should log to a file if requested" do m = GovukMirrorer::Crawler.new(:log_file => "/dev/null") logdev = m.logger.instance_variable_get('@logdev') logdev.filename.should == "/dev/null" end it "should log to syslog if requested" do m = GovukMirrorer::Crawler.new(:syslog => "local4") m.logger.should be_a(Syslogger) m.logger.facility.should == Syslog::LOG_LOCAL4 m.logger.options.should == (Syslog::LOG_PID | Syslog::LOG_CONS) m.logger.ident.should == 'govuk_mirrorer' end it "should default to log level INFO" do m = GovukMirrorer::Crawler.new m.logger.level.should == Logger::INFO end it "should allow overriding the log level" do m = GovukMirrorer::Crawler.new(:log_level => 'warn') m.logger.level.should == Logger::WARN end end end describe "crawl" do before :each do GovukMirrorer::Indexer.any_instance.stub(:all_start_urls).and_return(%w( https://www.example.com/1 https://www.example.com/2 )) @m = GovukMirrorer::Crawler.new(:request_interval => 0.01) @m.stub(:process_govuk_page) @m.send(:agent).stub(:get).and_return("default") @m.stub(:sleep) end it "should fetch each page and pass it to the handler" do @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1") @m.should_receive(:process_govuk_page).with("page_1", {}).ordered @m.send(:agent).should_receive(:get).with("https://www.example.com/2").ordered.and_return("page_2") @m.should_receive(:process_govuk_page).with("page_2", {}).ordered @m.crawl end it "should sleep for the configured request_interval between requests" do @m.should_receive(:process_govuk_page).ordered @m.should_receive(:sleep).with(0.01).ordered # Actually on kernel, but setting the expectation here works @m.should_receive(:process_govuk_page).ordered @m.should_receive(:sleep).with(0.01).ordered @m.crawl end describe "handling errors" do it "should call handle_error with the relevant details" do error = StandardError.new("Boom") @m.send(:agent).should_receive(:get).with("https://www.example.com/1").and_raise(error) @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {}) @m.crawl end it "should continue with the next URL" do @m.send(:agent).stub(:get).with("https://www.example.com/1").and_raise("Boom") @m.send(:agent).should_receive(:get).with("https://www.example.com/2").and_return("something") @m.crawl end context "error handling" do [ [429, "Too Many Requests"], [500, "Internal Server Error"], [503, "Boom"], ].each do |resp_code, resp_reason| context "#{resp_code} #{resp_reason}" do it "should sleep for a second, and then retry" do error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason) @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_raise(error) @m.send(:agent).should_receive(:get).with("https://www.example.com/1").ordered.and_return("page_1") @m.should_not_receive(:handle_error) @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works @m.should_receive(:process_govuk_page).with("page_1", {}) @m.crawl end it "should only retry once" do error = Mechanize::ResponseCodeError.new(double("Page", code: resp_code), resp_reason) @m.send(:agent).should_receive(:get).with("https://www.example.com/1").twice.and_raise(error) @m.should_receive(:sleep).with(1) # Actually on kernel, but setting the expectation here works @m.should_receive(:handle_error).with(:url => "https://www.example.com/1", :handler => :process_govuk_page, :error => error, :data => {}).once @m.crawl end end end end end end describe "process_govuk_page" do before :each do @m = GovukMirrorer::Crawler.new({:site_root => "https://site-under-test"}) @m.stub(:save_to_disk) @m.stub(:extract_and_handle_links) @page = double("Page", uri: URI.parse("https://site-under-test/something")) end it "should save the page to disk" do @m.should_receive(:save_to_disk).with(@page) @m.process_govuk_page(@page) end it "should extract any links in the page" do @m.should_receive(:extract_and_handle_links).with(@page) @m.process_govuk_page(@page) end it "should do nothing if the page is a non gov.uk page" do @page.stub(:uri).and_return(URI.parse("https://somewhere.else.com/foo")) @m.should_not_receive(:save_to_disk) @m.should_not_receive(:extract_and_handle_links) @m.process_govuk_page(@page) end end describe "extract_and_handle_links" do before :each do @m = GovukMirrorer::Crawler.new @m.stub(:process_link) end it "should extract all , and
HM Revenue & Customs lists the rates of VAT on different goods and services.