require 'spec_helper' describe Wombat::Crawler do before(:each) do @crawler = Class.new @crawler.send(:include, Wombat::Crawler) @crawler_instance = @crawler.new end describe '#crawl' do it 'should call the provided block' do event_called = false @crawler.event { event_called = true } expect(event_called).to eq(true) end it 'should provide metadata to yielded block' do @crawler.event do self.class.should eq(Wombat::DSL::PropertyGroup) end end it 'should store assigned metadata information' do time = Time.now @crawler.event do |e| e.title 'Fulltronic Dezembro' e.time Time.now end @crawler.venue do |v| v.name "Scooba" end @crawler.location { |v| v.latitude -50.2323 } expect(@crawler_instance).to receive(:parse) do |arg| expect(arg["event"]["title"].selector).to eq("Fulltronic Dezembro") expect(arg["event"]["time"].selector.to_s).to eq(time.to_s) expect(arg["venue"]["name"].selector).to eq("Scooba") expect(arg["location"]["latitude"].selector).to eq(-50.2323) end @crawler_instance.crawl end it 'should isolate metadata between different instances' do another_crawler = Class.new another_crawler.send(:include, Wombat::Crawler) another_crawler_instance = another_crawler.new another_crawler.event { |e| e.title 'Ibiza' } expect(another_crawler_instance).to receive(:parse) { |arg| expect(arg["event"]["title"].selector).to eq("Ibiza") } another_crawler_instance.crawl @crawler.event { |e| e.title 'Fulltronic Dezembro' } expect(@crawler_instance).to receive(:parse) { |arg| expect(arg["event"]["title"].selector).to eq("Fulltronic Dezembro") } @crawler_instance.crawl end it 'should be able to assign arbitrary plain text metadata' do @crawler.some_data("/event/list", :html, "geo") { |p| true } expect(@crawler_instance).to receive(:parse) do |arg| prop = arg['some_data'] expect(prop.wombat_property_name).to eq("some_data") expect(prop.selector).to eq("/event/list") expect(prop.format).to eq(:html) expect(prop.namespaces).to eq("geo") expect(prop.callback).to_not eq(nil) end @crawler_instance.crawl end it 'should be able to specify arbitrary block structure more than once' do @crawler.structure do data "xpath=/xyz" end @crawler.structure do another "css=.information" end expect(@crawler_instance).to receive(:parse) do |arg| expect(arg["structure"]["data"].selector).to eq("xpath=/xyz") expect(arg["structure"]["another"].selector).to eq("css=.information") end @crawler_instance.crawl end it 'should not explode if no block given' do @crawler.event end it 'should assign metadata format' do expect(@crawler_instance).to receive(:parse) do |arg| expect(arg[:document_format]).to eq(:xml) end @crawler.document_format :xml @crawler_instance.crawl end it 'should crawl with block' do @crawler.base_url "danielnc.com" @crawler.path "/itens" expect(@crawler_instance).to receive(:parse) do |arg| expect(arg[:base_url]).to eq("danielnc.com") expect(arg[:path]).to eq("/itens/1") end @crawler_instance.crawl do path "/itens/1" end another_instance = @crawler.new expect(another_instance).to receive(:parse) do |arg| expect(arg[:base_url]).to eq("danielnc.com") expect(arg[:path]).to eq("/itens") end another_instance.crawl end it 'should crawl with url and block' do url = 'http://danielinc.com/itens' expect(@crawler_instance).to receive(:parse).with(anything, url) @crawler_instance.crawl(url) do end another_instance = @crawler.new expect(another_instance).to receive(:parse).with(anything, url) another_instance.crawl(url) end it 'should remove created method missing' do @crawler.base_url "danielnc.com" @crawler.path "/itens" expect(@crawler_instance).to receive(:parse) do |arg| expect(arg[:base_url]).to eq("danielnc.com") expect(arg[:path]).to eq("/itens/1") end @crawler_instance.crawl do path "/itens/1" end expect(lambda { @craler_intance.undefined_method }).to raise_error(NoMethodError) end it 'should remove created instance variable' do @crawler.base_url "danielnc.com" @crawler.path "/itens" expect(@crawler_instance).to receive(:parse) do |arg| expect(arg[:base_url]).to eq("danielnc.com") expect(arg[:path]).to eq("/itens/1") end @crawler_instance.crawl do path "/itens/1" end expect(@crawler_instance.instance_variables.index(:@metadata_dup)).to be_nil end context "response code" do it "should have correct mechanize response code" do VCR.use_cassette('basic_crawler_page') do @crawler.base_url "http://www.terra.com.br" @crawler.path '/portal' @crawler.search "css=.btn-search" @crawler_instance.crawl expect(@crawler_instance.response_code).to be(200) end end it "should have correct rest client code" do VCR.use_cassette('basic_crawler_page') do @crawler.base_url "http://www.terra.com.br" @crawler.path '/portal' @crawler.search "css=.btn-search" @crawler.document_format :xml @crawler_instance.crawl expect(@crawler_instance.response_code).to be(200) end end it "should have mechanize error response code" do VCR.use_cassette('error_page') do @crawler.base_url "http://www.terra.com.br" @crawler.path '/portal' @crawler.search "css=.btn-search" expect(lambda { @crawler_instance.crawl }).to raise_error( "404 => Net::HTTPNotFound for http://www.terra.com.br/portal/ -- unhandled response") expect(@crawler_instance.response_code).to be(404) end end it "should have rest client error response code" do VCR.use_cassette('error_page') do @crawler.base_url "http://www.terra.com.br" @crawler.path '/portal' @crawler.search "css=.btn-search" @crawler.document_format :xml expect(lambda { @crawler_instance.crawl }).to raise_error(RestClient::ResourceNotFound) expect(@crawler_instance.response_code).to be(404) end end end end describe '#scrape' do it 'should alias to crawl' do expect(@crawler_instance).to receive :parse @crawler_instance.scrape end end end