# coding: utf-8 require 'spec_helper' describe 'basic crawler setup' do it 'should crawl page' do VCR.use_cassette('basic_crawler_page') do crawler = Class.new crawler.send(:include, Wombat::Crawler) crawler.base_url "http://www.terra.com.br" crawler.list_page '/portal' crawler.search "css=.btn-search" crawler.social do |s| s.twitter "css=.ctn-bar li.last" end crawler.for_each "css=.ctn-links" do menu "css=a" end crawler.subheader "css=h2.ttl-dynamic" do |h| h.gsub("London", "Londres") end crawler_instance = crawler.new results = crawler_instance.crawl results["search"].should == "Buscar" results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}] results["subheader"].should == "Londres 2012" results["social"]["twitter"].should == "Verão" end end it 'should clear iterators between multiple runs' do crawler = Class.new crawler.send(:include, Wombat::Crawler) crawler.base_url "http://www.terra.com.br" crawler.list_page '/portal' crawler.for_each "css=.ctn-links" do menu "css=a" end crawler_instance = crawler.new result_hash = [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}] results = nil VCR.use_cassette('basic_crawler_page') do results = crawler_instance.crawl end results["iterator0"].should == result_hash VCR.use_cassette('basic_crawler_page') do results = crawler_instance.crawl end results["iterator0"].should == result_hash end it 'should crawl page through block to class instance crawl method' do VCR.use_cassette('basic_crawler_page') do crawler = Class.new crawler.send(:include, Wombat::Crawler) crawler_instance = crawler.new results = crawler_instance.crawl do base_url "http://www.terra.com.br" list_page '/portal' search "css=.btn-search" social do |s| s.twitter "css=.ctn-bar li.last" end for_each "css=.ctn-links" do menu "css=a" end subheader "css=h2.ttl-dynamic" do |h| h.gsub("London", "Londres") end end results["search"].should == "Buscar" results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}] results["subheader"].should == "Londres 2012" results["social"]["twitter"].should == "Verão" end end it 'should crawl page through static crawl method' do VCR.use_cassette('basic_crawler_page') do results = Wombat.crawl do base_url "http://www.terra.com.br" list_page '/portal' search "css=.btn-search" social do |s| s.twitter "css=.ctn-bar li.last" end for_each "css=.ctn-links" do menu "css=a" end subheader "css=h2.ttl-dynamic" do |h| h.gsub("London", "Londres") end end results["search"].should == "Buscar" results["iterator0"].should == [{"menu"=>"Agenda"}, {"menu"=>"Brasileiro"}, {"menu"=>"Brasil"}, {"menu"=>"Bolsas"}, {"menu"=>"Cinema"}, {"menu"=>"Galerias de Fotos"}, {"menu"=>"Beleza"}, {"menu"=>"Esportes"}, {"menu"=>"Assine o RSS"}] results["subheader"].should == "Londres 2012" results["social"]["twitter"].should == "Verão" end end it 'should iterate elements' do VCR.use_cassette('for_each_page') do crawler = Class.new crawler.send(:include, Wombat::Crawler) crawler.base_url "https://www.github.com" crawler.list_page "/explore" crawler.for_each "css=ol.ranked-repositories li" do project do |p| p.repo 'css=h3' p.description('css=p.description') { |d| d.gsub(/for/, '') } end end crawler_instance = crawler.new results = crawler_instance.crawl results.should == { "iterator0" => [ { "project" => { "repo" => "jairajs89 / Touchy.js", "description" => "A simple light-weight JavaScript library dealing with touch events" } }, { "project" => { "repo" => "mcavage / node-restify", "description" => "node.js REST framework specifically meant web service APIs" } }, { "project" => { "repo" => "notlion / streetview-stereographic", "description" => "Shader Toy + Google Map + Panoramic Explorer" } }, { "project" => { "repo" => "twitter / bootstrap", "description" => "HTML, CSS, and JS toolkit from Twitter" } }, { "project" => { "repo" => "stolksdorf / Parallaxjs", "description" => "a Library Javascript that allows easy page parallaxing" } } ]} end end it 'should crawl xml with namespaces' do VCR.use_cassette('xml_with_namespace') do crawler = Class.new crawler.send(:include, Wombat::Crawler) crawler.document_format :xml crawler.base_url "http://ws.audioscrobbler.com" crawler.list_page "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b" crawler.artist "xpath=//title", :list crawler.for_each 'xpath=//event' do latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' } longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' } end crawler_instance = crawler.new results = crawler_instance.crawl iterator = results['iterator0'] iterator.should == [ {"latitude"=>"37.807775", "longitude"=>"-122.272736"}, {"latitude"=>"37.807717", "longitude"=>"-122.270059"}, {"latitude"=>"37.869784", "longitude"=>"-122.267701"}, {"latitude"=>"37.870873", "longitude"=>"-122.269313"}, {"latitude"=>"37.782348", "longitude"=>"-122.408059"}, {"latitude"=>"37.775529", "longitude"=>"-122.437757"}, {"latitude"=>"37.771079", "longitude"=>"-122.412604"}, {"latitude"=>"37.771079", "longitude"=>"-122.412604"}, {"latitude"=>"37.784963", "longitude"=>"-122.418871"}, {"latitude"=>"37.788978", "longitude"=>"-122.40664"} ] results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"] end end end