require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb") describe RegexpCrawler::Crawler do describe '#simple html' do it 'should parse data according to regexp' do success_page('/resources/simple.html', 'http://simple.com/') crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{
(.*?)
.*
(.*?)
.*
(.*?)
}m, :named_captures => ['title', 'date', 'body'], :model => 'post') results = crawl.start results.size.should == 1 results.first[:post][:title].should == 'test' end it 'should redirect' do redirect_page('http://redirect.com/', 'http://simple.com/') success_page('/resources/simple.html', 'http://simple.com/') end end describe '#complex html' do before(:each) do success_page('/resources/complex.html', 'http://complex.com/') success_page('/resources/nested1.html', 'http://complex.com/nested1.html') success_page('/resources/nested2.html', 'http://complex.com/nested2.html') end it 'should parse data according to regexp' do crawl = RegexpCrawler::Crawler.new crawl.start_page = 'http://complex.com/' crawl.continue_regexp = %r{(?:http://complex.com/)?nested\d.html} crawl.capture_regexp = %r{
(.*?)
.*
(.*?)
.*
(.*?)
}m crawl.named_captures = ['title', 'date', 'body'] crawl.model = 'post' results = crawl.start results.size.should == 2 results.first[:post][:title].should == 'nested1' results.last[:post][:title].should == 'nested2' end end def success_page(local_path, remote_path) path = File.expand_path(File.dirname(__FILE__) + local_path) content = File.read(path) http = mock(Net::HTTPSuccess) http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true) http.stubs(:body).returns(content) Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http) end def redirect_page(remote_path, redirect_path) http = mock(Net::HTTPRedirection) http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true) Net::HTTP.expects(:get_response).times(1).with(URI.parse(remote_path)).returns(http) end end