require File.expand_path(File.dirname(__FILE__) + "/spec_helper.rb")
describe RegexpCrawler::Crawler do
context '#simple html' do
it 'should parse data according to regexp' do
success_page('/resources/simple.html', 'http://simple.com/')
crawl = RegexpCrawler::Crawler.new(:start_page => 'http://simple.com/', :capture_regexp => %r{
(.*?)
.*(.*?)
.*(.*?)
}m, :named_captures => ['title', 'date', 'body'], :model => 'post')
results = crawl.start
results.size.should == 1
results.first[:post][:title].should == 'test'
end
it 'should redirect' do
redirect_page('http://redirect.com/', 'http://simple.com/')
success_page('/resources/simple.html', 'http://simple.com/')
end
end
context '#complex html' do
before(:each) do
success_page('/resources/complex.html', 'http://complex.com/')
success_page('/resources/nested1.html', 'http://complex.com/nested1.html')
success_page('/resources/nested2.html', 'http://complex.com/nested2.html')
end
it 'should parse data according to regexp' do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{(.*?)
.*(.*?)
.*(.*?)
}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
results = crawl.start
results.size.should == 2
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested2'
end
it 'should parse nested of nested data' do
success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
crawl.capture_regexp = %r{(.*?)
.*(.*?)
.*(.*?)
}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
results = crawl.start
results.size.should == 3
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested21'
end
it "should save by myself" do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{(.*?)
.*(.*?)
.*(.*?)
}m
crawl.named_captures = ['title', 'date', 'body']
my_results = []
crawl.save_method = Proc.new {|result, page| my_results << result}
results = crawl.start
results.size.should == 0
my_results.size.should == 2
end
it "should stop parse" do
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/nested\d.html}
crawl.capture_regexp = %r{(.*?)
.*(.*?)
.*(.*?)
}m
crawl.named_captures = ['title', 'date', 'body']
stop_page = "http://complex.com/nested1.html"
parse_pages = []
crawl.save_method = Proc.new do |result, page|
if page == stop_page
false
else
parse_pages << page
end
end
results = crawl.start
parse_pages.size.should == 0
end
it 'should parse skip nested2.html' do
success_page('/resources/nested21.html', 'http://complex.com/nested21.html')
crawl = RegexpCrawler::Crawler.new
crawl.start_page = 'http://complex.com/'
crawl.continue_regexp = %r{(?:http://complex.com)?/?nested\d+.html}
crawl.capture_regexp = %r{(.*?)
.*(.*?)
.*(.*?)
}m
crawl.named_captures = ['title', 'date', 'body']
crawl.model = 'post'
crawl.need_parse = Proc.new do |uri, response_body|
if response_body.index('nested2 test html')
false
else
true
end
end
results = crawl.start
results.size.should == 2
results.first[:post][:title].should == 'nested1'
results.last[:post][:title].should == 'nested21'
end
end
def success_page(local_path, remote_path)
path = File.expand_path(File.dirname(__FILE__) + local_path)
content = File.read(path)
http = mock(Net::HTTPSuccess)
http.stubs(:is_a?).with(Net::HTTPSuccess).returns(true)
http.stubs(:body).returns(content)
Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
end
def redirect_page(remote_path, redirect_path)
http = mock(Net::HTTPRedirection)
http.stubs(:is_a?).with(Net::HTTPRedirection).returns(true)
Net::HTTP.expects(:get_response_with_headers).times(1).with(URI.parse(remote_path), nil).returns(http)
end
end