spec/unit/crawler_spec.rb in rawler-0.0.2 vs spec/unit/crawler_spec.rb in rawler-0.0.3
- old
+ new
@@ -1,42 +1,98 @@
require File.dirname(__FILE__) + '/../spec_helper.rb'
describe Rawler::Crawler do
+ let(:url) { 'http://example.com' }
+
+ before(:each) do
+ Rawler.stub!(:url).and_return(url)
+ end
+
it "should parse all links" do
- url = 'http://example.com/'
register(url, site)
Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
end
+ it "should parse relative links" do
+ url = 'http://example.com/path'
+ register(url, '<a href="/foo">foo</a>')
+
+ Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
+ end
+
+ it "should parse links only if the page is in the same domain as the main url" do
+ url = 'http://external.com/path'
+ register(url, '<a href="/foo">foo</a>')
+
+ Rawler.should_receive(:url).and_return('http://example.com')
+
+ Rawler::Crawler.new(url).links.should == []
+ end
+
it "should return an empty array when raising Errno::ECONNREFUSED" do
- url = 'http://example.com'
register(url, site)
+ crawler = Rawler::Crawler.new(url)
- Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
- crawler = Rawler::Crawler.new(url).links.should == []
+ crawler.links.should == []
end
- it "should parse relative links" do
- url = 'http://example.com/path'
- register(url, '<a href="/foo">foo</a>')
+ it "should print a message when raising Errno::ECONNREFUSED" do
+ output = double('output')
+ register(url, site)
- Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
+ crawler = Rawler::Crawler.new(url)
+
+ Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
+ Rawler.should_receive(:output).and_return(output)
+ output.should_receive(:puts).with("Couldn't connect to #{url}")
+
+ crawler.links
end
- # it "should print a message when raising Errno::ECONNREFUSED" do
- # pending "refactor output. Don't use a global variable"
- # url = 'http://example.com'
- # register(url, site)
- #
- # Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
- #
- # $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
- #
- # Rawler::Crawler.new(url).links
- # end
+ context "should ignore content type other than text/html" do
+
+ ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
+
+ it "should ignore '#{content_type}'" do
+ register(url, site, 200, :content_type => content_type)
+
+ crawler = Rawler::Crawler.new(url)
+ crawler.links.should == []
+ end
+
+ end
+
+ end
+
+ it "should ignore links other than http or https" do
+ content = <<-content
+ <a href="http://example.com/valid">foo</a>
+ <a href="mailto:info@example.com">invalid</a>
+ <a href="https://foo.com">valid</a>
+ content
+
+ register(url, content)
+
+ crawler = Rawler::Crawler.new(url)
+ crawler.links.should == ['http://example.com/valid', 'https://foo.com']
+ end
+
+ it "should crawl http basic pages" do
+ content = '<a href="http://example.com/secret-path">foo</a>'
+
+ register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
+ register('http://foo:bar@example.com/secret', content)
+
+ Rawler.stub!(:username).and_return('foo')
+ Rawler.stub!(:password).and_return('bar')
+
+ crawler = Rawler::Crawler.new('http://example.com/secret')
+ crawler.links.should == ['http://example.com/secret-path']
+ end
private
def site
<<-site