spec/unit/crawler_spec.rb in rawler-0.0.2 vs spec/unit/crawler_spec.rb in rawler-0.0.3

- old
+ new

@@ -1,42 +1,98 @@ require File.dirname(__FILE__) + '/../spec_helper.rb' describe Rawler::Crawler do + let(:url) { 'http://example.com' } + + before(:each) do + Rawler.stub!(:url).and_return(url) + end + it "should parse all links" do - url = 'http://example.com/' register(url, site) Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar'] end + it "should parse relative links" do + url = 'http://example.com/path' + register(url, '<a href="/foo">foo</a>') + + Rawler::Crawler.new(url).links.should == ['http://example.com/foo'] + end + + it "should parse links only if the page is in the same domain as the main url" do + url = 'http://external.com/path' + register(url, '<a href="/foo">foo</a>') + + Rawler.should_receive(:url).and_return('http://example.com') + + Rawler::Crawler.new(url).links.should == [] + end + it "should return an empty array when raising Errno::ECONNREFUSED" do - url = 'http://example.com' register(url, site) + crawler = Rawler::Crawler.new(url) - Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED + Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED - crawler = Rawler::Crawler.new(url).links.should == [] + crawler.links.should == [] end - it "should parse relative links" do - url = 'http://example.com/path' - register(url, '<a href="/foo">foo</a>') + it "should print a message when raising Errno::ECONNREFUSED" do + output = double('output') + register(url, site) - Rawler::Crawler.new(url).links.should == ['http://example.com/foo'] + crawler = Rawler::Crawler.new(url) + + Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED + Rawler.should_receive(:output).and_return(output) + output.should_receive(:puts).with("Couldn't connect to #{url}") + + crawler.links end - # it "should print a message when raising Errno::ECONNREFUSED" do - # pending "refactor output. Don't use a global variable" - # url = 'http://example.com' - # register(url, site) - # - # Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED - # - # $stdout.should_receive(:puts).with("Couldn't connect to #{url}") - # - # Rawler::Crawler.new(url).links - # end + context "should ignore content type other than text/html" do + + ['text/plain', 'text/css', 'image/jpeg'].each do |content_type| + + it "should ignore '#{content_type}'" do + register(url, site, 200, :content_type => content_type) + + crawler = Rawler::Crawler.new(url) + crawler.links.should == [] + end + + end + + end + + it "should ignore links other than http or https" do + content = <<-content + <a href="http://example.com/valid">foo</a> + <a href="mailto:info@example.com">invalid</a> + <a href="https://foo.com">valid</a> + content + + register(url, content) + + crawler = Rawler::Crawler.new(url) + crawler.links.should == ['http://example.com/valid', 'https://foo.com'] + end + + it "should crawl http basic pages" do + content = '<a href="http://example.com/secret-path">foo</a>' + + register('http://example.com/secret', '', :status => ["401", "Unauthorized"]) + register('http://foo:bar@example.com/secret', content) + + Rawler.stub!(:username).and_return('foo') + Rawler.stub!(:password).and_return('bar') + + crawler = Rawler::Crawler.new('http://example.com/secret') + crawler.links.should == ['http://example.com/secret-path'] + end private def site <<-site