# encoding: UTF-8 require File.dirname(__FILE__) + '/../../spec_helper.rb' describe Rawler::Crawler do let(:url) { 'http://example.com' } let(:output) { double('output', :error => nil) } before(:each) do Rawler.stub!(:url).and_return(url) Rawler.stub!(:output).and_return(output) end context "basic functionality" do let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { content = <<-content
content } before(:each) do register(url, content) end it "should parse all links" do crawler.links.should == ['http://example.com/foo', 'http://external.com/bar'] end end context "relative paths" do context "base URL ends with a slash" do let(:url) { 'http://example.com/dir1/dir2/' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { 'foo bar baz' } before(:each) do register(url, content) end it "should parse relative links" do crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/dir2/bar', 'http://example.com/dir1/baz'] end end context "base URL doesn't end with a slash" do let(:url) { 'http://example.com/dir1/dir2' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { 'foo bar baz' } before(:each) do register(url, content) end it "should parse relative links" do crawler.links.should == ['http://example.com/foo', 'http://example.com/dir1/bar', 'http://example.com/baz'] end end end context "different domains" do let(:url) { 'http://external.com/path' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { 'foo' } before(:each) do Rawler.stub!(:url).and_return('http://example.com') register(url, content) end it "should parse relative links" do crawler.links.should == [] end end context "urls with hash tags" do let(:url) { 'http://example.com/path' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { 'foo' } before(:each) do register(url, content) end it "should not encode hashtags" do crawler.links.should == ['http://example.com/foo#bar'] end end context "urls with unicode characters" do let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { 'foo' } before(:each) do register(url, content) end it "should parse unicode links" do crawler.links.should == ['http://example.com/%E5%86%99%E7%A8%8B%E5%BA%8F%E5%AE%B9%E6%98%93%E5%87%BA%E7%8E%B0%E7%9A%84%E5%87%A0%E4%B8%AA%E4%B8%8D%E5%A5%BD%E7%9A%84%E5%9C%B0%E6%96%B9'] end end context "invalid urls" do context "javascript" do let(:url) { 'http://example.com/path' } let(:crawler) { Rawler::Crawler.new(url) } let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" } let(:content) { "foo" } before(:each) do register(url, content) end it "should return empty links" do crawler.links.should == [] end it "should not report the error" do crawler.should_not_receive(:write) crawler.links end end context "mailto" do let(:url) { 'http://example.com/path' } let(:crawler) { Rawler::Crawler.new(url) } let(:content) { "foo" } before(:each) do register(url, content) end it "should return empty links" do crawler.links.should == [] end it "should not report the error" do crawler.should_not_receive(:write) crawler.links end end end context "content type" do ['text/plain', 'text/css', 'image/jpeg'].each do |content_type| let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } before(:each) do register(url, '', 200, :content_type => content_type) end it "should ignore '#{content_type}'" do crawler.links.should == [] end end end context "Exceptions" do let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } before(:each) do register(url, '') end context "Errno::ECONNREFUSED" do before(:each) do Rawler::Request.stub!(:get).and_raise Errno::ECONNREFUSED end it "should return an empty array" do crawler.links.should == [] end it "should print a message when raising Errno::ECONNREFUSED" do output.should_receive(:error).with("Couldn't connect to #{url}") crawler.links end end context "Errno::ETIMEDOUT" do before(:each) do Rawler::Request.stub!(:get).and_raise Errno::ETIMEDOUT end it "should return an empty array when raising Errno::ETIMEDOUT" do crawler.links.should == [] end it "should print a message when raising Errno::ETIMEDOUT" do output.should_receive(:error).with("Connection to #{url} timed out") crawler.links end end end context "http basic" do let(:url) { 'http://example.com' } let(:content) { 'foo' } let(:crawler) { Rawler::Crawler.new('http://example.com/secret') } before(:each) do register('http://example.com/secret', '', :status => ["401", "Unauthorized"]) register('http://foo:bar@example.com/secret', content) Rawler.stub!(:username).and_return('foo') Rawler.stub!(:password).and_return('bar') end it "should crawl http basic pages" do crawler.links.should == ['http://example.com/secret-path'] end end context "url domain" do let(:content) { content = <<-content foo invalid valid valid with illegal whitespaces content } let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } before(:each) do register(url, content) end it "should ignore links other than http or https" do crawler.links.should == ['http://example.com/valid', 'https://foo.com', 'http://fooo.com'] end end context "invalid urls" do let(:content) { 'foo' } let(:url) { 'http://example.com' } let(:crawler) { Rawler::Crawler.new(url) } before(:each) do register(url, content) end it "should notify about the invalid url" do output.should_receive(:error).with('Invalid url: http://foo;bar - Called from: http://example.com') crawler.links.should == [] end end end