crawler_spec.rb in rawler-0.0.3

- old
+ new
@@ -1,42 +1,98 @@
 require File.dirname(__FILE__) + '/../spec_helper.rb'
 
 describe Rawler::Crawler do
   
+  let(:url) { 'http://example.com' }
+  
+  before(:each) do
+    Rawler.stub!(:url).and_return(url)
+  end
+  
   it "should parse all links" do
-    url = 'http://example.com/'
     register(url, site)
     
     Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
   end
   
+  it "should parse relative links" do
+    url = 'http://example.com/path'
+    register(url, '<a href="/foo">foo</a>')
+    
+    Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
+  end
+  
+  it "should parse links only if the page is in the same domain as the main url" do
+    url = 'http://external.com/path'
+    register(url, '<a href="/foo">foo</a>')
+    
+    Rawler.should_receive(:url).and_return('http://example.com')
+    
+    Rawler::Crawler.new(url).links.should == []
+  end
+  
   it "should return an empty array when raising Errno::ECONNREFUSED" do
-    url = 'http://example.com'
     register(url, site)
+    crawler = Rawler::Crawler.new(url)
     
-    Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
+    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
     
-    crawler = Rawler::Crawler.new(url).links.should == []
+    crawler.links.should == []
   end
   
-  it "should parse relative links" do
-    url = 'http://example.com/path'
-    register(url, '<a href="/foo">foo</a>')
+  it "should print a message when raising Errno::ECONNREFUSED" do
+    output = double('output')
+    register(url, site)
     
-    Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
+    crawler = Rawler::Crawler.new(url)
+    
+    Rawler::Request.should_receive(:get).and_raise Errno::ECONNREFUSED
+    Rawler.should_receive(:output).and_return(output)    
+    output.should_receive(:puts).with("Couldn't connect to #{url}")
+    
+    crawler.links
   end
   
-  # it "should print a message when raising Errno::ECONNREFUSED" do
-  #   pending "refactor output. Don't use a global variable"
-  #   url = 'http://example.com'
-  #   register(url, site)
-  #   
-  #   Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
-  #   
-  #   $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
-  #   
-  #   Rawler::Crawler.new(url).links
-  # end
+  context "should ignore content type other than text/html" do
+    
+    ['text/plain', 'text/css', 'image/jpeg'].each do |content_type|
+      
+      it "should ignore '#{content_type}'" do
+        register(url, site, 200, :content_type => content_type)
+
+        crawler = Rawler::Crawler.new(url)
+        crawler.links.should == []
+      end
+      
+    end
+    
+  end
+  
+  it "should ignore links other than http or https" do
+    content = <<-content
+      <a href="http://example.com/valid">foo</a>
+      <a href="mailto:info@example.com">invalid</a>
+      <a href="https://foo.com">valid</a>
+    content
+    
+    register(url, content)
+    
+    crawler = Rawler::Crawler.new(url)
+    crawler.links.should == ['http://example.com/valid', 'https://foo.com']
+  end
+  
+  it "should crawl http basic pages" do
+    content = '<a href="http://example.com/secret-path">foo</a>'
+    
+    register('http://example.com/secret', '', :status => ["401", "Unauthorized"])
+    register('http://foo:bar@example.com/secret', content)
+    
+    Rawler.stub!(:username).and_return('foo')
+    Rawler.stub!(:password).and_return('bar')
+    
+    crawler = Rawler::Crawler.new('http://example.com/secret')
+    crawler.links.should == ['http://example.com/secret-path']
+  end
   
   private
   
   def site
     <<-site