require 'spec_helper'
describe MetaInspector::Document do
describe 'passing the contents of the document as html' do
let(:doc) { MetaInspector::Document.new('http://cnn.com/', :document => "
Hello From Passed HtmlHello link") }
it "should get correct links when the url html is passed as an option" do
doc.links.internal.should == ["http://cnn.com/hello"]
end
it "should get the title" do
doc.title.should == "Hello From Passed Html"
end
end
it "should return a String as to_s" do
MetaInspector::Document.new('http://pagerankalert.com').to_s.class.should == String
end
it "should return a Hash with all the values set" do
doc = MetaInspector::Document.new('http://pagerankalert.com')
doc.to_hash.should == {
"url" => "http://pagerankalert.com/",
"scheme" => "http",
"host" => "pagerankalert.com",
"root_url" => "http://pagerankalert.com/",
"title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
"best_title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
"description" => "Track your PageRank(TM) changes and receive alerts by email",
"favicon" => "http://pagerankalert.com/src/favicon.ico",
"links" => {
'internal' => ["http://pagerankalert.com/",
"http://pagerankalert.com/es?language=es",
"http://pagerankalert.com/users/sign_up",
"http://pagerankalert.com/users/sign_in"],
'external' => ["http://pagerankalert.posterous.com/",
"http://twitter.com/pagerankalert",
"http://twitter.com/share"],
'non_http' => ["mailto:pagerankalert@gmail.com"]
},
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
"charset" => "utf-8",
"feed" => "http://feeds.feedburner.com/PageRankAlert",
"content_type" => "text/html",
"meta_tags" => {
"name" => {
"description" => ["Track your PageRank(TM) changes and receive alerts by email"],
"keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
"csrf-param" => ["authenticity_token"],
"csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
},
"http-equiv" => {},
"property" => {},
"charset" => ["utf-8"]
},
"response" => {
"status" => 200,
"headers" => {
"server" => "nginx/0.7.67",
"date"=>"Mon, 30 May 2011 09:45:42 GMT",
"content-type" => "text/html; charset=utf-8",
"connection" => "keep-alive",
"etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
"x-ua-compatible" => "IE=Edge,chrome=1",
"x-runtime" => "0.031274",
"set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
"cache-control" => "max-age=0, private, must-revalidate",
"content-length" => "6690",
"x-varnish" => "2167295052",
"age" => "0",
"via" => "1.1 varnish"
}
}
}
end
describe 'exception handling' do
let(:logger) { MetaInspector::ExceptionLog.new }
it "should parse images when parse_html_content_type_only is not specified" do
logger.should_not receive(:<<)
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', exception_log: logger)
image_url.title
end
it "should parse images when parse_html_content_type_only is false" do
logger.should_not receive(:<<)
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: false, exception_log: logger)
image_url.title
end
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
logger.should_receive(:<<).with(an_instance_of(RuntimeError))
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true, exception_log: logger)
image_url.title
end
it "should handle errors when content is not text/html and html_content_type_only is true" do
logger.should_receive(:<<).with(an_instance_of(RuntimeError))
tar_url = MetaInspector::Document.new('http://pagerankalert.com/file.tar.gz', html_content_only: true, exception_log: logger)
tar_url.title
end
context 'when a warn_level of :store is passed in' do
before do
@bad_request = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true, warn_level: :store)
@bad_request.title
end
it 'stores the exceptions' do
@bad_request.exceptions.should_not be_empty
end
it 'makes ok? to return false' do
@bad_request.should_not be_ok
end
end
context 'when a warn_level of :warn is passed in' do
before do
$stderr = StringIO.new
end
after do
$stderr = STDERR
end
it 'warns on STDERR' do
bad_request = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true, warn_level: :warn)
bad_request.title
$stderr.rewind
$stderr.string.chomp.should eq("The url provided contains image/png content instead of text/html content")
end
it 'does not raise an exception' do
expect {
bad_request = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true, warn_level: :warn)
bad_request.title
}.to_not raise_exception
end
it 'does not store exceptions' do
bad_request = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true, warn_level: :warn)
bad_request.title
expect( bad_request.exceptions ).to be_empty
end
end
end
describe 'headers' do
it "should include default headers" do
url = "http://pagerankalert.com/"
expected_headers = {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
headers = {}
headers.should_receive(:merge!).with(expected_headers)
Faraday::Connection.any_instance.stub(:headers){headers}
MetaInspector::Document.new(url)
end
it "should include passed headers on the request" do
url = "http://pagerankalert.com/"
headers = {'User-Agent' => 'Mozilla', 'Referer' => 'https://github.com/'}
headers = {}
headers.should_receive(:merge!).with(headers)
Faraday::Connection.any_instance.stub(:headers){headers}
MetaInspector::Document.new(url, headers: headers)
end
end
describe 'url normalization' do
it 'should normalize by default' do
MetaInspector.new('http://example.com/%EF%BD%9E').url.should == 'http://example.com/~'
end
it 'should not normalize if the normalize_url option is false' do
MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url.should == 'http://example.com/%EF%BD%9E'
end
end
end