require 'spec_helper' describe MetaInspector::Document do describe 'passing the contents of the document as html' do let(:doc) { MetaInspector::Document.new('http://cnn.com/', :document => "Hello From Passed HtmlHello link") } it "should get correct links when the url html is passed as an option" do expect(doc.links.internal).to eq(["http://cnn.com/hello"]) end it "should get the title" do expect(doc.title).to eq("Hello From Passed Html") end end it "should return a String as to_s" do expect(MetaInspector::Document.new('http://pagerankalert.com').to_s.class).to eq(String) end it "should return a Hash with all the values set" do doc = MetaInspector::Document.new('http://pagerankalert.com') expect(doc.to_hash).to eq({ "url" => "http://pagerankalert.com/", "scheme" => "http", "host" => "pagerankalert.com", "root_url" => "http://pagerankalert.com/", "title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts", "best_title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts", "author" => nil, "best_author" => nil, "description" => "Track your PageRank(TM) changes and receive alerts by email", "best_description"=> "Track your PageRank(TM) changes and receive alerts by email", "favicon" => "http://pagerankalert.com/src/favicon.ico", "links" => { 'internal' => ["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in"], 'external' => ["http://pagerankalert.posterous.com/", "http://twitter.com/pagerankalert", "http://twitter.com/share"], 'non_http' => ["mailto:pagerankalert@gmail.com"] }, "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset" => "utf-8", "feed" => "http://feeds.feedburner.com/PageRankAlert", "content_type" => "text/html", "meta_tags" => { "name" => { "description" => ["Track your PageRank(TM) changes and receive alerts by email"], "keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"], "csrf-param" => ["authenticity_token"], "csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="] }, "http-equiv" => {}, "property" => {}, "charset" => ["utf-8"] }, "response" => { "status" => 200, "headers" => { "server" => "nginx/0.7.67", "date"=>"Mon, 30 May 2011 09:45:42 GMT", "content-type" => "text/html; charset=utf-8", "connection" => "keep-alive", "etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"", "x-ua-compatible" => "IE=Edge,chrome=1", "x-runtime" => "0.031274", "set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly", "cache-control" => "max-age=0, private, must-revalidate", "content-length" => "6690", "x-varnish" => "2167295052", "age" => "0", "via" => "1.1 varnish" } } }) end describe "allow_non_html_content option" do it "should not allow non-html content type by default" do expect do image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png') image_url.title end.to raise_error(MetaInspector::ParserError) end it "should not allow non-html content type when explicitly disallowed" do expect do image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: false) image_url.title end.to raise_error(MetaInspector::ParserError) end it "should allow non-html content type when explicitly allowed" do expect do image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: true) image_url.title end.to_not raise_error end end describe 'headers' do it "should include default headers" do url = "http://pagerankalert.com/" expected_headers = {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)", 'Accept-Encoding' => 'identity'} headers = {} expect(headers).to receive(:merge!).with(expected_headers) allow_any_instance_of(Faraday::Connection).to receive(:headers){headers} MetaInspector::Document.new(url) end it "should include passed headers on the request" do url = "http://pagerankalert.com/" headers = {'User-Agent' => 'Mozilla', 'Referer' => 'https://github.com/'} headers = {} expect(headers).to receive(:merge!).with(headers) allow_any_instance_of(Faraday::Connection).to receive(:headers){headers} MetaInspector::Document.new(url, headers: headers) end end describe 'url normalization' do it 'should normalize by default' do expect(MetaInspector.new('http://example.com/%EF%BD%9E').url).to eq('http://example.com/~') end it 'should not normalize if the normalize_url option is false' do expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E') end end describe 'page encoding' do it 'should encode title according to the charset' do expect(MetaInspector.new('http://example-rtl.com/').title).to eq('بالفيديو.. "مصطفى بكري" : انتخابات الائتلاف غير نزيهة وموجهة لفوز أشخاص بعينها') end it 'should encode description according to the charset' do expect(MetaInspector.new('http://example-rtl.com/').description).to eq('أعلن النائب مصطفى بكري انسحابه من ائتلاف دعم مصر بعد اعتراضه على نتيجة الانتخابات الداخلية للائتلاف، وخسارته فيها، وقال إنه سيترشح غدا على منصب الوكيل بالمجلس') end it 'should replace NULL characters' do doc = MetaInspector.new('http://example-rtl.com/').parsed image_src = doc.css('.some_class').first.attribute('src').value expect(image_src).to eq('/path/to/image.jpg') end end end