# encoding: UTF-8 require 'spec_helper' describe Readability do before do @simple_html_fixture = <<-HTML title!

a comment

real content
something in a table
HTML end describe "images" do before do @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html") @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html") @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html") FakeWeb::Registry.instance.clean_registry FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg")) end it "should show one image, but outside of the best candidate" do @doc = Readability::Document.new(@thesum) @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"] @doc.best_candidate_has_image.should == false end it "should show one image inside of the best candidate" do @doc = Readability::Document.new(@nytimes) @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"] @doc.best_candidate_has_image.should == true end it "should not try to download local images" do @doc = Readability::Document.new(<<-HTML) title!
HTML do_not_allow(@doc).load_image(anything) @doc.images.should == [] end describe "no images" do it "shouldn't show images" do @doc = Readability::Document.new(@bbc, :min_image_height => 600) @doc.images.should == [] @doc.best_candidate_has_image.should == false end end describe "poll of images" do it "should show some images inside of the best candidate" do @doc = Readability::Document.new(@bbc) @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"] @doc.best_candidate_has_image.should == true end it "should show some images inside of the best candidate, include gif format" do @doc = Readability::Document.new(@bbc, :ignore_image_format => []) @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"] @doc.best_candidate_has_image.should == true end describe "width, height and format" do it "should show some images inside of the best candidate, but with width most equal to 400px" do @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => []) @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"] @doc.best_candidate_has_image.should == true end it "should show some images inside of the best candidate, but with width most equal to 304px" do @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => []) @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"] @doc.best_candidate_has_image.should == true end it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"]) @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"] @doc.best_candidate_has_image.should == true end it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => []) @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"] @doc.best_candidate_has_image.should == true end end end end describe "transformMisusedDivsIntoParagraphs" do before do @doc = Readability::Document.new(@simple_html_fixture) @doc.transform_misused_divs_into_paragraphs! end it "should transform divs containing no block elements into

s" do @doc.html.css("#body").first.name.should == "p" end it "should not transform divs that contain block elements" do @doc.html.css("#contains_blockquote").first.name.should == "div" end end describe "score_node" do before do @doc = Readability::Document.new(<<-HTML)

some content

some other content

HTML @elem1 = @doc.html.css("#elem1").first @elem2 = @doc.html.css("#elem2").first end it "should like
s more than s" do @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] end it "should like classes like text more than classes like comment" do @elem2.name = "div" @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score] @elem1['class'] = "text" @elem2['class'] = "comment" @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] end end describe "remove_unlikely_candidates!" do before do @doc = Readability::Document.new(@simple_html_fixture) @doc.remove_unlikely_candidates! end it "should remove things that have class comment" do @doc.html.inner_html.should_not =~ /a comment/ end it "should not remove body tags" do @doc.html.inner_html.should =~ /<\/body>/ end it "should not remove things with class comment and id body" do @doc.html.inner_html.should =~ /real content/ end end describe "score_paragraphs" do before(:each) do @doc = Readability::Document.new(<<-HTML) title!
a comment

some text

some more text

HTML @candidates = @doc.score_paragraphs(0) end it "should score elements in the document" do @candidates.values.length.should == 3 end it "should prefer the body in this particular example" do @candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }.first[:elem][:id].should == "body" end context "when two consequent br tags are used instead of p" do it "should assign the higher score to the first paragraph in this particular example" do @doc = Readability::Document.new(<<-HTML) title!
This is the main content!

Zebra found killed butcher with the chainsaw.

If only I could think of an example, oh, wait.
This is not the content and although it's longer if you meaure it in characters, it's supposed to have lower score than the previous paragraph. And it's only because of the previous paragraph is not one paragraph, it's three subparagraphs
HTML @candidates = @doc.score_paragraphs(0) @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1' end end end describe "the cant_read.html fixture" do it "should work on the cant_read.html fixture with some allowed tags" do allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a] allowed_attributes = %w[href] html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html") Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/) end end describe "general functionality" do before do @doc = Readability::Document.new("title!

Some content

", :min_text_length => 0, :retry_length => 1) end it "should return the main page content" do @doc.content.should match("Some content") end it "should return the page title if present" do @doc.title.should match("title!") doc = Readability::Document.new("

Some content

", :min_text_length => 0, :retry_length => 1) doc.title.should be_nil end end describe "ignoring sidebars" do before do @doc = Readability::Document.new("title!

Some content

", :min_text_length => 0, :retry_length => 1) end it "should not return the sidebar" do @doc.content.should_not match("sidebar") end end describe "inserting space for block elements" do before do @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1) title!

a
b


c
d
f/p>
HTML end it "should not return the sidebar" do @doc.content.should_not match("a b c d f") end end describe "outputs good stuff for known documents" do before do @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html") @samples = @html_files.map {|filename| File.basename(filename, '.html') } end it "should output expected fragments of text" do checks = 0 @samples.each do |sample| html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html") doc = Readability::Document.new(html).content load "fixtures/samples/#{sample}-fragments.rb" #puts "testing #{sample}..." $required_fragments.each do |required_text| doc.should include(required_text) checks += 1 end $excluded_fragments.each do |text_to_avoid| doc.should_not include(text_to_avoid) checks += 1 end end #puts "Performed #{checks} checks." end end describe "encoding guessing" do if RUBY_VERSION =~ /^1\.9\./ context "with ruby 1.9.2" do it "should correctly guess and enforce HTML encoding" do doc = Readability::Document.new("
hi!
") content = doc.content content.encoding.to_s.should == "ISO-8859-1" content.should be_valid_encoding end it "should allow encoding guessing to be skipped" do do_not_allow(GuessHtmlEncoding).encode doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true) doc.content end it "should allow encoding guessing to be overridden" do do_not_allow(GuessHtmlEncoding).encode doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8") doc.content end end end end end