s" do
expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
end
it "should like classes like text more than classes like comment" do
@elem2.name = "div"
expect(@doc.score_node(@elem1)[:content_score]).to eq(@doc.score_node(@elem2)[:content_score])
@elem1['class'] = "text"
@elem2['class'] = "comment"
expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
end
end
describe "remove_unlikely_candidates!" do
before do
@doc = Readability::Document.new(@simple_html_fixture)
@doc.remove_unlikely_candidates!
end
it "should remove things that have class comment" do
expect(@doc.html.inner_html).not_to match(/a comment/)
end
it "should not remove body tags" do
expect(@doc.html.inner_html).to match(/<\/body>/)
end
it "should not remove things with class comment and id body" do
expect(@doc.html.inner_html).to match(/real content/)
end
end
describe "score_paragraphs" do
before(:each) do
@doc = Readability::Document.new(<<-HTML)
title!
HTML
@candidates = @doc.score_paragraphs(0)
end
it "should score elements in the document" do
expect(@candidates.values.length).to eq(3)
end
it "should prefer the body in this particular example" do
expect(@candidates.values.sort { |a, b|
b[:content_score] <=> a[:content_score]
}.first[:elem][:id]).to eq("body")
end
context "when two consequent br tags are used instead of p" do
it "should assign the higher score to the first paragraph in this particular example" do
@doc = Readability::Document.new(<<-HTML)
title!
This is the main content!
Zebra found killed butcher with the chainsaw.
If only I could think of an example, oh, wait.
This is not the content and although it's longer if you meaure it in characters,
it's supposed to have lower score than the previous paragraph. And it's only because
of the previous paragraph is not one paragraph, it's three subparagraphs
HTML
@candidates = @doc.score_paragraphs(0)
expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
end
end
it "does not include short paragraphs as related siblings in the output" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
title!
#{'This link lowers the body score.' * 5}
HTML
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).not_to include("Too short")
end
it "includes long paragraphs as related siblings in the output" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
title!
This paragraph is longer than 80 characters so should be included as a sibling in the output.
#{'This link lowers the body score.' * 5}
HTML
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("This paragraph is longer")
end
it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
title!
Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.
#{'This link lowers the body score.' * 5}
HTML
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).not_to include("Although this paragraph")
end
it "does include non-paragraph tags in the output if their content score is high enough" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
title!
Paragraph 1
#{'Paragraph 2 ' * 10}
This should be included in the output because the content is score is high enough.
The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.
#{'This link lowers the body score.' * 5}
HTML
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("This should be included")
end
it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
title!
Paragraph 1
#{'Paragraph 2 ' * 10}
This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.
The likely_siblings now include the section tag so it should be included in the output.
#{'This link lowers the body score.' * 5}
HTML
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("should be included")
end
end
describe "the cant_read.html fixture" do
it "should work on the cant_read.html fixture with some allowed tags" do
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
allowed_attributes = %w[href]
html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
expect(Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content).to match(/Can you talk a little about how you developed the looks for the/)
end
end
describe "general functionality" do
before do
@doc = Readability::Document.new("title!",
:min_text_length => 0, :retry_length => 1)
end
it "should return the main page content" do
expect(@doc.content).to match("Some content")
end
it "should return the page title if present" do
expect(@doc.title).to match("title!")
doc = Readability::Document.new("",
:min_text_length => 0, :retry_length => 1)
expect(doc.title).to be_nil
end
end
describe "ignoring sidebars" do
before do
@doc = Readability::Document.new("title!",
:min_text_length => 0, :retry_length => 1)
end
it "should not return the sidebar" do
expect(@doc.content).not_to match("sidebar")
end
end
describe "inserting space for block elements" do
before do
@doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
title!
HTML
end
it "should not return the sidebar" do
expect(@doc.content).not_to match("a b c d f")
end
end
describe "outputs good stuff for known documents" do
before do
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
end
it "should output expected fragments of text" do
checks = 0
@samples.each do |sample|
html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
doc = Readability::Document.new(html).content
load "fixtures/samples/#{sample}-fragments.rb"
#puts "testing #{sample}..."
$required_fragments.each do |required_text|
expect(doc).to include(required_text)
checks += 1
end
$excluded_fragments.each do |text_to_avoid|
expect(doc).not_to include(text_to_avoid)
checks += 1
end
end
#puts "Performed #{checks} checks."
end
end
describe "encoding guessing" do
if RUBY_VERSION =~ /^1\.9\./
context "with ruby 1.9.2" do
it "should correctly guess and enforce HTML encoding" do
doc = Readability::Document.new("hi! ")
content = doc.content
expect(content.encoding.to_s).to eq("ISO-8859-1")
expect(content).to be_valid_encoding
end
it "should allow encoding guessing to be skipped" do
expect(GuessHtmlEncoding).to_not receive(:encode)
doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
doc.content
end
it "should allow encoding guessing to be overridden" do
expect(GuessHtmlEncoding).to_not receive(:encode)
doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
doc.content
end
end
end
end
describe "#make_html" do
it "should strip the html comments tag" do
doc = Readability::Document.new("hi! ")
content = doc.content
expect(content).to include("hi!")
expect(content).not_to include("bye")
end
it "should not error with empty content" do
expect(Readability::Document.new('').content).to eq('')
end
it "should not error with a document with no " do
expect(Readability::Document.new('').content).to eq('')
end
end
describe "No side-effects" do
before do
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
@thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
end
it "should not have any side-effects when calling content() and then images()" do
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
:do_not_guess_encoding => true)
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
@doc.content
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
end
it "should not have any side-effects when calling content() multiple times" do
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
:do_not_guess_encoding => true)
expect(@doc.content).to eq(@doc.content)
end
it "should not have any side-effects when calling content and images multiple times" do
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
:do_not_guess_encoding => true)
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
expect(@doc.content).to eq(@doc.content)
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
end
end
describe "Code blocks" do
before do
@code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
@content = Readability::Document.new(@code,
:tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
:attributes => %w[src href],
:remove_empty_nodes => false).content
@doc = Nokogiri::HTML(@content)
end
it "preserve the code blocks" do
expect(@doc.css("code pre").text).to eq("\nroot\n indented\n ")
end
it "preserve backwards code blocks" do
expect(@doc.css("pre code").text).to eq("\nsecond\n indented\n ")
end
end
describe "remove all tags" do
it "should work for an incomplete piece of HTML" do
doc = Readability::Document.new('test [])
expect(doc.content).to eq('test')
end
it "should work for a HTML document" do
doc = Readability::Document.new('title!',
:tags => [])
expect(doc.content).to eq('test')
end
it "should work for a plain text" do
doc = Readability::Document.new('test', :tags => [])
expect(doc.content).to eq('test')
end
end
describe "boing boing" do
let(:boing_boing) {
File.read(File.dirname(__FILE__) + "/fixtures/boing_boing.html")
}
it "contains incorrect data by default" do
# NOTE: in an ideal world this spec starts failing
# and readability correctly detects content for the
# boing boing sample.
doc = Readability::Document.new(boing_boing)
content = doc.content
expect(content !~ /Bees and Bombs/).to eq(true)
expect(content).to match(/ADVERTISE/)
end
it "should apply whitelist" do
doc = Readability::Document.new(boing_boing,
whitelist: ".post-content")
content = doc.content
expect(content).to match(/Bees and Bombs/)
end
it "should apply blacklist" do
doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
content = doc.content
expect(content !~ /ADVERTISE/).to eq(true)
end
end
describe "clean_conditionally_reason?" do
let (:list_fixture) { "test #{' ' * 102}" }
it "does not raise error" do
@doc = Readability::Document.new(list_fixture)
expect { @doc.content }.to_not raise_error
end
end
end
|