require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper")) describe Readability do before do @simple_html_fixture = Nokogiri::HTML <<-HTML title!

a comment

real content
something in a table
HTML end describe "transformMisusedDivsIntoParagraphs" do before do @doc = Readability::Document.new(@simple_html_fixture, nil, nil) @doc.transform_misused_divs_into_paragraphs! end it "should transform divs containing no block elements into

s" do @doc.document.css("#body").first.name.should == "p" end it "should not transform divs that contain block elements" do @doc.document.css("#contains_blockquote").first.name.should == "div" end end describe "score_node" do before do @html = Nokogiri::HTML <<-HTML

some content

some other content

HTML @doc = Readability::Document.new(@html, nil, nil) @elem1 = @doc.document.css("#elem1").first @elem2 = @doc.document.css("#elem2").first end it "should like
s more than s" do @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] end it "should like classes like text more than classes like comment" do @elem2.name = "div" @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score] @elem1['class'] = "text" @elem2['class'] = "comment" @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] end end describe "remove_unlikely_candidates!" do before do @doc = Readability::Document.new(@simple_html_fixture, nil, nil) @doc.remove_unlikely_candidates! end it "should remove things that have class comment" do @doc.document.inner_html.should_not =~ /a comment/ end it "should not remove body tags" do @doc.document.inner_html.should =~ /<\/body>/ end it "should not remove things with class comment and id body" do @doc.document.inner_html.should =~ /real content/ end end describe "score_paragraphs" do before(:each) do @html = Nokogiri::HTML <<-HTML title!

a comment

some text

some more text

HTML @doc = Readability::Document.new(@html, nil, nil) @candidates = @doc.score_paragraphs(0) end it "should score elements in the document" do @candidates.values.length.should == 4 end it "should prefer the body in this particular example" do @candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }.first[:elem][:id].should == "body" end end describe "the cant_read.html fixture" do it "should work on the cant_read.html fixture with some allowed tags" do allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a] allowed_attributes = %w[href] html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html") Readability::Document.new(Nokogiri::HTML(html), nil, nil, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/) end end describe "general functionality" do before do @doc = Readability::Document.new(Nokogiri::HTML("title!

Some content

"), nil, nil, :min_text_length => 0, :retry_length => 1) end it "should return the main page content" do @doc.content.should match("Some content") end end describe 'dealing with iso-8859-1' do before(:each) do file = File.open('spec/fixtures/folha.html', 'r') @content = file.read end it "should return the main page content" do Readability::Document.new(Nokogiri::HTML(@content, nil, 'ISO-8859'),nil,nil).content.unpack("C*").pack("U*") .should == "
\n

\n COLABORA\303\207\303\203O PARA A FOLHA\n

\n

\n A Anvisa (Ag\303\252ncia Nacional de Vigil\303\242ncia Sanit\303\241ria) interditou o lote do ch\303\241 de erva doce da marca Dr. Oetker. A medida foi publicada no \"Di\303\241rio Oficial da Uni\303\243o\" na quarta-feira (26).\n

\n

\n Segundo a Vigil\303\242ncia Sanit\303\241ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat\303\263rio no ensaio de pesquisa para mat\303\251rias macrosc\303\263picas e microsc\303\263picas que detectou a presen\303\247a de p\303\252lo de roedor e fragmentos de inseto.\n

\n

\n A interdi\303\247\303\243o cautelar vale pelo per\303\255odo de 90 dias ap\303\263s a data de publica\303\247\303\243o. Durante esse tempo, o produto interditado n\303\243o deve ser consumido e nem comercializado. As pessoas que j\303\241 adquiriram o produto do lote suspenso devem interromper o consumo.\n

\n
" end end describe 'dealing with utf-8' do before do @doc = Readability::Document.new(Nokogiri::HTML("title!

Açougue, espátula, Vovô, çáóéãà

", nil, 'UTF-8'), nil, nil, :min_text_length => 0, :retry_length => 1) end it 'should return the main page content' do @doc.content.should match("Açougue, espátula, Vovô, çáóéãà") end end describe "ignoring sidebars" do before do @doc = Readability::Document.new(Nokogiri::HTML("title!

Some content

"), nil, nil, :min_text_length => 0, :retry_length => 1) end it "should not return the sidebar" do @doc.content.should_not match("sidebar") end end describe "outputs good stuff for known documents" do before do @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html") @samples = @html_files.map {|filename| File.basename(filename, '.html') } end it "should output expected fragments of text" do checks = 0 @samples.each do |sample| html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html") doc = Readability::Document.new(Nokogiri::HTML(html), nil, nil).content load "fixtures/samples/#{sample}-fragments.rb" puts "testing #{sample}..." $required_fragments.each do |required_text| doc.should include(required_text) checks += 1 end $excluded_fragments.each do |text_to_avoid| doc.should_not include(text_to_avoid) checks += 1 end end puts "Performed #{checks} checks." end end describe "handles vimeo.com videos" do before(:each) do FakeWeb.register_uri(:get, 'http://vimeo.com/10365005', :response => File.read("spec/fixtures/vimeo.com.html")) @uri = URI.parse("http://vimeo.com/10365005") @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri.host, @uri.request_uri).content end it "should extract the video from the page" do @content.should include("