lib/dq-readability.rb in dq-readability-1.0.2 vs lib/dq-readability.rb in dq-readability-1.0.3

- old
+ new

@@ -94,11 +94,15 @@ # changing img src @html.css("img").each do |elem| begin if elem['src'][0] == '/' - elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + if elem['src'][1] == '/' + elem['src'] = 'http:'+elem['src'] + else + elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil + end else if @url.split('').last == '/' elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil else x = @url.split('/') @@ -111,12 +115,21 @@ rescue elem.remove end end - #changing the 'a' href + # changing certain tags to <p> tags + + x = @html.css("ol") + x.each do |t| + t.name = "p" + end + len = @html.css('ol').length + debug("length of ol tag #{len}") + #changing the 'a' href + @html.css("a").each do |elem| begin if elem['href'][0] == '/' elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil else @@ -523,11 +536,11 @@ s = Nokogiri::XML::Node::SaveOptions save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML html = node.serialize(:save_with => save_opts) # Get rid of duplicate whitespace - return html.gsub(/[\r\n\f]+/, "\n" ) + return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" ) end def clean_conditionally(node, candidates, selector) return unless @clean_conditionally node.css(selector).each do |el| @@ -548,13 +561,13 @@ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) to_remove = false reason = "" - if (counts["img"] > counts["p"]) && (counts["img"] > 1) + if (counts["img"] > counts["p"]+2) reason = "too many images" to_remove = true - elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" + elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" reason = "more <li>s than <p>s" to_remove = true elsif counts["input"] > (counts["p"] / 3).to_i reason = "less than 3x <p>s than <input>s" to_remove = true