lib/dq-readability.rb in dq-readability-1.0.2 vs lib/dq-readability.rb in dq-readability-1.0.3
- old
+ new
@@ -94,11 +94,15 @@
# changing img src
@html.css("img").each do |elem|
begin
if elem['src'][0] == '/'
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+ if elem['src'][1] == '/'
+ elem['src'] = 'http:'+elem['src']
+ else
+ elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+ end
else
if @url.split('').last == '/'
elem['src'] = URI.join(@url,elem['src']).to_s if URI.parse(elem['src']).host == nil
else
x = @url.split('/')
@@ -111,12 +115,21 @@
rescue
elem.remove
end
end
- #changing the 'a' href
+ # changing certain tags to <p> tags
+
+ x = @html.css("ol")
+ x.each do |t|
+ t.name = "p"
+ end
+ len = @html.css('ol').length
+ debug("length of ol tag #{len}")
+ #changing the 'a' href
+
@html.css("a").each do |elem|
begin
if elem['href'][0] == '/'
elem['href'] = URI.join(base,elem['href']).to_s if URI.parse(elem['href']).host == nil
else
@@ -523,11 +536,11 @@
s = Nokogiri::XML::Node::SaveOptions
save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
html = node.serialize(:save_with => save_opts)
# Get rid of duplicate whitespace
- return html.gsub(/[\r\n\f]+/, "\n" )
+ return "<head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'></head>" + "\n" + html.gsub(/[\r\n\f]+/, "\n" )
end
def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally
node.css(selector).each do |el|
@@ -548,13 +561,13 @@
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)
to_remove = false
reason = ""
- if (counts["img"] > counts["p"]) && (counts["img"] > 1)
+ if (counts["img"] > counts["p"]+2)
reason = "too many images"
to_remove = true
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
reason = "more <li>s than <p>s"
to_remove = true
elsif counts["input"] > (counts["p"] / 3).to_i
reason = "less than 3x <p>s than <input>s"
to_remove = true