lib/html2rss/auto_source/scraper/html.rb in html2rss-0.15.0 vs lib/html2rss/auto_source/scraper/html.rb in html2rss-0.16.0
- old
+ new
@@ -10,16 +10,18 @@
# Scrapes articles from HTML pages by
# finding similar structures around anchor tags in the parsed_body.
class Html
include Enumerable
+ TAGS_TO_IGNORE = /(nav|footer|header)/i
+
def self.articles?(parsed_body)
new(parsed_body, url: '').any?
end
def self.parent_until_condition(node, condition)
- return nil if !node || node.parent.name == 'html'
+ return nil if !node || node.document? || node.parent.name == 'html'
return node if condition.call(node)
parent_until_condition(node.parent, condition)
end
@@ -30,11 +32,11 @@
end
def initialize(parsed_body, url:)
@parsed_body = parsed_body
@url = url
- @css_selectors = Hash.new(0)
+ @selectors = Hash.new(0)
end
attr_reader :parsed_body
##
@@ -56,28 +58,35 @@
end
##
# Find all the anchors in root.
# @param root [Nokogiri::XML::Node] The root node to search for anchors
- # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
+ # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
@frequent_selectors ||= begin
root.traverse do |node|
next if !node.element? || node.name != 'a'
- @css_selectors[self.class.simplify_xpath(node.path)] += 1
+ @selectors[self.class.simplify_xpath(node.path)] += 1
end
- @css_selectors.keys
- .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
- .to_set
+ @selectors.keys
+ .select { |selector| (@selectors[selector]).to_i >= min_frequency }
+ .to_set
end
end
- private
-
def article_condition(node)
+ # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
+ return false if node.path.match?(TAGS_TO_IGNORE)
+
+ # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
+ return false if self.class.parent_until_condition(node, proc do |current_node|
+ current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
+ end)
+
return true if %w[body html].include?(node.name)
+
return true if node.parent.css('a').size > 1
false
end
end