lib/html2rss/auto_source/scraper/html.rb in html2rss-0.15.0 vs lib/html2rss/auto_source/scraper/html.rb in html2rss-0.16.0

- old
+ new

@@ -10,16 +10,18 @@ # Scrapes articles from HTML pages by # finding similar structures around anchor tags in the parsed_body. class Html include Enumerable + TAGS_TO_IGNORE = /(nav|footer|header)/i + def self.articles?(parsed_body) new(parsed_body, url: '').any? end def self.parent_until_condition(node, condition) - return nil if !node || node.parent.name == 'html' + return nil if !node || node.document? || node.parent.name == 'html' return node if condition.call(node) parent_until_condition(node.parent, condition) end @@ -30,11 +32,11 @@ end def initialize(parsed_body, url:) @parsed_body = parsed_body @url = url - @css_selectors = Hash.new(0) + @selectors = Hash.new(0) end attr_reader :parsed_body ## @@ -56,28 +58,35 @@ end ## # Find all the anchors in root. # @param root [Nokogiri::XML::Node] The root node to search for anchors - # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times + # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2) @frequent_selectors ||= begin root.traverse do |node| next if !node.element? || node.name != 'a' - @css_selectors[self.class.simplify_xpath(node.path)] += 1 + @selectors[self.class.simplify_xpath(node.path)] += 1 end - @css_selectors.keys - .select { |selector| (@css_selectors[selector]).to_i >= min_frequency } - .to_set + @selectors.keys + .select { |selector| (@selectors[selector]).to_i >= min_frequency } + .to_set end end - private - def article_condition(node) + # Ignore tags that are below a tag which is in TAGS_TO_IGNORE. + return false if node.path.match?(TAGS_TO_IGNORE) + + # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE. + return false if self.class.parent_until_condition(node, proc do |current_node| + current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) } + end) + return true if %w[body html].include?(node.name) + return true if node.parent.css('a').size > 1 false end end