# frozen_string_literal: true
require 'addressable'
require 'parallel'
module Html2rss
class AutoSource
module Scraper
##
# Scrapes articles by looking for common markup tags (article, section, li)
# containing an tag.
#
# See:
# 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
class SemanticHtml
include Enumerable
##
# Map of parent element names to CSS selectors for finding tags.
ANCHOR_TAG_SELECTORS = {
'section' => ['section :not(section) a[href]'],
'tr' => ['table tr :not(tr) a[href]'],
'article' => [
'article :not(article) a[href]',
'article a[href]'
],
'li' => [
'ul > li :not(li) a[href]',
'ol > li :not(li) a[href]'
]
}.freeze
# Check if the parsed_body contains articles
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
# @return [Boolean] True if articles are found, otherwise false.
def self.articles?(parsed_body)
return false unless parsed_body
ANCHOR_TAG_SELECTORS.each_value do |selectors|
return true if selectors.any? { |selector| parsed_body.at_css(selector) }
end
false
end
# Finds the closest ancestor tag matching the specified tag name
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
# @param tag_name [String] The tag name to search for
# @param stop_tag [String] The tag name to stop searching at
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
return current_tag if current_tag.name == tag_name
stop_tags = Set[tag_name, stop_tag]
while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
current_tag = current_tag.parent
end
current_tag
end
# Finds the closest matching selector upwards in the DOM tree
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
# @param selector [String] The CSS selector to search for
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
end
# Helper method to find a matching selector upwards
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
# @param selector [String] The CSS selector to search for
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
def self.find_closest_selector_upwards(current_tag, selector:)
while current_tag
found = current_tag.at_css(selector)
return found if found
return nil unless current_tag.respond_to?(:parent)
current_tag = current_tag.parent
end
end
# Returns an array of [tag_name, selector] pairs
# @return [Array<[String, String]>] Array of tag name and selector pairs
def self.anchor_tag_selector_pairs
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
selectors.map { |selector| [tag_name, selector] }
end
end
def initialize(parsed_body, url:)
@parsed_body = parsed_body
@url = url
end
attr_reader :parsed_body
##
# @yieldparam [Hash] The scraped article hash
# @return [Enumerator] Enumerator for the scraped articles
def each
return enum_for(:each) unless block_given?
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
parsed_body.css(selector).each do |selected_tag|
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
article_hash = Extractor.new(article_tag, url: @url).call
yield article_hash if article_hash
end
end
end
end
end
end
end