# frozen_string_literal: true require 'nokogiri' require 'parallel' require 'addressable' module Html2rss ## # The AutoSource class is responsible for extracting channel and articles # from a given URL. # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of # marking articles, e.g. schema, microdata, open graph, etc. class AutoSource class UnsupportedUrlScheme < Html2rss::Error; end class NoArticlesFound < Html2rss::Error; end SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze ## # @param url [Addressable::URI] The URL to extract articles from. # @param body [String] The body of the response. # @param headers [Hash] The headers of the response. def initialize(url, body:, headers: {}) raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI) raise ArgumentError, 'URL must be absolute' unless url.absolute? raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme) @url = url @body = body @headers = headers end def build raise NoArticlesFound if articles.empty? Reducer.call(articles, url:) Cleanup.call(articles, url:, keep_different_domain: true) channel.articles = articles Html2rss::AutoSource::RssBuilder.new( channel:, articles: ).call end def articles @articles ||= Scraper.from(parsed_body).flat_map do |scraper| instance = scraper.new(parsed_body, url:) articles_in_thread = Parallel.map(instance.each) do |article_hash| Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]" Article.new(**article_hash, scraper:) end Reducer.call(articles_in_thread, url:) articles_in_thread end end def channel @channel ||= Channel.new(parsed_body, headers: @headers, url:) end private attr_reader :url # @return [Nokogiri::HTML::Document] def parsed_body @parsed_body ||= Nokogiri.HTML(@body) .tap do |doc| # Remove comments from the document doc.xpath('//comment()').each(&:remove) end.freeze end end end