# frozen_string_literal: true require 'nokogiri' require 'parallel' require 'addressable' module Html2rss ## # The AutoSource class is responsible for extracting channel and articles # from a given URL. # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of # marking articles, e.g. schema, microdata, open graph, etc. class AutoSource class UnsupportedUrlScheme < Html2rss::Error; end class NoArticlesFound < Html2rss::Error; end SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze def initialize(url) unless url.is_a?(String) || url.is_a?(Addressable::URI) raise ArgumentError, 'URL must be a String or Addressable::URI' end @url = Addressable::URI.parse(url) raise ArgumentError, 'URL must be absolute' unless @url.absolute? raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme) end def build raise NoArticlesFound if articles.empty? Reducer.call(articles, url:) Cleanup.call(articles, url:, keep_different_domain: true) Html2rss::AutoSource::RssBuilder.new( channel:, articles: ).call end def articles @articles ||= Scraper.from(parsed_body).flat_map do |scraper| instance = scraper.new(parsed_body, url:) articles_in_thread = Parallel.map(instance.each) do |article_hash| Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]" Article.new(**article_hash, scraper:) end Reducer.call(articles_in_thread, url:) articles_in_thread end end def channel Channel.new(parsed_body, response:, url:, articles:) end private attr_reader :url def response @response ||= Html2rss::Utils.request_url(url) end # Parses the HTML body of the response using Nokogiri. # @return [Nokogiri::HTML::Document] def parsed_body @parsed_body ||= Nokogiri.HTML(response.body).freeze end end end