lib/html2rss/auto_source.rb in html2rss-0.13.0 vs lib/html2rss/auto_source.rb in html2rss-0.14.0

- old
+ new

@@ -14,28 +14,32 @@ class UnsupportedUrlScheme < Html2rss::Error; end class NoArticlesFound < Html2rss::Error; end SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze - def initialize(url) - unless url.is_a?(String) || url.is_a?(Addressable::URI) - raise ArgumentError, - 'URL must be a String or Addressable::URI' - end + ## + # @param url [Addressable::URI] The URL to extract articles from. + # @param body [String] The body of the response. + # @param headers [Hash] The headers of the response. + def initialize(url, body:, headers: {}) + raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI) + raise ArgumentError, 'URL must be absolute' unless url.absolute? + raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme) - @url = Addressable::URI.parse(url) - - raise ArgumentError, 'URL must be absolute' unless @url.absolute? - raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme) + @url = url + @body = body + @headers = headers end def build raise NoArticlesFound if articles.empty? Reducer.call(articles, url:) Cleanup.call(articles, url:, keep_different_domain: true) + channel.articles = articles + Html2rss::AutoSource::RssBuilder.new( channel:, articles: ).call end @@ -55,23 +59,22 @@ articles_in_thread end end def channel - Channel.new(parsed_body, response:, url:, articles:) + @channel ||= Channel.new(parsed_body, headers: @headers, url:) end private attr_reader :url - def response - @response ||= Html2rss::Utils.request_url(url) - end - - # Parses the HTML body of the response using Nokogiri. # @return [Nokogiri::HTML::Document] def parsed_body - @parsed_body ||= Nokogiri.HTML(response.body).freeze + @parsed_body ||= Nokogiri.HTML(@body) + .tap do |doc| + # Remove comments from the document + doc.xpath('//comment()').each(&:remove) + end.freeze end end end