lib/html2rss/auto_source.rb in html2rss-0.13.0 vs lib/html2rss/auto_source.rb in html2rss-0.14.0
- old
+ new
@@ -14,28 +14,32 @@
class UnsupportedUrlScheme < Html2rss::Error; end
class NoArticlesFound < Html2rss::Error; end
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
- def initialize(url)
- unless url.is_a?(String) || url.is_a?(Addressable::URI)
- raise ArgumentError,
- 'URL must be a String or Addressable::URI'
- end
+ ##
+ # @param url [Addressable::URI] The URL to extract articles from.
+ # @param body [String] The body of the response.
+ # @param headers [Hash] The headers of the response.
+ def initialize(url, body:, headers: {})
+ raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
+ raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
- @url = Addressable::URI.parse(url)
-
- raise ArgumentError, 'URL must be absolute' unless @url.absolute?
- raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
+ @url = url
+ @body = body
+ @headers = headers
end
def build
raise NoArticlesFound if articles.empty?
Reducer.call(articles, url:)
Cleanup.call(articles, url:, keep_different_domain: true)
+ channel.articles = articles
+
Html2rss::AutoSource::RssBuilder.new(
channel:,
articles:
).call
end
@@ -55,23 +59,22 @@
articles_in_thread
end
end
def channel
- Channel.new(parsed_body, response:, url:, articles:)
+ @channel ||= Channel.new(parsed_body, headers: @headers, url:)
end
private
attr_reader :url
- def response
- @response ||= Html2rss::Utils.request_url(url)
- end
-
- # Parses the HTML body of the response using Nokogiri.
# @return [Nokogiri::HTML::Document]
def parsed_body
- @parsed_body ||= Nokogiri.HTML(response.body).freeze
+ @parsed_body ||= Nokogiri.HTML(@body)
+ .tap do |doc|
+ # Remove comments from the document
+ doc.xpath('//comment()').each(&:remove)
+ end.freeze
end
end
end