Sha256: c65828238ef6382c8624f69ceae40f28a5c0c4665d640d89d32619f62b5d328b
Contents?: true
Size: 1.86 KB
Versions: 1
Compression:
Stored size: 1.86 KB
Contents
# frozen_string_literal: true require 'nokogiri' require 'parallel' require 'addressable' module Html2rss ## # The AutoSource class is responsible for extracting channel and articles # from a given URL. # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of # marking articles, e.g. schema, microdata, open graph, etc. class AutoSource class NoArticlesFound < Html2rss::Error; end ## # @param url [Addressable::URI] The URL to extract articles from. # @param body [String] The body of the response. # @param headers [Hash] The headers of the response. def initialize(url, body:, headers: {}) @url = url @body = body @headers = headers end def build raise NoArticlesFound if articles.empty? Reducer.call(articles, url:) Cleanup.call(articles, url:, keep_different_domain: true) channel.articles = articles Html2rss::AutoSource::RssBuilder.new( channel:, articles: ).call end def articles @articles ||= Scraper.from(parsed_body).flat_map do |scraper| instance = scraper.new(parsed_body, url:) articles_in_thread = Parallel.map(instance.each) do |article_hash| Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]" Article.new(**article_hash, scraper:) end Reducer.call(articles_in_thread, url:) articles_in_thread end end def channel @channel ||= Channel.new(parsed_body, headers: @headers, url:) end private attr_reader :url # @return [Nokogiri::HTML::Document] def parsed_body @parsed_body ||= Nokogiri.HTML(@body) .tap do |doc| # Remove comments from the document doc.xpath('//comment()').each(&:remove) end.freeze end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
html2rss-0.16.0 | lib/html2rss/auto_source.rb |