Sha256: c65828238ef6382c8624f69ceae40f28a5c0c4665d640d89d32619f62b5d328b

Contents?: true

Size: 1.86 KB

Versions: 1

Compression:

Stored size: 1.86 KB

Contents

# frozen_string_literal: true

require 'nokogiri'
require 'parallel'
require 'addressable'

module Html2rss
  ##
  # The AutoSource class is responsible for extracting channel and articles
  # from a given URL.
  # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
  # marking articles, e.g. schema, microdata, open graph, etc.
  class AutoSource
    class NoArticlesFound < Html2rss::Error; end

    ##
    # @param url [Addressable::URI] The URL to extract articles from.
    # @param body [String] The body of the response.
    # @param headers [Hash] The headers of the response.
    def initialize(url, body:, headers: {})
      @url = url
      @body = body
      @headers = headers
    end

    def build
      raise NoArticlesFound if articles.empty?

      Reducer.call(articles, url:)
      Cleanup.call(articles, url:, keep_different_domain: true)

      channel.articles = articles

      Html2rss::AutoSource::RssBuilder.new(
        channel:,
        articles:
      ).call
    end

    def articles
      @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
        instance = scraper.new(parsed_body, url:)

        articles_in_thread = Parallel.map(instance.each) do |article_hash|
          Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"

          Article.new(**article_hash, scraper:)
        end

        Reducer.call(articles_in_thread, url:)

        articles_in_thread
      end
    end

    def channel
      @channel ||= Channel.new(parsed_body, headers: @headers, url:)
    end

    private

    attr_reader :url

    # @return [Nokogiri::HTML::Document]
    def parsed_body
      @parsed_body ||= Nokogiri.HTML(@body)
                               .tap do |doc|
        # Remove comments from the document
        doc.xpath('//comment()').each(&:remove)
      end.freeze
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
html2rss-0.16.0 lib/html2rss/auto_source.rb