Sha256: 34e7930fc1d02df4e7caae2cb942864ca02f2b8659e8be9c9fd941c02bb30082

Contents?: true

Size: 1.91 KB

Versions: 1

Compression:

Stored size: 1.91 KB

Contents

# frozen_string_literal: true

module Html2rss
  class AutoSource
    ##
    # Cleanup is responsible for cleaning up the extracted articles.
    # :reek:MissingSafeMethod { enabled: false }
    # It applies various strategies to filter and refine the article list.
    class Cleanup
      class << self
        def call(articles, url:, keep_different_domain: false)
          Log.debug "Cleanup: start with #{articles.size} articles"

          articles.select!(&:valid?)

          deduplicate_by!(articles, :url)

          keep_only_http_urls!(articles)
          reject_different_domain!(articles, url) unless keep_different_domain

          Log.debug "Cleanup: end with #{articles.size} articles"
          articles
        end

        private

        ##
        # Deduplicates articles by a given key.
        #
        # @param articles [Array<Article>] The list of articles to process.
        # @param key [Symbol] The key to deduplicate by.
        def deduplicate_by!(articles, key)
          seen = {}
          articles.reject! do |article|
            value = article.public_send(key)
            value.nil? || seen.key?(value).tap { seen[value] = true }
          end
        end

        ##
        # Keeps only articles with HTTP or HTTPS URLs.
        #
        # @param articles [Array<Article>] The list of articles to process.
        def keep_only_http_urls!(articles)
          articles.select! { |article| %w[http https].include?(article.url&.scheme) }
        end

        ##
        # Rejects articles that have a URL not on the same domain as the source.
        #
        # @param articles [Array<Article>] The list of articles to process.
        # @param base_url [Addressable::URI] The source URL to compare against.
        def reject_different_domain!(articles, base_url)
          base_host = base_url.host
          articles.select! { |article| article.url&.host == base_host }
        end
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
html2rss-0.16.0 lib/html2rss/auto_source/cleanup.rb