Sha256: 34e7930fc1d02df4e7caae2cb942864ca02f2b8659e8be9c9fd941c02bb30082
Contents?: true
Size: 1.91 KB
Versions: 1
Compression:
Stored size: 1.91 KB
Contents
# frozen_string_literal: true module Html2rss class AutoSource ## # Cleanup is responsible for cleaning up the extracted articles. # :reek:MissingSafeMethod { enabled: false } # It applies various strategies to filter and refine the article list. class Cleanup class << self def call(articles, url:, keep_different_domain: false) Log.debug "Cleanup: start with #{articles.size} articles" articles.select!(&:valid?) deduplicate_by!(articles, :url) keep_only_http_urls!(articles) reject_different_domain!(articles, url) unless keep_different_domain Log.debug "Cleanup: end with #{articles.size} articles" articles end private ## # Deduplicates articles by a given key. # # @param articles [Array<Article>] The list of articles to process. # @param key [Symbol] The key to deduplicate by. def deduplicate_by!(articles, key) seen = {} articles.reject! do |article| value = article.public_send(key) value.nil? || seen.key?(value).tap { seen[value] = true } end end ## # Keeps only articles with HTTP or HTTPS URLs. # # @param articles [Array<Article>] The list of articles to process. def keep_only_http_urls!(articles) articles.select! { |article| %w[http https].include?(article.url&.scheme) } end ## # Rejects articles that have a URL not on the same domain as the source. # # @param articles [Array<Article>] The list of articles to process. # @param base_url [Addressable::URI] The source URL to compare against. def reject_different_domain!(articles, base_url) base_host = base_url.host articles.select! { |article| article.url&.host == base_host } end end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
html2rss-0.16.0 | lib/html2rss/auto_source/cleanup.rb |