# frozen_string_literal: true
module Html2rss
class AutoSource
##
# Cleanup is responsible for cleaning up the extracted articles.
# :reek:MissingSafeMethod { enabled: false }
# It applies various strategies to filter and refine the article list.
class Cleanup
class << self
def call(articles, url:, keep_different_domain: false)
Log.debug "Cleanup: start with #{articles.size} articles"
articles.select!(&:valid?)
remove_short!(articles, :title)
deduplicate_by!(articles, :url)
deduplicate_by!(articles, :title)
keep_only_http_urls!(articles)
reject_different_domain!(articles, url) unless keep_different_domain
Log.debug "Cleanup: end with #{articles.size} articles"
articles
end
private
##
# Removes articles with short values for a given key.
#
# @param articles [Array] The list of articles to process.
# @param key [Symbol] The key to check for short values.
# @param min_words [Integer] The minimum number of words required.
def remove_short!(articles, key = :title, min_words: 2)
articles.reject! do |article|
value = article.public_send(key)
value.nil? || value.to_s.split.size < min_words
end
end
##
# Deduplicates articles by a given key.
#
# @param articles [Array] The list of articles to process.
# @param key [Symbol] The key to deduplicate by.
def deduplicate_by!(articles, key)
seen = {}
articles.reject! do |article|
value = article.public_send(key)
value.nil? || seen.key?(value).tap { seen[value] = true }
end
end
##
# Keeps only articles with HTTP or HTTPS URLs.
#
# @param articles [Array] The list of articles to process.
def keep_only_http_urls!(articles)
articles.select! { |article| %w[http https].include?(article.url&.scheme) }
end
##
# Rejects articles that have a URL not on the same domain as the source.
#
# @param articles [Array] The list of articles to process.
# @param base_url [Addressable::URI] The source URL to compare against.
def reject_different_domain!(articles, base_url)
base_host = base_url.host
articles.select! { |article| article.url&.host == base_host }
end
end
end
end
end