# frozen_string_literal: true module Html2rss class AutoSource ## # Extracts channel information from # 1. the HTML document's . # 2. the HTTP response class Channel ## # # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document. # @param url [Addressable::URI] The URL of the channel. # @param headers [Hash] the http headers # @param articles [Array] The articles. def initialize(parsed_body, url:, headers:, articles: [], stylesheets: []) @parsed_body = parsed_body @url = url @headers = headers @articles = articles @stylesheets = stylesheets end attr_writer :articles attr_reader :stylesheets def url = extract_url def title = extract_title def language = extract_language def description = extract_description def image = extract_image def ttl = extract_ttl def last_build_date = headers['last-modified'] def generator "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})" end private attr_reader :parsed_body, :headers def extract_url @url.normalize.to_s end def extract_title parsed_body.at_css('head > title')&.text end def extract_language return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang'] parsed_body.at_css('[lang]')&.[]('lang') end def extract_description parsed_body.at_css('meta[name="description"]')&.[]('content') || '' end def extract_image url = parsed_body.at_css('meta[property="og:image"]')&.[]('content') Html2rss::Utils.sanitize_url(url) if url end def extract_ttl ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1) return unless ttl ttl.to_i.fdiv(60).ceil end def scraper_counts scraper_counts = +'' @articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 } .each do |klass, count| scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]") end scraper_counts end end end end