lib/html2rss/item.rb in html2rss-0.9.0 vs lib/html2rss/item.rb in html2rss-0.10.0

- old
+ new

@@ -1,99 +1,175 @@ -require 'faraday' -require 'faraday_middleware' +# frozen_string_literal: true + require 'nokogiri' module Html2rss ## - # Takes the selected Nokogiri::HTML and responds to accessors names + # Takes the selected Nokogiri::HTML and responds to accessor names # defined in the feed config. + # + # Instances can only be created via `.from_url` and + # each represents an internally used "RSS item". + # Such an item provides dynamically defined attributes as methods. class Item + # A context instance is passed to Item Extractors. + Context = Struct.new('Context', :options, :item, :config, keyword_init: true) + # Class to keep an Item's <enclosure>. + Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true) + + ## + # @param xml [Nokogiri::XML::Element] + # @param config [Html2rss::Config] def initialize(xml, config) @xml = xml @config = config end private_class_method :new + ## + # Checks if the object responds to a method dynamically based on the configuration. + # + # @param method_name [Symbol] + # @param _include_private [true, false] + # @return [true, false] def respond_to_missing?(method_name, _include_private = false) - config.attribute?(method_name) || super + config.selector?(method_name) || super end + ## + # Dynamically extracts data based on the method name. + # + # @param method_name [Symbol] + # @param _args [Array] + # @return [String] extracted value for the selector. def method_missing(method_name, *_args) return super unless respond_to_missing?(method_name) - attribute_options = config.attribute_options(method_name) + extract(method_name) + end - extractor = ItemExtractors.get_extractor(attribute_options[:extractor]) - value = extractor.new(xml, attribute_options).get + ## + # Selects and processes data according to the selector name. + # + # @param tag [Symbol] + # @return [String] the extracted value for the selector. + def extract(tag) + attribute_options = config.selector_attributes_with_channel(tag.to_sym) - post_process(value, attribute_options.fetch(:post_process, false)) + post_process( + ItemExtractors.item_extractor_factory(attribute_options, xml).get, + attribute_options.fetch(:post_process, false) + ) end - def available_attributes - @available_attributes ||= (%i[title link description author comments updated] & - @config.attribute_names) - %i[categories enclosure] + ## + # Checks if the item is valid accordin to RSS 2.0 spec, + # by ensuring it has at least a title or a description. + # + # @return [true, false] + def valid? + title_or_description.to_s != '' end ## - # At least a title or a description is required to be a valid RSS 2.0 item. - def valid? - title = self.title if config.attribute?(:title) - description = self.description if config.attribute?(:description) - [title, description].join != '' + # Returns either the title or the description, preferring title if available. + # + # @return [String, nil] + def title_or_description + return title if config.selector?(:title) + + description if config.selector?(:description) end ## - # @return [Array] + # + # @return [String] SHA1 hashed GUID. + def guid + content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join + + Digest::SHA1.hexdigest(content) + end + + ## + # Retrieves categories for the item based on configured category selectors. + # + # @return [Array<String>] list of categories. def categories - config.category_selectors.map(&method(:method_missing)) + config.category_selector_names.map { |method_name| public_send(method_name) } end + ## + # Checks if the item has an enclosure based on configuration. + # + # @return [true, false] def enclosure? - config.attribute?(:enclosure) + config.selector?(:enclosure) end - def enclosure_url - enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure)) + ## + # Retrieves enclosure details for the item. + # + # @return [Enclosure] enclosure details. + def enclosure + url = enclosure_url - Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure + raise 'An item.enclosure requires an absolute URL' unless url&.absolute? + + Enclosure.new( + type: Html2rss::Utils.guess_content_type_from_url(url), + bits_length: 0, + url: url.to_s + ) end ## - # @return [Array] + # Fetches items from a given URL using configuration settings. + # + # @param url [String] URL to fetch items from. + # @param config [Html2rss::Config] Configuration object. + # @return [Array<Html2rss::Item>] list of items fetched. def self.from_url(url, config) - body = get_body_from_url(url, config) + body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers) - Nokogiri.HTML(body).css(config.selector(:items)) - .map { |xml_item| new xml_item, config } - .keep_if(&:valid?) + Nokogiri.HTML(body) + .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME)) + .map { |xml| new(xml, config) } + .select(&:valid?) end private - def self.get_body_from_url(url, config) - request = Faraday.new(url: url, headers: config.headers) do |faraday| - faraday.use FaradayMiddleware::FollowRedirects - faraday.adapter Faraday.default_adapter - end + # @return [Nokogiri::XML::Element] XML element representing the item. + attr_reader :xml + # @return [Html2rss::Config] Configuration object for the item. + attr_reader :config - body = request.get.body - - config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body - end - private_class_method :get_body_from_url - - attr_reader :xml, :config - + ## + # Processes the extracted value according to post-processing options. + # + # @param value [String] extracted value. + # @param post_process_options [Hash<Symbol, Object>] post-processing options. + # @return [String] processed value. def post_process(value, post_process_options) return value unless post_process_options [post_process_options].flatten.each do |options| value = AttributePostProcessors.get_processor(options[:name]) - .new(value, options: options, item: self, config: @config) + .new(value, Context.new(options:, item: self, config:)) .get end value + end + + ## + # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute. + # + # @return [Addressable::URI, nil] absolute URL of the enclosure. + def enclosure_url + enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure)) + + Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure end end end