lib/html2rss/item.rb in html2rss-0.9.0 vs lib/html2rss/item.rb in html2rss-0.10.0
- old
+ new
@@ -1,99 +1,175 @@
-require 'faraday'
-require 'faraday_middleware'
+# frozen_string_literal: true
+
require 'nokogiri'
module Html2rss
##
- # Takes the selected Nokogiri::HTML and responds to accessors names
+ # Takes the selected Nokogiri::HTML and responds to accessor names
# defined in the feed config.
+ #
+ # Instances can only be created via `.from_url` and
+ # each represents an internally used "RSS item".
+ # Such an item provides dynamically defined attributes as methods.
class Item
+ # A context instance is passed to Item Extractors.
+ Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
+ # Class to keep an Item's <enclosure>.
+ Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
+
+ ##
+ # @param xml [Nokogiri::XML::Element]
+ # @param config [Html2rss::Config]
def initialize(xml, config)
@xml = xml
@config = config
end
private_class_method :new
+ ##
+ # Checks if the object responds to a method dynamically based on the configuration.
+ #
+ # @param method_name [Symbol]
+ # @param _include_private [true, false]
+ # @return [true, false]
def respond_to_missing?(method_name, _include_private = false)
- config.attribute?(method_name) || super
+ config.selector?(method_name) || super
end
+ ##
+ # Dynamically extracts data based on the method name.
+ #
+ # @param method_name [Symbol]
+ # @param _args [Array]
+ # @return [String] extracted value for the selector.
def method_missing(method_name, *_args)
return super unless respond_to_missing?(method_name)
- attribute_options = config.attribute_options(method_name)
+ extract(method_name)
+ end
- extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
- value = extractor.new(xml, attribute_options).get
+ ##
+ # Selects and processes data according to the selector name.
+ #
+ # @param tag [Symbol]
+ # @return [String] the extracted value for the selector.
+ def extract(tag)
+ attribute_options = config.selector_attributes_with_channel(tag.to_sym)
- post_process(value, attribute_options.fetch(:post_process, false))
+ post_process(
+ ItemExtractors.item_extractor_factory(attribute_options, xml).get,
+ attribute_options.fetch(:post_process, false)
+ )
end
- def available_attributes
- @available_attributes ||= (%i[title link description author comments updated] &
- @config.attribute_names) - %i[categories enclosure]
+ ##
+ # Checks if the item is valid accordin to RSS 2.0 spec,
+ # by ensuring it has at least a title or a description.
+ #
+ # @return [true, false]
+ def valid?
+ title_or_description.to_s != ''
end
##
- # At least a title or a description is required to be a valid RSS 2.0 item.
- def valid?
- title = self.title if config.attribute?(:title)
- description = self.description if config.attribute?(:description)
- [title, description].join != ''
+ # Returns either the title or the description, preferring title if available.
+ #
+ # @return [String, nil]
+ def title_or_description
+ return title if config.selector?(:title)
+
+ description if config.selector?(:description)
end
##
- # @return [Array]
+ #
+ # @return [String] SHA1 hashed GUID.
+ def guid
+ content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
+
+ Digest::SHA1.hexdigest(content)
+ end
+
+ ##
+ # Retrieves categories for the item based on configured category selectors.
+ #
+ # @return [Array<String>] list of categories.
def categories
- config.category_selectors.map(&method(:method_missing))
+ config.category_selector_names.map { |method_name| public_send(method_name) }
end
+ ##
+ # Checks if the item has an enclosure based on configuration.
+ #
+ # @return [true, false]
def enclosure?
- config.attribute?(:enclosure)
+ config.selector?(:enclosure)
end
- def enclosure_url
- enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
+ ##
+ # Retrieves enclosure details for the item.
+ #
+ # @return [Enclosure] enclosure details.
+ def enclosure
+ url = enclosure_url
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
+ raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
+
+ Enclosure.new(
+ type: Html2rss::Utils.guess_content_type_from_url(url),
+ bits_length: 0,
+ url: url.to_s
+ )
end
##
- # @return [Array]
+ # Fetches items from a given URL using configuration settings.
+ #
+ # @param url [String] URL to fetch items from.
+ # @param config [Html2rss::Config] Configuration object.
+ # @return [Array<Html2rss::Item>] list of items fetched.
def self.from_url(url, config)
- body = get_body_from_url(url, config)
+ body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
- Nokogiri.HTML(body).css(config.selector(:items))
- .map { |xml_item| new xml_item, config }
- .keep_if(&:valid?)
+ Nokogiri.HTML(body)
+ .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
+ .map { |xml| new(xml, config) }
+ .select(&:valid?)
end
private
- def self.get_body_from_url(url, config)
- request = Faraday.new(url: url, headers: config.headers) do |faraday|
- faraday.use FaradayMiddleware::FollowRedirects
- faraday.adapter Faraday.default_adapter
- end
+ # @return [Nokogiri::XML::Element] XML element representing the item.
+ attr_reader :xml
+ # @return [Html2rss::Config] Configuration object for the item.
+ attr_reader :config
- body = request.get.body
-
- config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
- end
- private_class_method :get_body_from_url
-
- attr_reader :xml, :config
-
+ ##
+ # Processes the extracted value according to post-processing options.
+ #
+ # @param value [String] extracted value.
+ # @param post_process_options [Hash<Symbol, Object>] post-processing options.
+ # @return [String] processed value.
def post_process(value, post_process_options)
return value unless post_process_options
[post_process_options].flatten.each do |options|
value = AttributePostProcessors.get_processor(options[:name])
- .new(value, options: options, item: self, config: @config)
+ .new(value, Context.new(options:, item: self, config:))
.get
end
value
+ end
+
+ ##
+ # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
+ #
+ # @return [Addressable::URI, nil] absolute URL of the enclosure.
+ def enclosure_url
+ enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
+
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
end
end
end