lib/html2rss/item_extractors/text.rb in html2rss-0.9.0 vs lib/html2rss/item_extractors/text.rb in html2rss-0.10.0
- old
+ new
@@ -1,12 +1,14 @@
+# frozen_string_literal: true
+
module Html2rss
module ItemExtractors
##
- # Return the text of the attribute. This is the default extractor used,
+ # Return the text content of the attribute. This is the default extractor used,
# when no extractor is explicitly given.
#
- # Imagine this HTML structure:
+ # Example HTML structure:
#
# <p>Lorem <b>ipsum</b> dolor ...</p>
#
# YAML usage example:
#
@@ -16,17 +18,27 @@
# extractor: text
#
# Would return:
# 'Lorem ipsum dolor ...'
class Text
+ # The available options for the text extractor.
+ Options = Struct.new('TextOptions', :selector, keyword_init: true)
+
+ ##
+ # Initializes the Text extractor.
+ #
+ # @param xml [Nokogiri::XML::Element]
+ # @param options [Options]
def initialize(xml, options)
- @element = ItemExtractors.element(xml, options)
+ @element = ItemExtractors.element(xml, options.selector)
end
##
- # @return [String]
+ # Retrieves and returns the text content of the element.
+ #
+ # @return [String] The text content.
def get
- @element.text.to_s.strip.split.join(' ')
+ @element.text.to_s.strip.gsub(/\s+/, ' ')
end
end
end
end