lib/html2rss/item_extractor.rb in html2rss-0.0.1 vs lib/html2rss/item_extractor.rb in html2rss-0.1.0
- old
+ new
@@ -1,25 +1,25 @@
-require 'sanitize'
-
module Html2rss
module ItemExtractor
- TEXT = proc { |xml, options| xml.css(options['selector'])&.text }
- ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']) }
+ TEXT = proc { |xml, options| xml.css(options['selector'])&.text&.strip }
+ ATTRIBUTE = proc { |xml, options| xml.css(options['selector']).attr(options['attribute']).to_s }
HREF = proc { |xml, options|
- uri = URI(options['channel']['url'])
- uri.path = xml.css(options['selector']).attr('href')
- uri
- }
+ href = xml.css(options['selector']).attr('href').to_s
+ path, query = href.split('?')
- HTML = proc { |xml, options|
- html = xml.css(options['selector']).to_s
+ if href.start_with?('http')
+ uri = URI(href)
+ else
+ uri = URI(options['channel']['url'])
+ uri.path = path.start_with?('/') ? path : "/#{path}"
+ uri.query = query
+ end
- Sanitize.fragment(html, Sanitize::Config.merge(
- Sanitize::Config::RELAXED,
- add_attributes: {
- 'a' => { 'rel' => 'nofollow noopener noreferrer' }
- }
- ))
+ uri
}
+
+ HTML = proc { |xml, options| xml.css(options['selector']).to_s }
+ STATIC = proc { |_xml, options| options['static'] }
+ CURRENT_TIME = proc { |_xml, _options| Time.new }
end
end