lib/html2rss/auto_source/scraper/semantic_html/extractor.rb in html2rss-0.14.0 vs lib/html2rss/auto_source/scraper/semantic_html/extractor.rb in html2rss-0.15.0
- old
+ new
@@ -33,18 +33,18 @@
end
def initialize(article_tag, url:)
@article_tag = article_tag
@url = url
- @heading = find_heading
- @extract_url = find_url
end
# @return [Hash, nil] The scraped article or nil.
def call
- return unless heading
+ @heading = find_heading || closest_anchor || return
+ @extract_url = find_url
+
{
title: extract_title,
url: extract_url,
image: extract_image,
description: extract_description,
@@ -69,18 +69,24 @@
end
times.min
end
+ ##
+ # Find the heading of the article.
+ # @return [Nokogiri::XML::Node, nil]
def find_heading
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+
+ return if heading_tags.empty?
+
smallest_heading = heading_tags.keys.min
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
end
def extract_title
- @extract_title ||= if heading.children.empty? && heading.text
+ @extract_title ||= if heading && (heading.children.empty? || heading.text)
visible_text_from_tag(heading)
else
visible_text_from_tag(
article_tag.css(HEADING_TAGS.join(','))
.max_by { |tag| tag.text.size }
@@ -99,23 +105,30 @@
description.gsub!(title_text, '') if title_text
description.strip!
description.empty? ? nil : description
end
+ def closest_anchor
+ SemanticHtml.find_closest_selector(heading || article_tag,
+ selector: 'a[href]:not([href=""])')
+ end
+
def find_url
- closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
- selector: 'a[href]:not([href=""])')
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
end
def extract_image
Image.call(article_tag, url:)
end
def generate_id
- [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
- extract_url&.path].compact.reject(&:empty?).first
+ [
+ article_tag['id'],
+ article_tag.at_css('[id]')&.attr('id'),
+ extract_url&.path,
+ extract_url&.query
+ ].compact.reject(&:empty?).first
end
end
end
end
end