lib/html2rss/auto_source/scraper/semantic_html/extractor.rb in html2rss-0.14.0 vs lib/html2rss/auto_source/scraper/semantic_html/extractor.rb in html2rss-0.15.0

- old
+ new

@@ -33,18 +33,18 @@ end def initialize(article_tag, url:) @article_tag = article_tag @url = url - @heading = find_heading - @extract_url = find_url end # @return [Hash, nil] The scraped article or nil. def call - return unless heading + @heading = find_heading || closest_anchor || return + @extract_url = find_url + { title: extract_title, url: extract_url, image: extract_image, description: extract_description, @@ -69,18 +69,24 @@ end times.min end + ## + # Find the heading of the article. + # @return [Nokogiri::XML::Node, nil] def find_heading heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name) + + return if heading_tags.empty? + smallest_heading = heading_tags.keys.min heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size } end def extract_title - @extract_title ||= if heading.children.empty? && heading.text + @extract_title ||= if heading && (heading.children.empty? || heading.text) visible_text_from_tag(heading) else visible_text_from_tag( article_tag.css(HEADING_TAGS.join(',')) .max_by { |tag| tag.text.size } @@ -99,23 +105,30 @@ description.gsub!(title_text, '') if title_text description.strip! description.empty? ? nil : description end + def closest_anchor + SemanticHtml.find_closest_selector(heading || article_tag, + selector: 'a[href]:not([href=""])') + end + def find_url - closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag, - selector: 'a[href]:not([href=""])') href = closest_anchor&.[]('href')&.split('#')&.first&.strip Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty? end def extract_image Image.call(article_tag, url:) end def generate_id - [article_tag['id'], article_tag.at_css('[id]')&.attr('id'), - extract_url&.path].compact.reject(&:empty?).first + [ + article_tag['id'], + article_tag.at_css('[id]')&.attr('id'), + extract_url&.path, + extract_url&.query + ].compact.reject(&:empty?).first end end end end end