extractor.rb in html2rss-0.15.0

- old
+ new

@@ -33,18 +33,18 @@
           end
 
           def initialize(article_tag, url:)
             @article_tag = article_tag
             @url = url
-            @heading = find_heading
-            @extract_url = find_url
           end
 
           # @return [Hash, nil] The scraped article or nil.
           def call
-            return unless heading
+            @heading = find_heading || closest_anchor || return
 
+            @extract_url = find_url
+
             {
               title: extract_title,
               url: extract_url,
               image: extract_image,
               description: extract_description,
@@ -69,18 +69,24 @@
             end
 
             times.min
           end
 
+          ##
+          # Find the heading of the article.
+          # @return [Nokogiri::XML::Node, nil]
           def find_heading
             heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+
+            return if heading_tags.empty?
+
             smallest_heading = heading_tags.keys.min
             heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
           end
 
           def extract_title
-            @extract_title ||= if heading.children.empty? && heading.text
+            @extract_title ||= if heading && (heading.children.empty? || heading.text)
                                  visible_text_from_tag(heading)
                                else
                                  visible_text_from_tag(
                                    article_tag.css(HEADING_TAGS.join(','))
                                               .max_by { |tag| tag.text.size }
@@ -99,23 +105,30 @@
             description.gsub!(title_text, '') if title_text
             description.strip!
             description.empty? ? nil : description
           end
 
+          def closest_anchor
+            SemanticHtml.find_closest_selector(heading || article_tag,
+                                               selector: 'a[href]:not([href=""])')
+          end
+
           def find_url
-            closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
-                                                                selector: 'a[href]:not([href=""])')
             href = closest_anchor&.[]('href')&.split('#')&.first&.strip
             Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
           end
 
           def extract_image
             Image.call(article_tag, url:)
           end
 
           def generate_id
-            [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
-             extract_url&.path].compact.reject(&:empty?).first
+            [
+              article_tag['id'],
+              article_tag.at_css('[id]')&.attr('id'),
+              extract_url&.path,
+              extract_url&.query
+            ].compact.reject(&:empty?).first
           end
         end
       end
     end
   end