# frozen_string_literal: true module Html2rss class AutoSource module Scraper class SemanticHtml ## # Image is responsible for extracting image URLs the article_tag. class Image def self.call(article_tag, url:) img_src = from_source(article_tag) || from_img(article_tag) || from_style(article_tag) Utils.build_absolute_url_from_relative(img_src, url) if img_src end def self.from_img(article_tag) article_tag.at_css('img[src]:not([src^="data"])')&.[]('src') end ## # Extracts the largest image source from the srcset attribute # of an img tag or a source tag inside a picture tag. # # @see # @see # @see def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize hash = article_tag.css('img[srcset], picture > source[srcset]') .flat_map { |source| source['srcset'].to_s.split(',') } .filter_map do |line| width, url = line.split.reverse next if url.nil? || url.start_with?('data:') width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i [width_value, url.strip] end.to_h hash[hash.keys.max] end def self.from_style(article_tag) article_tag.css('[style*="url"]') .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] } .reject { |src| !src || src.start_with?('data:') } .max_by(&:size) end end end end end end