Sha256: 1d4647af5d10a4d305ea9555fdd3c8bcbe71cb56539cc48ed3cfb4a5070f4bbf
Contents?: true
Size: 1.92 KB
Versions: 1
Compression:
Stored size: 1.92 KB
Contents
# frozen_string_literal: true module Html2rss class AutoSource module Scraper class SemanticHtml ## # Image is responsible for extracting image URLs the article_tag. class Image def self.call(article_tag, url:) img_src = from_source(article_tag) || from_img(article_tag) || from_style(article_tag) Utils.build_absolute_url_from_relative(img_src, url) if img_src end def self.from_img(article_tag) article_tag.at_css('img[src]:not([src^="data"])')&.[]('src') end ## # Extracts the largest image source from the srcset attribute # of an img tag or a source tag inside a picture tag. # # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images> # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset> # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture> def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize hash = article_tag.css('img[srcset], picture > source[srcset]') .flat_map do |source| source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width| next if url.nil? || url.start_with?('data:') width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i [width_value, url.strip] end end.to_h hash[hash.keys.max] end def self.from_style(article_tag) article_tag.css('[style*="url"]') .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] } .reject { |src| !src || src.start_with?('data:') } .max_by(&:size) end end end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
html2rss-0.17.0 | lib/html2rss/auto_source/scraper/semantic_html/image.rb |