lib/micro_micro/document.rb in micromicro-1.1.0 vs lib/micro_micro/document.rb in micromicro-2.0.0

- old
+ new

@@ -1,31 +1,9 @@ +# frozen_string_literal: true + module MicroMicro class Document - # A map of HTML `srcset` attributes and their associated element names - # - # @see https://html.spec.whatwg.org/#srcset-attributes - # @see https://html.spec.whatwg.org/#attributes-3 - HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP = { - 'imagesrcset' => %w[link], - 'srcset' => %w[img source] - }.freeze - - # A map of HTML URL attributes and their associated element names - # - # @see https://html.spec.whatwg.org/#attributes-3 - HTML_URL_ATTRIBUTES_MAP = { - 'action' => %w[form], - 'cite' => %w[blockquote del ins q], - 'data' => %w[object], - 'formaction' => %w[button input], - 'href' => %w[a area base link], - 'manifest' => %w[html], - 'ping' => %w[a area], - 'poster' => %w[video], - 'src' => %w[audio embed iframe img input script source track video] - }.freeze - # Parse a string of HTML for microformats2-encoded data. # # MicroMicro::Document.new('<a href="/" class="h-card" rel="me">Jason Garber</a>', 'https://sixtwothree.org') # # Or, pull the source HTML of a page on the Web: @@ -36,26 +14,27 @@ # doc = MicroMicro::Document.new(markup, url) # # @param markup [String] The HTML to parse for microformats2-encoded data. # @param base_url [String] The URL associated with markup. Used for relative URL resolution. def initialize(markup, base_url) - @markup = markup - @base_url = base_url - - resolve_relative_urls + @document = Nokogiri::HTML(markup, base_url).resolve_relative_urls! end + # :nocov: # @return [String] def inspect - format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relationships: #{relationships.inspect}>), object_id) + "#<#{self.class}:#{format('%#0x', object_id)} " \ + "items: #{items.inspect}, " \ + "relationships: #{relationships.inspect}>" end + # :nocov: # A collection of items parsed from the provided markup. # # @return [MicroMicro::Collections::ItemsCollection] def items - @items ||= Collections::ItemsCollection.new(Item.items_from(document)) + @items ||= Collections::ItemsCollection.new(Item.from_context(document.element_children)) end # A collection of relationships parsed from the provided markup. # # @return [MicroMicro::Collections::RelationshipsCollection] @@ -74,78 +53,11 @@ rels: relationships.group_by_rel, 'rel-urls': relationships.group_by_url } end - # Ignore this node? - # - # @param node [Nokogiri::XML::Element] - # @return [Boolean] - def self.ignore_node?(node) - ignored_node_names.include?(node.name) - end - - # A list of HTML element names the parser should ignore. - # - # @return [Array<String>] - def self.ignored_node_names - %w[script style template] - end - - # @see https://microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties - # @see https://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties - # - # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element] - # @yield [context] - # @return [String] - def self.text_content_from(context) - context.css(*ignored_node_names).unlink - - yield(context) if block_given? - - context.text.strip - end - private - attr_reader :base_url, :markup - - # @return [Nokogiri::XML::Element, nil] - def base_element - @base_element ||= Nokogiri::HTML(markup).at('//base[@href]') - end - # @return [Nokogiri::HTML::Document] - def document - @document ||= Nokogiri::HTML(markup, resolved_base_url) - end - - def resolve_relative_urls - HTML_URL_ATTRIBUTES_MAP.each do |attribute, names| - document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node| - node[attribute] = Addressable::URI.join(resolved_base_url, node[attribute].strip).normalize.to_s - end - end - - HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP.each do |attribute, names| - document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node| - candidates = node[attribute].split(',').map(&:strip).map { |candidate| candidate.match(/^(?<url>.+?)(?<descriptor>\s+.+)?$/) } - - node[attribute] = candidates.map { |candidate| "#{Addressable::URI.join(resolved_base_url, candidate[:url]).normalize}#{candidate[:descriptor]}" }.join(', ') - end - end - - self - end - - # @return [String] - def resolved_base_url - @resolved_base_url ||= begin - if base_element - Addressable::URI.join(base_url, base_element['href'].strip).normalize.to_s - else - base_url - end - end - end + attr_reader :document end end