lib/micro_micro/document.rb in micromicro-0.1.0 vs lib/micro_micro/document.rb in micromicro-1.0.0
- old
+ new
@@ -1,71 +1,151 @@
module MicroMicro
class Document
- # @param markup [String] the HTML to parse
- # @param base_url [String] the URL associated with the provided markup
+ # A map of HTML `srcset` attributes and their associated element names
+ #
+ # @see https://html.spec.whatwg.org/#srcset-attributes
+ # @see https://html.spec.whatwg.org/#attributes-3
+ HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP = {
+ 'imagesrcset' => %w[link],
+ 'srcset' => %w[img source]
+ }.freeze
+
+ # A map of HTML URL attributes and their associated element names
+ #
+ # @see https://html.spec.whatwg.org/#attributes-3
+ HTML_URL_ATTRIBUTES_MAP = {
+ 'action' => %w[form],
+ 'cite' => %w[blockquote del ins q],
+ 'data' => %w[object],
+ 'formaction' => %w[button input],
+ 'href' => %w[a area base link],
+ 'manifest' => %w[html],
+ 'ping' => %w[a area],
+ 'poster' => %w[video],
+ 'src' => %w[audio embed iframe img input script source track video]
+ }.freeze
+
+ # Parse a string of HTML for microformats2-encoded data.
+ #
+ # MicroMicro::Document.new('<a href="/" class="h-card" rel="me">Jason Garber</a>', 'https://sixtwothree.org')
+ #
+ # Or, pull the source HTML of a page on the Web:
+ #
+ # url = 'https://tantek.com'
+ # markup = Net::HTTP.get(URI.parse(url))
+ #
+ # doc = MicroMicro::Document.new(markup, url)
+ #
+ # @param markup [String] The HTML to parse for microformats2-encoded data.
+ # @param base_url [String] The URL associated with markup. Used for relative URL resolution.
def initialize(markup, base_url)
@markup = markup
@base_url = base_url
+
+ resolve_relative_urls
end
# @return [String]
def inspect
- format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relations: #{relations.inspect}>), object_id)
+ format(%(#<#{self.class.name}:%#0x items: #{items.inspect}, relationships: #{relationships.inspect}>), object_id)
end
+ # A collection of items parsed from the provided markup.
+ #
# @return [MicroMicro::Collections::ItemsCollection]
def items
@items ||= Collections::ItemsCollection.new(Item.items_from(document))
end
- # @return [MicroMicro::Collections::RelationsCollection]
- def relations
- @relations ||= Collections::RelationsCollection.new(Relation.relations_from(document))
+ # A collection of relationships parsed from the provided markup.
+ #
+ # @return [MicroMicro::Collections::RelationshipsCollection]
+ def relationships
+ @relationships ||= Collections::RelationshipsCollection.new(Relationship.relationships_from(document))
end
- # @see microformats2 Parsing Specification section 1.1
+ # Return the parsed document as a Hash.
+ #
# @see http://microformats.org/wiki/microformats2-parsing#parse_a_document_for_microformats
#
- # @return [Hash]
+ # @return [Hash{Symbol => Array, Hash}]
def to_h
{
items: items.to_a,
- rels: relations.group_by_rel,
- 'rel-urls': relations.group_by_url
+ rels: relationships.group_by_rel,
+ 'rel-urls': relationships.group_by_url
}
end
+ # Ignore this node?
+ #
# @param node [Nokogiri::XML::Element]
# @return [Boolean]
def self.ignore_node?(node)
ignored_node_names.include?(node.name)
end
+ # A list of HTML element names the parser should ignore.
+ #
# @return [Array<String>]
def self.ignored_node_names
%w[script style template]
end
+ # @see http://microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties
+ # @see http://microformats.org/wiki/microformats2-parsing#parsing_for_implied_properties
+ #
+ # @param context [Nokogiri::HTML::Document, Nokogiri::XML::NodeSet, Nokogiri::XML::Element]
+ # @yield [context]
+ # @return [String]
+ def self.text_content_from(context)
+ context.css(*ignored_node_names).unlink
+
+ yield(context) if block_given?
+
+ context.text.strip
+ end
+
private
attr_reader :base_url, :markup
# @return [Nokogiri::XML::Element, nil]
def base_element
- @base_element ||= Nokogiri::HTML(markup).at_css('base[href]')
+ @base_element ||= Nokogiri::HTML(markup).at('//base[@href]')
end
# @return [Nokogiri::HTML::Document]
def document
@document ||= Nokogiri::HTML(markup, resolved_base_url)
end
+ def resolve_relative_urls
+ HTML_URL_ATTRIBUTES_MAP.each do |attribute, names|
+ document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node|
+ node[attribute] = Absolutely.to_abs(base: resolved_base_url, relative: node[attribute].strip)
+ end
+ end
+
+ HTML_IMAGE_CANDIDATE_STRINGS_ATTRIBUTES_MAP.each do |attribute, names|
+ document.xpath(*names.map { |name| "//#{name}[@#{attribute}]" }).each do |node|
+ candidates = node[attribute].split(',').map(&:strip).map { |candidate| candidate.match(/^(?<url>.+?)(?<descriptor>\s+.+)?$/) }
+
+ node[attribute] = candidates.map { |candidate| "#{Absolutely.to_abs(base: resolved_base_url, relative: candidate[:url])}#{candidate[:descriptor]}" }.join(', ')
+ end
+ end
+
+ self
+ end
+
# @return [String]
def resolved_base_url
@resolved_base_url ||= begin
- return base_url unless base_element
-
- Absolutely.to_abs(base: base_url, relative: base_element['href'])
+ if base_element
+ Absolutely.to_abs(base: base_url, relative: base_element['href'].strip)
+ else
+ base_url
+ end
end
end
end
end