# frozen_string_literal: true require 'pathname' module Nokogiri module HTML4 class Document < Nokogiri::XML::Document ### # Get the meta tag encoding for this document. If there is no meta tag, # then nil is returned. def meta_encoding case when meta = at('//meta[@charset]') meta[:charset] when meta = meta_content_type meta['content'][/charset\s*=\s*([\w-]+)/i, 1] end end ### # Set the meta tag encoding for this document. # # If an meta encoding tag is already present, its content is # replaced with the given text. # # Otherwise, this method tries to create one at an appropriate # place supplying head and/or html elements as necessary, which # is inside a head element if any, and before any text node or # content element (typically
) if any. # # The result when trying to set an encoding that is different # from the document encoding is undefined. # # Beware in CRuby, that libxml2 automatically inserts a meta tag # into a head element. def meta_encoding= encoding case when meta = meta_content_type meta['content'] = 'text/html; charset=%s' % encoding encoding when meta = at('//meta[@charset]') meta['charset'] = encoding else meta = XML::Node.new('meta', self) if dtd = internal_subset and dtd.html5_dtd? meta['charset'] = encoding else meta['http-equiv'] = 'Content-Type' meta['content'] = 'text/html; charset=%s' % encoding end case when head = at('//head') head.prepend_child(meta) else set_metadata_element(meta) end encoding end end def meta_content_type xpath('//meta[@http-equiv and boolean(@content)]').find { |node| node['http-equiv'] =~ /\AContent-Type\z/i } end private :meta_content_type ### # Get the title string of this document. Return nil if there is # no title tag. def title title = at('//title') and title.inner_text end ### # Set the title string of this document. # # If a title element is already present, its content is replaced # with the given text. # # Otherwise, this method tries to create one at an appropriate # place supplying head and/or html elements as necessary, which # is inside a head element if any, right after a meta # encoding/charset tag if any, and before any text node or # content element (typically ) if any. def title=(text) tnode = XML::Text.new(text, self) if title = at('//title') title.children = tnode return text end title = XML::Node.new('title', self) << tnode case when head = at('//head') head << title when meta = at('//meta[@charset]') || meta_content_type # better put after charset declaration meta.add_next_sibling(title) else set_metadata_element(title) end text end def set_metadata_element(element) case when head = at('//head') head << element when html = at('//html') head = html.prepend_child(XML::Node.new('head', self)) head.prepend_child(element) when first = children.find { |node| case node when XML::Element, XML::Text true end } # We reach here only if the underlying document model # allows / elements to be omitted and does not # automatically supply them. first.add_previous_sibling(element) else html = add_child(XML::Node.new('html', self)) head = html.add_child(XML::Node.new('head', self)) head.prepend_child(element) end end private :set_metadata_element #### # Serialize Node using +options+. Save options can also be set using a # block. See SaveOptions. # # These two statements are equivalent: # # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML) # # or # # node.serialize(:encoding => 'UTF-8') do |config| # config.format.as_xml # end # def serialize options = {} options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML super end #### # Create a Nokogiri::XML::DocumentFragment from +tags+ def fragment tags = nil DocumentFragment.new(self, tags, self.root) end class << self ### # Parse HTML. +string_or_io+ may be a String, or any object that # responds to _read_ and _close_ such as an IO, or StringIO. # +url+ is resource where this document is located. +encoding+ is the # encoding that should be used when processing the document. +options+ # is a number that sets options in the parser, such as # Nokogiri::XML::ParseOptions::RECOVER. See the constants in # Nokogiri::XML::ParseOptions. def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML options = Nokogiri::XML::ParseOptions.new(options) if Integer === options yield options if block_given? url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil if string_or_io.respond_to?(:encoding) unless string_or_io.encoding.name == "ASCII-8BIT" encoding ||= string_or_io.encoding.name end end if string_or_io.respond_to?(:read) if string_or_io.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 string_or_io = string_or_io.expand_path.open url ||= string_or_io.path end unless encoding # Libxml2's parser has poor support for encoding # detection. First, it does not recognize the HTML5 # style meta charset declaration. Secondly, even if it # successfully detects an encoding hint, it does not # re-decode or re-parse the preceding part which may be # garbled. # # EncodingReader aims to perform advanced encoding # detection beyond what Libxml2 does, and to emulate # rewinding of a stream and make Libxml2 redo parsing # from the start when an encoding hint is found. string_or_io = EncodingReader.new(string_or_io) begin return read_io(string_or_io, url, encoding, options.to_i) rescue EncodingFound => e encoding = e.found_encoding end end return read_io(string_or_io, url, encoding, options.to_i) end # read_memory pukes on empty docs if string_or_io.nil? or string_or_io.empty? return encoding ? new.tap { |i| i.encoding = encoding } : new end encoding ||= EncodingReader.detect_encoding(string_or_io) read_memory(string_or_io, url, encoding, options.to_i) end end class EncodingFound < StandardError # :nodoc: attr_reader :found_encoding def initialize(encoding) @found_encoding = encoding super("encoding found: %s" % encoding) end end class EncodingReader # :nodoc: class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc: attr_reader :encoding def initialize @encoding = nil super() end def start_element(name, attrs = []) return unless name == 'meta' attr = Hash[attrs] charset = attr['charset'] and @encoding = charset http_equiv = attr['http-equiv'] and http_equiv.match(/\AContent-Type\z/i) and content = attr['content'] and m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and @encoding = m[1] end end class JumpSAXHandler < SAXHandler def initialize(jumptag) @jumptag = jumptag super() end def start_element(name, attrs = []) super throw @jumptag, @encoding if @encoding throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/ end end def self.detect_encoding(chunk) m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding if Nokogiri.jruby? m = chunk.match(/( 0 rest = @io.read(len) and ret << rest end if ret.empty? nil else ret end end end end end end