# frozen_string_literal: true module Nokogiri module XML module SAX ### # This parser is a SAX style parser that reads its input as it deems necessary. The parser # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends # messages to the Nokogiri::XML::SAX::Document. # # Here is an example of using this parser: # # # Create a subclass of Nokogiri::XML::SAX::Document and implement # # the events we care about: # class MyHandler < Nokogiri::XML::SAX::Document # def start_element name, attrs = [] # puts "starting: #{name}" # end # # def end_element name # puts "ending: #{name}" # end # end # # parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new) # # # Hand an IO object to the parser, which will read the XML from the IO. # File.open(path_to_xml) do |f| # parser.parse(f) # end # # For more information about \SAX parsers, see Nokogiri::XML::SAX. # # Also see Nokogiri::XML::SAX::Document for the available events. # # For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser. # class Parser # to dynamically resolve ParserContext in inherited methods include Nokogiri::ClassResolver # Structure used for marshalling attributes for some callbacks in XML::SAX::Document. class Attribute < Struct.new(:localname, :prefix, :uri, :value) end ENCODINGS = { # :nodoc: "NONE" => 0, # No char encoding detected "UTF-8" => 1, # UTF-8 "UTF16LE" => 2, # UTF-16 little endian "UTF16BE" => 3, # UTF-16 big endian "UCS4LE" => 4, # UCS-4 little endian "UCS4BE" => 5, # UCS-4 big endian "EBCDIC" => 6, # EBCDIC uh! "UCS4-2143" => 7, # UCS-4 unusual ordering "UCS4-3412" => 8, # UCS-4 unusual ordering "UCS2" => 9, # UCS-2 "ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1 "ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2 "ISO-8859-3" => 12, # ISO-8859-3 "ISO-8859-4" => 13, # ISO-8859-4 "ISO-8859-5" => 14, # ISO-8859-5 "ISO-8859-6" => 15, # ISO-8859-6 "ISO-8859-7" => 16, # ISO-8859-7 "ISO-8859-8" => 17, # ISO-8859-8 "ISO-8859-9" => 18, # ISO-8859-9 "ISO-2022-JP" => 19, # ISO-2022-JP "SHIFT-JIS" => 20, # Shift_JIS "EUC-JP" => 21, # EUC-JP "ASCII" => 22, # pure ASCII } REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc: deprecate_constant :ENCODINGS # The Nokogiri::XML::SAX::Document where events will be sent. attr_accessor :document # The encoding beings used for this document. attr_accessor :encoding ### # :call-seq: # new ⇒ SAX::Parser # new(handler) ⇒ SAX::Parser # new(handler, encoding) ⇒ SAX::Parser # # Create a new Parser. # # [Parameters] # - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive # events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible # through the #document attribute. # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when # parsing the input. (default +nil+ for auto-detection) # def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil) @encoding = encoding @document = doc @warned = false initialize_native unless Nokogiri.jruby? end ### # :call-seq: # parse(input) { |parser_context| ... } # # Parse the input, sending events to the SAX::Document at #document. # # [Parameters] # - +input+ (String, IO) The input to parse. # # If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io, # otherwise it forwards to Parser.parse_memory. # # [Yields] # If a block is given, the underlying ParserContext object will be yielded. This can be used # to set options on the parser context before parsing begins. # def parse(input, &block) if input.respond_to?(:read) && input.respond_to?(:close) parse_io(input, &block) else parse_memory(input, &block) end end ### # :call-seq: # parse_io(io) { |parser_context| ... } # parse_io(io, encoding) { |parser_context| ... } # # Parse an input stream. # # [Parameters] # - +io+ (IO) The readable IO object from which to read input # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when # parsing the input, or +nil+ for auto-detection. (default #encoding) # # [Yields] # If a block is given, the underlying ParserContext object will be yielded. This can be used # to set options on the parser context before parsing begins. # def parse_io(io, encoding = @encoding) ctx = related_class("ParserContext").io(io, encoding) yield ctx if block_given? ctx.parse_with(self) end ### # :call-seq: # parse_memory(input) { |parser_context| ... } # parse_memory(input, encoding) { |parser_context| ... } # # Parse an input string. # # [Parameters] # - +input+ (String) The input string to be parsed. # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when # parsing the input, or +nil+ for auto-detection. (default #encoding) # # [Yields] # If a block is given, the underlying ParserContext object will be yielded. This can be used # to set options on the parser context before parsing begins. # def parse_memory(input, encoding = @encoding) ctx = related_class("ParserContext").memory(input, encoding) yield ctx if block_given? ctx.parse_with(self) end ### # :call-seq: # parse_file(filename) { |parser_context| ... } # parse_file(filename, encoding) { |parser_context| ... } # # Parse a file. # # [Parameters] # - +filename+ (String) The path to the file to be parsed. # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when # parsing the input, or +nil+ for auto-detection. (default #encoding) # # [Yields] # If a block is given, the underlying ParserContext object will be yielded. This can be used # to set options on the parser context before parsing begins. # def parse_file(filename, encoding = @encoding) raise ArgumentError, "no filename provided" unless filename raise Errno::ENOENT unless File.exist?(filename) raise Errno::EISDIR if File.directory?(filename) ctx = related_class("ParserContext").file(filename, encoding) yield ctx if block_given? ctx.parse_with(self) end end end end end