# frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' require 'set' require "strscan" module REXML module Parsers if StringScanner::Version < "3.0.8" module StringScannerCaptures refine StringScanner do def captures values_at(*(1...size)) end end end using StringScannerCaptures end # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) # while parser.has_next? # res = parser.next # puts res[1]['att'] if res.start_tag? and res[0] == 'b' # end # See the PullEvent class for information on the content of the results. # The data is identical to the arguments passed for the various events to # the StreamListener API. # # Notice that: # parser = PullParser.new( "BAD DOCUMENT" ) # while parser.has_next? # res = parser.next # raise res[1] if res.error? # end # # Nat Price gave me some good ideas for the API. class BaseParser LETTER = '[:alpha:]' DIGIT = '[:digit:]' COMBININGCHAR = '' # TODO EXTENDER = '' # TODO NCNAME_STR= "[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*" QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" QNAME = /(#{QNAME_STR})/ # Just for backward compatibility. For example, kramdown uses this. # It's not used in REXML. UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" NAMECHAR = '[\-\w\.:]' NAME = "([\\w:]#{NAMECHAR}*)" NMTOKEN = "(?:#{NAMECHAR})+" NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" REFERENCE_RE = /#{REFERENCE}/ DOCTYPE_START = /\A\s*/um ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um COMMENT_START = /\A/um CDATA_START = /\A/um CDATA_PATTERN = //um XMLDECL_START = /\A<\?xml\s/u; XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um INSTRUCTION_START = /\A<\?/u INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um TAG_MATCH = /\A<((?>#{QNAME_STR}))/um CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um ENTITY_START = /\A\s*/um SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" ATTDEF_RE = /#{ATTDEF}/ ATTLISTDECL_START = /\A\s*/um TEXT_PATTERN = /\A([^<]*)/um # Entity constants PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] } module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /\s*#{NAME}/um GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um end private_constant :Private include Private def initialize( source ) self.stream = source @listeners = [] end def add_listener( listener ) @listeners << listener end attr_reader :source def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil @document_status = nil @tags = [] @stack = [] @entities = [] @nsstack = [] end def position if @source.respond_to? :position @source.position else # FIXME 0 end end # Returns true if there are no more events def empty? return (@source.empty? and @stack.empty?) end # Returns true if there are more events. Synonymous with !empty? def has_next? return !(@source.empty? and @stack.empty?) end # Push an event back on the head of the stream. This method # has (theoretically) infinite depth. def unshift token @stack.unshift(token) end # Peek at the +depth+ event in the stack. The first element on the stack # is at depth 0. If +depth+ is -1, will parse to the end of the input # stream and return the last event, which is always :end_document. # Be aware that this causes the stream to be parsed up to the +depth+ # event, so you can effectively pre-parse the entire document (pull the # entire thing into memory) using this method. def peek depth=0 raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] if depth == -1 temp.push(pull()) until empty? else while @stack.size+temp.size < depth+1 temp.push(pull()) end end @stack += temp if temp.size > 0 @stack[depth] end # Returns the next event. This is a +PullEvent+ object. def pull pull_event.tap do |event| @listeners.each do |listener| listener.receive event end end end def pull_event if @closed x, @closed = @closed, nil return [ :end_element, x ] end return [ :end_document ] if empty? return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @source.ensure_buffer if @document_status == nil start_position = @source.position if @source.match("/um, true)[1] ] elsif @source.match("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match(/\s+/um, true) if @source.match(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" end @source.position = start_position raise REXML::ParseException.new(message, @source) end @nsstack.unshift(curr_ns=Set.new) name = parse_name(base_error_message) if @source.match(/\s*\[/um, true) id = [nil, nil, nil] @document_status = :in_doctype elsif @source.match(/\s*>/um, true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer else id = parse_id(base_error_message, accept_external_id: true, accept_public_id: false) if id[0] == "SYSTEM" # For backward compatibility id[1], id[2] = id[2], nil end if @source.match(/\s*\[/um, true) @document_status = :in_doctype elsif @source.match(/\s*>/um, true) @document_status = :after_doctype @source.ensure_buffer else message = "#{base_error_message}: garbage after external ID" raise REXML::ParseException.new(message, @source) end end args = [:start_doctype, name, *id] if @document_status == :after_doctype @source.match(/\s*/um, true) @stack << [ :end_doctype ] end return args else message = "Invalid XML" raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype @source.match(/\s*/um, true) # skip spaces start_position = @source.position if @source.match("/um, true) raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, " 4 # Chop out NDATA decl # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] elsif match[2] == 'PUBLIC' # External reference match[3] = match[3][1..-2] # PUBID match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else match[2] = match[2][1..-2] match.pop if match.size == 4 # match is [ :entity, name, value ] end match << '%' if ref return match elsif @source.match("ATTLIST", true) md = @source.match(ATTLISTDECL_END, true) raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? element = md[1] contents = md[0] pairs = {} values = md[0].scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! val = attdef[3] val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ @nsstack[0] << $1 end end end return [ :attlistdecl, element, pairs, contents ] elsif @source.match("NOTATION", true) base_error_message = "Malformed notation declaration" unless @source.match(/\s+/um, true) if @source.match(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" end @source.position = start_position raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) unless @source.match(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] elsif md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype return [ :end_doctype ] end end if @document_status == :after_doctype @source.match(/\s*/um, true) end begin start_position = @source.position if @source.match("<", true) # :text's read_until may remain only "<" in buffer. In the # case, buffer is empty here. So we need to fill buffer # here explicitly. @source.ensure_buffer if @source.match("/", true) @nsstack.shift last_tag = @tags.pop md = @source.match(CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" message += " (got '#{md[1]}')" if md @source.position = start_position if md.nil? raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] elsif @source.match("!", true) md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md else md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.match("?", true) return process_instruction(start_position) else # Get the next tag md = @source.match(TAG_PATTERN, true) unless md @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) attributes, closed = parse_attributes(prefixes, curr_ns) # Verify that all of the prefixes have been defined for prefix in prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = tag @nsstack.shift else @tags.push( tag ) end return [ :start_element, tag, attributes ] end else text = @source.read_until("<") if text.chomp!("<") @source.position -= "<".bytesize end return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise rescue REXML::ParseException raise rescue => error raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end return [ :dummy ] end private :pull_event def entity( reference, entities ) value = nil value = entities[ reference ] if entities if not value value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end unnormalize( value, entities ) if value end # Escapes all possible entities def normalize( input, entities=nil, entity_filter=nil ) copy = input.clone # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| copy.gsub!( value[3], value[1] ) end copy end # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) rv = string.gsub( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end rv.gsub!( /&/, '&' ) end rv end private def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding true end def parse_name(base_error_message) md = @source.match(NAME_PATTERN, true) unless md if @source.match(/\s*\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end md[1] end def parse_id(base_error_message, accept_external_id:, accept_public_id:) if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote system_literal = md[2] system = system_literal[1..-2] if system_literal # Remove quote ["PUBLIC", pubid, system] elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote ["PUBLIC", pubid, nil] elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) system = nil system_literal = md[1] system = system_literal[1..-2] if system_literal # Remove quote ["SYSTEM", nil, system] else details = parse_id_invalid_details(accept_external_id: accept_external_id, accept_public_id: accept_public_id) message = "#{base_error_message}: #{details}" raise REXML::ParseException.new(message, @source) end end def parse_id_invalid_details(accept_external_id:, accept_public_id:) public = /\A\s*PUBLIC/um system = /\A\s*SYSTEM/um if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) return "public ID literal is missing" end unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) return "invalid public ID literal" end if accept_public_id if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) return "system ID literal is missing" end unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else "garbage after public ID literal" end elsif accept_external_id and @source.match(/#{system}/um) if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) return "system literal is missing" end unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) return "invalid ID type" end "ID type is missing" end end def process_instruction(start_position) match_data = @source.match(INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" @source.position = start_position raise REXML::ParseException.new(message, @source) end if @document_status.nil? and match_data[1] == "xml" content = match_data[2] version = VERSION.match(content) version = version[1] unless version.nil? encoding = ENCODING.match(content) encoding = encoding[1] unless encoding.nil? if need_source_encoding_update?(encoding) @source.encoding = encoding end if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding encoding = "UTF-16" end standalone = STANDALONE.match(content) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] end [:processing_instruction, match_data[1], match_data[2]] end def parse_attributes(prefixes, curr_ns) attributes = {} closed = false while true if @source.match(">", true) return attributes, closed elsif @source.match("/>", true) closed = true return attributes, closed elsif match = @source.match(QNAME, true) name = match[1] prefix = match[2] local_part = match[3] unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end unless match = @source.match(/(['"])/, true) message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end quote = match[1] start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) @source.position = start_position message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end @source.match(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" if value != "http://www.w3.org/XML/1998/namespace" msg = "The 'xml' prefix must not be bound to any other namespace "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self ) end elsif local_part == "xmlns" msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end curr_ns << local_part elsif prefix prefixes << prefix unless prefix == "xml" end if attributes[name] msg = "Duplicate attribute #{name.inspect}" raise REXML::ParseException.new(msg, @source, self) end attributes[name] = value else message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" raise REXML::ParseException.new(message, @source) end end end end end end =begin case event[0] when :start_element when :text when :end_element when :processing_instruction when :cdata when :comment when :xmldecl when :start_doctype when :end_doctype when :externalentity when :elementdecl when :entity when :attlistdecl when :notationdecl when :end_doctype end =end