# frozen_string_literal: true
require_relative '../parseexception'
require_relative '../undefinednamespaceexception'
require_relative '../source'
require 'set'
require "strscan"
module REXML
module Parsers
if StringScanner::Version < "3.0.8"
module StringScannerCaptures
refine StringScanner do
def captures
values_at(*(1...size))
end
end
end
using StringScannerCaptures
end
# = Using the Pull Parser
# This API is experimental, and subject to change.
# parser = PullParser.new( "texttxet" )
# while parser.has_next?
# res = parser.next
# puts res[1]['att'] if res.start_tag? and res[0] == 'b'
# end
# See the PullEvent class for information on the content of the results.
# The data is identical to the arguments passed for the various events to
# the StreamListener API.
#
# Notice that:
# parser = PullParser.new( "BAD DOCUMENT" )
# while parser.has_next?
# res = parser.next
# raise res[1] if res.error?
# end
#
# Nat Price gave me some good ideas for the API.
class BaseParser
LETTER = '[:alpha:]'
DIGIT = '[:digit:]'
COMBININGCHAR = '' # TODO
EXTENDER = '' # TODO
NCNAME_STR= "[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*"
QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
QNAME = /(#{QNAME_STR})/
# Just for backward compatibility. For example, kramdown uses this.
# It's not used in REXML.
UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
NAMECHAR = '[\-\w\.:]'
NAME = "([\\w:]#{NAMECHAR}*)"
NMTOKEN = "(?:#{NAMECHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/
DOCTYPE_START = /\A\s*/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A/um
CDATA_START = /\A/um
CDATA_PATTERN = //um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
ENTITY_START = /\A\s*/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /\A\s*/um
TEXT_PATTERN = /\A([^<]*)/um
# Entity constants
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
NDATADECL = "\\s+NDATA\\s+#{NAME}"
PEREFERENCE = "%#{NAME};"
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = ""
GEDECL = ""
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
NOTATIONDECL_START = /\A\s* [/>/, '>', '>', />/],
'lt' => [/</, '<', '<', /],
'quot' => [/"/, '"', '"', /"/],
"apos" => [/'/, "'", "'", /'/]
}
module Private
# Terminal requires two or more letters.
INSTRUCTION_TERM = "?>"
COMMENT_TERM = "-->"
CDATA_TERM = "]]>"
DOCTYPE_TERM = "]>"
# Read to the end of DOCTYPE because there is no proper ENTITY termination
ENTITY_TERM = DOCTYPE_TERM
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NAME_PATTERN = /\s*#{NAME}/um
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
CHARACTER_REFERENCES = /*((?:\d+)|(?:x[a-fA-F0-9]+));/
DEFAULT_ENTITIES_PATTERNS = {}
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
default_entities.each do |term|
DEFAULT_ENTITIES_PATTERNS[term] = /{term};/
end
end
private_constant :Private
def initialize( source )
self.stream = source
@listeners = []
@prefixes = Set.new
end
def add_listener( listener )
@listeners << listener
end
attr_reader :source
def stream=( source )
@source = SourceFactory.create_from( source )
@closed = nil
@have_root = false
@document_status = nil
@tags = []
@stack = []
@entities = []
@nsstack = []
end
def position
if @source.respond_to? :position
@source.position
else
# FIXME
0
end
end
# Returns true if there are no more events
def empty?
return (@source.empty? and @stack.empty?)
end
# Returns true if there are more events. Synonymous with !empty?
def has_next?
return !(@source.empty? and @stack.empty?)
end
# Push an event back on the head of the stream. This method
# has (theoretically) infinite depth.
def unshift token
@stack.unshift(token)
end
# Peek at the +depth+ event in the stack. The first element on the stack
# is at depth 0. If +depth+ is -1, will parse to the end of the input
# stream and return the last event, which is always :end_document.
# Be aware that this causes the stream to be parsed up to the +depth+
# event, so you can effectively pre-parse the entire document (pull the
# entire thing into memory) using this method.
def peek depth=0
raise %Q[Illegal argument "#{depth}"] if depth < -1
temp = []
if depth == -1
temp.push(pull()) until empty?
else
while @stack.size+temp.size < depth+1
temp.push(pull())
end
end
@stack += temp if temp.size > 0
@stack[depth]
end
# Returns the next event. This is a +PullEvent+ object.
def pull
@source.drop_parsed_content
pull_event.tap do |event|
@listeners.each do |listener|
listener.receive event
end
end
end
def pull_event
if @closed
x, @closed = @closed, nil
return [ :end_element, x ]
end
if empty?
if @document_status == :in_doctype
raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
end
return [ :end_document ]
end
return @stack.shift if @stack.size > 0
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
@source.ensure_buffer
if @document_status == nil
start_position = @source.position
if @source.match("", true)
return process_instruction(start_position)
elsif @source.match("/um, true, term: Private::COMMENT_TERM)
if md.nil?
raise REXML::ParseException.new("Unclosed comment", @source)
end
if /--|-\z/.match?(md[1])
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ]
elsif @source.match("DOCTYPE", true)
base_error_message = "Malformed DOCTYPE"
unless @source.match(/\s+/um, true)
if @source.match(">")
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid name"
end
@source.position = start_position
raise REXML::ParseException.new(message, @source)
end
@nsstack.unshift(Set.new)
name = parse_name(base_error_message)
if @source.match(/\s*\[/um, true)
id = [nil, nil, nil]
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
id = [nil, nil, nil]
@document_status = :after_doctype
@source.ensure_buffer
else
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: false)
if id[0] == "SYSTEM"
# For backward compatibility
id[1], id[2] = id[2], nil
end
if @source.match(/\s*\[/um, true)
@document_status = :in_doctype
elsif @source.match(/\s*>/um, true)
@document_status = :after_doctype
@source.ensure_buffer
else
message = "#{base_error_message}: garbage after external ID"
raise REXML::ParseException.new(message, @source)
end
end
args = [:start_doctype, name, *id]
if @document_status == :after_doctype
@source.match(/\s*/um, true)
@stack << [ :end_doctype ]
end
return args
else
message = "Invalid XML"
raise REXML::ParseException.new(message, @source)
end
end
end
if @document_status == :in_doctype
@source.match(/\s*/um, true) # skip spaces
start_position = @source.position
if @source.match("/um, true)
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
return [ :elementdecl, " 4 # Chop out NDATA decl
# match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
elsif match[2] == 'PUBLIC'
# External reference
match[3] = match[3][1..-2] # PUBID
match[4] = match[4][1..-2] # HREF
match.delete_at(5) if match.size > 5 # Chop out NDATA decl
# match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
match[2] = match[2][1..-2]
match.pop if match.size == 4
# match is [ :entity, name, value ]
end
match << '%' if ref
return match
elsif @source.match("ATTLIST", true)
md = @source.match(Private::ATTLISTDECL_END, true)
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
element = md[1]
contents = md[0]
pairs = {}
values = md[0].strip.scan( ATTDEF_RE )
values.each do |attdef|
unless attdef[3] == "#IMPLIED"
attdef.compact!
val = attdef[3]
val = attdef[4] if val == "#FIXED "
pairs[attdef[0]] = val
if attdef[0] =~ /^xmlns:(.*)/
@nsstack[0] << $1
end
end
end
return [ :attlistdecl, element, pairs, contents ]
elsif @source.match("NOTATION", true)
base_error_message = "Malformed notation declaration"
unless @source.match(/\s+/um, true)
if @source.match(">")
message = "#{base_error_message}: name is missing"
else
message = "#{base_error_message}: invalid name"
end
@source.position = start_position
raise REXML::ParseException.new(message, @source)
end
name = parse_name(base_error_message)
id = parse_id(base_error_message,
accept_external_id: true,
accept_public_id: true)
unless @source.match(/\s*>/um, true)
message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
return [:notationdecl, name, *id]
elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
case md[1]
when /--/, /-\z/
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ] if md
end
elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
return [ :externalentity, match[1] ]
elsif @source.match(/\]\s*>/um, true)
@document_status = :after_doctype
return [ :end_doctype ]
end
if @document_status == :in_doctype
raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
end
end
if @document_status == :after_doctype
@source.match(/\s*/um, true)
end
begin
start_position = @source.position
if @source.match("<", true)
# :text's read_until may remain only "<" in buffer. In the
# case, buffer is empty here. So we need to fill buffer
# here explicitly.
@source.ensure_buffer
if @source.match("/", true)
@nsstack.shift
last_tag = @tags.pop
md = @source.match(Private::CLOSE_PATTERN, true)
if md and !last_tag
message = "Unexpected top-level end tag (got '#{md[1]}')"
raise REXML::ParseException.new(message, @source)
end
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
message += " (got '#{md[1]}')" if md
@source.position = start_position if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif @source.match("!", true)
md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][0] == ?-
md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
if md.nil? || /--|-\z/.match?(md[1])
raise REXML::ParseException.new("Malformed comment", @source)
end
return [ :comment, md[1] ]
else
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif @source.match("?", true)
return process_instruction(start_position)
else
# Get the next tag
md = @source.match(Private::TAG_PATTERN, true)
unless md
@source.position = start_position
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
@document_status = :in_element
@prefixes.clear
@prefixes << md[2] if md[2]
@nsstack.unshift(curr_ns=Set.new)
attributes, closed = parse_attributes(@prefixes, curr_ns)
# Verify that all of the prefixes have been defined
for prefix in @prefixes
unless @nsstack.find{|k| k.member?(prefix)}
raise UndefinedNamespaceException.new(prefix,@source,self)
end
end
if closed
@closed = tag
@nsstack.shift
else
if @tags.empty? and @have_root
raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
end
@tags.push( tag )
end
@have_root = true
return [ :start_element, tag, attributes ]
end
else
text = @source.read_until("<")
if text.chomp!("<")
@source.position -= "<".bytesize
end
if @tags.empty? and @have_root
unless /\A\s*\z/.match?(text)
raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
end
return pull_event
end
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
raise
rescue REXML::ParseException
raise
rescue => error
raise REXML::ParseException.new( "Exception parsing",
@source, self, (error ? error : $!) )
end
return [ :dummy ]
end
private :pull_event
def entity( reference, entities )
value = nil
value = entities[ reference ] if entities
if not value
value = DEFAULT_ENTITIES[ reference ]
value = value[2] if value
end
unnormalize( value, entities ) if value
end
# Escapes all possible entities
def normalize( input, entities=nil, entity_filter=nil )
copy = input.clone
# Doing it like this rather than in a loop improves the speed
copy.gsub!( EREFERENCE, '&' )
entities.each do |key, value|
copy.gsub!( value, "{key};" ) unless entity_filter and
entity_filter.include?(entity)
end if entities
copy.gsub!( EREFERENCE, '&' )
DEFAULT_ENTITIES.each do |key, value|
copy.gsub!( value[3], value[1] )
end
copy
end
# Unescapes all possible entities
def unnormalize( string, entities=nil, filter=nil )
if string.include?("\r")
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
else
rv = string.dup
end
matches = rv.scan( REFERENCE_RE )
return rv if matches.size == 0
rv.gsub!( Private::CHARACTER_REFERENCES ) {
m=$1
m = "0#{m}" if m[0] == ?x
[Integer(m)].pack('U*')
}
matches.collect!{|x|x[0]}.compact!
if matches.size > 0
matches.each do |entity_reference|
unless filter and filter.include?(entity_reference)
entity_value = entity( entity_reference, entities )
if entity_value
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /{entity_reference};/
rv.gsub!( re, entity_value )
else
er = DEFAULT_ENTITIES[entity_reference]
rv.gsub!( er[0], er[2] ) if er
end
end
end
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
end
rv
end
private
def need_source_encoding_update?(xml_declaration_encoding)
return false if xml_declaration_encoding.nil?
return false if /\AUTF-16\z/i =~ xml_declaration_encoding
true
end
def parse_name(base_error_message)
md = @source.match(Private::NAME_PATTERN, true)
unless md
if @source.match(/\s*\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
end
raise REXML::ParseException.new(message, @source)
end
md[1]
end
def parse_id(base_error_message,
accept_external_id:,
accept_public_id:)
if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
system_literal = md[2]
system = system_literal[1..-2] if system_literal # Remove quote
["PUBLIC", pubid, system]
elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
pubid = system = nil
pubid_literal = md[1]
pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
["PUBLIC", pubid, nil]
elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
system = nil
system_literal = md[1]
system = system_literal[1..-2] if system_literal # Remove quote
["SYSTEM", nil, system]
else
details = parse_id_invalid_details(accept_external_id: accept_external_id,
accept_public_id: accept_public_id)
message = "#{base_error_message}: #{details}"
raise REXML::ParseException.new(message, @source)
end
end
def parse_id_invalid_details(accept_external_id:,
accept_public_id:)
public = /\A\s*PUBLIC/um
system = /\A\s*SYSTEM/um
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
return "public ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
return "invalid public ID literal"
end
if accept_public_id
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
return "system ID literal is missing"
end
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
"garbage after public ID literal"
end
elsif accept_external_id and @source.match(/#{system}/um)
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
return "system literal is missing"
end
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
return "invalid system literal"
end
"garbage after system literal"
else
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
return "invalid ID type"
end
"ID type is missing"
end
end
def process_instruction(start_position)
match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
unless match_data
message = "Invalid processing instruction node"
@source.position = start_position
raise REXML::ParseException.new(message, @source)
end
if match_data[1] == "xml"
if @document_status
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
end
content = match_data[2]
version = VERSION.match(content)
version = version[1] unless version.nil?
encoding = ENCODING.match(content)
encoding = encoding[1] unless encoding.nil?
if need_source_encoding_update?(encoding)
@source.encoding = encoding
end
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
encoding = "UTF-16"
end
standalone = STANDALONE.match(content)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
end
[:processing_instruction, match_data[1], match_data[2]]
end
def parse_attributes(prefixes, curr_ns)
attributes = {}
closed = false
while true
if @source.match(">", true)
return attributes, closed
elsif @source.match("/>", true)
closed = true
return attributes, closed
elsif match = @source.match(QNAME, true)
name = match[1]
prefix = match[2]
local_part = match[3]
unless @source.match(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
unless match = @source.match(/(['"])/, true)
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
quote = match[1]
start_position = @source.position
value = @source.read_until(quote)
unless value.chomp!(quote)
@source.position = start_position
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
@source.match(/\s*/um, true)
if prefix == "xmlns"
if local_part == "xml"
if value != "http://www.w3.org/XML/1998/namespace"
msg = "The 'xml' prefix must not be bound to any other namespace "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self )
end
elsif local_part == "xmlns"
msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
raise REXML::ParseException.new( msg, @source, self)
end
curr_ns << local_part
elsif prefix
prefixes << prefix unless prefix == "xml"
end
if attributes[name]
msg = "Duplicate attribute #{name.inspect}"
raise REXML::ParseException.new(msg, @source, self)
end
attributes[name] = value
else
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
raise REXML::ParseException.new(message, @source)
end
end
end
end
end
end
=begin
case event[0]
when :start_element
when :text
when :end_element
when :processing_instruction
when :cdata
when :comment
when :xmldecl
when :start_doctype
when :end_doctype
when :externalentity
when :elementdecl
when :entity
when :attlistdecl
when :notationdecl
when :end_doctype
end
=end