lib/TextParser.rb in taskjuggler-0.0.7 vs lib/TextParser.rb in taskjuggler-0.0.8
- old
+ new
@@ -17,17 +17,15 @@
require 'TjException'
require 'Log'
class TaskJuggler
- # The TextParser implements a regular LALR parser. But it uses a recursive
- # rule traversor instead of the more commonly found state machine generated by
- # yacc-like tools. Since stack depths is not really an issue with a Ruby
- # implementation this approach has one big advantage. The syntax of the parser
- # can be modified during parsing. This allows support for languages that can
- # extend themself. The TaskJuggler syntax is such an beast. Traditional yacc
- # generated parsers would fail with such a syntax.
+ # The TextParser implements a somewhat modified LL(1) parser. It uses a
+ # dynamically compiled state machine. Dynamically means, that the syntax can
+ # be extended during the parse process. This allows support for languages
+ # that can extend their syntax during the parse process. The TaskJuggler
+ # syntax is such an beast.
#
# This class is just a base class. A complete parser would derive from this
# class and implement the rule set and the functions _nextToken()_ and
# _returnToken()_. It also needs to set the array _variables_ to declare all
# variables ($SOMENAME) that the scanner may deliver.
@@ -37,16 +35,25 @@
# parsing, TextParser#updateParserTables must be called to make the changes
# effective. The parser can also document the syntax automatically. To
# document a pattern, the functions TextParser#doc, TextParser#descr,
# TextParser#also and TextParser#arg can be used.
#
+ # In contrast to conventional LL grammars, we use a slightly improved syntax
+ # descriptions. Repeated patterns are not described by recursive call but we
+ # use a repeat flag for syntax rules that consists of repeatable patterns.
+ # This removes the need for recursion elimination when compiling the state
+ # machine and makes the syntax a lot more readable. However, it adds a bit
+ # more complexity to the state machine. Optional patterns are described by
+ # a rule flag, not by adding an empty pattern.
+ #
# To start parsing the input the function TextParser#parse needs to be called
# with the name of the start rule.
class TextParser
# Utility class so that we can distinguish Array results from the Array
- # containing the results of a repeatable rule.
+ # containing the results of a repeatable rule. We define some merging
+ # method with a slightly different behaviour.
class TextParserResultArray < Array
def initialize
super
end
@@ -60,11 +67,10 @@
self.concat(arg)
else
super
end
end
-
end
attr_reader :rules, :messageHandler
# Create a new TextParser object.
@@ -74,22 +80,33 @@
# This Hash will store the ruleset that the parser is operating on.
@rules = { }
# Array to hold the token types that the scanner can return.
@variables = []
# An list of token types that are not allowed in the current context.
- @badVariables = []
+ # For performance reasons we use a hash with the token as key. The value
+ # is irrelevant.
+ @blockedVariables = {}
# The currently processed rule.
@cr = nil
+
+ @states = {}
+ # The stack used by the FSM.
+ @stack = nil
end
# Limit the allowed tokens of the scanner to the subset passed by the
# _tokenSet_ Array.
def limitTokenSet(tokenSet)
return unless tokenSet
- @badVariables = @variables.dup
- @badVariables.delete_if { |v| tokenSet.include?(v) }
+ # Create a copy of all supported variables.
+ blockedVariables = @variables.dup
+ # Then delete all that are in the limited set.
+ blockedVariables.delete_if { |v| tokenSet.include?(v) }
+ # And convert the list into a Hash for faster lookups.
+ @blockedVariables = {}
+ blockedVariables.each { |v| @blockedVariables[v] = true }
end
# Call all methods that start with 'rule_' to initialize the rules.
def initRules
methods.each do |m|
@@ -106,10 +123,12 @@
# function also sets the class variable @cr to the new rule. Subsequent
# calls to TextParser#pattern, TextParser#optional or
# TextParser#repeatable will then implicitely operate on the most recently
# added rule.
def newRule(name)
+ # Use a symbol instead of a String.
+ name = name.intern
raise "Fatal Error: Rule #{name} already exists" if @rules.has_key?(name)
if block_given?
saveCr = @cr
@rules[name] = @cr = TextParser::Rule.new(name)
@@ -146,29 +165,41 @@
def repeatable
@cr.setRepeatable
end
# This function needs to be called whenever new rules or patterns have been
- # added and before the next call to TextParser#parse.
+ # added and before the next call to TextParser#parse. It's perfectly ok to
+ # call this function from within a parse() call as long as the states that
+ # are currently on the stack have not been modified.
def updateParserTables
- @rules.each_value { |rule| rule.transitions = {} }
+ saveFsmStack
+ # Invalidate some cached data.
+ @rules.each_value { |rule| rule.flushCache }
+ @states = {}
+ # Generate the parser states for all patterns of all rules.
@rules.each_value do |rule|
- getTransitions(rule)
+ rule.generateStates.each do |s|
+ @states[[ s.rule, s.pattern, s.index ]] = s
+ end
checkRule(rule)
end
+ # Compute the transitions between the generated states.
+ @states.each_value do |state|
+ state.addTransitions(@states, @rules)
+ end
+ restoreFsmStack
end
# To parse the input this function needs to be called with the name of the
# rule to start with. It returns the result of the processing function of
# the top-level parser rule that was specified by _ruleName_. In case of
# an error, the result is false.
def parse(ruleName)
@stack = []
@@expectedTokens = []
- updateParserTables
begin
- result = parseRuleR(@rules[ruleName])
+ result = parseFSM(@rules[ruleName])
rescue TjException => msg
if msg.message && !msg.message.empty?
@messageHandler.critical('parse', msg.message)
end
return false
@@ -179,23 +210,14 @@
# Return the SourceFileInfo of the TextScanner at the beginning of the
# currently processed TextParser::Rule. Or return nil if we don't have a
# current position.
def sourceFileInfo
- return nil if @stack.nil? || @stack.empty?
- @stack.last.sourceFileInfo[0]
+ return @scanner.sourceFileInfo if @stack.nil? || @stack.length <= 1
+ @stack.last.firstSourceFileInfo
end
- def matchingRules(keyword)
- matches = []
- @rules.each do |name, rule|
- patIdx = rule.matchingPatternIndex('_' + keyword)
- matches << [ rule, patIdx ] if patIdx
- end
- matches
- end
-
def error(id, text, sfi = nil, data = nil)
sfi ||= sourceFileInfo
if @scanner
# The scanner has some more context information, so we pass the error
# on to the TextScanner.
@@ -216,410 +238,223 @@
end
end
private
- # getTransitions recursively determines all possible target tokens
- # that the _rule_ matches. A target token can either be a fixed token
- # (prefixed with _), a variable token (prefixed with $) or an end token
- # (just a .). The list of found target tokens is stored in the _transitions_
- # list of the rule. For each rule pattern we store the transitions for this
- # pattern in a token -> rule hash.
- def getTransitions(rule)
- # If we have processed this rule before we can just return a copy
- # of the transitions of this rule. This avoids endless recursions.
- return rule.transitions.dup unless rule.transitions.empty?
-
- rule.transitions = []
- rule.patterns.each do |pat|
- allTokensOptional = true
- transitions = { }
- pat.each do |token|
- tokenId = token[1..-1]
- if token[0] == ?!
- unless @rules.has_key?(tokenId)
- raise "Fatal Error: Unknown reference to #{tokenId} in pattern " +
- "#{pat} + of rule #{rule.name}"
- end
- refRule = @rules[tokenId]
- # If the referenced rule describes optional content, we need to look
- # at the next token as well.
- res = getTransitions(@rules[tokenId])
- allTokensOptional = false unless refRule.optional?(@rules)
- # Combine the hashes for each pattern into a single hash
- res.each do |pat_i|
- pat_i.each { |tok, r| transitions[tok] = r }
- end
- elsif '_$.'.include?(token[0])
- transitions[token] = rule
- allTokensOptional = false
- else
- raise 'Fatal Error: Illegal token type specifier used for token' +
- ": #{tokenId}"
- end
- break unless allTokensOptional
- end
- # Make sure that we only have one possible transition for each
- # target.
- transitions.each do |key, value|
- rule.transitions.each do |trans|
- if trans.has_key?(key)
- rule.dump
- raise "Fatal Error: Rule #{rule.name} has ambiguous " +
- "transitions for target #{key}"
- end
- end
- end
- rule.transitions << transitions
- end
- rule.transitions.dup
- end
-
def checkRule(rule)
if rule.patterns.empty?
raise "Rule #{rule.name} must have at least one pattern"
end
rule.patterns.each do |pat|
- pat.each do |tok|
- type = tok[0]
- token = tok[1..-1]
- if type == ?$
- if @variables.index(token).nil?
+ pat.each do |type, name|
+ if type == :variable
+ if @variables.index(name).nil?
error('unsupported_token',
- "The token #{token} is not supported here.", token[2])
+ "The token #{name} is not supported here.")
end
- elsif type == ?!
- if @rules[token].nil?
- raise "Fatal Error: Reference to unknown rule #{token} in " +
+ elsif type == :reference
+ if @rules[name].nil?
+ raise "Fatal Error: Reference to unknown rule #{name} in " +
"pattern '#{pat}' of rule #{rule.name}"
end
end
end
end
end
- # This function processes the input starting with the syntax description of
- # _rule_. It recursively calls this function whenever the syntax description
- # contains the reference to another rule.
- # This recursive version has cleaner code and is about 8% faster than
- # parseRuleNR.
- def parseRuleR(rule)
- #Log.enter('parseRuleR', "Parsing with rule #{rule.name}")
- #puts "Parsing with rule #{rule.name}"
- result = rule.repeatable ? TextParserResultArray.new : nil
- # Rules can be marked 'repeatable'. This flag will be set to true after
- # the first iteration has been completed.
- repeatMode = false
+ def parseFSM(rule)
+ unless (state = @states[[ rule, nil, 0 ]])
+ error("no_start_state", "No start state for rule #{rule.name} found")
+ end
+ @stack = [ TextParser::StackElement.new(nil, state) ]
+
loop do
- # At the beginning of a rule we need a token from the input to determine
- # which pattern of the rule needs to be processed.
- token = getNextToken
+ if state.transitions.empty?
+ # The final states of each pattern have no pre-compiled transitions.
+ # For such a state, we don't need to get a new token.
+ transition = token = nil
+ else
+ transition = state.transition(token = getNextToken)
+ end
- return result unless (pattern = findPattern(rule, token, repeatMode))
- # The @stack will store the resulting value of each element in the
- # pattern.
- @stack << TextParser::StackElement.new(pattern.function)
+ if transition
+ # Shift: This for normal state transitions. This may be from one
+ # token of a pattern to the next token of the same pattern, to the
+ # start of a new pattern or a loop-back to the start of a pattern of
+ # the same rule. The transition tells us what state we have to
+ # process next.
+ state = transition.state
- pattern.each do |element|
- # Separate the type and token text for pattern element.
- elType = element[0]
- elToken = element[1..-1]
- if elType == ?!
- # The element is a reference to another rule. Return the token if
- # we still have one and continue with the referenced rule.
- unless token.nil?
- sfi = token[2]
- returnToken(token)
- token = nil
+ # If we have looped-back we need to finish the pattern first. Final
+ # tokens of repeatable rules do have transitions!
+ finishPattern(token) if transition.loopBack
+
+ # Transitions that enter rules generate states which we need to
+ # resume at when a rule has been completely processed. We push this
+ # list of states on the @stack.
+ stackElement = @stack.last
+ first = true
+ transition.stateStack.each do |s|
+ if first && s.pattern == stackElement.state.pattern
+ # The first state in the list may just be another state of the
+ # current pattern. In this case, we already have the
+ # StackElement on the @stack. We only need to update the State
+ # for the current StackElement.
+ stackElement.state = s
else
- sfi = nil
+ # For other patterns, we just push a new StackElement onto the
+ # @stack.
+ @stack.push(TextParser::StackElement.new(nil, s))
end
- @stack.last.store(parseRuleR(@rules[elToken]), sfi)
- #Log << "Resuming rule #{rule.name}"
- #puts "Resuming rule #{rule.name}"
- else
- # In case the element is a keyword or variable we have to get a new
- # token if we don't have one anymore.
- token = getNextToken unless token
+ first = false
+ end
- processNormalElements(elType, elToken, token)
+ if state.index == 0
+ # If we have just started with a new pattern (or loop-ed back) we
+ # need to push a new StackEntry onto the @stack. The StackEntry
+ # stores the result of the pattern and keeps the State that we
+ # need to return to in case we jump to other patterns from this
+ # pattern.
+ function = state.index == state.pattern.tokens.length - 1 ?
+ state.pattern.function : nil
+ @stack.push(TextParser::StackElement.new(state.pattern.function,
+ state))
+ end
- # The token has been consumed. Reset the variable.
- token = nil
- @@expectedTokens = []
+ # Store the token value in the result Array.
+ @stack.last.insert(state.index, token[1], token[2], false)
+ else
+ # Reduce: We've reached the end of a rule. There is no pre-compiled
+ # transition available. The current token, if we have one, is of no
+ # use to us during this state. We just return it to the scanner. The
+ # next state is determined by the first matching state from the
+ # @stack.
+ if state.noReduce
+ # Only states that finish a rule may trigger a reduce operation.
+ # Other states have the noReduce flag set. If a reduce for such a
+ # state is triggered, we found a token that is not supported by
+ # the syntax rules.
+ error("no_reduce",
+ "Unexpected token '#{token[1]}' found. " +
+ "Expecting one of " +
+ "#{@stack.last.state.expectedTokens.join(', ')}")
end
+ returnToken(token) if token
+ if finishPattern(token)
+ # Accept: We're done with parsing.
+ break
+ end
+ state = @stack.last.state
end
+ end
- # Once the complete pattern has been processed we call the processing
- # function for this pattern to operate on the value array. Then pop the
- # entry for this rule from the stack.
- @val = @stack.last.val
- @sourceFileInfo = @stack.last.sourceFileInfo
- res = nil
- res = @stack.last.function.call unless @stack.last.function.nil?
- @stack.pop
+ @stack[0].val[0]
+ end
- # If the rule is not repeatable we can store the result and break the
- # outer loop to exit the function.
- unless rule.repeatable
- result = res
- break
+ def finishPattern(token)
+ #dumpStack
+ # To finish a pattern we need to pop the StackElement with the token
+ # values from the stack.
+ stackEntry = @stack.pop
+ if stackEntry.nil? || @stack.empty?
+ # Check if we have reached the bottom of the stack.
+ token = getNextToken unless token
+ if token[0] == :endOfText
+ # If the token is the end of the top-level file, we're done. We push
+ # back the StackEntry since it holds the overall result of the
+ # parsing.
+ @stack.push(stackEntry)
+ return true
end
+ # If it's not the EOF token, we found a token that violates the syntax
+ # rules.
+ error('unexpctd_token', "Unexpected token '#{token[1]}' found. " +
+ "Expecting one of " +
+ "#{stackEntry.state.expectedTokens.join(', ')}")
+ end
+ # Memorize if the rule for this pattern was repeatable. Then we will
+ # store the result of the pattern in an Array.
+ ruleIsRepeatable = stackEntry.state.rule.repeatable
- # Otherwise we append the result to the result array and turn repeat
- # mode on.
- result << res
- repeatMode = true
+ state = stackEntry.state
+ result = nil
+ if state.pattern.function
+ # Make the token values and their SourceFileInfo available.
+ @val = stackEntry.val
+ @sourceFileInfo = stackEntry.sourceFileInfo
+ # Now call the pattern action to compute the value of the pattern.
+ result = state.pattern.function.call
end
- #Log.exit('parseRuleR', "Finished rule #{rule.name}")
- #puts "Finished rule #{rule.name}"
- return result
+ # We use the SourceFileInfo of the first token of the pattern to store
+ # it with the result of the pattern.
+ firstSourceFileInfo = stackEntry.firstSourceFileInfo
+ # Store the result at the correct position into the next lower level of
+ # the stack.
+ stackEntry = @stack.last
+ stackEntry.insert(stackEntry.state.index, result,
+ firstSourceFileInfo, ruleIsRepeatable)
+ false
end
- # This function processes the input starting with the syntax description
- # of _rule_. It's implemented as an unrolled recursion. It recursively
- # iterates over the rule tree as controlled by the input file.
- # This version is not limited by the size of the system stack. So far, I'm
- # not aware of any project that is too large for the system stack. Since
- # the recursive version parseRuleR is about 8% faster and has cleaner
- # code, we use that by default.
- def parseRuleNR(rule)
- elementIdx = 0
- recursionResult = nil
- # These flags are used to managed the control flow to and from the
- # recursion point.
- recur = resume = false
- # The stack that holds the context for the recursion levels. It's either
- # just a rule to start a new recursion or an Array of state variables.
- recursionStack = [ rule ]
- begin
- # Pop the top entry from the recursion stack.
- se = recursionStack.pop
- if se.is_a?(Array)
- # We have essentially finished a recursion level and need to get
- # back to the place where we started the recursion. First, we need
- # to restore the state again.
- rule, pattern, elementIdx, result, repeatMode, sfi = se
- #Log << "Recursion loop started in resume mode for rule #{rule.name}"
- # Now jump to the recursion point without doing anything else.
- resume = true
- else
- # Start a new recursion level. The rule tells us how to interpret
- # the input text.
- rule = se
- #Log.enter('parseRuleNR', "Parsing with rule #{rule.name}")
- resume = false
- end
-
- unless resume
- result = rule.repeatable ? TextParserResultArray.new : nil
- # Rules can be marked 'repeatable'. This flag will be set to true
- # after the first iteration has been completed.
- repeatMode = false
- end
-
- loop do
- unless resume
- # At the beginning of a rule we need a token from the input to
- # determine which pattern of the rule needs to be processed.
- token = getNextToken
-
- break unless (pattern = findPattern(rule, token, repeatMode))
- # The @stack will store the resulting value of each element in the
- # pattern.
- @stack << TextParser::StackElement.new(pattern.function)
-
- # Once we've found the right pattern, we need to process each
- # element.
- elementIdx = 0
- end
-
- elementCount = pattern.length
- while elementIdx < elementCount
- element = pattern[elementIdx]
- # Separate the type and token text for pattern element.
- elType = element[0]
- elToken = element[1..-1]
- if elType == ?!
- unless resume
- # The element is a reference to another rule. Return the token
- # if we still have one and continue with the referenced rule.
- if token
- sfi = token[2]
- returnToken(token)
- token = nil
- else
- sfi = nil
- end
- # This is where the recursion would happen. Instead, we push
- # the state variables and then the next rule onto the
- # recursion stack.
- recursionStack.push([ rule, pattern, elementIdx, result,
- repeatMode, sfi ])
- recursionStack.push(@rules[elToken])
- # Now terminate all but the outer loops without doing anything
- # else.
- recur = true
- break
- else
- # We're back right after where the recursion started. Store
- # the result and turn resume mode off again.
- @stack.last.store(recursionResult, sfi)
- resume = false
- end
- else
- # In case the element is a keyword or variable we have to get a
- # new token if we don't have one anymore.
- token = getNextToken unless token
-
- processNormalElements(elType, elToken, token)
-
- # The token has been consumed. Reset the variable.
- token = nil
- @@expectedTokens = []
+ def dumpStack
+ #puts "Stack level #{@stack.length}"
+ @stack.each do |sl|
+ print "#{@stack.index(sl)}: "
+ sl.each do |v|
+ if v.is_a?(Array)
+ begin
+ print "[#{v.join('|')}]|"
+ rescue
+ print "[#{v[0].class}...]|"
end
- elementIdx += 1
- end # of pattern while loop
-
- # Skip the rest of the loop in recur mode.
- break if recur
-
- elementIdx = 0
-
- # Once the complete pattern has been processed we call the
- # processing function for this pattern to operate on the value
- # array. Then pop the entry for this rule from the stack. The
- # called function will use @val and @sourceFileInfo to retrieve
- # data from the parser.
- @val = @stack.last.val
- @sourceFileInfo = @stack.last.sourceFileInfo
- res = @stack.last.function ? @stack.last.function.call : nil
- @stack.pop
-
- # If the rule is not repeatable we can store the result and break
- # the outer loop to exit the function.
- unless rule.repeatable
- result = res
- break
+ else
+ begin
+ print "#{v}|"
+ rescue
+ print v.class
+ end
end
+ end
+ print " -> #{sl.state ? sl.state.to_s(true) : 'nil'} #{sl.function.nil? ? '' : '(Called)'}"
+ puts ""
+ end
+ end
- # Otherwise we append the result to the result array and turn repeat
- # mode on.
- result << res
- # We have completed the first iteration. Set the repeat mode flag to
- # indicate that further iterations are already re-runs.
- repeatMode = true
- end # of rule processing loop
+ # Convert the FSM stack state entries from State objects into [ rule,
+ # pattern, index ] equivalents.
+ def saveFsmStack
+ return unless @stack
- if recur
- recur = false
- else
- #Log.exit('parseRuleNR', "Finished rule #{rule.name}")
- recursionResult = result
- end
- end while !recursionStack.empty?
+ @stack.each do |s|
+ next unless (st = s.state)
+ s.state = [ st.rule, st.pattern, st.index ]
+ end
+ end
- return result
+ # Convert the FSM stack state entries from [ rule, pattern, index ] into
+ # the respective State objects again.
+ def restoreFsmStack
+ return unless @stack
+
+ @stack.each do |s|
+ next unless (state = @states[s.state])
+ raise "Stack restore failed. Cannot find state" unless state
+ s.state = state
+ end
end
def getNextToken
token = nextToken
#Log << "Token: [#{token[0]}][#{token[1]}]"
- if @badVariables.include?(token[0])
+ if @blockedVariables[token[0]]
error('unsupported_token',
"The token #{token[1]} is not supported in this context.",
token[2])
end
token
end
- def findPattern(rule, token, repeatMode)
- # The scanner cannot differentiate between keywords and identifiers. So
- # whenever an identifier is returned we have to see if we have a
- # matching keyword first. If none is found, then look for normal
- # identifiers.
- if token[0] == 'ID'
- if (patIdx = rule.matchingPatternIndex('_' + token[1])).nil?
- patIdx = rule.matchingPatternIndex("$ID")
- end
- elsif token[0] == 'LITERAL'
- patIdx = rule.matchingPatternIndex('_' + token[1])
- elsif token[0] == false
- patIdx = rule.matchingPatternIndex('.')
- else
- patIdx = rule.matchingPatternIndex('$' + token[0])
- end
-
- # If no matching pattern is found for the token we have to check if the
- # rule is optional or we are in repeat mode. If this is the case, return
- # the token back to the scanner. Otherwise, we have found a token we
- # cannot handle at this point.
- if patIdx.nil?
- # Append the list of expected tokens to the @@expectedToken array.
- # This may be used in a later rule to provide more details when an
- # error occured.
- rule.transitions.each do |transition|
- keys = transition.keys
- keys.collect! { |key| key[1..-1] }
- @@expectedTokens += keys
- @@expectedTokens.sort!
- end
-
- unless rule.optional?(@rules) || repeatMode
- error('unexpctd_token',
- (token[0] != false ?
- "Unexpected token '#{token[1]}' of type " +
- "'#{token[0]}'. " :
- "Unexpected end of file in #{@scanner.fileName}. ") +
- (@@expectedTokens.length > 1 ?
- "Expecting one of #{@@expectedTokens.join(', ')}" :
- "Expecting #{@@expectedTokens[0]}"), token[2])
- end
- returnToken(token)
- return nil
- end
-
- rule.pattern(patIdx)
- end
-
- # Handle the elements that don't trigger a recursion.
- def processNormalElements(elType, elToken, token)
- if elType == ?_
- # If the element requires a keyword the token must match this
- # keyword.
- if elToken != token[1]
- text = "'#{elToken}' expected but found " +
- "'#{token[1]}' (#{token[0]})."
- unless @@expectedTokens.empty?
- text = "#{@@expectedTokens.join(', ')} or " + text
- end
- error('spec_keywork_expctd', text, token[2])
- end
- @stack.last.store(elToken, token[2])
- elsif elType == ?.
- if token[0..1] != [ '.', '<END>' ]
- error('end_expected',
- "Found garbage at expected end of text: #{token[1]}\n" +
- "If you see this in the middle of your text, you probably " +
- "have closed your context too early.", token[2])
- end
- else
- # The token must match the expected variable type.
- if token[0] != elToken
- text = "'#{elToken}' expected but found " +
- "'#{token[1]}' (#{token[0]})."
- unless @@expectedTokens.empty?
- text = "#{@@expectedTokens.join(', ')} or " + text
- end
- error('spec_token_expctd', text, token[2])
- end
- # If the element is a variable store the value of the token.
- @stack.last.store(token[1], token[2])
- end
- end
-
end
end
-