lib/TextParser.rb in taskjuggler-0.0.7 vs lib/TextParser.rb in taskjuggler-0.0.8

- old
+ new

@@ -17,17 +17,15 @@ require 'TjException' require 'Log' class TaskJuggler - # The TextParser implements a regular LALR parser. But it uses a recursive - # rule traversor instead of the more commonly found state machine generated by - # yacc-like tools. Since stack depths is not really an issue with a Ruby - # implementation this approach has one big advantage. The syntax of the parser - # can be modified during parsing. This allows support for languages that can - # extend themself. The TaskJuggler syntax is such an beast. Traditional yacc - # generated parsers would fail with such a syntax. + # The TextParser implements a somewhat modified LL(1) parser. It uses a + # dynamically compiled state machine. Dynamically means, that the syntax can + # be extended during the parse process. This allows support for languages + # that can extend their syntax during the parse process. The TaskJuggler + # syntax is such an beast. # # This class is just a base class. A complete parser would derive from this # class and implement the rule set and the functions _nextToken()_ and # _returnToken()_. It also needs to set the array _variables_ to declare all # variables ($SOMENAME) that the scanner may deliver. @@ -37,16 +35,25 @@ # parsing, TextParser#updateParserTables must be called to make the changes # effective. The parser can also document the syntax automatically. To # document a pattern, the functions TextParser#doc, TextParser#descr, # TextParser#also and TextParser#arg can be used. # + # In contrast to conventional LL grammars, we use a slightly improved syntax + # descriptions. Repeated patterns are not described by recursive call but we + # use a repeat flag for syntax rules that consists of repeatable patterns. + # This removes the need for recursion elimination when compiling the state + # machine and makes the syntax a lot more readable. However, it adds a bit + # more complexity to the state machine. Optional patterns are described by + # a rule flag, not by adding an empty pattern. + # # To start parsing the input the function TextParser#parse needs to be called # with the name of the start rule. class TextParser # Utility class so that we can distinguish Array results from the Array - # containing the results of a repeatable rule. + # containing the results of a repeatable rule. We define some merging + # method with a slightly different behaviour. class TextParserResultArray < Array def initialize super end @@ -60,11 +67,10 @@ self.concat(arg) else super end end - end attr_reader :rules, :messageHandler # Create a new TextParser object. @@ -74,22 +80,33 @@ # This Hash will store the ruleset that the parser is operating on. @rules = { } # Array to hold the token types that the scanner can return. @variables = [] # An list of token types that are not allowed in the current context. - @badVariables = [] + # For performance reasons we use a hash with the token as key. The value + # is irrelevant. + @blockedVariables = {} # The currently processed rule. @cr = nil + + @states = {} + # The stack used by the FSM. + @stack = nil end # Limit the allowed tokens of the scanner to the subset passed by the # _tokenSet_ Array. def limitTokenSet(tokenSet) return unless tokenSet - @badVariables = @variables.dup - @badVariables.delete_if { |v| tokenSet.include?(v) } + # Create a copy of all supported variables. + blockedVariables = @variables.dup + # Then delete all that are in the limited set. + blockedVariables.delete_if { |v| tokenSet.include?(v) } + # And convert the list into a Hash for faster lookups. + @blockedVariables = {} + blockedVariables.each { |v| @blockedVariables[v] = true } end # Call all methods that start with 'rule_' to initialize the rules. def initRules methods.each do |m| @@ -106,10 +123,12 @@ # function also sets the class variable @cr to the new rule. Subsequent # calls to TextParser#pattern, TextParser#optional or # TextParser#repeatable will then implicitely operate on the most recently # added rule. def newRule(name) + # Use a symbol instead of a String. + name = name.intern raise "Fatal Error: Rule #{name} already exists" if @rules.has_key?(name) if block_given? saveCr = @cr @rules[name] = @cr = TextParser::Rule.new(name) @@ -146,29 +165,41 @@ def repeatable @cr.setRepeatable end # This function needs to be called whenever new rules or patterns have been - # added and before the next call to TextParser#parse. + # added and before the next call to TextParser#parse. It's perfectly ok to + # call this function from within a parse() call as long as the states that + # are currently on the stack have not been modified. def updateParserTables - @rules.each_value { |rule| rule.transitions = {} } + saveFsmStack + # Invalidate some cached data. + @rules.each_value { |rule| rule.flushCache } + @states = {} + # Generate the parser states for all patterns of all rules. @rules.each_value do |rule| - getTransitions(rule) + rule.generateStates.each do |s| + @states[[ s.rule, s.pattern, s.index ]] = s + end checkRule(rule) end + # Compute the transitions between the generated states. + @states.each_value do |state| + state.addTransitions(@states, @rules) + end + restoreFsmStack end # To parse the input this function needs to be called with the name of the # rule to start with. It returns the result of the processing function of # the top-level parser rule that was specified by _ruleName_. In case of # an error, the result is false. def parse(ruleName) @stack = [] @@expectedTokens = [] - updateParserTables begin - result = parseRuleR(@rules[ruleName]) + result = parseFSM(@rules[ruleName]) rescue TjException => msg if msg.message && !msg.message.empty? @messageHandler.critical('parse', msg.message) end return false @@ -179,23 +210,14 @@ # Return the SourceFileInfo of the TextScanner at the beginning of the # currently processed TextParser::Rule. Or return nil if we don't have a # current position. def sourceFileInfo - return nil if @stack.nil? || @stack.empty? - @stack.last.sourceFileInfo[0] + return @scanner.sourceFileInfo if @stack.nil? || @stack.length <= 1 + @stack.last.firstSourceFileInfo end - def matchingRules(keyword) - matches = [] - @rules.each do |name, rule| - patIdx = rule.matchingPatternIndex('_' + keyword) - matches << [ rule, patIdx ] if patIdx - end - matches - end - def error(id, text, sfi = nil, data = nil) sfi ||= sourceFileInfo if @scanner # The scanner has some more context information, so we pass the error # on to the TextScanner. @@ -216,410 +238,223 @@ end end private - # getTransitions recursively determines all possible target tokens - # that the _rule_ matches. A target token can either be a fixed token - # (prefixed with _), a variable token (prefixed with $) or an end token - # (just a .). The list of found target tokens is stored in the _transitions_ - # list of the rule. For each rule pattern we store the transitions for this - # pattern in a token -> rule hash. - def getTransitions(rule) - # If we have processed this rule before we can just return a copy - # of the transitions of this rule. This avoids endless recursions. - return rule.transitions.dup unless rule.transitions.empty? - - rule.transitions = [] - rule.patterns.each do |pat| - allTokensOptional = true - transitions = { } - pat.each do |token| - tokenId = token[1..-1] - if token[0] == ?! - unless @rules.has_key?(tokenId) - raise "Fatal Error: Unknown reference to #{tokenId} in pattern " + - "#{pat} + of rule #{rule.name}" - end - refRule = @rules[tokenId] - # If the referenced rule describes optional content, we need to look - # at the next token as well. - res = getTransitions(@rules[tokenId]) - allTokensOptional = false unless refRule.optional?(@rules) - # Combine the hashes for each pattern into a single hash - res.each do |pat_i| - pat_i.each { |tok, r| transitions[tok] = r } - end - elsif '_$.'.include?(token[0]) - transitions[token] = rule - allTokensOptional = false - else - raise 'Fatal Error: Illegal token type specifier used for token' + - ": #{tokenId}" - end - break unless allTokensOptional - end - # Make sure that we only have one possible transition for each - # target. - transitions.each do |key, value| - rule.transitions.each do |trans| - if trans.has_key?(key) - rule.dump - raise "Fatal Error: Rule #{rule.name} has ambiguous " + - "transitions for target #{key}" - end - end - end - rule.transitions << transitions - end - rule.transitions.dup - end - def checkRule(rule) if rule.patterns.empty? raise "Rule #{rule.name} must have at least one pattern" end rule.patterns.each do |pat| - pat.each do |tok| - type = tok[0] - token = tok[1..-1] - if type == ?$ - if @variables.index(token).nil? + pat.each do |type, name| + if type == :variable + if @variables.index(name).nil? error('unsupported_token', - "The token #{token} is not supported here.", token[2]) + "The token #{name} is not supported here.") end - elsif type == ?! - if @rules[token].nil? - raise "Fatal Error: Reference to unknown rule #{token} in " + + elsif type == :reference + if @rules[name].nil? + raise "Fatal Error: Reference to unknown rule #{name} in " + "pattern '#{pat}' of rule #{rule.name}" end end end end end - # This function processes the input starting with the syntax description of - # _rule_. It recursively calls this function whenever the syntax description - # contains the reference to another rule. - # This recursive version has cleaner code and is about 8% faster than - # parseRuleNR. - def parseRuleR(rule) - #Log.enter('parseRuleR', "Parsing with rule #{rule.name}") - #puts "Parsing with rule #{rule.name}" - result = rule.repeatable ? TextParserResultArray.new : nil - # Rules can be marked 'repeatable'. This flag will be set to true after - # the first iteration has been completed. - repeatMode = false + def parseFSM(rule) + unless (state = @states[[ rule, nil, 0 ]]) + error("no_start_state", "No start state for rule #{rule.name} found") + end + @stack = [ TextParser::StackElement.new(nil, state) ] + loop do - # At the beginning of a rule we need a token from the input to determine - # which pattern of the rule needs to be processed. - token = getNextToken + if state.transitions.empty? + # The final states of each pattern have no pre-compiled transitions. + # For such a state, we don't need to get a new token. + transition = token = nil + else + transition = state.transition(token = getNextToken) + end - return result unless (pattern = findPattern(rule, token, repeatMode)) - # The @stack will store the resulting value of each element in the - # pattern. - @stack << TextParser::StackElement.new(pattern.function) + if transition + # Shift: This for normal state transitions. This may be from one + # token of a pattern to the next token of the same pattern, to the + # start of a new pattern or a loop-back to the start of a pattern of + # the same rule. The transition tells us what state we have to + # process next. + state = transition.state - pattern.each do |element| - # Separate the type and token text for pattern element. - elType = element[0] - elToken = element[1..-1] - if elType == ?! - # The element is a reference to another rule. Return the token if - # we still have one and continue with the referenced rule. - unless token.nil? - sfi = token[2] - returnToken(token) - token = nil + # If we have looped-back we need to finish the pattern first. Final + # tokens of repeatable rules do have transitions! + finishPattern(token) if transition.loopBack + + # Transitions that enter rules generate states which we need to + # resume at when a rule has been completely processed. We push this + # list of states on the @stack. + stackElement = @stack.last + first = true + transition.stateStack.each do |s| + if first && s.pattern == stackElement.state.pattern + # The first state in the list may just be another state of the + # current pattern. In this case, we already have the + # StackElement on the @stack. We only need to update the State + # for the current StackElement. + stackElement.state = s else - sfi = nil + # For other patterns, we just push a new StackElement onto the + # @stack. + @stack.push(TextParser::StackElement.new(nil, s)) end - @stack.last.store(parseRuleR(@rules[elToken]), sfi) - #Log << "Resuming rule #{rule.name}" - #puts "Resuming rule #{rule.name}" - else - # In case the element is a keyword or variable we have to get a new - # token if we don't have one anymore. - token = getNextToken unless token + first = false + end - processNormalElements(elType, elToken, token) + if state.index == 0 + # If we have just started with a new pattern (or loop-ed back) we + # need to push a new StackEntry onto the @stack. The StackEntry + # stores the result of the pattern and keeps the State that we + # need to return to in case we jump to other patterns from this + # pattern. + function = state.index == state.pattern.tokens.length - 1 ? + state.pattern.function : nil + @stack.push(TextParser::StackElement.new(state.pattern.function, + state)) + end - # The token has been consumed. Reset the variable. - token = nil - @@expectedTokens = [] + # Store the token value in the result Array. + @stack.last.insert(state.index, token[1], token[2], false) + else + # Reduce: We've reached the end of a rule. There is no pre-compiled + # transition available. The current token, if we have one, is of no + # use to us during this state. We just return it to the scanner. The + # next state is determined by the first matching state from the + # @stack. + if state.noReduce + # Only states that finish a rule may trigger a reduce operation. + # Other states have the noReduce flag set. If a reduce for such a + # state is triggered, we found a token that is not supported by + # the syntax rules. + error("no_reduce", + "Unexpected token '#{token[1]}' found. " + + "Expecting one of " + + "#{@stack.last.state.expectedTokens.join(', ')}") end + returnToken(token) if token + if finishPattern(token) + # Accept: We're done with parsing. + break + end + state = @stack.last.state end + end - # Once the complete pattern has been processed we call the processing - # function for this pattern to operate on the value array. Then pop the - # entry for this rule from the stack. - @val = @stack.last.val - @sourceFileInfo = @stack.last.sourceFileInfo - res = nil - res = @stack.last.function.call unless @stack.last.function.nil? - @stack.pop + @stack[0].val[0] + end - # If the rule is not repeatable we can store the result and break the - # outer loop to exit the function. - unless rule.repeatable - result = res - break + def finishPattern(token) + #dumpStack + # To finish a pattern we need to pop the StackElement with the token + # values from the stack. + stackEntry = @stack.pop + if stackEntry.nil? || @stack.empty? + # Check if we have reached the bottom of the stack. + token = getNextToken unless token + if token[0] == :endOfText + # If the token is the end of the top-level file, we're done. We push + # back the StackEntry since it holds the overall result of the + # parsing. + @stack.push(stackEntry) + return true end + # If it's not the EOF token, we found a token that violates the syntax + # rules. + error('unexpctd_token', "Unexpected token '#{token[1]}' found. " + + "Expecting one of " + + "#{stackEntry.state.expectedTokens.join(', ')}") + end + # Memorize if the rule for this pattern was repeatable. Then we will + # store the result of the pattern in an Array. + ruleIsRepeatable = stackEntry.state.rule.repeatable - # Otherwise we append the result to the result array and turn repeat - # mode on. - result << res - repeatMode = true + state = stackEntry.state + result = nil + if state.pattern.function + # Make the token values and their SourceFileInfo available. + @val = stackEntry.val + @sourceFileInfo = stackEntry.sourceFileInfo + # Now call the pattern action to compute the value of the pattern. + result = state.pattern.function.call end - #Log.exit('parseRuleR', "Finished rule #{rule.name}") - #puts "Finished rule #{rule.name}" - return result + # We use the SourceFileInfo of the first token of the pattern to store + # it with the result of the pattern. + firstSourceFileInfo = stackEntry.firstSourceFileInfo + # Store the result at the correct position into the next lower level of + # the stack. + stackEntry = @stack.last + stackEntry.insert(stackEntry.state.index, result, + firstSourceFileInfo, ruleIsRepeatable) + false end - # This function processes the input starting with the syntax description - # of _rule_. It's implemented as an unrolled recursion. It recursively - # iterates over the rule tree as controlled by the input file. - # This version is not limited by the size of the system stack. So far, I'm - # not aware of any project that is too large for the system stack. Since - # the recursive version parseRuleR is about 8% faster and has cleaner - # code, we use that by default. - def parseRuleNR(rule) - elementIdx = 0 - recursionResult = nil - # These flags are used to managed the control flow to and from the - # recursion point. - recur = resume = false - # The stack that holds the context for the recursion levels. It's either - # just a rule to start a new recursion or an Array of state variables. - recursionStack = [ rule ] - begin - # Pop the top entry from the recursion stack. - se = recursionStack.pop - if se.is_a?(Array) - # We have essentially finished a recursion level and need to get - # back to the place where we started the recursion. First, we need - # to restore the state again. - rule, pattern, elementIdx, result, repeatMode, sfi = se - #Log << "Recursion loop started in resume mode for rule #{rule.name}" - # Now jump to the recursion point without doing anything else. - resume = true - else - # Start a new recursion level. The rule tells us how to interpret - # the input text. - rule = se - #Log.enter('parseRuleNR', "Parsing with rule #{rule.name}") - resume = false - end - - unless resume - result = rule.repeatable ? TextParserResultArray.new : nil - # Rules can be marked 'repeatable'. This flag will be set to true - # after the first iteration has been completed. - repeatMode = false - end - - loop do - unless resume - # At the beginning of a rule we need a token from the input to - # determine which pattern of the rule needs to be processed. - token = getNextToken - - break unless (pattern = findPattern(rule, token, repeatMode)) - # The @stack will store the resulting value of each element in the - # pattern. - @stack << TextParser::StackElement.new(pattern.function) - - # Once we've found the right pattern, we need to process each - # element. - elementIdx = 0 - end - - elementCount = pattern.length - while elementIdx < elementCount - element = pattern[elementIdx] - # Separate the type and token text for pattern element. - elType = element[0] - elToken = element[1..-1] - if elType == ?! - unless resume - # The element is a reference to another rule. Return the token - # if we still have one and continue with the referenced rule. - if token - sfi = token[2] - returnToken(token) - token = nil - else - sfi = nil - end - # This is where the recursion would happen. Instead, we push - # the state variables and then the next rule onto the - # recursion stack. - recursionStack.push([ rule, pattern, elementIdx, result, - repeatMode, sfi ]) - recursionStack.push(@rules[elToken]) - # Now terminate all but the outer loops without doing anything - # else. - recur = true - break - else - # We're back right after where the recursion started. Store - # the result and turn resume mode off again. - @stack.last.store(recursionResult, sfi) - resume = false - end - else - # In case the element is a keyword or variable we have to get a - # new token if we don't have one anymore. - token = getNextToken unless token - - processNormalElements(elType, elToken, token) - - # The token has been consumed. Reset the variable. - token = nil - @@expectedTokens = [] + def dumpStack + #puts "Stack level #{@stack.length}" + @stack.each do |sl| + print "#{@stack.index(sl)}: " + sl.each do |v| + if v.is_a?(Array) + begin + print "[#{v.join('|')}]|" + rescue + print "[#{v[0].class}...]|" end - elementIdx += 1 - end # of pattern while loop - - # Skip the rest of the loop in recur mode. - break if recur - - elementIdx = 0 - - # Once the complete pattern has been processed we call the - # processing function for this pattern to operate on the value - # array. Then pop the entry for this rule from the stack. The - # called function will use @val and @sourceFileInfo to retrieve - # data from the parser. - @val = @stack.last.val - @sourceFileInfo = @stack.last.sourceFileInfo - res = @stack.last.function ? @stack.last.function.call : nil - @stack.pop - - # If the rule is not repeatable we can store the result and break - # the outer loop to exit the function. - unless rule.repeatable - result = res - break + else + begin + print "#{v}|" + rescue + print v.class + end end + end + print " -> #{sl.state ? sl.state.to_s(true) : 'nil'} #{sl.function.nil? ? '' : '(Called)'}" + puts "" + end + end - # Otherwise we append the result to the result array and turn repeat - # mode on. - result << res - # We have completed the first iteration. Set the repeat mode flag to - # indicate that further iterations are already re-runs. - repeatMode = true - end # of rule processing loop + # Convert the FSM stack state entries from State objects into [ rule, + # pattern, index ] equivalents. + def saveFsmStack + return unless @stack - if recur - recur = false - else - #Log.exit('parseRuleNR', "Finished rule #{rule.name}") - recursionResult = result - end - end while !recursionStack.empty? + @stack.each do |s| + next unless (st = s.state) + s.state = [ st.rule, st.pattern, st.index ] + end + end - return result + # Convert the FSM stack state entries from [ rule, pattern, index ] into + # the respective State objects again. + def restoreFsmStack + return unless @stack + + @stack.each do |s| + next unless (state = @states[s.state]) + raise "Stack restore failed. Cannot find state" unless state + s.state = state + end end def getNextToken token = nextToken #Log << "Token: [#{token[0]}][#{token[1]}]" - if @badVariables.include?(token[0]) + if @blockedVariables[token[0]] error('unsupported_token', "The token #{token[1]} is not supported in this context.", token[2]) end token end - def findPattern(rule, token, repeatMode) - # The scanner cannot differentiate between keywords and identifiers. So - # whenever an identifier is returned we have to see if we have a - # matching keyword first. If none is found, then look for normal - # identifiers. - if token[0] == 'ID' - if (patIdx = rule.matchingPatternIndex('_' + token[1])).nil? - patIdx = rule.matchingPatternIndex("$ID") - end - elsif token[0] == 'LITERAL' - patIdx = rule.matchingPatternIndex('_' + token[1]) - elsif token[0] == false - patIdx = rule.matchingPatternIndex('.') - else - patIdx = rule.matchingPatternIndex('$' + token[0]) - end - - # If no matching pattern is found for the token we have to check if the - # rule is optional or we are in repeat mode. If this is the case, return - # the token back to the scanner. Otherwise, we have found a token we - # cannot handle at this point. - if patIdx.nil? - # Append the list of expected tokens to the @@expectedToken array. - # This may be used in a later rule to provide more details when an - # error occured. - rule.transitions.each do |transition| - keys = transition.keys - keys.collect! { |key| key[1..-1] } - @@expectedTokens += keys - @@expectedTokens.sort! - end - - unless rule.optional?(@rules) || repeatMode - error('unexpctd_token', - (token[0] != false ? - "Unexpected token '#{token[1]}' of type " + - "'#{token[0]}'. " : - "Unexpected end of file in #{@scanner.fileName}. ") + - (@@expectedTokens.length > 1 ? - "Expecting one of #{@@expectedTokens.join(', ')}" : - "Expecting #{@@expectedTokens[0]}"), token[2]) - end - returnToken(token) - return nil - end - - rule.pattern(patIdx) - end - - # Handle the elements that don't trigger a recursion. - def processNormalElements(elType, elToken, token) - if elType == ?_ - # If the element requires a keyword the token must match this - # keyword. - if elToken != token[1] - text = "'#{elToken}' expected but found " + - "'#{token[1]}' (#{token[0]})." - unless @@expectedTokens.empty? - text = "#{@@expectedTokens.join(', ')} or " + text - end - error('spec_keywork_expctd', text, token[2]) - end - @stack.last.store(elToken, token[2]) - elsif elType == ?. - if token[0..1] != [ '.', '<END>' ] - error('end_expected', - "Found garbage at expected end of text: #{token[1]}\n" + - "If you see this in the middle of your text, you probably " + - "have closed your context too early.", token[2]) - end - else - # The token must match the expected variable type. - if token[0] != elToken - text = "'#{elToken}' expected but found " + - "'#{token[1]}' (#{token[0]})." - unless @@expectedTokens.empty? - text = "#{@@expectedTokens.join(', ')} or " + text - end - error('spec_token_expctd', text, token[2]) - end - # If the element is a variable store the value of the token. - @stack.last.store(token[1], token[2]) - end - end - end end -