RichTextScanner.rb in taskjuggler-0.0.3

- old
+ new

@@ -37,13 +37,17 @@
       # This flag is set to true whenever we are at the start of a text line.
       @beginOfLine = true
       # This is the position of the start of the currently processed line.
       # It's only used for error reporting.
       @lineStart = 0
-      # Most of the wiki markup interpretation can be turned on/off by using
-      # <nowiki>...</nowiki> in the text. This flag keeps this state.
-      @wikiEnabled = true
+      # This variable stores the mode that the parser is operating in. The
+      # following modes are supported:
+      # :wiki : accept supported MediaWiki subset plus TJ extensions
+      # :nowiki : ignore most markup except for the </nowiki> token
+      # :funcarg : parse name and parameters of an block  or inline parser
+      # function.
+      @mode = :wiki
       # Enable to trigger printout instead of exception.
       @debug = false
     end
 
     # This is a wrapper for nextToken only used for debugging.
@@ -60,174 +64,27 @@
         tok = @tokenBuffer
         @tokenBuffer = nil
         return tok
       end
 
-      # Some characters have only a special meaning at the start of the line.
-      # When the last token pushed the cursor into a new line, this flag is set
-      # to true.
-      if @beginOfLine && @wikiEnabled
-        # Reset the flag again.
-        @beginOfLine = false
-
-        # We already know that the last newline was a real linebreak. Further
-        # newlines can safely be ignored.
-        readSequence("\n")
-
-        case (c = nextChar)
-        when '='
-          # Headings start with 2 or more = and must be followed by a space.
-          level = readSequenceMax('=', 4)
-          if level == 1
-            # 1 = does not mean anything. Push it back and process it as normal
-            # text further down.
-            returnChar
-          else
-            # Between the = characters and the title text must be exactly one
-            # space.
-            return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' '
-            # If that's missing, The = are treated as normal text further down.
-            returnChar(level + 1)
-          end
-        when '-'
-          # Horizontal ruler. Must have exactly 4 -.
-          level = readSequenceMax('-', 4)
-          return [ "HLINE", '-' * 4 ] if level == 4
-          returnChar(level)
-        when '*'
-          # Bullet lists start with one to three * characters.
-          level = readSequenceMax('*')
-          # Between the * characters and the bullet text must be exactly one
-          # space.
-          return [ "BULLET#{level}", '*' * level ] if nextChar == ' '
-          # If that's missing, The # are treated as normal text further down.
-          returnChar(level + 1)
-        when '#'
-          # Numbered list start with one to three # characters.
-          level = readSequenceMax('#')
-          # Between the # characters and the bullet text must be exactly one
-          # space.
-          return [ "NUMBER#{level}", '#' * level ] if nextChar == ' '
-          # If that's missing, The # are treated as normal text further down.
-          returnChar(level + 1)
-        when ' '
-          # Lines that start with a space are treated as verbatim text.
-          return [ "PRE", readCode ] if (c = peek) && c != "\n"
-        else
-          # If the character is not a known control character we push it back
-          # and treat it as normal text further down.
-          returnChar
+      if @mode == :funcarg
+        return nextTokenFuncArg
+      end
+      if @beginOfLine && @mode == :wiki
+        if (res = nextTokenWikiBOL)
+          return res
         end
       end
 
-      # Not all sequences of inline markup characters are control sequences. In
-      # case we detect a sequence that has not the right number of characters,
-      # we push them back and start over with this flag set to true.
-      ignoreInlineMarkup = false
-
+      # Many inline control character sequences consit of multiple characters.
+      # In case of incomplete sequences, we roll back to the start character
+      # and set the ignoreInlineMarkup flag to simply treat them as normal
+      # text.
+      @ignoreInlineMarkup = false
       loop do
-        c = nextChar
-        if c.nil?
-          # We've reached the end of the text.
-          return [ '.', '<END>' ]
-        elsif c == ' ' || c == "\t"
-          # Sequences of tabs or spaces are treated as token boundaries, but
-          # otherwise they are ignored.
-          readSequence(' ', "\t")
-          return [ 'SPACE', ' ' ]
-        elsif c == "'" && !ignoreInlineMarkup && @wikiEnabled
-          # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
-          # code, 5 ' means italic and bold. Anything else is just normal text.
-          level = readSequenceMax("'", 5)
-          if level == 2
-            return [ 'ITALIC', "'" * level ]
-          elsif level == 3
-            return [ 'BOLD', "'" * level ]
-          elsif level == 4
-            return [ 'CODE', "'" * level ]
-          elsif level == 5
-            return [ 'BOLDITALIC', "'" * level ]
-          else
-            # We have not found the right syntax. Treat the found characters as
-            # normal text.  Push all ' back and start again but ignoring the '
-            # code for once.
-            returnChar(level)
-            ignoreInlineMarkup = true
-            next
-          end
-        elsif c == '=' && !ignoreInlineMarkup && @wikiEnabled
-          level = readSequenceMax('=', 4)
-          if level > 1
-            return [ "TITLE#{level - 1}END", '=' * level ]
-          else
-            # We have not found the right syntax. Treat found characters as
-            # normal text.  Push all = back and start again but ignoring the =
-            # code for once.
-            returnChar(level)
-            ignoreInlineMarkup = true
-            next
-          end
-        elsif c == '[' && @wikiEnabled
-          level = readSequenceMax('[', 2)
-          return [ level == 1 ? 'HREF' : 'REF', '[' * level ]
-        elsif c == ']' && @wikiEnabled
-          level = readSequenceMax(']', 2)
-          return [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ]
-        elsif c == "\n"
-          # Newlines are pretty important as they can terminate blocks and turn
-          # the next character into the start of a control sequence.
-          # Hard linebreaks consist of a newline followed by another newline or
-          # any of the begin-of-line control characters.
-          if (c = nextChar) && "\n*# =-".include?(c)
-            returnChar if c != "\n"
-            # The next character may be a control character.
-            @beginOfLine = true
-            return [ 'LINEBREAK', "\n" ]
-          elsif c.nil?
-            # We hit the end of the text.
-            return [ '.', '<END>' ]
-          else
-            # Single line breaks are treated as spaces. Return the char after
-            # the newline and start with this one again.
-            returnChar
-            return [ 'SPACE', ' ' ]
-          end
-        elsif c == '<'
-          if peekMatch('nowiki>')
-            # Turn most wiki markup interpretation off.
-            @pos += 'nowiki>'.length
-            @wikiEnabled = false
-            next
-          elsif peekMatch('/nowiki>')
-            # Turn most wiki markup interpretation on.
-            @pos += '/nowiki>'.length
-            @wikiEnabled = true
-            next
-          else
-            returnChar
-          end
-        else
-          # Reset this flag again.
-          ignoreInlineMarkup = false
-          str = ''
-          str << c
-          # Now we can collect characters of a word until we hit a whitespace.
-          while (c = nextChar) && !" \n\t".include?(c)
-            if @wikiEnabled
-              # Or at least to ' characters in a row.
-              break if c == "'" && peek == "'"
-              # Or a ] or <
-              break if ']<'.include?(c)
-            else
-              # Make sure we find the </nowiki> tag even within a word.
-              break if c == '<'
-            end
-            str << c
-          end
-          # Return the character that indicated the word end.
-          returnChar
-          return [ 'WORD', str ]
+        if res = (@mode == :wiki ? nextTokenWikiInline : nextTokenNoWikiInline)
+          return res
         end
       end
     end
 
     # Return the last issued token to the token buffer.
@@ -260,10 +117,260 @@
       end
     end
 
   private
 
+    # Function arguments have the following formats:
+    #  <[blockfunc par1="value1" par2='value2']>
+    #  <-inlinefunc par1="value1" ... ->
+    def nextTokenFuncArg
+      token = [ '.', '<END>' ]
+      while c = nextChar
+        case c
+        when ' ', "\n", "\t"
+          if (tok = readBlanks(c))
+            token = tok
+            break
+          end
+        when '='
+          return [ '_=', '=' ]
+        when "'"
+          return readString(c)
+        when '"'
+          return readString(c)
+        when 'a'..'z', 'A'..'Z', '_'
+          return readId(c)
+        when ']'
+          if nextChar == '>'
+            @mode = :wiki
+            return [ 'BLOCKFUNCEND', ']>' ]
+          end
+          returnChar
+        when '-'
+          if nextChar == '>'
+            @mode = :wiki
+            return [ 'INLINEFUNCEND', '->' ]
+          end
+          returnChar
+        end
+      end
+    end
+
+    def nextTokenWikiBOL
+      # Some characters have only a special meaning at the start of the line.
+      # When the last token pushed the cursor into a new line, this flag is set
+      # to true.
+
+      # Reset the flag again.
+      @beginOfLine = false
+
+      # We already know that the last newline was a real linebreak. Further
+      # newlines can safely be ignored.
+      readSequence("\n")
+
+      # All the lead characters of a token here also need to be registered
+      # with nextTokenNewline!
+      case (c = nextChar)
+      when '='
+        # Headings start with 2 or more = and must be followed by a space.
+        level = readSequenceMax('=', 4)
+        if level == 1
+          # 1 = does not mean anything. Push it back and process it as normal
+          # text further down.
+          returnChar
+        else
+          # Between the = characters and the title text must be exactly one
+          # space.
+          return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' '
+          # If that's missing, The = are treated as normal text further down.
+          returnChar(level + 1)
+        end
+      when '-'
+        # Horizontal ruler. Must have exactly 4 -.
+        level = readSequenceMax('-', 4)
+        return [ "HLINE", '-' * 4 ] if level == 4
+        returnChar(level)
+      when '*'
+        # Bullet lists start with one to three * characters.
+        level = readSequenceMax('*')
+        # Between the * characters and the bullet text must be exactly one
+        # space.
+        return [ "BULLET#{level}", '*' * level ] if nextChar == ' '
+        # If that's missing, The # are treated as normal text further down.
+        returnChar(level + 1)
+      when '#'
+        # Numbered list start with one to three # characters.
+        level = readSequenceMax('#')
+        # Between the # characters and the bullet text must be exactly one
+        # space.
+        return [ "NUMBER#{level}", '#' * level ] if nextChar == ' '
+        # If that's missing, The # are treated as normal text further down.
+        returnChar(level + 1)
+      when '<'
+        # This may be the start of a block generating function.
+        if nextChar == '['
+          # Switch the parser to block function argument parsing mode.
+          @mode = :funcarg
+          return [ 'BLOCKFUNCSTART', '<[' ]
+        end
+        # Maybe not.
+        returnChar(2)
+      when ' '
+        # Lines that start with a space are treated as verbatim text.
+        return [ "PRE", readCode ] if (c = peek) && c != "\n"
+      else
+        # If the character is not a known control character we push it back
+        # and treat it as normal text further down.
+        returnChar
+      end
+
+      return nil
+    end
+
+    def nextTokenWikiInline
+      c = nextChar
+      if c.nil?
+        # We've reached the end of the text.
+        [ '.', '<END>' ]
+      elsif c == ' ' || c == "\t"
+        # Sequences of tabs or spaces are treated as token boundaries, but
+        # otherwise they are ignored.
+        readSequence(' ', "\t")
+        [ 'SPACE', ' ' ]
+      elsif c == "'" && !@ignoreInlineMarkup
+        # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
+        # code, 5 ' means italic and bold. Anything else is just normal text.
+        level = readSequenceMax("'", 5)
+        if level == 2
+          [ 'ITALIC', "'" * level ]
+        elsif level == 3
+          [ 'BOLD', "'" * level ]
+        elsif level == 4
+          [ 'CODE', "'" * level ]
+        elsif level == 5
+          [ 'BOLDITALIC', "'" * level ]
+        else
+          # We have not found the right syntax. Treat the found characters as
+          # normal text.  Push all ' back and start again but ignoring the '
+          # code for once.
+          returnChar(level)
+          @ignoreInlineMarkup = true
+          nil
+        end
+      elsif c == '=' && !@ignoreInlineMarkup
+        level = readSequenceMax('=', 4)
+        if level > 1
+          [ "TITLE#{level - 1}END", '=' * level ]
+        else
+          # We have not found the right syntax. Treat found characters as
+          # normal text.  Push all = back and start again but ignoring the =
+          # code for once.
+          returnChar(level)
+          @ignoreInlineMarkup = true
+          nil
+        end
+      elsif c == '['
+        level = readSequenceMax('[', 2)
+        [ level == 1 ? 'HREF' : 'REF', '[' * level ]
+      elsif c == ']'
+        level = readSequenceMax(']', 2)
+        [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ]
+      elsif c == "\n"
+        nextTokenNewline
+      elsif c == '<' && !@ignoreInlineMarkup
+        nextTokenOpenAngle
+      else
+        nextTokenWord(c)
+      end
+    end
+
+    def nextTokenNoWikiInline
+      c = nextChar
+      if c.nil?
+        # We've reached the end of the text.
+        [ '.', '<END>' ]
+      elsif c == ' ' || c == "\t"
+        # Sequences of tabs or spaces are treated as token boundaries, but
+        # otherwise they are ignored.
+        readSequence(' ', "\t")
+        [ 'SPACE', ' ' ]
+      elsif c == "\n"
+        nextTokenNewline
+      elsif c == '<' && !@ignoreInlineMarkup
+        nextTokenOpenAngle
+      else
+        nextTokenWord(c)
+      end
+    end
+
+    # We've just read a newline. Now we need to figure out whether this is a
+    # LINEBREAK or just a SPACE. This is determined by looking at the next
+    # character.
+    def nextTokenNewline
+      # Newlines are pretty important as they can terminate blocks and turn
+      # the next character into the start of a control sequence.
+      # Hard linebreaks consist of a newline followed by another newline or
+      # any of the begin-of-line control characters.
+      if (c = nextChar) && "\n*#< =-".include?(c)
+        returnChar if c != "\n"
+        # The next character may be a control character.
+        @beginOfLine = true
+        [ 'LINEBREAK', "\n" ]
+      elsif c.nil?
+        # We hit the end of the text.
+        [ '.', '<END>' ]
+      else
+        # Single line breaks are treated as spaces. Return the char after
+        # the newline and start with this one again.
+        returnChar
+        [ 'SPACE', ' ' ]
+      end
+    end
+
+    def nextTokenOpenAngle
+      if peekMatch('nowiki>')
+        # Turn most wiki markup interpretation off.
+        @pos += 'nowiki>'.length
+        @mode = :nowiki
+      elsif peekMatch('/nowiki>')
+        # Turn most wiki markup interpretation on.
+        @pos += '/nowiki>'.length
+        @mode = :wiki
+      else
+        # We've not found a valid control sequence. Push back the character
+        # and make sure we treat it as a normal character.
+        @ignoreInlineMarkup = true
+        returnChar
+      end
+      nil
+    end
+
+    # _c_ does not match any start of a control sequence, so we read
+    # characters until we find the end of the word.
+    def nextTokenWord(c)
+      # Reset this flag again.
+      @ignoreInlineMarkup = false
+      str = ''
+      str << c
+      # Now we can collect characters of a word until we hit a whitespace.
+      while (c = nextChar) && !" \n\t".include?(c)
+        if @mode == :wiki
+          # Or at least to ' characters in a row.
+          break if c == "'" && peek == "'"
+          # Or a ] or <
+          break if ']<'.include?(c)
+        else
+          # Make sure we find the </nowiki> tag even within a word.
+          break if c == '<'
+        end
+        str << c
+      end
+      # Return the character that indicated the word end.
+      returnChar
+      [ 'WORD', str ]
+    end
+
     # Deliver the next character. Keep track of the cursor position. In case we
     # reach the end, nil is returned.
     def nextChar
       return nil if @pos >= @textLength
       c = @text[@pos]
@@ -315,11 +422,11 @@
     # Return true if the next characters match exactly the character sequence in
     # word.
     def peekMatch(word)
       # Since Ruby 1.9 is returning Strings for String#[] we need to emulate
       # this for Ruby 1.8.
-      '' << @text[@pos, word.length] == word
+      ('' << @text[@pos, word.length]) == word
     end
 
     # Read a sequence of characters that are all contained in the _chars_ Array.
     # If a character is found that is not in _chars_ the method returns the so
     # far found sequence of chars as String.
@@ -366,9 +473,46 @@
       returnChar
       @beginOfLine = true
       tok
     end
 
+    def readBlanks(c)
+      loop do
+        if c != ' ' && c != "\n" && c != "\t"
+          returnChar
+          return nil
+        end
+        c = nextChar
+      end
+    end
+
+    def readId(c)
+      token = ""
+      token << c
+      while (c = nextChar) &&
+            (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9')  === c ||
+             c == '_')
+        token << c
+      end
+      returnChar
+      return [ 'ID', token ]
+    end
+
+    def readString(terminator)
+      token = ""
+      while (c = nextChar) && c != terminator
+        if c == "\\"
+          # Terminators can be used as regular characters when prefixed by a \.
+          if (c = nextChar) && c != terminator
+            # \ followed by non-terminator. Just add both.
+            token << "\\"
+          end
+        end
+        token << c
+      end
+
+      [ 'STRING', token ]
+    end
   end
 
   # Exception raised by the RichTextScanner in case of processing errors. Its
   # primary purpose is to carry the id, lineNo, error message and the currently
   # parsed line information.