RichTextScanner.rb in taskjuggler-0.0.4

- old
+ new

@@ -1,11 +1,11 @@
 #!/usr/bin/env ruby -w
 # encoding: UTF-8
 #
 # = RichTextScanner.rb -- The TaskJuggler III Project Management Software
 #
-# Copyright (c) 2006, 2007, 2008, 2009 by Chris Schlaeger <cs@kde.org>
+# Copyright (c) 2006, 2007, 2008, 2009, 2010 by Chris Schlaeger <cs@kde.org>
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of version 2 of the GNU General Public License as
 # published by the Free Software Foundation.
 #
@@ -51,10 +51,11 @@
     end
 
     # This is a wrapper for nextToken only used for debugging.
     #def nextToken
     #  tok = nextTokenI
+    #  raise "Token Error:" unless tok && tok[0] && tok[1]
     #  puts "#{tok[0]}: #{tok[1]}"
     #  tok
     #end
 
     # Return the next token from the input text.
@@ -66,10 +67,12 @@
         return tok
       end
 
       if @mode == :funcarg
         return nextTokenFuncArg
+      elsif @mode == :href
+        return nextTokenHRef
       end
       if @beginOfLine && @mode == :wiki
         if (res = nextTokenWikiBOL)
           return res
         end
@@ -122,11 +125,11 @@
     # Function arguments have the following formats:
     #  <[blockfunc par1="value1" par2='value2']>
     #  <-inlinefunc par1="value1" ... ->
     def nextTokenFuncArg
       token = [ '.', '<END>' ]
-      while c = nextChar
+      while (c = nextChar)
         case c
         when ' ', "\n", "\t"
           if (tok = readBlanks(c))
             token = tok
             break
@@ -151,12 +154,48 @@
             return [ 'INLINEFUNCEND', '->' ]
           end
           returnChar
         end
       end
+      token
     end
 
+    def nextTokenHRef
+      token = [ '.', '<END>' ]
+      while (c = nextChar)
+        if c.nil?
+          # We've reached the end of the text.
+          return [ '.', '<END>' ]
+        elsif c == ' ' || c == "\t" || c == "\n"
+          # Sequences of tabs, spaces and newlines are treated as token
+          # boundaries, but otherwise they are ignored.
+          readSequence(" \n\t")
+          return [ 'SPACE', ' ' ]
+        elsif c == '<' && !@ignoreInlineMarkup
+          if nextChar == '-' && isIdStart(peek(1))
+            token = readId('', 'QUERY')
+            unless nextChar == '-' && nextChar == '>'
+              error('unterminated_query',
+                    "Inline query must be terminated with '->'")
+            end
+            return token
+          else
+            # It's not a query.
+            returnChar(2)
+            @ignoreInlineMarkup = true
+            next
+          end
+        elsif c == ']'
+          @mode = :wiki
+          return [ 'HREFEND', ']' ]
+        else
+          return nextTokenWord(c)
+        end
+      end
+      token
+    end
+
     def nextTokenWikiBOL
       # Some characters have only a special meaning at the start of the line.
       # When the last token pushed the cursor into a new line, this flag is set
       # to true.
 
@@ -232,11 +271,11 @@
         # We've reached the end of the text.
         [ '.', '<END>' ]
       elsif c == ' ' || c == "\t"
         # Sequences of tabs or spaces are treated as token boundaries, but
         # otherwise they are ignored.
-        readSequence(' ', "\t")
+        readSequence(" \t")
         [ 'SPACE', ' ' ]
       elsif c == "'" && !@ignoreInlineMarkup
         # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
         # code, 5 ' means italic and bold. Anything else is just normal text.
         level = readSequenceMax("'", 5)
@@ -268,14 +307,19 @@
           @ignoreInlineMarkup = true
           nil
         end
       elsif c == '['
         level = readSequenceMax('[', 2)
-        [ level == 1 ? 'HREF' : 'REF', '[' * level ]
-      elsif c == ']'
-        level = readSequenceMax(']', 2)
-        [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ]
+        if level == 1
+          @mode = :href
+          [ 'HREF' , '[' ]
+        else
+          [ 'REF', '[[' ]
+        end
+      elsif c == ']' && peek == ']'
+        nextChar
+        [ 'REFEND', ']]' ]
       elsif c == "\n"
         nextTokenNewline
       elsif c == '<' && !@ignoreInlineMarkup
         nextTokenOpenAngle
       else
@@ -289,11 +333,11 @@
         # We've reached the end of the text.
         [ '.', '<END>' ]
       elsif c == ' ' || c == "\t"
         # Sequences of tabs or spaces are treated as token boundaries, but
         # otherwise they are ignored.
-        readSequence(' ', "\t")
+        readSequence(" \t")
         [ 'SPACE', ' ' ]
       elsif c == "\n"
         nextTokenNewline
       elsif c == '<' && !@ignoreInlineMarkup
         nextTokenOpenAngle
@@ -308,18 +352,28 @@
     def nextTokenNewline
       # Newlines are pretty important as they can terminate blocks and turn
       # the next character into the start of a control sequence.
       # Hard linebreaks consist of a newline followed by another newline or
       # any of the begin-of-line control characters.
-      if (c = nextChar) && "\n*#< =-".include?(c)
+      if (c = nextChar).nil?
+        # We hit the end of the text.
+        [ '.', '<END>' ]
+      elsif c == '<' && peekMatch('[')
+        # the '<' can be a start of a block (BLOCKFUNCSTART) or inline text
+        # (INLINEFUNCSTART). Only for the first case the linebreak is real.
         returnChar if c != "\n"
         # The next character may be a control character.
         @beginOfLine = true
         [ 'LINEBREAK', "\n" ]
-      elsif c.nil?
-        # We hit the end of the text.
-        [ '.', '<END>' ]
+      elsif "\n*# =-".include?(c)
+        # These characters correspond to the first characters of a block
+        # element. When they are found at the begin of the line, the newline
+        # was really a line break.
+        returnChar if c != "\n"
+        # The next character may be a control character.
+        @beginOfLine = true
+        [ 'LINEBREAK', "\n" ]
       else
         # Single line breaks are treated as spaces. Return the char after
         # the newline and start with this one again.
         returnChar
         [ 'SPACE', ' ' ]
@@ -333,10 +387,15 @@
         @mode = :nowiki
       elsif peekMatch('/nowiki>')
         # Turn most wiki markup interpretation on.
         @pos += '/nowiki>'.length
         @mode = :wiki
+      elsif peekMatch('-') && @mode == :wiki
+        nextChar
+        # Switch the parser to function argument parsing mode.
+        @mode = :funcarg
+        return [ 'INLINEFUNCSTART', '<-' ]
       else
         # We've not found a valid control sequence. Push back the character
         # and make sure we treat it as a normal character.
         @ignoreInlineMarkup = true
         returnChar
@@ -351,15 +410,19 @@
       @ignoreInlineMarkup = false
       str = ''
       str << c
       # Now we can collect characters of a word until we hit a whitespace.
       while (c = nextChar) && !" \n\t".include?(c)
-        if @mode == :wiki
+        case @mode
+        when :wiki
           # Or at least to ' characters in a row.
           break if c == "'" && peek == "'"
-          # Or a ] or <
+          # Or a -, ] or <
           break if ']<'.include?(c)
+        when :href
+          # Look for - of the end mark -> end ']'
+          break if c == '-' || c == ']' || c == '<'
         else
           # Make sure we find the </nowiki> tag even within a word.
           break if c == '<'
         end
         str << c
@@ -370,11 +433,16 @@
     end
 
     # Deliver the next character. Keep track of the cursor position. In case we
     # reach the end, nil is returned.
     def nextChar
-      return nil if @pos >= @textLength
+      if @pos >= @textLength
+        # Correct @pos so that returnChar works properly but mutliple reads of
+        # EOT are ignored.
+        @pos = @textLength + 1
+        return nil
+      end
       c = @text[@pos]
       @pos += 1
       if c == ?\n
         @lineNo += 1
         # Save the position of the line start for later use during error
@@ -428,13 +496,13 @@
     end
 
     # Read a sequence of characters that are all contained in the _chars_ Array.
     # If a character is found that is not in _chars_ the method returns the so
     # far found sequence of chars as String.
-    def readSequence(*chars)
+    def readSequence(chars)
       sequence = ''
-      while chars.include?(c = nextChar)
+      while (c = nextChar) && chars.index(c)
         sequence << c
       end
       # Push back the character that did no longer match.
       returnChar
       sequence
@@ -483,19 +551,23 @@
         end
         c = nextChar
       end
     end
 
-    def readId(c)
+    def isIdStart(c)
+      (('a'..'z') === c || ('A'..'Z') === c || c == '_')
+    end
+
+    def readId(c, tokenType = 'ID')
       token = ""
       token << c
       while (c = nextChar) &&
             (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9')  === c ||
              c == '_')
         token << c
       end
       returnChar
-      return [ 'ID', token ]
+      return [ tokenType, token ]
     end
 
     def readString(terminator)
       token = ""
       while (c = nextChar) && c != terminator