lib/RichTextScanner.rb in taskjuggler-0.0.2 vs lib/RichTextScanner.rb in taskjuggler-0.0.3
- old
+ new
@@ -37,13 +37,17 @@
# This flag is set to true whenever we are at the start of a text line.
@beginOfLine = true
# This is the position of the start of the currently processed line.
# It's only used for error reporting.
@lineStart = 0
- # Most of the wiki markup interpretation can be turned on/off by using
- # <nowiki>...</nowiki> in the text. This flag keeps this state.
- @wikiEnabled = true
+ # This variable stores the mode that the parser is operating in. The
+ # following modes are supported:
+ # :wiki : accept supported MediaWiki subset plus TJ extensions
+ # :nowiki : ignore most markup except for the </nowiki> token
+ # :funcarg : parse name and parameters of an block or inline parser
+ # function.
+ @mode = :wiki
# Enable to trigger printout instead of exception.
@debug = false
end
# This is a wrapper for nextToken only used for debugging.
@@ -60,174 +64,27 @@
tok = @tokenBuffer
@tokenBuffer = nil
return tok
end
- # Some characters have only a special meaning at the start of the line.
- # When the last token pushed the cursor into a new line, this flag is set
- # to true.
- if @beginOfLine && @wikiEnabled
- # Reset the flag again.
- @beginOfLine = false
-
- # We already know that the last newline was a real linebreak. Further
- # newlines can safely be ignored.
- readSequence("\n")
-
- case (c = nextChar)
- when '='
- # Headings start with 2 or more = and must be followed by a space.
- level = readSequenceMax('=', 4)
- if level == 1
- # 1 = does not mean anything. Push it back and process it as normal
- # text further down.
- returnChar
- else
- # Between the = characters and the title text must be exactly one
- # space.
- return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' '
- # If that's missing, The = are treated as normal text further down.
- returnChar(level + 1)
- end
- when '-'
- # Horizontal ruler. Must have exactly 4 -.
- level = readSequenceMax('-', 4)
- return [ "HLINE", '-' * 4 ] if level == 4
- returnChar(level)
- when '*'
- # Bullet lists start with one to three * characters.
- level = readSequenceMax('*')
- # Between the * characters and the bullet text must be exactly one
- # space.
- return [ "BULLET#{level}", '*' * level ] if nextChar == ' '
- # If that's missing, The # are treated as normal text further down.
- returnChar(level + 1)
- when '#'
- # Numbered list start with one to three # characters.
- level = readSequenceMax('#')
- # Between the # characters and the bullet text must be exactly one
- # space.
- return [ "NUMBER#{level}", '#' * level ] if nextChar == ' '
- # If that's missing, The # are treated as normal text further down.
- returnChar(level + 1)
- when ' '
- # Lines that start with a space are treated as verbatim text.
- return [ "PRE", readCode ] if (c = peek) && c != "\n"
- else
- # If the character is not a known control character we push it back
- # and treat it as normal text further down.
- returnChar
+ if @mode == :funcarg
+ return nextTokenFuncArg
+ end
+ if @beginOfLine && @mode == :wiki
+ if (res = nextTokenWikiBOL)
+ return res
end
end
- # Not all sequences of inline markup characters are control sequences. In
- # case we detect a sequence that has not the right number of characters,
- # we push them back and start over with this flag set to true.
- ignoreInlineMarkup = false
-
+ # Many inline control character sequences consit of multiple characters.
+ # In case of incomplete sequences, we roll back to the start character
+ # and set the ignoreInlineMarkup flag to simply treat them as normal
+ # text.
+ @ignoreInlineMarkup = false
loop do
- c = nextChar
- if c.nil?
- # We've reached the end of the text.
- return [ '.', '<END>' ]
- elsif c == ' ' || c == "\t"
- # Sequences of tabs or spaces are treated as token boundaries, but
- # otherwise they are ignored.
- readSequence(' ', "\t")
- return [ 'SPACE', ' ' ]
- elsif c == "'" && !ignoreInlineMarkup && @wikiEnabled
- # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
- # code, 5 ' means italic and bold. Anything else is just normal text.
- level = readSequenceMax("'", 5)
- if level == 2
- return [ 'ITALIC', "'" * level ]
- elsif level == 3
- return [ 'BOLD', "'" * level ]
- elsif level == 4
- return [ 'CODE', "'" * level ]
- elsif level == 5
- return [ 'BOLDITALIC', "'" * level ]
- else
- # We have not found the right syntax. Treat the found characters as
- # normal text. Push all ' back and start again but ignoring the '
- # code for once.
- returnChar(level)
- ignoreInlineMarkup = true
- next
- end
- elsif c == '=' && !ignoreInlineMarkup && @wikiEnabled
- level = readSequenceMax('=', 4)
- if level > 1
- return [ "TITLE#{level - 1}END", '=' * level ]
- else
- # We have not found the right syntax. Treat found characters as
- # normal text. Push all = back and start again but ignoring the =
- # code for once.
- returnChar(level)
- ignoreInlineMarkup = true
- next
- end
- elsif c == '[' && @wikiEnabled
- level = readSequenceMax('[', 2)
- return [ level == 1 ? 'HREF' : 'REF', '[' * level ]
- elsif c == ']' && @wikiEnabled
- level = readSequenceMax(']', 2)
- return [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ]
- elsif c == "\n"
- # Newlines are pretty important as they can terminate blocks and turn
- # the next character into the start of a control sequence.
- # Hard linebreaks consist of a newline followed by another newline or
- # any of the begin-of-line control characters.
- if (c = nextChar) && "\n*# =-".include?(c)
- returnChar if c != "\n"
- # The next character may be a control character.
- @beginOfLine = true
- return [ 'LINEBREAK', "\n" ]
- elsif c.nil?
- # We hit the end of the text.
- return [ '.', '<END>' ]
- else
- # Single line breaks are treated as spaces. Return the char after
- # the newline and start with this one again.
- returnChar
- return [ 'SPACE', ' ' ]
- end
- elsif c == '<'
- if peekMatch('nowiki>')
- # Turn most wiki markup interpretation off.
- @pos += 'nowiki>'.length
- @wikiEnabled = false
- next
- elsif peekMatch('/nowiki>')
- # Turn most wiki markup interpretation on.
- @pos += '/nowiki>'.length
- @wikiEnabled = true
- next
- else
- returnChar
- end
- else
- # Reset this flag again.
- ignoreInlineMarkup = false
- str = ''
- str << c
- # Now we can collect characters of a word until we hit a whitespace.
- while (c = nextChar) && !" \n\t".include?(c)
- if @wikiEnabled
- # Or at least to ' characters in a row.
- break if c == "'" && peek == "'"
- # Or a ] or <
- break if ']<'.include?(c)
- else
- # Make sure we find the </nowiki> tag even within a word.
- break if c == '<'
- end
- str << c
- end
- # Return the character that indicated the word end.
- returnChar
- return [ 'WORD', str ]
+ if res = (@mode == :wiki ? nextTokenWikiInline : nextTokenNoWikiInline)
+ return res
end
end
end
# Return the last issued token to the token buffer.
@@ -260,10 +117,260 @@
end
end
private
+ # Function arguments have the following formats:
+ # <[blockfunc par1="value1" par2='value2']>
+ # <-inlinefunc par1="value1" ... ->
+ def nextTokenFuncArg
+ token = [ '.', '<END>' ]
+ while c = nextChar
+ case c
+ when ' ', "\n", "\t"
+ if (tok = readBlanks(c))
+ token = tok
+ break
+ end
+ when '='
+ return [ '_=', '=' ]
+ when "'"
+ return readString(c)
+ when '"'
+ return readString(c)
+ when 'a'..'z', 'A'..'Z', '_'
+ return readId(c)
+ when ']'
+ if nextChar == '>'
+ @mode = :wiki
+ return [ 'BLOCKFUNCEND', ']>' ]
+ end
+ returnChar
+ when '-'
+ if nextChar == '>'
+ @mode = :wiki
+ return [ 'INLINEFUNCEND', '->' ]
+ end
+ returnChar
+ end
+ end
+ end
+
+ def nextTokenWikiBOL
+ # Some characters have only a special meaning at the start of the line.
+ # When the last token pushed the cursor into a new line, this flag is set
+ # to true.
+
+ # Reset the flag again.
+ @beginOfLine = false
+
+ # We already know that the last newline was a real linebreak. Further
+ # newlines can safely be ignored.
+ readSequence("\n")
+
+ # All the lead characters of a token here also need to be registered
+ # with nextTokenNewline!
+ case (c = nextChar)
+ when '='
+ # Headings start with 2 or more = and must be followed by a space.
+ level = readSequenceMax('=', 4)
+ if level == 1
+ # 1 = does not mean anything. Push it back and process it as normal
+ # text further down.
+ returnChar
+ else
+ # Between the = characters and the title text must be exactly one
+ # space.
+ return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' '
+ # If that's missing, The = are treated as normal text further down.
+ returnChar(level + 1)
+ end
+ when '-'
+ # Horizontal ruler. Must have exactly 4 -.
+ level = readSequenceMax('-', 4)
+ return [ "HLINE", '-' * 4 ] if level == 4
+ returnChar(level)
+ when '*'
+ # Bullet lists start with one to three * characters.
+ level = readSequenceMax('*')
+ # Between the * characters and the bullet text must be exactly one
+ # space.
+ return [ "BULLET#{level}", '*' * level ] if nextChar == ' '
+ # If that's missing, The # are treated as normal text further down.
+ returnChar(level + 1)
+ when '#'
+ # Numbered list start with one to three # characters.
+ level = readSequenceMax('#')
+ # Between the # characters and the bullet text must be exactly one
+ # space.
+ return [ "NUMBER#{level}", '#' * level ] if nextChar == ' '
+ # If that's missing, The # are treated as normal text further down.
+ returnChar(level + 1)
+ when '<'
+ # This may be the start of a block generating function.
+ if nextChar == '['
+ # Switch the parser to block function argument parsing mode.
+ @mode = :funcarg
+ return [ 'BLOCKFUNCSTART', '<[' ]
+ end
+ # Maybe not.
+ returnChar(2)
+ when ' '
+ # Lines that start with a space are treated as verbatim text.
+ return [ "PRE", readCode ] if (c = peek) && c != "\n"
+ else
+ # If the character is not a known control character we push it back
+ # and treat it as normal text further down.
+ returnChar
+ end
+
+ return nil
+ end
+
+ def nextTokenWikiInline
+ c = nextChar
+ if c.nil?
+ # We've reached the end of the text.
+ [ '.', '<END>' ]
+ elsif c == ' ' || c == "\t"
+ # Sequences of tabs or spaces are treated as token boundaries, but
+ # otherwise they are ignored.
+ readSequence(' ', "\t")
+ [ 'SPACE', ' ' ]
+ elsif c == "'" && !@ignoreInlineMarkup
+ # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
+ # code, 5 ' means italic and bold. Anything else is just normal text.
+ level = readSequenceMax("'", 5)
+ if level == 2
+ [ 'ITALIC', "'" * level ]
+ elsif level == 3
+ [ 'BOLD', "'" * level ]
+ elsif level == 4
+ [ 'CODE', "'" * level ]
+ elsif level == 5
+ [ 'BOLDITALIC', "'" * level ]
+ else
+ # We have not found the right syntax. Treat the found characters as
+ # normal text. Push all ' back and start again but ignoring the '
+ # code for once.
+ returnChar(level)
+ @ignoreInlineMarkup = true
+ nil
+ end
+ elsif c == '=' && !@ignoreInlineMarkup
+ level = readSequenceMax('=', 4)
+ if level > 1
+ [ "TITLE#{level - 1}END", '=' * level ]
+ else
+ # We have not found the right syntax. Treat found characters as
+ # normal text. Push all = back and start again but ignoring the =
+ # code for once.
+ returnChar(level)
+ @ignoreInlineMarkup = true
+ nil
+ end
+ elsif c == '['
+ level = readSequenceMax('[', 2)
+ [ level == 1 ? 'HREF' : 'REF', '[' * level ]
+ elsif c == ']'
+ level = readSequenceMax(']', 2)
+ [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ]
+ elsif c == "\n"
+ nextTokenNewline
+ elsif c == '<' && !@ignoreInlineMarkup
+ nextTokenOpenAngle
+ else
+ nextTokenWord(c)
+ end
+ end
+
+ def nextTokenNoWikiInline
+ c = nextChar
+ if c.nil?
+ # We've reached the end of the text.
+ [ '.', '<END>' ]
+ elsif c == ' ' || c == "\t"
+ # Sequences of tabs or spaces are treated as token boundaries, but
+ # otherwise they are ignored.
+ readSequence(' ', "\t")
+ [ 'SPACE', ' ' ]
+ elsif c == "\n"
+ nextTokenNewline
+ elsif c == '<' && !@ignoreInlineMarkup
+ nextTokenOpenAngle
+ else
+ nextTokenWord(c)
+ end
+ end
+
+ # We've just read a newline. Now we need to figure out whether this is a
+ # LINEBREAK or just a SPACE. This is determined by looking at the next
+ # character.
+ def nextTokenNewline
+ # Newlines are pretty important as they can terminate blocks and turn
+ # the next character into the start of a control sequence.
+ # Hard linebreaks consist of a newline followed by another newline or
+ # any of the begin-of-line control characters.
+ if (c = nextChar) && "\n*#< =-".include?(c)
+ returnChar if c != "\n"
+ # The next character may be a control character.
+ @beginOfLine = true
+ [ 'LINEBREAK', "\n" ]
+ elsif c.nil?
+ # We hit the end of the text.
+ [ '.', '<END>' ]
+ else
+ # Single line breaks are treated as spaces. Return the char after
+ # the newline and start with this one again.
+ returnChar
+ [ 'SPACE', ' ' ]
+ end
+ end
+
+ def nextTokenOpenAngle
+ if peekMatch('nowiki>')
+ # Turn most wiki markup interpretation off.
+ @pos += 'nowiki>'.length
+ @mode = :nowiki
+ elsif peekMatch('/nowiki>')
+ # Turn most wiki markup interpretation on.
+ @pos += '/nowiki>'.length
+ @mode = :wiki
+ else
+ # We've not found a valid control sequence. Push back the character
+ # and make sure we treat it as a normal character.
+ @ignoreInlineMarkup = true
+ returnChar
+ end
+ nil
+ end
+
+ # _c_ does not match any start of a control sequence, so we read
+ # characters until we find the end of the word.
+ def nextTokenWord(c)
+ # Reset this flag again.
+ @ignoreInlineMarkup = false
+ str = ''
+ str << c
+ # Now we can collect characters of a word until we hit a whitespace.
+ while (c = nextChar) && !" \n\t".include?(c)
+ if @mode == :wiki
+ # Or at least to ' characters in a row.
+ break if c == "'" && peek == "'"
+ # Or a ] or <
+ break if ']<'.include?(c)
+ else
+ # Make sure we find the </nowiki> tag even within a word.
+ break if c == '<'
+ end
+ str << c
+ end
+ # Return the character that indicated the word end.
+ returnChar
+ [ 'WORD', str ]
+ end
+
# Deliver the next character. Keep track of the cursor position. In case we
# reach the end, nil is returned.
def nextChar
return nil if @pos >= @textLength
c = @text[@pos]
@@ -315,11 +422,11 @@
# Return true if the next characters match exactly the character sequence in
# word.
def peekMatch(word)
# Since Ruby 1.9 is returning Strings for String#[] we need to emulate
# this for Ruby 1.8.
- '' << @text[@pos, word.length] == word
+ ('' << @text[@pos, word.length]) == word
end
# Read a sequence of characters that are all contained in the _chars_ Array.
# If a character is found that is not in _chars_ the method returns the so
# far found sequence of chars as String.
@@ -366,9 +473,46 @@
returnChar
@beginOfLine = true
tok
end
+ def readBlanks(c)
+ loop do
+ if c != ' ' && c != "\n" && c != "\t"
+ returnChar
+ return nil
+ end
+ c = nextChar
+ end
+ end
+
+ def readId(c)
+ token = ""
+ token << c
+ while (c = nextChar) &&
+ (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9') === c ||
+ c == '_')
+ token << c
+ end
+ returnChar
+ return [ 'ID', token ]
+ end
+
+ def readString(terminator)
+ token = ""
+ while (c = nextChar) && c != terminator
+ if c == "\\"
+ # Terminators can be used as regular characters when prefixed by a \.
+ if (c = nextChar) && c != terminator
+ # \ followed by non-terminator. Just add both.
+ token << "\\"
+ end
+ end
+ token << c
+ end
+
+ [ 'STRING', token ]
+ end
end
# Exception raised by the RichTextScanner in case of processing errors. Its
# primary purpose is to carry the id, lineNo, error message and the currently
# parsed line information.