lib/RichTextScanner.rb in taskjuggler-0.0.2 vs lib/RichTextScanner.rb in taskjuggler-0.0.3

- old
+ new

@@ -37,13 +37,17 @@ # This flag is set to true whenever we are at the start of a text line. @beginOfLine = true # This is the position of the start of the currently processed line. # It's only used for error reporting. @lineStart = 0 - # Most of the wiki markup interpretation can be turned on/off by using - # <nowiki>...</nowiki> in the text. This flag keeps this state. - @wikiEnabled = true + # This variable stores the mode that the parser is operating in. The + # following modes are supported: + # :wiki : accept supported MediaWiki subset plus TJ extensions + # :nowiki : ignore most markup except for the </nowiki> token + # :funcarg : parse name and parameters of an block or inline parser + # function. + @mode = :wiki # Enable to trigger printout instead of exception. @debug = false end # This is a wrapper for nextToken only used for debugging. @@ -60,174 +64,27 @@ tok = @tokenBuffer @tokenBuffer = nil return tok end - # Some characters have only a special meaning at the start of the line. - # When the last token pushed the cursor into a new line, this flag is set - # to true. - if @beginOfLine && @wikiEnabled - # Reset the flag again. - @beginOfLine = false - - # We already know that the last newline was a real linebreak. Further - # newlines can safely be ignored. - readSequence("\n") - - case (c = nextChar) - when '=' - # Headings start with 2 or more = and must be followed by a space. - level = readSequenceMax('=', 4) - if level == 1 - # 1 = does not mean anything. Push it back and process it as normal - # text further down. - returnChar - else - # Between the = characters and the title text must be exactly one - # space. - return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' ' - # If that's missing, The = are treated as normal text further down. - returnChar(level + 1) - end - when '-' - # Horizontal ruler. Must have exactly 4 -. - level = readSequenceMax('-', 4) - return [ "HLINE", '-' * 4 ] if level == 4 - returnChar(level) - when '*' - # Bullet lists start with one to three * characters. - level = readSequenceMax('*') - # Between the * characters and the bullet text must be exactly one - # space. - return [ "BULLET#{level}", '*' * level ] if nextChar == ' ' - # If that's missing, The # are treated as normal text further down. - returnChar(level + 1) - when '#' - # Numbered list start with one to three # characters. - level = readSequenceMax('#') - # Between the # characters and the bullet text must be exactly one - # space. - return [ "NUMBER#{level}", '#' * level ] if nextChar == ' ' - # If that's missing, The # are treated as normal text further down. - returnChar(level + 1) - when ' ' - # Lines that start with a space are treated as verbatim text. - return [ "PRE", readCode ] if (c = peek) && c != "\n" - else - # If the character is not a known control character we push it back - # and treat it as normal text further down. - returnChar + if @mode == :funcarg + return nextTokenFuncArg + end + if @beginOfLine && @mode == :wiki + if (res = nextTokenWikiBOL) + return res end end - # Not all sequences of inline markup characters are control sequences. In - # case we detect a sequence that has not the right number of characters, - # we push them back and start over with this flag set to true. - ignoreInlineMarkup = false - + # Many inline control character sequences consit of multiple characters. + # In case of incomplete sequences, we roll back to the start character + # and set the ignoreInlineMarkup flag to simply treat them as normal + # text. + @ignoreInlineMarkup = false loop do - c = nextChar - if c.nil? - # We've reached the end of the text. - return [ '.', '<END>' ] - elsif c == ' ' || c == "\t" - # Sequences of tabs or spaces are treated as token boundaries, but - # otherwise they are ignored. - readSequence(' ', "\t") - return [ 'SPACE', ' ' ] - elsif c == "'" && !ignoreInlineMarkup && @wikiEnabled - # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced - # code, 5 ' means italic and bold. Anything else is just normal text. - level = readSequenceMax("'", 5) - if level == 2 - return [ 'ITALIC', "'" * level ] - elsif level == 3 - return [ 'BOLD', "'" * level ] - elsif level == 4 - return [ 'CODE', "'" * level ] - elsif level == 5 - return [ 'BOLDITALIC', "'" * level ] - else - # We have not found the right syntax. Treat the found characters as - # normal text. Push all ' back and start again but ignoring the ' - # code for once. - returnChar(level) - ignoreInlineMarkup = true - next - end - elsif c == '=' && !ignoreInlineMarkup && @wikiEnabled - level = readSequenceMax('=', 4) - if level > 1 - return [ "TITLE#{level - 1}END", '=' * level ] - else - # We have not found the right syntax. Treat found characters as - # normal text. Push all = back and start again but ignoring the = - # code for once. - returnChar(level) - ignoreInlineMarkup = true - next - end - elsif c == '[' && @wikiEnabled - level = readSequenceMax('[', 2) - return [ level == 1 ? 'HREF' : 'REF', '[' * level ] - elsif c == ']' && @wikiEnabled - level = readSequenceMax(']', 2) - return [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ] - elsif c == "\n" - # Newlines are pretty important as they can terminate blocks and turn - # the next character into the start of a control sequence. - # Hard linebreaks consist of a newline followed by another newline or - # any of the begin-of-line control characters. - if (c = nextChar) && "\n*# =-".include?(c) - returnChar if c != "\n" - # The next character may be a control character. - @beginOfLine = true - return [ 'LINEBREAK', "\n" ] - elsif c.nil? - # We hit the end of the text. - return [ '.', '<END>' ] - else - # Single line breaks are treated as spaces. Return the char after - # the newline and start with this one again. - returnChar - return [ 'SPACE', ' ' ] - end - elsif c == '<' - if peekMatch('nowiki>') - # Turn most wiki markup interpretation off. - @pos += 'nowiki>'.length - @wikiEnabled = false - next - elsif peekMatch('/nowiki>') - # Turn most wiki markup interpretation on. - @pos += '/nowiki>'.length - @wikiEnabled = true - next - else - returnChar - end - else - # Reset this flag again. - ignoreInlineMarkup = false - str = '' - str << c - # Now we can collect characters of a word until we hit a whitespace. - while (c = nextChar) && !" \n\t".include?(c) - if @wikiEnabled - # Or at least to ' characters in a row. - break if c == "'" && peek == "'" - # Or a ] or < - break if ']<'.include?(c) - else - # Make sure we find the </nowiki> tag even within a word. - break if c == '<' - end - str << c - end - # Return the character that indicated the word end. - returnChar - return [ 'WORD', str ] + if res = (@mode == :wiki ? nextTokenWikiInline : nextTokenNoWikiInline) + return res end end end # Return the last issued token to the token buffer. @@ -260,10 +117,260 @@ end end private + # Function arguments have the following formats: + # <[blockfunc par1="value1" par2='value2']> + # <-inlinefunc par1="value1" ... -> + def nextTokenFuncArg + token = [ '.', '<END>' ] + while c = nextChar + case c + when ' ', "\n", "\t" + if (tok = readBlanks(c)) + token = tok + break + end + when '=' + return [ '_=', '=' ] + when "'" + return readString(c) + when '"' + return readString(c) + when 'a'..'z', 'A'..'Z', '_' + return readId(c) + when ']' + if nextChar == '>' + @mode = :wiki + return [ 'BLOCKFUNCEND', ']>' ] + end + returnChar + when '-' + if nextChar == '>' + @mode = :wiki + return [ 'INLINEFUNCEND', '->' ] + end + returnChar + end + end + end + + def nextTokenWikiBOL + # Some characters have only a special meaning at the start of the line. + # When the last token pushed the cursor into a new line, this flag is set + # to true. + + # Reset the flag again. + @beginOfLine = false + + # We already know that the last newline was a real linebreak. Further + # newlines can safely be ignored. + readSequence("\n") + + # All the lead characters of a token here also need to be registered + # with nextTokenNewline! + case (c = nextChar) + when '=' + # Headings start with 2 or more = and must be followed by a space. + level = readSequenceMax('=', 4) + if level == 1 + # 1 = does not mean anything. Push it back and process it as normal + # text further down. + returnChar + else + # Between the = characters and the title text must be exactly one + # space. + return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' ' + # If that's missing, The = are treated as normal text further down. + returnChar(level + 1) + end + when '-' + # Horizontal ruler. Must have exactly 4 -. + level = readSequenceMax('-', 4) + return [ "HLINE", '-' * 4 ] if level == 4 + returnChar(level) + when '*' + # Bullet lists start with one to three * characters. + level = readSequenceMax('*') + # Between the * characters and the bullet text must be exactly one + # space. + return [ "BULLET#{level}", '*' * level ] if nextChar == ' ' + # If that's missing, The # are treated as normal text further down. + returnChar(level + 1) + when '#' + # Numbered list start with one to three # characters. + level = readSequenceMax('#') + # Between the # characters and the bullet text must be exactly one + # space. + return [ "NUMBER#{level}", '#' * level ] if nextChar == ' ' + # If that's missing, The # are treated as normal text further down. + returnChar(level + 1) + when '<' + # This may be the start of a block generating function. + if nextChar == '[' + # Switch the parser to block function argument parsing mode. + @mode = :funcarg + return [ 'BLOCKFUNCSTART', '<[' ] + end + # Maybe not. + returnChar(2) + when ' ' + # Lines that start with a space are treated as verbatim text. + return [ "PRE", readCode ] if (c = peek) && c != "\n" + else + # If the character is not a known control character we push it back + # and treat it as normal text further down. + returnChar + end + + return nil + end + + def nextTokenWikiInline + c = nextChar + if c.nil? + # We've reached the end of the text. + [ '.', '<END>' ] + elsif c == ' ' || c == "\t" + # Sequences of tabs or spaces are treated as token boundaries, but + # otherwise they are ignored. + readSequence(' ', "\t") + [ 'SPACE', ' ' ] + elsif c == "'" && !@ignoreInlineMarkup + # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced + # code, 5 ' means italic and bold. Anything else is just normal text. + level = readSequenceMax("'", 5) + if level == 2 + [ 'ITALIC', "'" * level ] + elsif level == 3 + [ 'BOLD', "'" * level ] + elsif level == 4 + [ 'CODE', "'" * level ] + elsif level == 5 + [ 'BOLDITALIC', "'" * level ] + else + # We have not found the right syntax. Treat the found characters as + # normal text. Push all ' back and start again but ignoring the ' + # code for once. + returnChar(level) + @ignoreInlineMarkup = true + nil + end + elsif c == '=' && !@ignoreInlineMarkup + level = readSequenceMax('=', 4) + if level > 1 + [ "TITLE#{level - 1}END", '=' * level ] + else + # We have not found the right syntax. Treat found characters as + # normal text. Push all = back and start again but ignoring the = + # code for once. + returnChar(level) + @ignoreInlineMarkup = true + nil + end + elsif c == '[' + level = readSequenceMax('[', 2) + [ level == 1 ? 'HREF' : 'REF', '[' * level ] + elsif c == ']' + level = readSequenceMax(']', 2) + [ level == 1 ? 'HREFEND' : 'REFEND', ']' * level ] + elsif c == "\n" + nextTokenNewline + elsif c == '<' && !@ignoreInlineMarkup + nextTokenOpenAngle + else + nextTokenWord(c) + end + end + + def nextTokenNoWikiInline + c = nextChar + if c.nil? + # We've reached the end of the text. + [ '.', '<END>' ] + elsif c == ' ' || c == "\t" + # Sequences of tabs or spaces are treated as token boundaries, but + # otherwise they are ignored. + readSequence(' ', "\t") + [ 'SPACE', ' ' ] + elsif c == "\n" + nextTokenNewline + elsif c == '<' && !@ignoreInlineMarkup + nextTokenOpenAngle + else + nextTokenWord(c) + end + end + + # We've just read a newline. Now we need to figure out whether this is a + # LINEBREAK or just a SPACE. This is determined by looking at the next + # character. + def nextTokenNewline + # Newlines are pretty important as they can terminate blocks and turn + # the next character into the start of a control sequence. + # Hard linebreaks consist of a newline followed by another newline or + # any of the begin-of-line control characters. + if (c = nextChar) && "\n*#< =-".include?(c) + returnChar if c != "\n" + # The next character may be a control character. + @beginOfLine = true + [ 'LINEBREAK', "\n" ] + elsif c.nil? + # We hit the end of the text. + [ '.', '<END>' ] + else + # Single line breaks are treated as spaces. Return the char after + # the newline and start with this one again. + returnChar + [ 'SPACE', ' ' ] + end + end + + def nextTokenOpenAngle + if peekMatch('nowiki>') + # Turn most wiki markup interpretation off. + @pos += 'nowiki>'.length + @mode = :nowiki + elsif peekMatch('/nowiki>') + # Turn most wiki markup interpretation on. + @pos += '/nowiki>'.length + @mode = :wiki + else + # We've not found a valid control sequence. Push back the character + # and make sure we treat it as a normal character. + @ignoreInlineMarkup = true + returnChar + end + nil + end + + # _c_ does not match any start of a control sequence, so we read + # characters until we find the end of the word. + def nextTokenWord(c) + # Reset this flag again. + @ignoreInlineMarkup = false + str = '' + str << c + # Now we can collect characters of a word until we hit a whitespace. + while (c = nextChar) && !" \n\t".include?(c) + if @mode == :wiki + # Or at least to ' characters in a row. + break if c == "'" && peek == "'" + # Or a ] or < + break if ']<'.include?(c) + else + # Make sure we find the </nowiki> tag even within a word. + break if c == '<' + end + str << c + end + # Return the character that indicated the word end. + returnChar + [ 'WORD', str ] + end + # Deliver the next character. Keep track of the cursor position. In case we # reach the end, nil is returned. def nextChar return nil if @pos >= @textLength c = @text[@pos] @@ -315,11 +422,11 @@ # Return true if the next characters match exactly the character sequence in # word. def peekMatch(word) # Since Ruby 1.9 is returning Strings for String#[] we need to emulate # this for Ruby 1.8. - '' << @text[@pos, word.length] == word + ('' << @text[@pos, word.length]) == word end # Read a sequence of characters that are all contained in the _chars_ Array. # If a character is found that is not in _chars_ the method returns the so # far found sequence of chars as String. @@ -366,9 +473,46 @@ returnChar @beginOfLine = true tok end + def readBlanks(c) + loop do + if c != ' ' && c != "\n" && c != "\t" + returnChar + return nil + end + c = nextChar + end + end + + def readId(c) + token = "" + token << c + while (c = nextChar) && + (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9') === c || + c == '_') + token << c + end + returnChar + return [ 'ID', token ] + end + + def readString(terminator) + token = "" + while (c = nextChar) && c != terminator + if c == "\\" + # Terminators can be used as regular characters when prefixed by a \. + if (c = nextChar) && c != terminator + # \ followed by non-terminator. Just add both. + token << "\\" + end + end + token << c + end + + [ 'STRING', token ] + end end # Exception raised by the RichTextScanner in case of processing errors. Its # primary purpose is to carry the id, lineNo, error message and the currently # parsed line information.