lib/RichTextScanner.rb in taskjuggler-0.0.6 vs lib/RichTextScanner.rb in taskjuggler-0.0.7

- old
+ new

@@ -9,625 +9,204 @@ # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # require 'UTF8String' +require 'TextScanner' class TaskJuggler # The RichTextScanner is used by the RichTextParser to chop the input text - # into digestable tokens. The parser and the scanner only communicate over - # RichTextScanner#nextToken and RichTextScanner#returnToken. The scanner can - # break the text into words and special tokens. - class RichTextScanner + # into digestable tokens. It specializes the TextScanner class for RichText + # syntax. The scanner can operate in various modes. The current mode is + # context dependent. The following modes are supported: + # + # :bop : at the begining of a paragraph. + # :bol : at the begining of a line. + # :inline : in the middle of a line + # :nowiki : ignoring all MediaWiki special tokens + # :ref : inside of a REF [[ .. ]] + # :href : inside of an HREF [ .. ] + # :func : inside of a block <[ .. ]> or inline <- .. -> function + class RichTextScanner < TextScanner - # Create the RichTextScanner object and initialize all state variables. - def initialize(text) - # The token buffer is used to hold a returned token. Only one token can - # be returned at a time. - @tokenBuffer = nil - # A reference to the input text. - @text = text - # The reference text should not change during processing. So we can - # determine the length upfront. It's frequently used. - @textLength = text.length - # The number of current line. - @lineNo = 1 - # This is the current position withing @text. - @pos = 0 - # This flag is set to true whenever we are at the start of a text line. - @beginOfLine = true - # This is the position of the start of the currently processed line. - # It's only used for error reporting. - @lineStart = 0 - # This variable stores the mode that the parser is operating in. The - # following modes are supported: - # :wiki : accept supported MediaWiki subset plus TJ extensions - # :nowiki : ignore most markup except for the </nowiki> token - # :funcarg : parse name and parameters of an block or inline parser - # function. - @mode = :wiki - # Enable to trigger printout instead of exception. - @debug = false - end + def initialize(masterFile, messageHandler) + tokenPatterns = [ + # :bol mode rules + [ 'LINEBREAK', /\s*\n/, :bol, method('linebreak') ], + [ nil, /\s+/, :bol, method('inlineMode') ], - # This is a wrapper for nextToken only used for debugging. - #def nextToken - # tok = nextTokenI - # raise "Token Error:" unless tok && tok[0] && tok[1] - # puts "#{tok[0]}: #{tok[1]}" - # tok - #end + # :bop mode rules + [ 'PRE', / [^\n]+\n?/, :bop, method('pre') ], + [ nil, /\s*\n/, :bop, method('linebreak') ], - # Return the next token from the input text. - def nextToken - # If we have a returned token, this is returned first. - if @tokenBuffer - tok = @tokenBuffer - @tokenBuffer = nil - return tok - end + # :inline mode rules + [ 'SPACE', /[ \t\n]+/, :inline, method('space') ], - if @mode == :funcarg - return nextTokenFuncArg - elsif @mode == :href - return nextTokenHRef - elsif @mode == :ref - return nextTokenRef - end - if @beginOfLine && @mode == :wiki - if (res = nextTokenWikiBOL) - return res - end - end + # :bop and :bol mode rules + [ 'INLINEFUNCSTART', /<-/, [ :bop, :bol, :inline ], + method('functionStart') ], + [ 'BLOCKFUNCSTART', /<\[/, [ :bop, :bol ], method('functionStart') ], + [ 'TITLE*', /={2,5}/, [ :bop, :bol ], method('titleStart') ], + [ 'TITLE*END', /={2,5}/, :inline, method('titleEnd') ], + [ 'BULLET*', /\*{1,4} /, [ :bop, :bol ], method('bullet') ], + [ 'NUMBER*', /\#{1,4} /, [ :bop, :bol ], method('number') ], + [ 'HLINE', /----/, [ :bop, :bol ], method('inlineMode') ], - # Many inline control character sequences consit of multiple characters. - # In case of incomplete sequences, we roll back to the start character - # and set the ignoreInlineMarkup flag to simply treat them as normal - # text. - @ignoreInlineMarkup = false - loop do - if res = (@mode == :wiki ? nextTokenWikiInline : nextTokenNoWikiInline) - return res - end - end - end + # :bop, :bol and :inline mode rules + # The <nowiki> token puts the scanner into :nowiki mode. + [ nil, /<nowiki>/, [ :bop, :bol, :inline ], method('nowikiStart') ], + [ 'QUOTES', /'{2,5}/, [ :bop, :bol, :inline ], method('quotes') ], + [ 'REF', /\[\[/, [ :bop, :bol, :inline ], method('refStart') ], + [ 'HREF', /\[/, [ :bop, :bol, :inline], method('hrefStart') ], + [ 'WORD', /.[^ \n\t\[<']*/, [ :bop, :bol, :inline ], + method('inlineMode') ], - # Return the last issued token to the token buffer. - def returnToken(token) - unless @tokenBuffer.nil? - raise TjException.new, 'Token buffer overflow!' - end - @tokenBuffer = token - end + # :nowiki mode rules + [ nil, /<\/nowiki>/, :nowiki, method('nowikiEnd') ], + [ 'WORD', /(<(?!\/nowiki>)|[^ \t\n<])+/, :nowiki ], + [ 'SPACE', /[ \t]+/, :nowiki ], + [ 'LINEBREAK', /\s*\n/, :nowiki ], - # Report the current cursor position. - def sourceFileInfo - [ @lineNo, @pos ] - end + # :ref mode rules + [ 'REFEND', /\]\]/, :ref, method('refEnd') ], + [ 'WORD', /(<(?!-)|(\](?!\])|[^|<\]]))+/, :ref ], + [ 'QUERY', /<-\w+->/, :ref, method('query') ], + [ 'LITERAL', /./, :ref ], - # This function makes more sense for parsers that process actual files. As - # we don't have a file name, we just return 'input text'. - def fileName - 'input text' - end + # :href mode rules + [ 'HREFEND', /\]/, :href, method('hrefEnd') ], + [ 'WORD', /(<(?!-)|[^ \t\n\]<])+/, :href ], + [ 'QUERY', /<-\w+->/, :href, method('query') ], + [ 'SPACE', /[ \t\n]+/, :href ], - # The parser uses this function to report any errors during parsing. - def error(id, text, foo = nil, bar = nil) - if @debug - $stderr.puts "Line #{@lineNo}: #{text}\n" + - "#{@text[@lineStart, @pos - @lineStart]}" - else - raise RichTextException.new(id, @lineNo, text, - @text[@lineStart, @pos - @lineStart]) - end + # :func mode rules + [ 'INLINEFUNCEND', /->/ , :func, method('functionEnd') ], + [ 'BLOCKFUNCEND', /\]>/, :func, method('functionEnd') ], + [ 'ID', /[a-zA-Z_]\w*/, :func ], + [ 'STRING', /"(\\"|[^"])*"/, :func, method('dqString') ], + [ 'STRING', /'(\\'|[^'])*'/, :func, method('sqString') ], + [ nil, /[ \t\n]+/, :func ], + [ 'LITERAL', /./, :func ] + ] + super(masterFile, messageHandler, tokenPatterns, :bop) end - private + private - # Function arguments have the following formats: - # <[blockfunc par1="value1" par2='value2']> - # <-inlinefunc par1="value1" ... -> - def nextTokenFuncArg - token = [ '.', '<END>' ] - while (c = nextChar) - case c - when ' ', "\n", "\t" - if (tok = readBlanks(c)) - token = tok - break - end - when '=' - return [ '_=', '=' ] - when "'" - return readString(c) - when '"' - return readString(c) - when 'a'..'z', 'A'..'Z', '_' - return readId(c) - when ']' - if nextChar == '>' - @mode = :wiki - return [ 'BLOCKFUNCEND', ']>' ] - end - returnChar - when '-' - if nextChar == '>' - @mode = :wiki - return [ 'INLINEFUNCEND', '->' ] - end - returnChar - end + def space(type, match) + if match.index("\n") + # If the match contains a linebreak we switch to :bol mode. + self.mode = :bol + # And return an empty string. + match = '' end - token + [type, match ] end - def nextTokenRef - c = nextChar - return [ '.', '<END' ] if c.nil? - - return [ 'LITERAL', '|' ] if c == '|' - - if c == ']' && peek == ']' - nextChar - @mode = :wiki - return [ 'REFEND', ']]' ] - end - - token = c - while (c = nextChar) - break if c.nil? - if c == '|' || (c == ']' && peek == ']') - returnChar - break - end - token << c - end - [ 'WORD', token ] + def linebreak(type, match) + self.mode = :bop + [ type, match ] end - def nextTokenHRef - token = [ '.', '<END>' ] - while (c = nextChar) - if c.nil? - # We've reached the end of the text. - return token - elsif c == ' ' || c == "\t" || c == "\n" - # Sequences of tabs, spaces and newlines are treated as token - # boundaries, but otherwise they are ignored. - readSequence(" \n\t") - return [ 'SPACE', ' ' ] - elsif c == '<' && !@ignoreInlineMarkup - if nextChar == '-' && isIdStart(peek(1)) - token = readId('', 'QUERY') - unless nextChar == '-' && nextChar == '>' - error('unterminated_query', - "Inline query must be terminated with '->'") - end - return token - else - # It's not a query. - returnChar(2) - @ignoreInlineMarkup = true - next - end - elsif c == ']' - @mode = :wiki - return [ 'HREFEND', ']' ] - else - return nextTokenWord(c) - end - end - token + def inlineMode(type, match) + self.mode = :inline + [ type, match ] end - def nextTokenWikiBOL - # Some characters have only a special meaning at the start of the line. - # When the last token pushed the cursor into a new line, this flag is set - # to true. - - # Reset the flag again. - @beginOfLine = false - - # We already know that the last newline was a real linebreak. Further - # newlines can safely be ignored. - readSequence("\n") - - # All the lead characters of a token here also need to be registered - # with nextTokenNewline! - case (c = nextChar) - when '=' - # Headings start with 2 or more = and must be followed by a space. - level = readSequenceMax('=', 5) - if level == 1 - # 1 = does not mean anything. Push it back and process it as normal - # text further down. - returnChar - else - # Between the = characters and the title text must be exactly one - # space. - return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' ' - # If that's missing, The = are treated as normal text further down. - returnChar(level + 1) - end - when '-' - # Horizontal ruler. Must have exactly 4 -. - level = readSequenceMax('-', 4) - return [ "HLINE", '-' * 4 ] if level == 4 - returnChar(level) - when '*' - # Bullet lists start with one to three * characters. - level = readSequenceMax('*', 4) - # Between the * characters and the bullet text must be exactly one - # space. - return [ "BULLET#{level}", '*' * level ] if nextChar == ' ' - # If that's missing, The # are treated as normal text further down. - returnChar(level + 1) - when '#' - # Numbered list start with one to three # characters. - level = readSequenceMax('#', 4) - # Between the # characters and the bullet text must be exactly one - # space. - return [ "NUMBER#{level}", '#' * level ] if nextChar == ' ' - # If that's missing, The # are treated as normal text further down. - returnChar(level + 1) - when '<' - # This may be the start of a block generating function. - if nextChar == '[' - # Switch the parser to block function argument parsing mode. - @mode = :funcarg - return [ 'BLOCKFUNCSTART', '<[' ] - end - # Maybe not. - returnChar(2) - when ' ' - # Lines that start with a space are treated as verbatim text. - return [ "PRE", readCode ] if (c = peek) && c != "\n" - else - # If the character is not a known control character we push it back - # and treat it as normal text further down. - returnChar - end - - return nil + def titleStart(type, match) + self.mode = :inline + [ "TITLE#{match.length - 1}", match ] end - def nextTokenWikiInline - c = nextChar - if c.nil? - # We've reached the end of the text. - [ '.', '<END>' ] - elsif c == ' ' || c == "\t" - # Sequences of tabs or spaces are treated as token boundaries, but - # otherwise they are ignored. - readSequence(" \t") - [ 'SPACE', ' ' ] - elsif c == "'" && !@ignoreInlineMarkup - # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced - # code, 5 ' means italic and bold. Anything else is just normal text. - level = readSequenceMax("'", 5) - if level == 2 - [ 'ITALIC', "'" * level ] - elsif level == 3 - [ 'BOLD', "'" * level ] - elsif level == 4 - [ 'CODE', "'" * level ] - elsif level == 5 - [ 'BOLDITALIC', "'" * level ] - else - # We have not found the right syntax. Treat the found characters as - # normal text. Push all ' back and start again but ignoring the ' - # code for once. - returnChar(level) - @ignoreInlineMarkup = true - nil - end - elsif c == '=' && !@ignoreInlineMarkup - level = readSequenceMax('=', 5) - if level > 1 - [ "TITLE#{level - 1}END", '=' * level ] - else - # We have not found the right syntax. Treat found characters as - # normal text. Push all = back and start again but ignoring the = - # code for once. - returnChar(level) - @ignoreInlineMarkup = true - nil - end - elsif c == '[' - level = readSequenceMax('[', 2) - if level == 1 - @mode = :href - [ 'HREF' , '[' ] - else - @mode = :ref - [ 'REF', '[[' ] - end - elsif c == ']' && peek == ']' - nextChar - [ 'REFEND', ']]' ] - elsif c == "\n" - nextTokenNewline - elsif c == '<' && !@ignoreInlineMarkup - nextTokenOpenAngle - else - nextTokenWord(c) - end + def titleEnd(type, match) + [ "TITLE#{match.length - 1}END", match ] end - def nextTokenNoWikiInline - c = nextChar - if c.nil? - # We've reached the end of the text. - [ '.', '<END>' ] - elsif c == ' ' || c == "\t" - # Sequences of tabs or spaces are treated as token boundaries, but - # otherwise they are ignored. - readSequence(" \t") - [ 'SPACE', ' ' ] - elsif c == "\n" - nextTokenNewline - elsif c == '<' && !@ignoreInlineMarkup - nextTokenOpenAngle - else - nextTokenWord(c) - end + def bullet(type, match) + self.mode = :inline + [ "BULLET#{match.length - 1}", match ] end - # We've just read a newline. Now we need to figure out whether this is a - # LINEBREAK or just a SPACE. This is determined by looking at the next - # character. - def nextTokenNewline - # Newlines are pretty important as they can terminate blocks and turn - # the next character into the start of a control sequence. - # Hard linebreaks consist of a newline followed by another newline or - # any of the begin-of-line control characters. - if (c = nextChar).nil? - # We hit the end of the text. - [ '.', '<END>' ] - elsif c == '<' && peekMatch('[') - # the '<' can be a start of a block (BLOCKFUNCSTART) or inline text - # (INLINEFUNCSTART). Only for the first case the linebreak is real. - returnChar if c != "\n" - # The next character may be a control character. - @beginOfLine = true - [ 'LINEBREAK', "\n" ] - elsif "\n*#=-".include?(c) - # These characters correspond to the first characters of a block - # element. When they are found at the begin of the line, the newline - # was really a line break. - returnChar if c != "\n" - # The next character may be a control character. - @beginOfLine = true - [ 'LINEBREAK', "\n" ] - else - # Single line breaks are treated as spaces. Return the char after - # the newline and start with this one again. - returnChar - [ 'SPACE', ' ' ] - end + def number(type, match) + self.mode = :inline + [ "NUMBER#{match.length - 1}", match ] end - def nextTokenOpenAngle - if peekMatch('nowiki>') - # Turn most wiki markup interpretation off. - @pos += 'nowiki>'.length - @mode = :nowiki - elsif peekMatch('/nowiki>') - # Turn most wiki markup interpretation on. - @pos += '/nowiki>'.length - @mode = :wiki - elsif peekMatch('-') && @mode == :wiki - nextChar - # Switch the parser to function argument parsing mode. - @mode = :funcarg - return [ 'INLINEFUNCSTART', '<-' ] - else - # We've not found a valid control sequence. Push back the character - # and make sure we treat it as a normal character. - @ignoreInlineMarkup = true - returnChar - end - nil + def quotes(type, match) + self.mode = :inline + types = [ nil, nil, 'ITALIC', 'BOLD' , 'CODE', 'BOLDITALIC' ] + [ types[match.length], match ] end - # _c_ does not match any start of a control sequence, so we read - # characters until we find the end of the word. - def nextTokenWord(c) - # Reset this flag again. - @ignoreInlineMarkup = false - str = '' - str << c - # Now we can collect characters of a word until we hit a whitespace. - while (c = nextChar) && !" \n\t".include?(c) - case @mode - when :wiki - # Or at least two ' characters in a row. - break if c == "'" && peek == "'" - # Or a ] or < - break if ']<'.include?(c) - when :href - # Look for - of the end mark -> end ']' - break if '-]<'.include?(c) - else - # Make sure we find the </nowiki> tag even within a word. - break if c == '<' - end - str << c - end - # Return the character that indicated the word end. - returnChar - [ 'WORD', str ] + def nowikiStart(type, match) + self.mode = :nowiki + [ type, match ] end - # Deliver the next character. Keep track of the cursor position. In case we - # reach the end, nil is returned. - def nextChar - if @pos >= @textLength - # Correct @pos so that returnChar works properly but mutliple reads of - # EOT are ignored. - @pos = @textLength + 1 - return nil - end - c = @text[@pos] - @pos += 1 - if c == ?\n - @lineNo += 1 - # Save the position of the line start for later use during error - # reporting. The line begins after the newline. - @lineStart = @pos - end - # Since Ruby 1.9 is returning Strings for String#[] we need to emulate - # this for Ruby 1.8. - '' << c + def nowikiEnd(type, match) + self.mode = :inline + [ type, match ] end - # Return one or more characters. _n_ is the number of characters to move - # back the cursor. - def returnChar(n = 1) - crossedNewline = false - if @pos <= @textLength && @pos >= n - # Check for newlines and update @lineNo accordingly. - n.times do |i| - if @text[@pos - i - 1] == ?\n - crossedNewline = true - @lineNo -= 1 - end - end - @pos -= n - end - - # If we have crossed a newline during rewind, we have to find the start of - # the current line again. - if crossedNewline - @lineStart = @pos - @lineStart -= 1 while @lineStart > 0 && @text[@lineStart - 1] != ?\n - end + def functionStart(type, match) + # When restoring :bol or :bop mode, we need to switch to :inline mode. + @funcLastMode = (@scannerMode == :bop || @scannerMode == :bol) ? + :inline : @scannerMode + self.mode = :func + [ type, match ] end - # Return a character further up the text without moving the cursor. - # _lookAhead_ is the number of characters to peek ahead. A value of 0 would - # return the last character provided by nextChar(). - def peek(lookAhead = 1) - return nil if (@pos + lookAhead - 1) >= @textLength - # Since Ruby 1.9 is returning Strings for String#[] we need to emulate - # this for Ruby 1.8. - '' << @text[@pos + lookAhead - 1] + def functionEnd(type, match) + self.mode = @funcLastMode + @funcLastMode = nil + [ type, match ] end - # Return true if the next characters match exactly the character sequence in - # word. - def peekMatch(word) - # Since Ruby 1.9 is returning Strings for String#[] we need to emulate - # this for Ruby 1.8. - ('' << @text[@pos, word.length]) == word + def pre(type, match) + [ type, match[1..-1] ] end - # Read a sequence of characters that are all contained in the _chars_ Array. - # If a character is found that is not in _chars_ the method returns the so - # far found sequence of chars as String. - def readSequence(chars) - sequence = '' - while (c = nextChar) && chars.index(c) - sequence << c - end - # Push back the character that did no longer match. - returnChar - sequence + def dqString(type, match) + # Remove first and last character and remove backslashes from quoted + # double quotes. + [ type, match[1..-2].gsub(/\\"/, '"') ] end - # Read a sequence of _c_ characters until a different character is found or - # _max_ count has been reached. - def readSequenceMax(c, max = 3) - i = 1 - while nextChar == c && i < max - i += 1 - end - # Return the non matching character. - returnChar - i + def sqString(type, match) + # Remove first and last character and remove backslashes from quoted + # single quotes. + [ type, match[1..-2].gsub(/\\'/, "'") ] end - # Read a block of pre-formatted text. All lines must start with a space - # character. - def readCode - tok = '' - loop do - # Read until the end of the line - while (c = nextChar) && c != "\n" - # Append a found characters. - tok << c - end - # Append the newline. - tok << c - # If the next line does not start with a space, we've reached the end of - # the code block. - if (c = nextChar) && c != ' ' - break - end - end - returnChar - @beginOfLine = true - tok + def query(type, match) + # Remove <- and ->. + [ type, match[2..-3] ] end - def readBlanks(c) - loop do - if c != ' ' && c != "\n" && c != "\t" - returnChar - return nil - end - c = nextChar - end + def hrefStart(type, match) + # When restoring :bol or :bop mode, we need to switch to :inline mode. + @hrefLastMode = (@scannerMode == :bop || @scannerMode == :bol) ? + :inline : @scannerMode + self.mode = :href + [ type, match ] end - def isIdStart(c) - (('a'..'z') === c || ('A'..'Z') === c || c == '_') + def hrefEnd(type, match) + self.mode = @hrefLastMode + @hrefLastMode = nil + [ type, match ] end - def readId(c, tokenType = 'ID') - token = "" - token << c - while (c = nextChar) && - (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9') === c || - c == '_') - token << c - end - returnChar - return [ tokenType, token ] + def refStart(type, match) + self.mode = :ref + [ type, match ] end - def readString(terminator) - token = "" - while (c = nextChar) && c != terminator - if c == "\\" - # Terminators can be used as regular characters when prefixed by a \. - if (c = nextChar) && c != terminator - # \ followed by non-terminator. Just add both. - token << "\\" - end - end - token << c - end - - [ 'STRING', token ] + def refEnd(type, match) + self.mode = :inline + [ type, match ] end - end - # Exception raised by the RichTextScanner in case of processing errors. Its - # primary purpose is to carry the id, lineNo, error message and the currently - # parsed line information. - class RichTextException < RuntimeError - - attr_reader :lineNo, :id, :text, :line - - def initialize(id, lineNo, msgText, line) - @id = id - @lineNo = lineNo - @text = msgText - @line = line - end - end end -