lib/RichTextScanner.rb in taskjuggler-0.0.6 vs lib/RichTextScanner.rb in taskjuggler-0.0.7
- old
+ new
@@ -9,625 +9,204 @@
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
require 'UTF8String'
+require 'TextScanner'
class TaskJuggler
# The RichTextScanner is used by the RichTextParser to chop the input text
- # into digestable tokens. The parser and the scanner only communicate over
- # RichTextScanner#nextToken and RichTextScanner#returnToken. The scanner can
- # break the text into words and special tokens.
- class RichTextScanner
+ # into digestable tokens. It specializes the TextScanner class for RichText
+ # syntax. The scanner can operate in various modes. The current mode is
+ # context dependent. The following modes are supported:
+ #
+ # :bop : at the begining of a paragraph.
+ # :bol : at the begining of a line.
+ # :inline : in the middle of a line
+ # :nowiki : ignoring all MediaWiki special tokens
+ # :ref : inside of a REF [[ .. ]]
+ # :href : inside of an HREF [ .. ]
+ # :func : inside of a block <[ .. ]> or inline <- .. -> function
+ class RichTextScanner < TextScanner
- # Create the RichTextScanner object and initialize all state variables.
- def initialize(text)
- # The token buffer is used to hold a returned token. Only one token can
- # be returned at a time.
- @tokenBuffer = nil
- # A reference to the input text.
- @text = text
- # The reference text should not change during processing. So we can
- # determine the length upfront. It's frequently used.
- @textLength = text.length
- # The number of current line.
- @lineNo = 1
- # This is the current position withing @text.
- @pos = 0
- # This flag is set to true whenever we are at the start of a text line.
- @beginOfLine = true
- # This is the position of the start of the currently processed line.
- # It's only used for error reporting.
- @lineStart = 0
- # This variable stores the mode that the parser is operating in. The
- # following modes are supported:
- # :wiki : accept supported MediaWiki subset plus TJ extensions
- # :nowiki : ignore most markup except for the </nowiki> token
- # :funcarg : parse name and parameters of an block or inline parser
- # function.
- @mode = :wiki
- # Enable to trigger printout instead of exception.
- @debug = false
- end
+ def initialize(masterFile, messageHandler)
+ tokenPatterns = [
+ # :bol mode rules
+ [ 'LINEBREAK', /\s*\n/, :bol, method('linebreak') ],
+ [ nil, /\s+/, :bol, method('inlineMode') ],
- # This is a wrapper for nextToken only used for debugging.
- #def nextToken
- # tok = nextTokenI
- # raise "Token Error:" unless tok && tok[0] && tok[1]
- # puts "#{tok[0]}: #{tok[1]}"
- # tok
- #end
+ # :bop mode rules
+ [ 'PRE', / [^\n]+\n?/, :bop, method('pre') ],
+ [ nil, /\s*\n/, :bop, method('linebreak') ],
- # Return the next token from the input text.
- def nextToken
- # If we have a returned token, this is returned first.
- if @tokenBuffer
- tok = @tokenBuffer
- @tokenBuffer = nil
- return tok
- end
+ # :inline mode rules
+ [ 'SPACE', /[ \t\n]+/, :inline, method('space') ],
- if @mode == :funcarg
- return nextTokenFuncArg
- elsif @mode == :href
- return nextTokenHRef
- elsif @mode == :ref
- return nextTokenRef
- end
- if @beginOfLine && @mode == :wiki
- if (res = nextTokenWikiBOL)
- return res
- end
- end
+ # :bop and :bol mode rules
+ [ 'INLINEFUNCSTART', /<-/, [ :bop, :bol, :inline ],
+ method('functionStart') ],
+ [ 'BLOCKFUNCSTART', /<\[/, [ :bop, :bol ], method('functionStart') ],
+ [ 'TITLE*', /={2,5}/, [ :bop, :bol ], method('titleStart') ],
+ [ 'TITLE*END', /={2,5}/, :inline, method('titleEnd') ],
+ [ 'BULLET*', /\*{1,4} /, [ :bop, :bol ], method('bullet') ],
+ [ 'NUMBER*', /\#{1,4} /, [ :bop, :bol ], method('number') ],
+ [ 'HLINE', /----/, [ :bop, :bol ], method('inlineMode') ],
- # Many inline control character sequences consit of multiple characters.
- # In case of incomplete sequences, we roll back to the start character
- # and set the ignoreInlineMarkup flag to simply treat them as normal
- # text.
- @ignoreInlineMarkup = false
- loop do
- if res = (@mode == :wiki ? nextTokenWikiInline : nextTokenNoWikiInline)
- return res
- end
- end
- end
+ # :bop, :bol and :inline mode rules
+ # The <nowiki> token puts the scanner into :nowiki mode.
+ [ nil, /<nowiki>/, [ :bop, :bol, :inline ], method('nowikiStart') ],
+ [ 'QUOTES', /'{2,5}/, [ :bop, :bol, :inline ], method('quotes') ],
+ [ 'REF', /\[\[/, [ :bop, :bol, :inline ], method('refStart') ],
+ [ 'HREF', /\[/, [ :bop, :bol, :inline], method('hrefStart') ],
+ [ 'WORD', /.[^ \n\t\[<']*/, [ :bop, :bol, :inline ],
+ method('inlineMode') ],
- # Return the last issued token to the token buffer.
- def returnToken(token)
- unless @tokenBuffer.nil?
- raise TjException.new, 'Token buffer overflow!'
- end
- @tokenBuffer = token
- end
+ # :nowiki mode rules
+ [ nil, /<\/nowiki>/, :nowiki, method('nowikiEnd') ],
+ [ 'WORD', /(<(?!\/nowiki>)|[^ \t\n<])+/, :nowiki ],
+ [ 'SPACE', /[ \t]+/, :nowiki ],
+ [ 'LINEBREAK', /\s*\n/, :nowiki ],
- # Report the current cursor position.
- def sourceFileInfo
- [ @lineNo, @pos ]
- end
+ # :ref mode rules
+ [ 'REFEND', /\]\]/, :ref, method('refEnd') ],
+ [ 'WORD', /(<(?!-)|(\](?!\])|[^|<\]]))+/, :ref ],
+ [ 'QUERY', /<-\w+->/, :ref, method('query') ],
+ [ 'LITERAL', /./, :ref ],
- # This function makes more sense for parsers that process actual files. As
- # we don't have a file name, we just return 'input text'.
- def fileName
- 'input text'
- end
+ # :href mode rules
+ [ 'HREFEND', /\]/, :href, method('hrefEnd') ],
+ [ 'WORD', /(<(?!-)|[^ \t\n\]<])+/, :href ],
+ [ 'QUERY', /<-\w+->/, :href, method('query') ],
+ [ 'SPACE', /[ \t\n]+/, :href ],
- # The parser uses this function to report any errors during parsing.
- def error(id, text, foo = nil, bar = nil)
- if @debug
- $stderr.puts "Line #{@lineNo}: #{text}\n" +
- "#{@text[@lineStart, @pos - @lineStart]}"
- else
- raise RichTextException.new(id, @lineNo, text,
- @text[@lineStart, @pos - @lineStart])
- end
+ # :func mode rules
+ [ 'INLINEFUNCEND', /->/ , :func, method('functionEnd') ],
+ [ 'BLOCKFUNCEND', /\]>/, :func, method('functionEnd') ],
+ [ 'ID', /[a-zA-Z_]\w*/, :func ],
+ [ 'STRING', /"(\\"|[^"])*"/, :func, method('dqString') ],
+ [ 'STRING', /'(\\'|[^'])*'/, :func, method('sqString') ],
+ [ nil, /[ \t\n]+/, :func ],
+ [ 'LITERAL', /./, :func ]
+ ]
+ super(masterFile, messageHandler, tokenPatterns, :bop)
end
- private
+ private
- # Function arguments have the following formats:
- # <[blockfunc par1="value1" par2='value2']>
- # <-inlinefunc par1="value1" ... ->
- def nextTokenFuncArg
- token = [ '.', '<END>' ]
- while (c = nextChar)
- case c
- when ' ', "\n", "\t"
- if (tok = readBlanks(c))
- token = tok
- break
- end
- when '='
- return [ '_=', '=' ]
- when "'"
- return readString(c)
- when '"'
- return readString(c)
- when 'a'..'z', 'A'..'Z', '_'
- return readId(c)
- when ']'
- if nextChar == '>'
- @mode = :wiki
- return [ 'BLOCKFUNCEND', ']>' ]
- end
- returnChar
- when '-'
- if nextChar == '>'
- @mode = :wiki
- return [ 'INLINEFUNCEND', '->' ]
- end
- returnChar
- end
+ def space(type, match)
+ if match.index("\n")
+ # If the match contains a linebreak we switch to :bol mode.
+ self.mode = :bol
+ # And return an empty string.
+ match = ''
end
- token
+ [type, match ]
end
- def nextTokenRef
- c = nextChar
- return [ '.', '<END' ] if c.nil?
-
- return [ 'LITERAL', '|' ] if c == '|'
-
- if c == ']' && peek == ']'
- nextChar
- @mode = :wiki
- return [ 'REFEND', ']]' ]
- end
-
- token = c
- while (c = nextChar)
- break if c.nil?
- if c == '|' || (c == ']' && peek == ']')
- returnChar
- break
- end
- token << c
- end
- [ 'WORD', token ]
+ def linebreak(type, match)
+ self.mode = :bop
+ [ type, match ]
end
- def nextTokenHRef
- token = [ '.', '<END>' ]
- while (c = nextChar)
- if c.nil?
- # We've reached the end of the text.
- return token
- elsif c == ' ' || c == "\t" || c == "\n"
- # Sequences of tabs, spaces and newlines are treated as token
- # boundaries, but otherwise they are ignored.
- readSequence(" \n\t")
- return [ 'SPACE', ' ' ]
- elsif c == '<' && !@ignoreInlineMarkup
- if nextChar == '-' && isIdStart(peek(1))
- token = readId('', 'QUERY')
- unless nextChar == '-' && nextChar == '>'
- error('unterminated_query',
- "Inline query must be terminated with '->'")
- end
- return token
- else
- # It's not a query.
- returnChar(2)
- @ignoreInlineMarkup = true
- next
- end
- elsif c == ']'
- @mode = :wiki
- return [ 'HREFEND', ']' ]
- else
- return nextTokenWord(c)
- end
- end
- token
+ def inlineMode(type, match)
+ self.mode = :inline
+ [ type, match ]
end
- def nextTokenWikiBOL
- # Some characters have only a special meaning at the start of the line.
- # When the last token pushed the cursor into a new line, this flag is set
- # to true.
-
- # Reset the flag again.
- @beginOfLine = false
-
- # We already know that the last newline was a real linebreak. Further
- # newlines can safely be ignored.
- readSequence("\n")
-
- # All the lead characters of a token here also need to be registered
- # with nextTokenNewline!
- case (c = nextChar)
- when '='
- # Headings start with 2 or more = and must be followed by a space.
- level = readSequenceMax('=', 5)
- if level == 1
- # 1 = does not mean anything. Push it back and process it as normal
- # text further down.
- returnChar
- else
- # Between the = characters and the title text must be exactly one
- # space.
- return [ "TITLE#{level - 1}", '=' * level ] if nextChar == ' '
- # If that's missing, The = are treated as normal text further down.
- returnChar(level + 1)
- end
- when '-'
- # Horizontal ruler. Must have exactly 4 -.
- level = readSequenceMax('-', 4)
- return [ "HLINE", '-' * 4 ] if level == 4
- returnChar(level)
- when '*'
- # Bullet lists start with one to three * characters.
- level = readSequenceMax('*', 4)
- # Between the * characters and the bullet text must be exactly one
- # space.
- return [ "BULLET#{level}", '*' * level ] if nextChar == ' '
- # If that's missing, The # are treated as normal text further down.
- returnChar(level + 1)
- when '#'
- # Numbered list start with one to three # characters.
- level = readSequenceMax('#', 4)
- # Between the # characters and the bullet text must be exactly one
- # space.
- return [ "NUMBER#{level}", '#' * level ] if nextChar == ' '
- # If that's missing, The # are treated as normal text further down.
- returnChar(level + 1)
- when '<'
- # This may be the start of a block generating function.
- if nextChar == '['
- # Switch the parser to block function argument parsing mode.
- @mode = :funcarg
- return [ 'BLOCKFUNCSTART', '<[' ]
- end
- # Maybe not.
- returnChar(2)
- when ' '
- # Lines that start with a space are treated as verbatim text.
- return [ "PRE", readCode ] if (c = peek) && c != "\n"
- else
- # If the character is not a known control character we push it back
- # and treat it as normal text further down.
- returnChar
- end
-
- return nil
+ def titleStart(type, match)
+ self.mode = :inline
+ [ "TITLE#{match.length - 1}", match ]
end
- def nextTokenWikiInline
- c = nextChar
- if c.nil?
- # We've reached the end of the text.
- [ '.', '<END>' ]
- elsif c == ' ' || c == "\t"
- # Sequences of tabs or spaces are treated as token boundaries, but
- # otherwise they are ignored.
- readSequence(" \t")
- [ 'SPACE', ' ' ]
- elsif c == "'" && !@ignoreInlineMarkup
- # Sequence of 2 ' means italic, 3 ' means bold, 4 ' means monospaced
- # code, 5 ' means italic and bold. Anything else is just normal text.
- level = readSequenceMax("'", 5)
- if level == 2
- [ 'ITALIC', "'" * level ]
- elsif level == 3
- [ 'BOLD', "'" * level ]
- elsif level == 4
- [ 'CODE', "'" * level ]
- elsif level == 5
- [ 'BOLDITALIC', "'" * level ]
- else
- # We have not found the right syntax. Treat the found characters as
- # normal text. Push all ' back and start again but ignoring the '
- # code for once.
- returnChar(level)
- @ignoreInlineMarkup = true
- nil
- end
- elsif c == '=' && !@ignoreInlineMarkup
- level = readSequenceMax('=', 5)
- if level > 1
- [ "TITLE#{level - 1}END", '=' * level ]
- else
- # We have not found the right syntax. Treat found characters as
- # normal text. Push all = back and start again but ignoring the =
- # code for once.
- returnChar(level)
- @ignoreInlineMarkup = true
- nil
- end
- elsif c == '['
- level = readSequenceMax('[', 2)
- if level == 1
- @mode = :href
- [ 'HREF' , '[' ]
- else
- @mode = :ref
- [ 'REF', '[[' ]
- end
- elsif c == ']' && peek == ']'
- nextChar
- [ 'REFEND', ']]' ]
- elsif c == "\n"
- nextTokenNewline
- elsif c == '<' && !@ignoreInlineMarkup
- nextTokenOpenAngle
- else
- nextTokenWord(c)
- end
+ def titleEnd(type, match)
+ [ "TITLE#{match.length - 1}END", match ]
end
- def nextTokenNoWikiInline
- c = nextChar
- if c.nil?
- # We've reached the end of the text.
- [ '.', '<END>' ]
- elsif c == ' ' || c == "\t"
- # Sequences of tabs or spaces are treated as token boundaries, but
- # otherwise they are ignored.
- readSequence(" \t")
- [ 'SPACE', ' ' ]
- elsif c == "\n"
- nextTokenNewline
- elsif c == '<' && !@ignoreInlineMarkup
- nextTokenOpenAngle
- else
- nextTokenWord(c)
- end
+ def bullet(type, match)
+ self.mode = :inline
+ [ "BULLET#{match.length - 1}", match ]
end
- # We've just read a newline. Now we need to figure out whether this is a
- # LINEBREAK or just a SPACE. This is determined by looking at the next
- # character.
- def nextTokenNewline
- # Newlines are pretty important as they can terminate blocks and turn
- # the next character into the start of a control sequence.
- # Hard linebreaks consist of a newline followed by another newline or
- # any of the begin-of-line control characters.
- if (c = nextChar).nil?
- # We hit the end of the text.
- [ '.', '<END>' ]
- elsif c == '<' && peekMatch('[')
- # the '<' can be a start of a block (BLOCKFUNCSTART) or inline text
- # (INLINEFUNCSTART). Only for the first case the linebreak is real.
- returnChar if c != "\n"
- # The next character may be a control character.
- @beginOfLine = true
- [ 'LINEBREAK', "\n" ]
- elsif "\n*#=-".include?(c)
- # These characters correspond to the first characters of a block
- # element. When they are found at the begin of the line, the newline
- # was really a line break.
- returnChar if c != "\n"
- # The next character may be a control character.
- @beginOfLine = true
- [ 'LINEBREAK', "\n" ]
- else
- # Single line breaks are treated as spaces. Return the char after
- # the newline and start with this one again.
- returnChar
- [ 'SPACE', ' ' ]
- end
+ def number(type, match)
+ self.mode = :inline
+ [ "NUMBER#{match.length - 1}", match ]
end
- def nextTokenOpenAngle
- if peekMatch('nowiki>')
- # Turn most wiki markup interpretation off.
- @pos += 'nowiki>'.length
- @mode = :nowiki
- elsif peekMatch('/nowiki>')
- # Turn most wiki markup interpretation on.
- @pos += '/nowiki>'.length
- @mode = :wiki
- elsif peekMatch('-') && @mode == :wiki
- nextChar
- # Switch the parser to function argument parsing mode.
- @mode = :funcarg
- return [ 'INLINEFUNCSTART', '<-' ]
- else
- # We've not found a valid control sequence. Push back the character
- # and make sure we treat it as a normal character.
- @ignoreInlineMarkup = true
- returnChar
- end
- nil
+ def quotes(type, match)
+ self.mode = :inline
+ types = [ nil, nil, 'ITALIC', 'BOLD' , 'CODE', 'BOLDITALIC' ]
+ [ types[match.length], match ]
end
- # _c_ does not match any start of a control sequence, so we read
- # characters until we find the end of the word.
- def nextTokenWord(c)
- # Reset this flag again.
- @ignoreInlineMarkup = false
- str = ''
- str << c
- # Now we can collect characters of a word until we hit a whitespace.
- while (c = nextChar) && !" \n\t".include?(c)
- case @mode
- when :wiki
- # Or at least two ' characters in a row.
- break if c == "'" && peek == "'"
- # Or a ] or <
- break if ']<'.include?(c)
- when :href
- # Look for - of the end mark -> end ']'
- break if '-]<'.include?(c)
- else
- # Make sure we find the </nowiki> tag even within a word.
- break if c == '<'
- end
- str << c
- end
- # Return the character that indicated the word end.
- returnChar
- [ 'WORD', str ]
+ def nowikiStart(type, match)
+ self.mode = :nowiki
+ [ type, match ]
end
- # Deliver the next character. Keep track of the cursor position. In case we
- # reach the end, nil is returned.
- def nextChar
- if @pos >= @textLength
- # Correct @pos so that returnChar works properly but mutliple reads of
- # EOT are ignored.
- @pos = @textLength + 1
- return nil
- end
- c = @text[@pos]
- @pos += 1
- if c == ?\n
- @lineNo += 1
- # Save the position of the line start for later use during error
- # reporting. The line begins after the newline.
- @lineStart = @pos
- end
- # Since Ruby 1.9 is returning Strings for String#[] we need to emulate
- # this for Ruby 1.8.
- '' << c
+ def nowikiEnd(type, match)
+ self.mode = :inline
+ [ type, match ]
end
- # Return one or more characters. _n_ is the number of characters to move
- # back the cursor.
- def returnChar(n = 1)
- crossedNewline = false
- if @pos <= @textLength && @pos >= n
- # Check for newlines and update @lineNo accordingly.
- n.times do |i|
- if @text[@pos - i - 1] == ?\n
- crossedNewline = true
- @lineNo -= 1
- end
- end
- @pos -= n
- end
-
- # If we have crossed a newline during rewind, we have to find the start of
- # the current line again.
- if crossedNewline
- @lineStart = @pos
- @lineStart -= 1 while @lineStart > 0 && @text[@lineStart - 1] != ?\n
- end
+ def functionStart(type, match)
+ # When restoring :bol or :bop mode, we need to switch to :inline mode.
+ @funcLastMode = (@scannerMode == :bop || @scannerMode == :bol) ?
+ :inline : @scannerMode
+ self.mode = :func
+ [ type, match ]
end
- # Return a character further up the text without moving the cursor.
- # _lookAhead_ is the number of characters to peek ahead. A value of 0 would
- # return the last character provided by nextChar().
- def peek(lookAhead = 1)
- return nil if (@pos + lookAhead - 1) >= @textLength
- # Since Ruby 1.9 is returning Strings for String#[] we need to emulate
- # this for Ruby 1.8.
- '' << @text[@pos + lookAhead - 1]
+ def functionEnd(type, match)
+ self.mode = @funcLastMode
+ @funcLastMode = nil
+ [ type, match ]
end
- # Return true if the next characters match exactly the character sequence in
- # word.
- def peekMatch(word)
- # Since Ruby 1.9 is returning Strings for String#[] we need to emulate
- # this for Ruby 1.8.
- ('' << @text[@pos, word.length]) == word
+ def pre(type, match)
+ [ type, match[1..-1] ]
end
- # Read a sequence of characters that are all contained in the _chars_ Array.
- # If a character is found that is not in _chars_ the method returns the so
- # far found sequence of chars as String.
- def readSequence(chars)
- sequence = ''
- while (c = nextChar) && chars.index(c)
- sequence << c
- end
- # Push back the character that did no longer match.
- returnChar
- sequence
+ def dqString(type, match)
+ # Remove first and last character and remove backslashes from quoted
+ # double quotes.
+ [ type, match[1..-2].gsub(/\\"/, '"') ]
end
- # Read a sequence of _c_ characters until a different character is found or
- # _max_ count has been reached.
- def readSequenceMax(c, max = 3)
- i = 1
- while nextChar == c && i < max
- i += 1
- end
- # Return the non matching character.
- returnChar
- i
+ def sqString(type, match)
+ # Remove first and last character and remove backslashes from quoted
+ # single quotes.
+ [ type, match[1..-2].gsub(/\\'/, "'") ]
end
- # Read a block of pre-formatted text. All lines must start with a space
- # character.
- def readCode
- tok = ''
- loop do
- # Read until the end of the line
- while (c = nextChar) && c != "\n"
- # Append a found characters.
- tok << c
- end
- # Append the newline.
- tok << c
- # If the next line does not start with a space, we've reached the end of
- # the code block.
- if (c = nextChar) && c != ' '
- break
- end
- end
- returnChar
- @beginOfLine = true
- tok
+ def query(type, match)
+ # Remove <- and ->.
+ [ type, match[2..-3] ]
end
- def readBlanks(c)
- loop do
- if c != ' ' && c != "\n" && c != "\t"
- returnChar
- return nil
- end
- c = nextChar
- end
+ def hrefStart(type, match)
+ # When restoring :bol or :bop mode, we need to switch to :inline mode.
+ @hrefLastMode = (@scannerMode == :bop || @scannerMode == :bol) ?
+ :inline : @scannerMode
+ self.mode = :href
+ [ type, match ]
end
- def isIdStart(c)
- (('a'..'z') === c || ('A'..'Z') === c || c == '_')
+ def hrefEnd(type, match)
+ self.mode = @hrefLastMode
+ @hrefLastMode = nil
+ [ type, match ]
end
- def readId(c, tokenType = 'ID')
- token = ""
- token << c
- while (c = nextChar) &&
- (('a'..'z') === c || ('A'..'Z') === c || ('0'..'9') === c ||
- c == '_')
- token << c
- end
- returnChar
- return [ tokenType, token ]
+ def refStart(type, match)
+ self.mode = :ref
+ [ type, match ]
end
- def readString(terminator)
- token = ""
- while (c = nextChar) && c != terminator
- if c == "\\"
- # Terminators can be used as regular characters when prefixed by a \.
- if (c = nextChar) && c != terminator
- # \ followed by non-terminator. Just add both.
- token << "\\"
- end
- end
- token << c
- end
-
- [ 'STRING', token ]
+ def refEnd(type, match)
+ self.mode = :inline
+ [ type, match ]
end
- end
- # Exception raised by the RichTextScanner in case of processing errors. Its
- # primary purpose is to carry the id, lineNo, error message and the currently
- # parsed line information.
- class RichTextException < RuntimeError
-
- attr_reader :lineNo, :id, :text, :line
-
- def initialize(id, lineNo, msgText, line)
- @id = id
- @lineNo = lineNo
- @text = msgText
- @line = line
- end
-
end
end
-