lib/src_lexer.rb in src_lexer-0.0.1 vs lib/src_lexer.rb in src_lexer-0.0.2
- old
+ new
@@ -16,155 +16,209 @@
end
end
class Lexer
END_TOKEN = [false, nil]
- attr_reader :keywords, :symbols, :line_comment_marker, :comment_markers, :tokens, :str
+ NUMBER_REGEX = /^[\d]+[\.]?[\d]*\z/
+ STRING_REGEX = /^\"(.*)\"\z/m
+ attr_reader :keywords, :symbols, :string_literal_marker, :line_comment_marker, :comment_markers, :tokens, :str
- def initialize(keywords, symbols, line_comment_marker, comment_marker)
- @keywords = ((keywords.nil?) ? [] : keywords.uniq.compact)
- @symbols = ((symbols.nil?) ? [] : symbols.uniq.compact)
- @line_comment_marker = ((line_comment_marker.nil?) ? '' : line_comment_marker)
- @comment_markers = ((comment_marker.nil?) ? ['', ''] : comment_marker)
+ def initialize(keywords, symbols, string_literal_marker, line_comment_marker, comment_markers)
+ @keywords = (keywords ? keywords.uniq.compact : [])
+ @symbols = (symbols ? symbols.uniq.compact : [])
+ @string_literal_marker = string_literal_marker
+ @line_comment_marker = line_comment_marker
+ @comment_markers = comment_markers
end
def analyze(str)
@str = str
tokenize
end
def pop_token
token = @tokens.shift
- if token.nil? then
- return END_TOKEN
- end
+ return END_TOKEN if token.nil?
case token[0]
- when /^[\d]+[\.]?[\d]*\z/
+ when NUMBER_REGEX
[:NUMBER, Token.new(token[0], token[1], token[2])]
- when /^\"(.*)\"\z/m
+ when STRING_REGEX
[:STRING, Token.new(token[0], token[1], token[2])]
else
- id = is_reserved?(token[0]) ? token[0] : :IDENT
- [id, Token.new(token[0], token[1], token[2])]
+ [is_reserved?(token[0]) ? token[0] : :IDENT, Token.new(token[0], token[1], token[2])]
end
end
private
- class StringIterator
- attr_reader :index
+ class PosInfo
+ attr_accessor :index, :line_no, :char_no
+
+ def initialize
+ @index = 0
+ @line_no = 1
+ @char_no = 1
+ end
+ end
+ class StringIterator
def initialize(str)
@str = str
- @index = 0
- @marked_pos = -1
+ @current_pos = PosInfo.new
+ @marked_pos = PosInfo.new
+ mark_clear()
end
+ def mark_clear
+ @marked_pos.index = -1
+ @marked_pos.line_no = 0
+ @marked_pos.char_no = 0
+ end
+
def mark_set
- @marked_pos = @index
+ @marked_pos = @current_pos.clone
end
def is(target_string)
return false if target_string.length.zero?
- end_pos = (@index + target_string.length - 1)
- @str[@index..end_pos] == target_string
+ end_pos = (@current_pos.index + target_string.length - 1)
+ @str[@current_pos.index..end_pos] == target_string
end
def is_in(target_list)
target_list.find { |target| is(target) } != nil
end
def move_next
- @index += 1
+ if /\n/.match @str[@current_pos.index]
+ @current_pos.line_no += 1
+ @current_pos.char_no = 1
+ else
+ @current_pos.char_no += 1
+ end
+ @current_pos.index += 1
end
def move_to_the_end_of_the_line
- @index += (@str[@index..-1] =~ /$/) - 1
+ char_count_to_the_end_of_the_line = (@str[@current_pos.index..-1] =~ /$/) - 1
+ @current_pos.index += char_count_to_the_end_of_the_line
+ @current_pos.char_no += char_count_to_the_end_of_the_line
end
def move_to(target)
- esceped_target = Regexp.escape(target)
- @index += (@str[@index..-1] =~ /#{esceped_target}/m) + target.length - 1
+ char_count_to_target = (@str[@current_pos.index..-1] =~ /#{Regexp.escape(target)}/m) + target.length - 1
+ chopped_string = @str[@current_pos.index..@current_pos.index + char_count_to_target]
+ @current_pos.index += char_count_to_target
+ match = /.*\n(.*)$/m.match(chopped_string)
+ p match[1].length if match
+ if match
+ @current_pos.char_no = match[1].length
+ else
+ @current_pos.char_no += char_count_to_target
+ end
+ @current_pos.line_no += chopped_string.each_char.select{|char| /\n/.match char}.length
end
- def [](range)
- @str[range]
+ def <(index)
+ @current_pos.index < index
end
- def <(pos)
- @index < pos
- end
-
- def char
- @str[@index]
- end
-
def is_white_space
- /[\s]/.match(char)
+ /\s/.match(@str[@current_pos.index])
end
- def info(pos)
- [0, 0] if pos == 0
- line_no, char_no = 1, 0
- @str[0..pos].each_char do |char|
- if /\n/.match(char)
- line_no += 1
- char_no = 0
- else
- char_no += 1
- end
- end
- [line_no, char_no]
- end
-
def marked?
- @marked_pos != -1
+ @marked_pos.index != -1
end
def shift
- result = @str[@marked_pos..(@index - 1)]
- line_no_and_char_no = info(@marked_pos)
- @marked_pos = -1
- return result, *line_no_and_char_no
+ result = [@str[@marked_pos.index..(@current_pos.index - 1)], @marked_pos.line_no, @marked_pos.char_no]
+ mark_clear()
+ return result
end
end
def tokenize()
@tokens = []
iterator = StringIterator.new(@str)
while iterator < @str.length do
if iterator.is_white_space then
@tokens.push iterator.shift if iterator.marked?
- elsif iterator.is(@line_comment_marker) then
+ iterator.move_next
+ elsif @line_comment_marker && iterator.is(@line_comment_marker) then
@tokens.push iterator.shift if iterator.marked?
iterator.move_to_the_end_of_the_line
- elsif iterator.is(@comment_markers[0]) then
+ iterator.move_next
+ elsif @comment_markers && iterator.is(@comment_markers[0]) then
@tokens.push iterator.shift if iterator.marked?
iterator.move_to(@comment_markers[1])
- elsif iterator.is('"') then
+ iterator.move_next
+ elsif @string_literal_marker && iterator.is(@string_literal_marker[0]) then
@tokens.push iterator.shift if iterator.marked?
iterator.mark_set
iterator.move_next
- iterator.move_to('"')
+ iterator.move_to(@string_literal_marker[1])
iterator.move_next
@tokens.push iterator.shift
- next
elsif iterator.is_in(@symbols) then
@tokens.push iterator.shift if iterator.marked?
- symbol = @symbols.find { |symbol| iterator.is(symbol) }
- @tokens.push [iterator[iterator.index..(iterator.index + symbol.length - 1)], *iterator.info(iterator.index)]
- (symbol.length - 1).times { iterator.move_next }
+ iterator.mark_set
+ @symbols.find { |symbol| iterator.is(symbol) }.length.times { iterator.move_next }
+ @tokens.push iterator.shift
elsif !iterator.marked? then
iterator.mark_set
+ else
+ iterator.move_next
end
- iterator.move_next
end
-
@tokens.push iterator.shift if iterator.marked?
end
def is_reserved?(token)
@keywords.include?(token) || @symbols.include?(token)
+ end
+ end
+
+ class CSharpLexer < Lexer
+ def initialize
+ super(
+ [ # C# keywords
+ 'abstract', 'as', 'base', 'bool', 'break',
+ 'byte', 'case', 'catch', 'char', 'checked',
+ 'class', 'const', 'continue', 'decimal', 'default',
+ 'delegate', 'do', 'double', 'else', 'enum',
+ 'event', 'explicit', 'extern', 'false', 'finally',
+ 'fixed', 'float', 'for', 'foreach', 'goto',
+ 'if', 'implicit', 'in', 'int', 'interface',
+ 'internal', 'is', 'lock', 'long', 'namespace',
+ 'new', 'null', 'object', 'operator', 'out',
+ 'override', 'params', 'private', 'protected', 'public',
+ 'readonly', 'ref', 'return', 'sbyte', 'sealed',
+ 'short', 'sizeof', 'stackalloc', 'static', 'string',
+ 'struct', 'switch', 'this', 'throw', 'true',
+ 'try', 'typeof', 'uint', 'ulong', 'unchecked',
+ 'unsafe', 'ushort', 'using', 'virtual', 'void',
+ 'volatile', 'while',
+ # C# context keywords
+ 'add', 'alias', 'ascending', 'async', 'await',
+ 'descending', 'dynamic', 'from', 'get', 'global',
+ 'group', 'into', 'join', 'let', 'orderby',
+ 'partial', 'remove', 'select', 'set', 'value',
+ 'var', 'where', 'yield'
+ ],
+ [
+ '<<=', '>>=', '<<', '>>', '<=',
+ '>=', '==', '!=', '&&', '||',
+ '??', '+=', '-=', '*=', '/=',
+ '%=', '&=', '|=', '^=', '=>',
+ '*', '/', '%', '+', '-',
+ '<', '>', '&', '^', '|',
+ '?', ':', '=', '{', '}',
+ '(', ')', '[', ']', ';'
+ ],
+ ['"', '"'], # comment markers
+ '//', # line comment marker
+ ['/*', '*/']) # multi line comment markers
end
end
end