# -*- encoding: utf-8 -*- require "src_lexer/version" module SrcLexer class Token attr_reader :str, :line_no, :char_no def initialize(str, line_no, char_no) @str = str @line_no = line_no @char_no = char_no end def ==(other_object) @str == other_object.str && @line_no == other_object.line_no && @char_no == other_object.char_no end end class Lexer END_TOKEN = [false, nil] NUMBER_REGEX = /^[\d]+[\.]?[\d]*\z/ STRING_REGEX = /^\"(.*)\"\z/m attr_reader :keywords, :symbols, :string_literal_marker, :line_comment_marker, :comment_markers, :tokens, :str def initialize(keywords, symbols, string_literal_marker, line_comment_marker, comment_markers) @keywords = (keywords ? keywords.uniq.compact : []) @symbols = (symbols ? symbols.uniq.compact : []) @string_literal_marker = string_literal_marker @line_comment_marker = line_comment_marker @comment_markers = comment_markers end def analyze(str) @str = str tokenize end def pop_token token = @tokens.shift return END_TOKEN if token.nil? case token[0] when NUMBER_REGEX [:NUMBER, Token.new(token[0], token[1], token[2])] when STRING_REGEX [:STRING, Token.new(token[0], token[1], token[2])] else [is_reserved?(token[0]) ? token[0] : :IDENT, Token.new(token[0], token[1], token[2])] end end private class PosInfo attr_accessor :index, :line_no, :char_no def initialize @index = 0 @line_no = 1 @char_no = 1 end end class StringIterator def initialize(str) @str = str @current_pos = PosInfo.new @marked_pos = PosInfo.new mark_clear() end def mark_clear @marked_pos.index = -1 @marked_pos.line_no = 0 @marked_pos.char_no = 0 end def mark_set @marked_pos = @current_pos.clone end def is(target_string) return false if target_string.length.zero? end_pos = (@current_pos.index + target_string.length - 1) @str[@current_pos.index..end_pos] == target_string end def is_in(target_list) target_list.find { |target| is(target) } != nil end def move_next if /\n/.match @str[@current_pos.index] @current_pos.line_no += 1 @current_pos.char_no = 1 else @current_pos.char_no += 1 end @current_pos.index += 1 end def move_to_the_end_of_the_line char_count_to_the_end_of_the_line = (@str[@current_pos.index..-1] =~ /$/) - 1 @current_pos.index += char_count_to_the_end_of_the_line @current_pos.char_no += char_count_to_the_end_of_the_line end def move_to(target) char_count_to_target = (@str[@current_pos.index..-1] =~ /#{Regexp.escape(target)}/m) + target.length - 1 chopped_string = @str[@current_pos.index..@current_pos.index + char_count_to_target] @current_pos.index += char_count_to_target match = /.*\n(.*)$/m.match(chopped_string) p match[1].length if match if match @current_pos.char_no = match[1].length else @current_pos.char_no += char_count_to_target end @current_pos.line_no += chopped_string.each_char.select{|char| /\n/.match char}.length end def <(index) @current_pos.index < index end def is_white_space /\s/.match(@str[@current_pos.index]) end def marked? @marked_pos.index != -1 end def shift result = [@str[@marked_pos.index..(@current_pos.index - 1)], @marked_pos.line_no, @marked_pos.char_no] mark_clear() return result end end def tokenize() @tokens = [] iterator = StringIterator.new(@str) while iterator < @str.length do if iterator.is_white_space then @tokens.push iterator.shift if iterator.marked? iterator.move_next elsif @line_comment_marker && iterator.is(@line_comment_marker) then @tokens.push iterator.shift if iterator.marked? iterator.move_to_the_end_of_the_line iterator.move_next elsif @comment_markers && iterator.is(@comment_markers[0]) then @tokens.push iterator.shift if iterator.marked? iterator.move_to(@comment_markers[1]) iterator.move_next elsif @string_literal_marker && iterator.is(@string_literal_marker[0]) then @tokens.push iterator.shift if iterator.marked? iterator.mark_set iterator.move_next iterator.move_to(@string_literal_marker[1]) iterator.move_next @tokens.push iterator.shift elsif iterator.is_in(@symbols) then @tokens.push iterator.shift if iterator.marked? iterator.mark_set @symbols.find { |symbol| iterator.is(symbol) }.length.times { iterator.move_next } @tokens.push iterator.shift elsif !iterator.marked? then iterator.mark_set else iterator.move_next end end @tokens.push iterator.shift if iterator.marked? return self end def is_reserved?(token) @keywords.include?(token) || @symbols.include?(token) end end class CSharpLexer < Lexer def initialize super( [ # C# keywords 'abstract', 'as', 'base', 'bool', 'break', 'byte', 'case', 'catch', 'char', 'checked', 'class', 'const', 'continue', 'decimal', 'default', 'delegate', 'do', 'double', 'else', 'enum', 'event', 'explicit', 'extern', 'false', 'finally', 'fixed', 'float', 'for', 'foreach', 'goto', 'if', 'implicit', 'in', 'int', 'interface', 'internal', 'is', 'lock', 'long', 'namespace', 'new', 'null', 'object', 'operator', 'out', 'override', 'params', 'private', 'protected', 'public', 'readonly', 'ref', 'return', 'sbyte', 'sealed', 'short', 'sizeof', 'stackalloc', 'static', 'string', 'struct', 'switch', 'this', 'throw', 'true', 'try', 'typeof', 'uint', 'ulong', 'unchecked', 'unsafe', 'ushort', 'using', 'virtual', 'void', 'volatile', 'while', # C# context keywords 'add', 'alias', 'ascending', 'async', 'await', 'descending', 'dynamic', 'from', 'get', 'global', 'group', 'into', 'join', 'let', 'orderby', 'partial', 'remove', 'select', 'set', 'value', 'var', 'where', 'yield' ], [ '<<=', '>>=', '<<', '>>', '<=', '>=', '==', '!=', '&&', '||', '??', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '=>', '*', '/', '%', '+', '-', '<', '>', '&', '^', '|', '?', ':', '=', '{', '}', '(', ')', '[', ']', ';', ',' ], ['"', '"'], # comment markers '//', # line comment marker ['/*', '*/']) # multi line comment markers end end end