lib/src_lexer.rb in src_lexer-1.0.2 vs lib/src_lexer.rb in src_lexer-1.0.3

- old
+ new

@@ -1,227 +1,227 @@ -# -*- encoding: utf-8 -*- -require "src_lexer/version" - -module SrcLexer - class Token - attr_reader :str, :line_no, :char_no - - def initialize(str, line_no, char_no) - @str = str - @line_no = line_no - @char_no = char_no - end - - def ==(other_object) - @str == other_object.str && @line_no == other_object.line_no && @char_no == other_object.char_no - end - end - - class Lexer - END_TOKEN = [false, nil] - NUMBER_REGEX = /^[\d]+[\.]?[\d]*\z/ - STRING_REGEX = /^\"(.*)\"\z/m - attr_reader :keywords, :symbols, :string_literal_marker, :line_comment_marker, :comment_markers, :tokens, :str - - def initialize(keywords, symbols, string_literal_marker, line_comment_marker, comment_markers) - @keywords = (keywords ? keywords.uniq.compact : []) - @symbols = (symbols ? symbols.uniq.compact : []) - @string_literal_marker = string_literal_marker - @line_comment_marker = line_comment_marker - @comment_markers = comment_markers - end - - def analyze(str) - @str = str - tokenize - end - - def pop_token - token = @tokens.shift - return END_TOKEN if token.nil? - case token[0] - when NUMBER_REGEX - [:NUMBER, Token.new(token[0], token[1], token[2])] - when STRING_REGEX - [:STRING, Token.new(token[0], token[1], token[2])] - else - [is_reserved?(token[0]) ? token[0] : :IDENT, Token.new(token[0], token[1], token[2])] - end - end - - private - - class PosInfo - attr_accessor :index, :line_no, :char_no - - def initialize - @index = 0 - @line_no = 1 - @char_no = 1 - end - end - - class StringIterator - def initialize(str) - @str = str - @current_pos = PosInfo.new - @marked_pos = PosInfo.new - mark_clear() - end - - def mark_clear - @marked_pos.index = -1 - @marked_pos.line_no = 0 - @marked_pos.char_no = 0 - end - - def mark_set - @marked_pos = @current_pos.clone - end - - def is(target_string) - return false if target_string.length.zero? - end_pos = (@current_pos.index + target_string.length - 1) - @str[@current_pos.index..end_pos] == target_string - end - - def is_in(target_list) - target_list.find { |target| is(target) } != nil - end - - def move_next - if /\n/.match @str[@current_pos.index] - @current_pos.line_no += 1 - @current_pos.char_no = 1 - else - @current_pos.char_no += 1 - end - @current_pos.index += 1 - end - - def move_to_the_end_of_the_line - char_count_to_the_end_of_the_line = (@str[@current_pos.index..-1] =~ /$/) - 1 - @current_pos.index += char_count_to_the_end_of_the_line - @current_pos.char_no += char_count_to_the_end_of_the_line - end - - def move_to(target) - char_count_to_target = (@str[@current_pos.index..-1] =~ /#{Regexp.escape(target)}/m) + target.length - 1 - chopped_string = @str[@current_pos.index..@current_pos.index + char_count_to_target] - @current_pos.index += char_count_to_target - match = /.*\n(.*)$/m.match(chopped_string) - p match[1].length if match - if match - @current_pos.char_no = match[1].length - else - @current_pos.char_no += char_count_to_target - end - @current_pos.line_no += chopped_string.each_char.select{|char| /\n/.match char}.length - end - - def <(index) - @current_pos.index < index - end - - def is_white_space - /\s/.match(@str[@current_pos.index]) - end - - def marked? - @marked_pos.index != -1 - end - - def shift - result = [@str[@marked_pos.index..(@current_pos.index - 1)], @marked_pos.line_no, @marked_pos.char_no] - mark_clear() - return result - end - end - - def tokenize() - @tokens = [] - iterator = StringIterator.new(@str) - - while iterator < @str.length do - if iterator.is_white_space then - @tokens.push iterator.shift if iterator.marked? - iterator.move_next - elsif @line_comment_marker && iterator.is(@line_comment_marker) then - @tokens.push iterator.shift if iterator.marked? - iterator.move_to_the_end_of_the_line - iterator.move_next - elsif @comment_markers && iterator.is(@comment_markers[0]) then - @tokens.push iterator.shift if iterator.marked? - iterator.move_to(@comment_markers[1]) - iterator.move_next - elsif @string_literal_marker && iterator.is(@string_literal_marker[0]) then - @tokens.push iterator.shift if iterator.marked? - iterator.mark_set - iterator.move_next - iterator.move_to(@string_literal_marker[1]) - iterator.move_next - @tokens.push iterator.shift - elsif iterator.is_in(@symbols) then - @tokens.push iterator.shift if iterator.marked? - iterator.mark_set - @symbols.find { |symbol| iterator.is(symbol) }.length.times { iterator.move_next } - @tokens.push iterator.shift - elsif !iterator.marked? then - iterator.mark_set - else - iterator.move_next - end - end - @tokens.push iterator.shift if iterator.marked? - - return self - end - - def is_reserved?(token) - @keywords.include?(token) || @symbols.include?(token) - end - end - - class CSharpLexer < Lexer - def initialize - super( - [ # C# keywords - 'abstract', 'as', 'base', 'bool', 'break', - 'byte', 'case', 'catch', 'char', 'checked', - 'class', 'const', 'continue', 'decimal', 'default', - 'delegate', 'do', 'double', 'else', 'enum', - 'event', 'explicit', 'extern', 'false', 'finally', - 'fixed', 'float', 'for', 'foreach', 'goto', - 'if', 'implicit', 'in', 'int', 'interface', - 'internal', 'is', 'lock', 'long', 'namespace', - 'new', 'null', 'object', 'operator', 'out', - 'override', 'params', 'private', 'protected', 'public', - 'readonly', 'ref', 'return', 'sbyte', 'sealed', - 'short', 'sizeof', 'stackalloc', 'static', 'string', - 'struct', 'switch', 'this', 'throw', 'true', - 'try', 'typeof', 'uint', 'ulong', 'unchecked', - 'unsafe', 'ushort', 'using', 'virtual', 'void', - 'volatile', 'while', - # C# context keywords - 'add', 'alias', 'ascending', 'async', 'await', - 'descending', 'dynamic', 'from', 'get', 'global', - 'group', 'into', 'join', 'let', 'orderby', - 'partial', 'remove', 'select', 'set', 'value', - 'var', 'where', 'yield' - ], - [ - '<<=', '>>=', '<<', '>>', '<=', - '>=', '==', '!=', '&&', '||', - '??', '+=', '-=', '*=', '/=', - '%=', '&=', '|=', '^=', '=>', - '*', '/', '%', '+', '-', - '<', '>', '&', '^', '|', - '?', ':', '=', '{', '}', - '(', ')', '[', ']', ';', - ',' - ], - ['"', '"'], # comment markers - '//', # line comment marker - ['/*', '*/']) # multi line comment markers - end - end -end +# -*- encoding: utf-8 -*- +require "src_lexer/version" + +module SrcLexer + class Token + attr_reader :str, :line_no, :char_no + + def initialize(str, line_no, char_no) + @str = str + @line_no = line_no + @char_no = char_no + end + + def ==(other_object) + @str == other_object.str && @line_no == other_object.line_no && @char_no == other_object.char_no + end + end + + class Lexer + END_TOKEN = [false, nil] + NUMBER_REGEX = /^[\d]+[\.]?[\d]*\z/ + STRING_REGEX = /^\"(.*)\"\z/m + attr_reader :keywords, :symbols, :string_literal_marker, :line_comment_marker, :comment_markers, :tokens, :str + + def initialize(keywords, symbols, string_literal_marker, line_comment_marker, comment_markers) + @keywords = (keywords ? keywords.uniq.compact : []) + @symbols = (symbols ? symbols.uniq.compact : []) + @string_literal_marker = string_literal_marker + @line_comment_marker = line_comment_marker + @comment_markers = comment_markers + end + + def analyze(str) + @str = str + tokenize + end + + def pop_token + token = @tokens.shift + return END_TOKEN if token.nil? + case token[0] + when NUMBER_REGEX + [:NUMBER, Token.new(token[0], token[1], token[2])] + when STRING_REGEX + [:STRING, Token.new(token[0], token[1], token[2])] + else + [is_reserved?(token[0]) ? token[0] : :IDENT, Token.new(token[0], token[1], token[2])] + end + end + + private + + class PosInfo + attr_accessor :index, :line_no, :char_no + + def initialize + @index = 0 + @line_no = 1 + @char_no = 1 + end + end + + class StringIterator + def initialize(str) + @str = str + @current_pos = PosInfo.new + @marked_pos = PosInfo.new + mark_clear() + end + + def mark_clear + @marked_pos.index = -1 + @marked_pos.line_no = 0 + @marked_pos.char_no = 0 + end + + def mark_set + @marked_pos = @current_pos.clone + end + + def is(target_string) + return false if target_string.length.zero? + end_pos = (@current_pos.index + target_string.length - 1) + @str[@current_pos.index..end_pos] == target_string + end + + def is_in(target_list) + target_list.find { |target| is(target) } != nil + end + + def move_next + if /\n/.match @str[@current_pos.index] + @current_pos.line_no += 1 + @current_pos.char_no = 1 + else + @current_pos.char_no += 1 + end + @current_pos.index += 1 + end + + def move_to_the_end_of_the_line + char_count_to_the_end_of_the_line = (@str[@current_pos.index..-1] =~ /$/) - 1 + @current_pos.index += char_count_to_the_end_of_the_line + @current_pos.char_no += char_count_to_the_end_of_the_line + end + + def move_to(target) + char_count_to_target = (@str[@current_pos.index..-1] =~ /#{Regexp.escape(target)}/m) + target.length - 1 + chopped_string = @str[@current_pos.index..@current_pos.index + char_count_to_target] + @current_pos.index += char_count_to_target + match = /.*\n(.*)$/m.match(chopped_string) + p match[1].length if match + if match + @current_pos.char_no = match[1].length + else + @current_pos.char_no += char_count_to_target + end + @current_pos.line_no += chopped_string.each_char.select{|char| /\n/.match char}.length + end + + def <(index) + @current_pos.index < index + end + + def is_white_space + /\s/.match(@str[@current_pos.index]) + end + + def marked? + @marked_pos.index != -1 + end + + def shift + result = [@str[@marked_pos.index..(@current_pos.index - 1)], @marked_pos.line_no, @marked_pos.char_no] + mark_clear() + return result + end + end + + def tokenize() + @tokens = [] + iterator = StringIterator.new(@str) + + while iterator < @str.length do + if iterator.is_white_space then + @tokens.push iterator.shift if iterator.marked? + iterator.move_next + elsif @line_comment_marker && iterator.is(@line_comment_marker) then + @tokens.push iterator.shift if iterator.marked? + iterator.move_to_the_end_of_the_line + iterator.move_next + elsif @comment_markers && iterator.is(@comment_markers[0]) then + @tokens.push iterator.shift if iterator.marked? + iterator.move_to(@comment_markers[1]) + iterator.move_next + elsif @string_literal_marker && iterator.is(@string_literal_marker[0]) then + @tokens.push iterator.shift if iterator.marked? + iterator.mark_set + iterator.move_next + iterator.move_to(@string_literal_marker[1]) + iterator.move_next + @tokens.push iterator.shift + elsif iterator.is_in(@symbols) then + @tokens.push iterator.shift if iterator.marked? + iterator.mark_set + @symbols.find { |symbol| iterator.is(symbol) }.length.times { iterator.move_next } + @tokens.push iterator.shift + elsif !iterator.marked? then + iterator.mark_set + else + iterator.move_next + end + end + @tokens.push iterator.shift if iterator.marked? + + return self + end + + def is_reserved?(token) + @keywords.include?(token) || @symbols.include?(token) + end + end + + class CSharpLexer < Lexer + def initialize + super( + [ # C# keywords + 'abstract', 'as', 'base', 'bool', 'break', + 'byte', 'case', 'catch', 'char', 'checked', + 'class', 'const', 'continue', 'decimal', 'default', + 'delegate', 'do', 'double', 'else', 'enum', + 'event', 'explicit', 'extern', 'false', 'finally', + 'fixed', 'float', 'for', 'foreach', 'goto', + 'if', 'implicit', 'in', 'int', 'interface', + 'internal', 'is', 'lock', 'long', 'namespace', + 'new', 'null', 'object', 'operator', 'out', + 'override', 'params', 'private', 'protected', 'public', + 'readonly', 'ref', 'return', 'sbyte', 'sealed', + 'short', 'sizeof', 'stackalloc', 'static', 'string', + 'struct', 'switch', 'this', 'throw', 'true', + 'try', 'typeof', 'uint', 'ulong', 'unchecked', + 'unsafe', 'ushort', 'using', 'virtual', 'void', + 'volatile', 'while', + # C# context keywords + 'add', 'alias', 'ascending', 'async', 'await', + 'descending', 'dynamic', 'from', 'get', 'global', + 'group', 'into', 'join', 'let', 'orderby', + 'partial', 'remove', 'select', 'set', 'value', + 'var', 'where', 'yield' + ], + [ + '<<=', '>>=', '<<', '>>', '<=', + '>=', '==', '!=', '&&', '||', + '??', '+=', '-=', '*=', '/=', + '%=', '&=', '|=', '^=', '=>', + '*', '/', '%', '+', '-', + '<', '>', '&', '^', '|', + '?', ':', '=', '{', '}', + '(', ')', '[', ']', ';', + ',' + ], + ['"', '"'], # comment markers + '//', # line comment marker + ['/*', '*/']) # multi line comment markers + end + end +end