src_lexer.rb in src_lexer-0.0.2

- old
+ new
@@ -16,155 +16,209 @@
     end
   end
 
   class Lexer
     END_TOKEN = [false, nil]
-    attr_reader :keywords, :symbols, :line_comment_marker, :comment_markers, :tokens, :str
+    NUMBER_REGEX = /^[\d]+[\.]?[\d]*\z/
+    STRING_REGEX = /^\"(.*)\"\z/m
+    attr_reader :keywords, :symbols, :string_literal_marker, :line_comment_marker, :comment_markers, :tokens, :str
 
-    def initialize(keywords, symbols, line_comment_marker, comment_marker)
-      @keywords = ((keywords.nil?) ? [] : keywords.uniq.compact)
-      @symbols = ((symbols.nil?) ? [] : symbols.uniq.compact)
-      @line_comment_marker = ((line_comment_marker.nil?) ? '' : line_comment_marker)
-      @comment_markers = ((comment_marker.nil?) ? ['', ''] : comment_marker)
+    def initialize(keywords, symbols, string_literal_marker, line_comment_marker, comment_markers)
+      @keywords = (keywords ? keywords.uniq.compact : [])
+      @symbols = (symbols ? symbols.uniq.compact : [])
+      @string_literal_marker = string_literal_marker
+      @line_comment_marker = line_comment_marker
+      @comment_markers = comment_markers
     end
 
     def analyze(str)
       @str = str
       tokenize
     end
 
     def pop_token
       token = @tokens.shift
-      if token.nil? then
-        return END_TOKEN
-      end
+      return END_TOKEN if token.nil?
       case token[0]
-      when /^[\d]+[\.]?[\d]*\z/
+      when NUMBER_REGEX
         [:NUMBER, Token.new(token[0], token[1], token[2])]
-      when /^\"(.*)\"\z/m
+      when STRING_REGEX
         [:STRING, Token.new(token[0], token[1], token[2])]
       else
-        id = is_reserved?(token[0]) ? token[0] : :IDENT
-        [id, Token.new(token[0], token[1], token[2])]
+        [is_reserved?(token[0]) ? token[0] : :IDENT, Token.new(token[0], token[1], token[2])]
       end
     end
 
     private
 
-    class StringIterator
-      attr_reader :index
+    class PosInfo
+      attr_accessor :index, :line_no, :char_no
+      
+      def initialize
+        @index = 0
+        @line_no = 1
+        @char_no = 1
+      end
+    end
 
+    class StringIterator
       def initialize(str)
         @str = str
-        @index = 0
-        @marked_pos = -1
+        @current_pos = PosInfo.new
+        @marked_pos = PosInfo.new
+        mark_clear()
       end
 
+      def mark_clear
+        @marked_pos.index = -1
+        @marked_pos.line_no = 0
+        @marked_pos.char_no = 0
+      end
+
       def mark_set
-        @marked_pos = @index
+        @marked_pos = @current_pos.clone
       end
 
       def is(target_string)
         return false if target_string.length.zero?
-        end_pos = (@index + target_string.length - 1)
-        @str[@index..end_pos] == target_string
+        end_pos = (@current_pos.index + target_string.length - 1)
+        @str[@current_pos.index..end_pos] == target_string
       end
 
       def is_in(target_list)
         target_list.find { |target| is(target) } != nil
       end
 
       def move_next
-        @index += 1
+        if /\n/.match @str[@current_pos.index]
+          @current_pos.line_no += 1
+          @current_pos.char_no = 1
+        else
+          @current_pos.char_no += 1
+        end
+        @current_pos.index += 1
       end
 
       def move_to_the_end_of_the_line
-        @index += (@str[@index..-1] =~ /$/) - 1
+        char_count_to_the_end_of_the_line = (@str[@current_pos.index..-1] =~ /$/) - 1
+        @current_pos.index += char_count_to_the_end_of_the_line
+        @current_pos.char_no += char_count_to_the_end_of_the_line
       end
 
       def move_to(target)
-        esceped_target = Regexp.escape(target)
-        @index += (@str[@index..-1] =~ /#{esceped_target}/m) + target.length - 1
+        char_count_to_target = (@str[@current_pos.index..-1] =~ /#{Regexp.escape(target)}/m) + target.length - 1
+        chopped_string = @str[@current_pos.index..@current_pos.index + char_count_to_target]
+        @current_pos.index += char_count_to_target
+        match = /.*\n(.*)$/m.match(chopped_string)
+        p match[1].length if match
+        if match
+          @current_pos.char_no = match[1].length
+        else
+          @current_pos.char_no += char_count_to_target
+        end
+        @current_pos.line_no += chopped_string.each_char.select{|char| /\n/.match char}.length
       end
 
-      def [](range)
-        @str[range]
+      def <(index)
+        @current_pos.index < index
       end
 
-      def <(pos)
-        @index < pos
-      end
-
-      def char
-        @str[@index]
-      end
-
       def is_white_space
-        /[\s]/.match(char)
+        /\s/.match(@str[@current_pos.index])
       end
 
-      def info(pos)
-        [0, 0] if pos == 0
-        line_no, char_no = 1, 0
-        @str[0..pos].each_char do |char|
-          if /\n/.match(char)
-            line_no += 1
-            char_no = 0
-          else
-            char_no += 1
-          end
-        end
-        [line_no, char_no]
-      end
-
       def marked?
-        @marked_pos != -1
+        @marked_pos.index != -1
       end
 
       def shift
-        result = @str[@marked_pos..(@index - 1)]
-        line_no_and_char_no = info(@marked_pos) 
-        @marked_pos = -1
-        return result, *line_no_and_char_no
+        result = [@str[@marked_pos.index..(@current_pos.index - 1)], @marked_pos.line_no, @marked_pos.char_no]
+        mark_clear()
+        return result
       end
     end
 
     def tokenize()
       @tokens = []
       iterator = StringIterator.new(@str)
 
       while iterator < @str.length do
         if iterator.is_white_space then
           @tokens.push iterator.shift if iterator.marked?
-        elsif iterator.is(@line_comment_marker) then
+          iterator.move_next
+        elsif @line_comment_marker && iterator.is(@line_comment_marker) then
           @tokens.push iterator.shift if iterator.marked?
           iterator.move_to_the_end_of_the_line
-        elsif iterator.is(@comment_markers[0]) then
+          iterator.move_next
+        elsif @comment_markers && iterator.is(@comment_markers[0]) then
           @tokens.push iterator.shift if iterator.marked?
           iterator.move_to(@comment_markers[1])
-        elsif iterator.is('"') then
+          iterator.move_next
+        elsif @string_literal_marker && iterator.is(@string_literal_marker[0]) then
           @tokens.push iterator.shift if iterator.marked?
           iterator.mark_set
           iterator.move_next
-          iterator.move_to('"')
+          iterator.move_to(@string_literal_marker[1])
           iterator.move_next
           @tokens.push iterator.shift
-          next
         elsif iterator.is_in(@symbols) then
           @tokens.push iterator.shift if iterator.marked?
-          symbol = @symbols.find { |symbol| iterator.is(symbol) }
-          @tokens.push [iterator[iterator.index..(iterator.index + symbol.length - 1)], *iterator.info(iterator.index)]
-          (symbol.length - 1).times { iterator.move_next }
+          iterator.mark_set
+          @symbols.find { |symbol| iterator.is(symbol) }.length.times { iterator.move_next }
+          @tokens.push iterator.shift
         elsif !iterator.marked? then
           iterator.mark_set
+        else
+          iterator.move_next
         end
-        iterator.move_next
       end
-
       @tokens.push iterator.shift if iterator.marked?
     end
 
     def is_reserved?(token)
       @keywords.include?(token) || @symbols.include?(token)
+    end
+  end
+
+  class CSharpLexer < Lexer
+    def initialize
+      super(
+        [ # C# keywords
+          'abstract',   'as',       'base',       'bool',      'break',
+          'byte',       'case',     'catch',      'char',      'checked',
+          'class',      'const',    'continue',   'decimal',   'default',
+          'delegate',   'do',       'double',     'else',      'enum',
+          'event',      'explicit', 'extern',     'false',     'finally',
+          'fixed',      'float',    'for',        'foreach',   'goto',
+          'if',         'implicit', 'in',         'int',       'interface',
+          'internal',   'is',       'lock',       'long',      'namespace',
+          'new',        'null',     'object',     'operator',  'out',
+          'override',   'params',   'private',    'protected', 'public',
+          'readonly',   'ref',      'return',     'sbyte',     'sealed',
+          'short',      'sizeof',   'stackalloc', 'static',    'string',
+          'struct',     'switch',   'this',       'throw',     'true',
+          'try',        'typeof',   'uint',       'ulong',     'unchecked',
+          'unsafe',     'ushort',   'using',      'virtual',   'void',
+          'volatile',   'while',
+          # C# context keywords
+          'add',        'alias',    'ascending',  'async',     'await',
+          'descending', 'dynamic',  'from',       'get',       'global',
+          'group',      'into',     'join',       'let',       'orderby',
+          'partial',    'remove',   'select',     'set',       'value',
+          'var',        'where',    'yield'
+        ],
+        [
+          '<<=', '>>=', '<<',  '>>',  '<=',
+          '>=',  '==',  '!=',  '&&',  '||',
+          '??',  '+=',  '-=',  '*=',  '/=',
+          '%=',  '&=',  '|=',  '^=',  '=>',
+          '*',   '/',   '%',   '+',   '-',
+          '<',   '>',   '&',   '^',   '|',
+          '?',   ':',   '=',   '{',   '}',
+          '(',   ')',   '[',   ']',   ';'
+        ],
+        ['"', '"'], # comment markers
+        '//', # line comment marker
+        ['/*', '*/']) # multi line comment markers
     end
   end
 end