parse_result.rb in prism-0.23.0

- old
+ new

@@ -7,22 +7,20 @@
   class Source
     # The source code that this source object represents.
     attr_reader :source
 
     # The line number where this source starts.
-    attr_accessor :start_line
+    attr_reader :start_line
 
     # The list of newline byte offsets in the source code.
     attr_reader :offsets
 
-    # Create a new source object with the given source code and newline byte
-    # offsets. If no newline byte offsets are given, they will be computed from
-    # the source code.
-    def initialize(source, start_line = 1, offsets = compute_offsets(source))
+    # Create a new source object with the given source code.
+    def initialize(source, start_line = 1, offsets = [])
       @source = source
-      @start_line = start_line
-      @offsets = offsets
+      @start_line = start_line # set after parsing is done
+      @offsets = offsets # set after parsing is done
     end
 
     # Perform a byteslice on the source code using the given byte offset and
     # byte length.
     def slice(byte_offset, length)
@@ -54,10 +52,27 @@
     # Return the column number in characters for the given byte offset.
     def character_column(byte_offset)
       character_offset(byte_offset) - character_offset(line_start(byte_offset))
     end
 
+    # Returns the offset from the start of the file for the given byte offset
+    # counting in code units for the given encoding.
+    #
+    # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
+    # concept of code units that differs from the number of characters in other
+    # encodings, it is not captured here.
+    def code_units_offset(byte_offset, encoding)
+      byteslice = source.byteslice(0, byte_offset).encode(encoding)
+      (encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
+    end
+
+    # Returns the column number in code units for the given encoding for the
+    # given byte offset.
+    def code_units_column(byte_offset, encoding)
+      code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
+    end
+
     private
 
     # Binary search through the offsets to find the line number for the given
     # byte offset.
     def find_line(byte_offset)
@@ -75,18 +90,10 @@
         end
       end
 
       left - 1
     end
-
-    # Find all of the newlines in the source code and return their byte offsets
-    # from the start of the string an array.
-    def compute_offsets(code)
-      offsets = [0]
-      code.b.scan("\n") { offsets << $~.end(0) }
-      offsets
-    end
   end
 
   # This represents a location in the source.
   class Location
     # A Source object that is used to determine more information from the given
@@ -136,10 +143,15 @@
     # starts.
     def start_character_offset
       source.character_offset(start_offset)
     end
 
+    # The offset from the start of the file in code units of the given encoding.
+    def start_code_units_offset(encoding = Encoding::UTF_16LE)
+      source.code_units_offset(start_offset, encoding)
+    end
+
     # The byte offset from the beginning of the source where this location ends.
     def end_offset
       start_offset + length
     end
 
@@ -147,10 +159,15 @@
     # ends.
     def end_character_offset
       source.character_offset(end_offset)
     end
 
+    # The offset from the start of the file in code units of the given encoding.
+    def end_code_units_offset(encoding = Encoding::UTF_16LE)
+      source.code_units_offset(end_offset, encoding)
+    end
+
     # The line number where this location starts.
     def start_line
       source.line(start_offset)
     end
 
@@ -175,19 +192,31 @@
     # the line.
     def start_character_column
       source.character_column(start_offset)
     end
 
+    # The column number in code units of the given encoding where this location
+    # starts from the start of the line.
+    def start_code_units_column(encoding = Encoding::UTF_16LE)
+      source.code_units_column(start_offset, encoding)
+    end
+
     # The column number in bytes where this location ends from the start of the
     # line.
     def end_column
       source.column(end_offset)
     end
 
     # The column number in characters where this location ends from the start of
     # the line.
     def end_character_column
       source.character_column(end_offset)
+    end
+
+    # The column number in code units of the given encoding where this location
+    # ends from the start of the line.
+    def end_code_units_column(encoding = Encoding::UTF_16LE)
+      source.code_units_column(end_offset, encoding)
     end
 
     # Implement the hash pattern matching interface for Location.
     def deconstruct_keys(keys)
       { start_offset: start_offset, end_offset: end_offset }