lib/prism/parse_result.rb in prism-1.1.0 vs lib/prism/parse_result.rb in prism-1.2.0
- old
+ new
@@ -10,10 +10,25 @@
# specialized and more performant `ASCIISource` if no multibyte characters
# are present in the source code.
def self.for(source, start_line = 1, offsets = [])
if source.ascii_only?
ASCIISource.new(source, start_line, offsets)
+ elsif source.encoding == Encoding::BINARY
+ source.force_encoding(Encoding::UTF_8)
+
+ if source.valid_encoding?
+ new(source, start_line, offsets)
+ else
+ # This is an extremely niche use case where the file is marked as
+ # binary, contains multi-byte characters, and those characters are not
+ # valid UTF-8. In this case we'll mark it as binary and fall back to
+ # treating everything as a single-byte character. This _may_ cause
+ # problems when asking for code units, but it appears to be the
+ # cleanest solution at the moment.
+ source.force_encoding(Encoding::BINARY)
+ ASCIISource.new(source, start_line, offsets)
+ end
else
new(source, start_line, offsets)
end
end
@@ -87,20 +102,32 @@
# counting in code units for the given encoding.
#
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
+ #
+ # We purposefully replace invalid and undefined characters with replacement
+ # characters in this conversion. This happens for two reasons. First, it's
+ # possible that the given byte offset will not occur on a character
+ # boundary. Second, it's possible that the source code will contain a
+ # character that has no equivalent in the given encoding.
def code_units_offset(byte_offset, encoding)
- byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
+ byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
byteslice.bytesize / 2
else
byteslice.length
end
end
+ # Generate a cache that targets a specific encoding for calculating code
+ # unit offsets.
+ def code_units_cache(encoding)
+ CodeUnitsCache.new(source, encoding)
+ end
+
# Returns the column number in code units for the given encoding for the
# given byte offset.
def code_units_column(byte_offset, encoding)
code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
end
@@ -126,14 +153,88 @@
left - 1
end
end
+ # A cache that can be used to quickly compute code unit offsets from byte
+ # offsets. It purposefully provides only a single #[] method to access the
+ # cache in order to minimize surface area.
+ #
+ # Note that there are some known issues here that may or may not be addressed
+ # in the future:
+ #
+ # * The first is that there are issues when the cache computes values that are
+ # not on character boundaries. This can result in subsequent computations
+ # being off by one or more code units.
+ # * The second is that this cache is currently unbounded. In theory we could
+ # introduce some kind of LRU cache to limit the number of entries, but this
+ # has not yet been implemented.
+ #
+ class CodeUnitsCache
+ class UTF16Counter # :nodoc:
+ def initialize(source, encoding)
+ @source = source
+ @encoding = encoding
+ end
+
+ def count(byte_offset, byte_length)
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
+ end
+ end
+
+ class LengthCounter # :nodoc:
+ def initialize(source, encoding)
+ @source = source
+ @encoding = encoding
+ end
+
+ def count(byte_offset, byte_length)
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
+ end
+ end
+
+ private_constant :UTF16Counter, :LengthCounter
+
+ # Initialize a new cache with the given source and encoding.
+ def initialize(source, encoding)
+ @source = source
+ @counter =
+ if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
+ UTF16Counter.new(source, encoding)
+ else
+ LengthCounter.new(source, encoding)
+ end
+
+ @cache = {}
+ @offsets = []
+ end
+
+ # Retrieve the code units offset from the given byte offset.
+ def [](byte_offset)
+ @cache[byte_offset] ||=
+ if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
+ @offsets << byte_offset
+ @counter.count(0, byte_offset)
+ elsif index == 0
+ @offsets.unshift(byte_offset)
+ @counter.count(0, byte_offset)
+ else
+ @offsets.insert(index, byte_offset)
+ offset = @offsets[index - 1]
+ @cache[offset] + @counter.count(offset, byte_offset - offset)
+ end
+ end
+ end
+
# Specialized version of Prism::Source for source code that includes ASCII
# characters only. This class is used to apply performance optimizations that
- # cannot be applied to sources that include multibyte characters. Sources that
- # include multibyte characters are represented by the Prism::Source class.
+ # cannot be applied to sources that include multibyte characters.
+ #
+ # In the extremely rare case that a source includes multi-byte characters but
+ # is marked as binary because of a magic encoding comment and it cannot be
+ # eagerly converted to UTF-8, this class will be used as well. This is because
+ # at that point we will treat everything as single-byte characters.
class ASCIISource < Source
# Return the character offset for the given byte offset.
def character_offset(byte_offset)
byte_offset
end
@@ -151,10 +252,17 @@
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
byte_offset
end
+ # Returns a cache that is the identity function in order to maintain the
+ # same interface. We can do this because code units are always equivalent to
+ # byte offsets for ASCII-only sources.
+ def code_units_cache(encoding)
+ ->(byte_offset) { byte_offset }
+ end
+
# Specialized version of `code_units_column` that does not depend on
# `code_units_offset`, which is a more expensive operation. This is
# essentially the same as `Prism::Source#column`.
def code_units_column(byte_offset, encoding)
byte_offset - line_start(byte_offset)
@@ -260,10 +368,16 @@
# The offset from the start of the file in code units of the given encoding.
def start_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(start_offset, encoding)
end
+ # The start offset from the start of the file in code units using the given
+ # cache to fetch or calculate the value.
+ def cached_start_code_units_offset(cache)
+ cache[start_offset]
+ end
+
# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
end
@@ -276,10 +390,16 @@
# The offset from the start of the file in code units of the given encoding.
def end_code_units_offset(encoding = Encoding::UTF_16LE)
source.code_units_offset(end_offset, encoding)
end
+ # The end offset from the start of the file in code units using the given
+ # cache to fetch or calculate the value.
+ def cached_end_code_units_offset(cache)
+ cache[end_offset]
+ end
+
# The line number where this location starts.
def start_line
source.line(start_offset)
end
@@ -310,10 +430,16 @@
# starts from the start of the line.
def start_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(start_offset, encoding)
end
+ # The start column in code units using the given cache to fetch or calculate
+ # the value.
+ def cached_start_code_units_column(cache)
+ cache[start_offset] - cache[source.line_start(start_offset)]
+ end
+
# The column number in bytes where this location ends from the start of the
# line.
def end_column
source.column(end_offset)
end
@@ -328,10 +454,16 @@
# ends from the start of the line.
def end_code_units_column(encoding = Encoding::UTF_16LE)
source.code_units_column(end_offset, encoding)
end
+ # The end column in code units using the given cache to fetch or calculate
+ # the value.
+ def cached_end_code_units_column(cache)
+ cache[end_offset] - cache[source.line_start(end_offset)]
+ end
+
# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
end
@@ -576,9 +708,14 @@
# Returns true if there were errors during parsing and false if there were
# not.
def failure?
!success?
+ end
+
+ # Create a code units cache for the given encoding.
+ def code_units_cache(encoding)
+ source.code_units_cache(encoding)
end
end
# This is a result specific to the `parse` and `parse_file` methods.
class ParseResult < Result