lib/prism/parse_result.rb in prism-1.1.0 vs lib/prism/parse_result.rb in prism-1.2.0

- old
+ new

@@ -10,10 +10,25 @@ # specialized and more performant `ASCIISource` if no multibyte characters # are present in the source code. def self.for(source, start_line = 1, offsets = []) if source.ascii_only? ASCIISource.new(source, start_line, offsets) + elsif source.encoding == Encoding::BINARY + source.force_encoding(Encoding::UTF_8) + + if source.valid_encoding? + new(source, start_line, offsets) + else + # This is an extremely niche use case where the file is marked as + # binary, contains multi-byte characters, and those characters are not + # valid UTF-8. In this case we'll mark it as binary and fall back to + # treating everything as a single-byte character. This _may_ cause + # problems when asking for code units, but it appears to be the + # cleanest solution at the moment. + source.force_encoding(Encoding::BINARY) + ASCIISource.new(source, start_line, offsets) + end else new(source, start_line, offsets) end end @@ -87,20 +102,32 @@ # counting in code units for the given encoding. # # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the # concept of code units that differs from the number of characters in other # encodings, it is not captured here. + # + # We purposefully replace invalid and undefined characters with replacement + # characters in this conversion. This happens for two reasons. First, it's + # possible that the given byte offset will not occur on a character + # boundary. Second, it's possible that the source code will contain a + # character that has no equivalent in the given encoding. def code_units_offset(byte_offset, encoding) - byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding) + byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace) if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE byteslice.bytesize / 2 else byteslice.length end end + # Generate a cache that targets a specific encoding for calculating code + # unit offsets. + def code_units_cache(encoding) + CodeUnitsCache.new(source, encoding) + end + # Returns the column number in code units for the given encoding for the # given byte offset. def code_units_column(byte_offset, encoding) code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding) end @@ -126,14 +153,88 @@ left - 1 end end + # A cache that can be used to quickly compute code unit offsets from byte + # offsets. It purposefully provides only a single #[] method to access the + # cache in order to minimize surface area. + # + # Note that there are some known issues here that may or may not be addressed + # in the future: + # + # * The first is that there are issues when the cache computes values that are + # not on character boundaries. This can result in subsequent computations + # being off by one or more code units. + # * The second is that this cache is currently unbounded. In theory we could + # introduce some kind of LRU cache to limit the number of entries, but this + # has not yet been implemented. + # + class CodeUnitsCache + class UTF16Counter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2 + end + end + + class LengthCounter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length + end + end + + private_constant :UTF16Counter, :LengthCounter + + # Initialize a new cache with the given source and encoding. + def initialize(source, encoding) + @source = source + @counter = + if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE + UTF16Counter.new(source, encoding) + else + LengthCounter.new(source, encoding) + end + + @cache = {} + @offsets = [] + end + + # Retrieve the code units offset from the given byte offset. + def [](byte_offset) + @cache[byte_offset] ||= + if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil? + @offsets << byte_offset + @counter.count(0, byte_offset) + elsif index == 0 + @offsets.unshift(byte_offset) + @counter.count(0, byte_offset) + else + @offsets.insert(index, byte_offset) + offset = @offsets[index - 1] + @cache[offset] + @counter.count(offset, byte_offset - offset) + end + end + end + # Specialized version of Prism::Source for source code that includes ASCII # characters only. This class is used to apply performance optimizations that - # cannot be applied to sources that include multibyte characters. Sources that - # include multibyte characters are represented by the Prism::Source class. + # cannot be applied to sources that include multibyte characters. + # + # In the extremely rare case that a source includes multi-byte characters but + # is marked as binary because of a magic encoding comment and it cannot be + # eagerly converted to UTF-8, this class will be used as well. This is because + # at that point we will treat everything as single-byte characters. class ASCIISource < Source # Return the character offset for the given byte offset. def character_offset(byte_offset) byte_offset end @@ -151,10 +252,17 @@ # encodings, it is not captured here. def code_units_offset(byte_offset, encoding) byte_offset end + # Returns a cache that is the identity function in order to maintain the + # same interface. We can do this because code units are always equivalent to + # byte offsets for ASCII-only sources. + def code_units_cache(encoding) + ->(byte_offset) { byte_offset } + end + # Specialized version of `code_units_column` that does not depend on # `code_units_offset`, which is a more expensive operation. This is # essentially the same as `Prism::Source#column`. def code_units_column(byte_offset, encoding) byte_offset - line_start(byte_offset) @@ -260,10 +368,16 @@ # The offset from the start of the file in code units of the given encoding. def start_code_units_offset(encoding = Encoding::UTF_16LE) source.code_units_offset(start_offset, encoding) end + # The start offset from the start of the file in code units using the given + # cache to fetch or calculate the value. + def cached_start_code_units_offset(cache) + cache[start_offset] + end + # The byte offset from the beginning of the source where this location ends. def end_offset start_offset + length end @@ -276,10 +390,16 @@ # The offset from the start of the file in code units of the given encoding. def end_code_units_offset(encoding = Encoding::UTF_16LE) source.code_units_offset(end_offset, encoding) end + # The end offset from the start of the file in code units using the given + # cache to fetch or calculate the value. + def cached_end_code_units_offset(cache) + cache[end_offset] + end + # The line number where this location starts. def start_line source.line(start_offset) end @@ -310,10 +430,16 @@ # starts from the start of the line. def start_code_units_column(encoding = Encoding::UTF_16LE) source.code_units_column(start_offset, encoding) end + # The start column in code units using the given cache to fetch or calculate + # the value. + def cached_start_code_units_column(cache) + cache[start_offset] - cache[source.line_start(start_offset)] + end + # The column number in bytes where this location ends from the start of the # line. def end_column source.column(end_offset) end @@ -328,10 +454,16 @@ # ends from the start of the line. def end_code_units_column(encoding = Encoding::UTF_16LE) source.code_units_column(end_offset, encoding) end + # The end column in code units using the given cache to fetch or calculate + # the value. + def cached_end_code_units_column(cache) + cache[end_offset] - cache[source.line_start(end_offset)] + end + # Implement the hash pattern matching interface for Location. def deconstruct_keys(keys) { start_offset: start_offset, end_offset: end_offset } end @@ -576,9 +708,14 @@ # Returns true if there were errors during parsing and false if there were # not. def failure? !success? + end + + # Create a code units cache for the given encoding. + def code_units_cache(encoding) + source.code_units_cache(encoding) end end # This is a result specific to the `parse` and `parse_file` methods. class ParseResult < Result