git_ls.rb in git_ls-0.5.0

- old
+ new

@@ -1,195 +1,197 @@
 # frozen_string_literal: true
 
 # Usage:
 #   GitLS.files -> Array of strings as files.
 #   This will be identical output to git ls-files
+require 'stringio'
+
 module GitLS # rubocop:disable Metrics/ModuleLength
   class Error < StandardError; end
 
   class << self
-    def files(path = ::Dir.pwd)
+    def files(path = nil)
+      path = path ? ::File.join(path, '.git/index') : '.git/index'
+
       read(path, false)
     end
 
-    def headers(path = ::Dir.pwd)
-      read(path, true)
-    end
-
     private
 
-    def read(path, return_headers_only) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
-      path = ::File.join(path, '.git/index') if ::File.directory?(path)
-      file = ::File.new(path)
+    def read(path, _return_headers_only) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
+      begin
+        # reading the whole file into memory is faster than lots of ::File#read
+        # the biggest it's going to be is 10s of megabytes, well within ram.
+        file = ::StringIO.new(::File.read(path, mode: 'rb'))
+      rescue ::Errno::ENOENT => e
+        raise ::GitLS::Error, "Not a git directory: #{e.message}"
+      end
+
       buf = ::String.new
       # 4-byte signature:
       # The signature is { 'D', 'I', 'R', 'C' } (stands for "dircache")
       # 4-byte version number:
       # The current supported versions are 2, 3 and 4.
       # 32-bit number of index entries.
-      sig, git_index_version, length = file.read(12, buf).unpack('a4NN')
-      raise ::GitLS::Error, ".git/index file not found at #{path}" unless sig == 'DIRC'
+      sig = file.read(4, buf)
+      raise ::GitLS::Error, ".git/index file not found at '#{path}'" unless sig == 'DIRC'
 
-      return { git_index_version: git_index_version, length: length } if return_headers_only
+      git_index_version = file.read(4, buf).unpack1('N')
+      entries = file.read(4, buf).unpack1('N')
 
-      files = ::Array.new(length)
-      case git_index_version
+      files = ::Array.new(entries)
+      files = case git_index_version
       when 2 then files_2(files, file)
       when 3 then files_3(files, file)
       when 4 then files_4(files, file)
-      else raise ::GitLS::Error, 'Unrecognized git index version'
+      else raise ::GitLS::Error, "Unrecognized git index version '#{git_index_version}'"
       end
 
-      extensions(files, file, buf)
-      files
-    rescue ::Errno::ENOENT => e
-      raise ::GitLS::Error, "Not a git directory: #{e.message}"
-    ensure
-      # :nocov:
-      # coverage tracking for branches in ensure blocks is weird
-      file&.close
-      # :nocov:
-      files
+      read_extensions(files, file, path, buf)
     end
 
-    def extensions(files, file, buf)
-      case file.read(4, buf)
-      when 'link' then link_extension(files, file, buf)
-      when /[A-Z]{4}/ then ignored_extension(files, file, buf)
+    def read_extensions(files, file, path, buf) # rubocop:disable Metrics/MethodLength
+      extension = file.read(4, buf)
+      if extension == 'link'
+        read_link_extension(files, file, path, buf)
+      elsif extension.match?(/\A[A-Z]{4}\z/)
+        size = file.read(4, buf).unpack1('N')
+        file.seek(size, 1)
+        read_extensions(files, file, path, buf)
       else
-        return if (file.pos += 16) && file.eof?
+        return files if file.seek(16, 1) && file.eof?
 
-        raise ::GitLS::Error, "Unrecognized .git/index extension #{buf.inspect}"
+        raise ::GitLS::Error, "Unrecognized .git/index extension #{extension.inspect}"
       end
     end
 
-    def ignored_extension(files, file, buf)
-      size = file.read(4, buf).unpack1('N')
-      file.pos += size
-      extensions(files, file, buf)
-    end
+    def read_link_extension(files, file, path, buf) # rubocop:disable Metrics/MethodLength
+      file.seek(4, 1) # skip size
 
-    def link_extension(files, file, buf) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
-      file.pos += 4 # size = file.read(4, buf).unpack1('N')
-
       sha = file.read(20, buf)
 
-      new_files = files.dup
+      split_files = read("#{::File.dirname(path)}/sharedindex.#{sha.unpack1('H*')}", false)
 
-      files.replace files("#{::File.dirname(file.path)}/sharedindex.#{sha.unpack1('H*')}")
-
       ewah_each_value(file, buf) do |pos|
-        files[pos] = nil
+        split_files[pos] = nil
       end
 
       ewah_each_value(file, buf) do |pos|
-        replacement_file = new_files.shift
+        replacement_file = files.shift
         # the documentation *implies* that this *may* get a new filename
         # i can't get it to happen though
         # :nocov:
-        files[pos] = replacement_file unless replacement_file.empty?
+        split_files[pos] = replacement_file unless replacement_file.empty?
         # :nocov:
       end
 
-      files.compact!
-      files.concat(new_files)
-      files.sort!
+      split_files.compact!
+      split_files.concat(files)
+      split_files.sort!
 
-      extensions(files, file, buf)
+      read_extensions(split_files, file, path, buf)
     end
 
     # format is defined here:
     # https://git-scm.com/docs/bitmap-format#_appendix_a_serialization_format_for_an_ewah_bitmap
     def ewah_each_value(file, buf) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
       uncompressed_pos = 0
-      file.pos += 4 # uncompressed_bits_count = file.read(4, buf).unpack1('N')
+
+      file.seek(4, 1) # skip 4 byte uncompressed_bits_count.
       compressed_bytes = file.read(4, buf).unpack1('N') * 8
 
       final_file_pos = file.pos + compressed_bytes
 
       until file.pos == final_file_pos
         run_length_word = file.read(8, buf).unpack1('Q>')
         # 1st bit
         run_bit = run_length_word & 1
-        # the next 32 bits, masked, multiplied by 64 (which is shifted by 6 places)
-        run_length = ((run_length_word >> 1) & 0xFFFF_FFFF) << 6
+        # the next 32 bits, masked, multiplied by 64
+        run_length = ((run_length_word / 0b1_0) & 0xFFFF_FFFF) * 64
         # the next 31 bits
-        literal_length = (run_length_word >> 33)
+        literal_length = (run_length_word / 0b100000000_00000000_00000000_00000000_0)
 
         if run_bit == 1
           run_length.times do
             yield uncompressed_pos
             uncompressed_pos += 1
           end
         else
           uncompressed_pos += run_length
         end
 
-        literal_length.times do
-          word = file.read(8, buf).unpack1('B*').reverse
-          word.each_char do |char|
+        next unless literal_length > 0
+
+        words = file.read(8 * literal_length, buf).unpack('B64' * literal_length)
+        words.each do |word|
+          word.each_char.reverse_each do |char|
             yield(uncompressed_pos) if char == '1'
 
             uncompressed_pos += 1
           end
         end
       end
 
-      file.pos += 4 # bitmap metadata for adding to bitmaps
+      file.seek(4, 1) # bitmap metadata for adding to bitmaps
     end
 
     def files_2(files, file) # rubocop:disable Metrics/MethodLength
       files.map! do
-        file.pos += 60 # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
-        length = ((file.getbyte & 0b0000_1111) << 8) + file.getbyte # find the 12 byte length
+        file.seek(60, 1) # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
+
+        length = (file.getbyte & 0xF) * 256 + file.getbyte # find the 12 byte length
         if length < 0xFFF
           path = file.read(length)
           # :nocov:
         else
           # i can't test this i just get ENAMETOOLONG a lot
-          path = file.readline("\0").chop
-          file.pos -= 1
+          # I'm not sure it's even possible to get to this path, PATH_MAX is 4096 bytes on linux, 1024 on mac
+          # and length is a 12 byte number: 4096 max.
+          path = file.readline("\0").chop!
+          file.seek(-1, 1)
           # :nocov:
         end
-        file.pos += 8 - ((length - 2) % 8) # 1-8 bytes padding of nuls
+        file.seek(8 - ((length - 2) % 8), 1) # 1-8 bytes padding of nuls
+        path.force_encoding(Encoding::UTF_8)
         path
       end
     end
 
     def files_3(files, file) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
       files.map! do
-        file.pos += 60 # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
+        file.seek(60, 1) # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
+        flags = file.getbyte
+        extended_flag = (flags & 0b0100_0000) > 0
+        length = (flags & 0xF) * 256 + file.getbyte # find the 12 byte length
+        file.seek(2, 1) if extended_flag
 
-        flags = file.getbyte * 256 + file.getbyte
-        extended_flag = (flags & 0b0100_0000_0000_0000).positive?
-        file.pos += 2 if extended_flag
-
-        length = flags & 0b0000_1111_1111_1111
         if length < 0xFFF
           path = file.read(length)
           # :nocov:
         else
           # i can't test this i just get ENAMETOOLONG a lot
-          path = file.readline("\0").chop
-          file.pos -= 1
+          # I'm not sure it's even possible to get to this path, PATH_MAX is 4096 bytes on linux, 1024 on mac
+          # and length is a 12 byte number: 4096 max.
+          path = file.readline("\0").chop!
+          file.seek(-1, 1)
           # :nocov:
         end
-
-        file.pos += 8 - ((path.bytesize - (extended_flag ? 0 : 2)) % 8) # 1-8 bytes padding of nuls
+        file.seek(8 - ((path.bytesize - (extended_flag ? 0 : 2)) % 8), 1) # 1-8 bytes padding of nuls
+        path.force_encoding(Encoding::UTF_8)
         path
       end
     end
 
     def files_4(files, file) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
       prev_entry_path = ''
       files.map! do
-        file.pos += 60 # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
-        flags = file.getbyte * 256 + file.getbyte
-        file.pos += 2 if (flags & 0b0100_0000_0000_0000).positive?
+        file.seek(60, 1) # skip 60 bytes (40 bytes of stat, 20 bytes of sha)
+        flags = file.getbyte
+        extended_flag = (flags & 0b0100_0000) > 0
+        length = (flags & 0xF) * 256 + file.getbyte # find the 12 byte length
+        file.seek(2, 1) if extended_flag
 
-        length = flags & 0b0000_1111_1111_1111
-
         # documentation for this number from
         # https://git-scm.com/docs/pack-format#_original_version_1_pack_idx_files_have_the_following_format
         # offset encoding:
         #   n bytes with MSB set in all but the last one.
         #   The offset is then the number constructed by
@@ -197,30 +199,34 @@
         #   for n >= 2 adding 2^7 + 2^14 + ... + 2^(7*(n-1))
         #   to the result.
         read_offset = 0
         prev_read_offset = file.getbyte
         n = 1
-        while (prev_read_offset & 0b1000_0000).positive?
-          read_offset += (prev_read_offset - 0b1000_0000)
+        while (prev_read_offset & 0b1000_0000) > 0
+          read_offset += (prev_read_offset & 0b0111_1111)
           read_offset += 2**(7 * n)
           n += 1
           prev_read_offset = file.getbyte
         end
         read_offset += prev_read_offset
 
         initial_part_length = prev_entry_path.bytesize - read_offset
 
         if length < 0xFFF
           rest = file.read(length - initial_part_length)
-          file.pos += 1 # the NUL
+          file.seek(1, 1) # the NUL
           # :nocov:
         else
           # i can't test this i just get ENAMETOOLONG a lot
-          rest = file.readline("\0").chop
+          # I'm not sure it's even possible to get to this path, PATH_MAX is 4096 bytes on linux, 1024 on mac
+          # and length is a 12 byte number: 4096 max.
+          rest = file.readline("\0").chop!
+          file.seek(-1, 1)
           # :nocov:
         end
 
         prev_entry_path = prev_entry_path.byteslice(0, initial_part_length) + rest
+        prev_entry_path.force_encoding(Encoding::UTF_8)
       end
     end
   end
 end