# A replacement for RubyZip for streaming, with a couple of small differences. # The first difference is that it is verbosely-written-to-the-spec and you can actually # follow what is happening. It does not support quite a few fancy features of Rubyzip, # but instead it can be digested in one reading, and has solid Zip64 support. It also does # not attempt any tricks with Zip64 placeholder extra fields because the ZipTricks streaming # engine assumes you _know_ how large your file is (both compressed and uncompressed) _and_ # you have the file's CRC32 checksum upfront. # # Just like Rubyzip it will switch to Zip64 automatically if required, but there is no global # setting to enable that behavior - it is always on. class ZipTricks::Microzip STORED = 0 DEFLATED = 8 TooMuch = Class.new(StandardError) DuplicateFilenames = Class.new(StandardError) UnknownMode = Class.new(StandardError) FOUR_BYTE_MAX_UINT = 0xFFFFFFFF TWO_BYTE_MAX_UINT = 0xFFFF VERSION_MADE_BY = 52 VERSION_NEEDED_TO_EXTRACT = 20 VERSION_NEEDED_TO_EXTRACT_ZIP64 = 45 DEFAULT_EXTERNAL_ATTRS = begin # These need to be set so that the unarchived files do not become executable on UNIX, for # security purposes. Strictly speaking we would want to make this user-customizable, # but for now just putting in sane defaults will do. For example, Trac with zipinfo does this: # zipinfo.external_attr = 0644 << 16L # permissions -r-wr--r--. # We snatch the incantations from Rubyzip for this. unix_perms = 0644 file_type_file = 010 external_attrs = (file_type_file << 12 | (unix_perms & 07777)) << 16 end MADE_BY_SIGNATURE = begin # A combination of the VERSION_MADE_BY low byte and the OS type high byte os_type = 3 # UNIX [VERSION_MADE_BY, os_type].pack('CC') end C_V = 'V'.freeze C_v = 'v'.freeze C_Qe = 'Q<'.freeze module Bytesize def bytesize_of ''.force_encoding(Encoding::BINARY).tap {|b| yield(b) }.bytesize end end include Bytesize class Entry < Struct.new(:filename, :crc32, :compressed_size, :uncompressed_size, :storage_mode, :mtime) include Bytesize def initialize(*) super @requires_zip64 = (compressed_size > FOUR_BYTE_MAX_UINT || uncompressed_size > FOUR_BYTE_MAX_UINT) if filename.bytesize > TWO_BYTE_MAX_UINT raise TooMuch, "The given filename is too long to fit (%d bytes)" % filename.bytesize end end def requires_zip64? @requires_zip64 end # Set the general purpose flags for the entry. The only flag we care about is the EFS # bit (bit 11) which should be set if the filename is UTF8. If it is, we need to set the # bit so that the unarchiving application knows that the filename in the archive is UTF-8 # encoded, and not some DOS default. For ASCII entries it does not matter. # # Now, strictly speaking, if a diacritic-containing character (such as å) does fit into the DOS-437 # codepage, it should be encodable as such. This would, in theory, let older Windows tools # decode the filename correctly. However, this kills the filename decoding for the OSX builtin # archive utility (it assumes the filename to be UTF-8, regardless). So if we allow filenames # to be encoded in DOS-437, we _potentially_ have support in Windows but we upset everyone on Mac. # If we just use UTF-8 and set the right EFS bit in general purpose flags, we upset Windows users # because most of the Windows unarchive tools (at least the builtin ones) do not give a flying eff # about the EFS support bit being set. # # Additionally, if we use Unarchiver on OSX (which is our recommended unpacker for large files), # it will (very rightfully) ask us how we should decode each filename that does not have the EFS bit, # but does contain something non-ASCII-decodable. This is horrible UX for users. # # So, basically, we have 2 choices, for filenames containing diacritics (for bona-fide UTF-8 you do not # even get those choices, you _have_ to use UTF-8): # # * Make life easier for Windows users by setting stuff to DOS, not care about the standard _and_ make # most of Mac users upset # * Make life easy for Mac users and conform to the standard, and tell Windows users to get a _decent_ # ZIP unarchiving tool. # # We are going with option 2, and this is well-thought-out. Trust me. If you want the crazytown # filename encoding scheme that is described here http://stackoverflow.com/questions/13261347 # you can try this: # # [Encoding::CP437, Encoding::ISO_8859_1, Encoding::UTF_8] # # We don't want no such thing, and sorry Windows users, you are going to need a decent unarchiver # that honors the standard. Alas, alas. def gp_flags_based_on_filename filename.encode(Encoding::ASCII) 0b00000000000 rescue EncodingError 0b00000000000 | 0b100000000000 end def write_local_file_header(io) # TBD: caveat. If this entry _does_ fit into a standard zip segment (both compressed and # uncompressed size at or below 0xFFFF etc), but it is _located_ at an offset that requires # Zip64 to be used (beyound 4GB), we are going to be omitting the Zip64 extras in the local # file header, but we will be enabling them when writing the central directory. Then the # CD record for the file _will_ have Zip64 extra, but the local file header won't. In theory, # this should not pose a problem, but then again... life in this world can be harsh. # # If it turns out that it _does_ pose a problem, we can always do: # # @requires_zip64 = true if io.tell > FOUR_BYTE_MAX_UINT # # right here, and have the data written regardless even if the file fits. io << [0x04034b50].pack(C_V) # local file header signature 4 bytes (0x04034b50) if @requires_zip64 # version needed to extract 2 bytes io << [VERSION_NEEDED_TO_EXTRACT_ZIP64].pack(C_v) else io << [VERSION_NEEDED_TO_EXTRACT].pack(C_v) end io << [gp_flags_based_on_filename].pack("v") # general purpose bit flag 2 bytes io << [storage_mode].pack("v") # compression method 2 bytes io << [to_binary_dos_time(mtime)].pack(C_v) # last mod file time 2 bytes io << [to_binary_dos_date(mtime)].pack(C_v) # last mod file date 2 bytes io << [crc32].pack(C_V) # crc-32 4 bytes if @requires_zip64 io << [FOUR_BYTE_MAX_UINT].pack(C_V) # compressed size 4 bytes io << [FOUR_BYTE_MAX_UINT].pack(C_V) # uncompressed size 4 bytes else io << [compressed_size].pack(C_V) # compressed size 4 bytes io << [uncompressed_size].pack(C_V) # uncompressed size 4 bytes end # Filename should not be longer than 0xFFFF otherwise this wont fit here io << [filename.bytesize].pack(C_v) # file name length 2 bytes extra_size = 0 if @requires_zip64 extra_size += bytesize_of {|buf| write_zip_64_extra_for_local_file_header(buf) } end io << [extra_size].pack(C_v) # extra field length 2 bytes io << filename # file name (variable size) # Interesting tidbit: # https://social.technet.microsoft.com/Forums/windows/en-US/6a60399f-2879-4859-b7ab-6ddd08a70948 # TL;DR of it is: Windows 7 Explorer _will_ open Zip64 entries. However, it desires to have the # Zip64 extra field as _the first_ extra field. If we decide to add the Info-ZIP UTF-8 field... write_zip_64_extra_for_local_file_header(io) if @requires_zip64 end def write_zip_64_extra_for_local_file_header(io) io << [0x0001].pack(C_v) # 2 bytes Tag for this "extra" block type io << [16].pack(C_v) # 2 bytes Size of this "extra" block. For us it will always be 16 (2x8) io << [uncompressed_size].pack(C_Qe) # 8 bytes Original uncompressed file size io << [compressed_size].pack(C_Qe) # 8 bytes Size of compressed data end def write_zip_64_extra_for_central_directory_file_header(io, local_file_header_location) io << [0x0001].pack(C_v) # 2 bytes Tag for this "extra" block type io << [28].pack(C_v) # 2 bytes Size of this "extra" block. For us it will always be 28 io << [uncompressed_size].pack(C_Qe) # 8 bytes Original uncompressed file size io << [compressed_size].pack(C_Qe) # 8 bytes Size of compressed data io << [local_file_header_location].pack(C_Qe) # 8 bytes Offset of local header record io << [0].pack(C_V) # 4 bytes Number of the disk on which this file starts end def write_central_directory_file_header(io, local_file_header_location) # At this point if the header begins somewhere beyound 0xFFFFFFFF we _have_ to record the offset # of the local file header as a zip64 extra field, so we give up, give in, you loose, love will always win... @requires_zip64 = true if local_file_header_location > FOUR_BYTE_MAX_UINT io << [0x02014b50].pack(C_V) # central file header signature 4 bytes (0x02014b50) io << MADE_BY_SIGNATURE # version made by 2 bytes if @requires_zip64 io << [VERSION_NEEDED_TO_EXTRACT_ZIP64].pack(C_v) # version needed to extract 2 bytes else io << [VERSION_NEEDED_TO_EXTRACT].pack(C_v) # version needed to extract 2 bytes end io << [gp_flags_based_on_filename].pack(C_v) # general purpose bit flag 2 bytes io << [storage_mode].pack(C_v) # compression method 2 bytes io << [to_binary_dos_time(mtime)].pack(C_v) # last mod file time 2 bytes io << [to_binary_dos_date(mtime)].pack(C_v) # last mod file date 2 bytes io << [crc32].pack(C_V) # crc-32 4 bytes if @requires_zip64 io << [FOUR_BYTE_MAX_UINT].pack(C_V) # compressed size 4 bytes io << [FOUR_BYTE_MAX_UINT].pack(C_V) # uncompressed size 4 bytes else io << [compressed_size].pack(C_V) # compressed size 4 bytes io << [uncompressed_size].pack(C_V) # uncompressed size 4 bytes end # Filename should not be longer than 0xFFFF otherwise this wont fit here io << [filename.bytesize].pack(C_v) # file name length 2 bytes extra_size = 0 if @requires_zip64 extra_size += bytesize_of {|buf| write_zip_64_extra_for_central_directory_file_header(buf, local_file_header_location) } end io << [extra_size].pack(C_v) # extra field length 2 bytes io << [0].pack(C_v) # file comment length 2 bytes io << [0].pack(C_v) # disk number start 2 bytes io << [0].pack(C_v) # internal file attributes 2 bytes io << [DEFAULT_EXTERNAL_ATTRS].pack(C_V) # external file attributes 4 bytes if @requires_zip64 io << [FOUR_BYTE_MAX_UINT].pack(C_V) # relative offset of local header 4 bytes else io << [local_file_header_location].pack(C_V) # relative offset of local header 4 bytes end io << filename # file name (variable size) if @requires_zip64 # extra field (variable size) write_zip_64_extra_for_central_directory_file_header(io, local_file_header_location) end # file comment (variable size) end private def to_binary_dos_time(t) (t.sec/2) + (t.min << 5) + (t.hour << 11) end def to_binary_dos_date(t) (t.day) + (t.month << 5) + ((t.year - 1980) << 9) end end # Creates a new streaming writer. # The writer is stateful and knows it's list of ZIP file entries as they are being added. def initialize @files = [] @local_header_offsets = [] end # Adds a file to the entry list and immediately writes out it's local file header into the # output stream. # # @param io[#<<, #tell] the buffer to write the local file header to # @param filename[String] The name of the file # @param crc32[Fixnum] The CRC32 checksum of the file # @param compressed_size[Fixnum] The size of the compressed (or stored) data - how much space it uses in the ZIP # @param uncompressed_size[Fixnum] The size of the file once extracted # @param storage_mode[Fixnum] Either 0 for "stored" or 8 for "deflated" # @param mtime[Time] What modification time to record for the file # @return [void] def add_local_file_header(io:, filename:, crc32:, compressed_size:, uncompressed_size:, storage_mode:, mtime: Time.now.utc) if @files.any?{|e| e.filename == filename } raise DuplicateFilenames, "Filename #{filename.inspect} already used in the archive" end raise UnknownMode, "Unknown compression mode #{storage_mode}" unless [STORED, DEFLATED].include?(storage_mode) e = Entry.new(filename, crc32, compressed_size, uncompressed_size, storage_mode, mtime) @files << e @local_header_offsets << io.tell e.write_local_file_header(io) end # Writes the central directory (including the Zip6 salient bits if necessary) # # @param io[#<<, #tell] the buffer to write the central directory to. # The method will use `tell` on the buffer since it has to know where the central directory is located # @return [void] def write_central_directory(io) start_of_central_directory = io.tell # Central directory file headers, per file in order @files.each_with_index do |file, i| local_file_header_offset_from_start_of_file = @local_header_offsets.fetch(i) file.write_central_directory_file_header(io, local_file_header_offset_from_start_of_file) end central_dir_size = io.tell - start_of_central_directory zip64_required = central_dir_size > FOUR_BYTE_MAX_UINT || start_of_central_directory > FOUR_BYTE_MAX_UINT || @files.length > TWO_BYTE_MAX_UINT || @files.any?(&:requires_zip64?) # Then, if zip64 is used if zip64_required # [zip64 end of central directory record] zip64_eocdr_offset = io.tell # zip64 end of central dir io << [0x06064b50].pack(C_V) # signature 4 bytes (0x06064b50) io << [44].pack(C_Qe) # size of zip64 end of central # directory record 8 bytes # (this is ex. the 12 bytes of the signature and the size value itself). # Without the extensible data sector it is always 44. io << MADE_BY_SIGNATURE # version made by 2 bytes io << [VERSION_NEEDED_TO_EXTRACT_ZIP64].pack(C_v) # version needed to extract 2 bytes io << [0].pack(C_V) # number of this disk 4 bytes io << [0].pack(C_V) # number of the disk with the # start of the central directory 4 bytes io << [@files.length].pack(C_Qe) # total number of entries in the # central directory on this disk 8 bytes io << [@files.length].pack(C_Qe) # total number of entries in the # central directory 8 bytes io << [central_dir_size].pack(C_Qe) # size of the central directory 8 bytes # offset of start of central # directory with respect to io << [start_of_central_directory].pack(C_Qe) # the starting disk number 8 bytes # zip64 extensible data sector (variable size) # [zip64 end of central directory locator] io << [0x07064b50].pack("V") # zip64 end of central dir locator # signature 4 bytes (0x07064b50) io << [0].pack(C_V) # number of the disk with the # start of the zip64 end of # central directory 4 bytes io << [zip64_eocdr_offset].pack(C_Qe) # relative offset of the zip64 # end of central directory record 8 bytes # (note: "relative" is actually "from the start of the file") io << [1].pack(C_V) # total number of disks 4 bytes end # Then the end of central directory record: io << [0x06054b50].pack(C_V) # end of central dir signature 4 bytes (0x06054b50) io << [0].pack(C_v) # number of this disk 2 bytes io << [0].pack(C_v) # number of the disk with the # start of the central directory 2 bytes if zip64_required # the number of entries will be read from the zip64 part of the central directory io << [TWO_BYTE_MAX_UINT].pack(C_v) # total number of entries in the # central directory on this disk 2 bytes io << [TWO_BYTE_MAX_UINT].pack(C_v) # total number of entries in # the central directory 2 bytes else io << [@files.length].pack(C_v) # total number of entries in the # central directory on this disk 2 bytes io << [@files.length].pack(C_v) # total number of entries in # the central directory 2 bytes end if zip64_required io << [FOUR_BYTE_MAX_UINT].pack(C_V) # size of the central directory 4 bytes io << [FOUR_BYTE_MAX_UINT].pack(C_V) # offset of start of central # directory with respect to # the starting disk number 4 bytes else io << [central_dir_size].pack(C_V) # size of the central directory 4 bytes io << [start_of_central_directory].pack(C_V) # offset of start of central # directory with respect to # the starting disk number 4 bytes end io << [0].pack(C_v) # .ZIP file comment length 2 bytes # .ZIP file comment (variable size) end private_constant :FOUR_BYTE_MAX_UINT, :TWO_BYTE_MAX_UINT, :VERSION_MADE_BY, :VERSION_NEEDED_TO_EXTRACT, :VERSION_NEEDED_TO_EXTRACT_ZIP64, :DEFAULT_EXTERNAL_ATTRS, :MADE_BY_SIGNATURE, :Entry, :C_V, :C_v, :C_Qe end