ascii85.rb in Ascii85-2.0.0

- old
+ new
@@ -1,221 +1,453 @@
-# encoding: utf-8
 # frozen_string_literal: true
 
+require 'stringio'
 
 #
 # Ascii85 is an implementation of Adobe's binary-to-text encoding of the
 # same name in pure Ruby.
 #
-# See http://www.adobe.com/products/postscript/pdfs/PLRM.pdf page 131
-# and http://en.wikipedia.org/wiki/Ascii85 for more information about
-# the format.
+# See http://en.wikipedia.org/wiki/Ascii85 for more information about the
+# format.
 #
 # Author::  Johannes Holzfuß (johannes@holzfuss.name)
 # License:: Distributed under the MIT License (see LICENSE file)
 #
+module Ascii85
+  class << self
+    #
+    # Encodes the bytes of the given String or IO-like object as Ascii85.
+    #
+    # @param str_or_io [String, IO] The input to encode
+    # @param wrap_lines [Integer, false] The line length for wrapping, or +false+ for no wrapping
+    # @param out [IO, nil] An optional IO-like object to write the output to
+    #
+    # @return [String, IO] The encoded String or the output IO object that was passed in
+    #
+    # @example Encoding a simple String
+    #   Ascii85.encode("Ruby")
+    #   # => <~;KZGo~>
+    #
+    # @example Encoding with line wrapping
+    #   Ascii85.encode("Supercalifragilisticexpialidocious", 15)
+    #   # => <~;g!%jEarNoBkD
+    #   #    BoB5)0rF*),+AU&
+    #   #    0.@;KXgDe!L"F`R
+    #   #    ~>
+    #
+    # @example Encoding without line wrapping
+    #   Ascii85.encode("Supercalifragilisticexpialidocious", false)
+    #   # => <~;g!%jEarNoBkDBoB5)0rF*),+AU&0.@;KXgDe!L"F`R~>
+    #
+    # @example Encoding from an IO-like object
+    #   input = StringIO.new("Ruby")
+    #   Ascii85.encode(input)
+    #   # => "<~;KZGo~>"
+    #
+    # @example Encoding to an IO object
+    #   output = StringIO.new
+    #   Ascii85.encode("Ruby", out: output)
+    #   # => output (with "<~;KZGo~>" written to it)
+    #
+    def encode(str_or_io, wrap_lines = 80, out: nil)
+      reader = if io_like?(str_or_io)
+                 str_or_io
+               else
+                 StringIO.new(str_or_io.to_s, 'rb')
+               end
 
+      return ''.dup if reader.eof?
 
-module Ascii85
-  #
-  # Encodes the bytes of the given String as Ascii85.
-  #
-  # If +wrap_lines+ evaluates to +false+, the output will be returned as
-  # a single long line. Otherwise #encode formats the output into lines
-  # of length +wrap_lines+ (minimum is 2).
-  #
-  #     Ascii85.encode("Ruby")
-  #     => <~;KZGo~>
-  #
-  #     Ascii85.encode("Supercalifragilisticexpialidocious", 15)
-  #     => <~;g!%jEarNoBkD
-  #        BoB5)0rF*),+AU&
-  #        0.@;KXgDe!L"F`R
-  #        ~>
-  #
-  #     Ascii85.encode("Supercalifragilisticexpialidocious", false)
-  #     => <~;g!%jEarNoBkDBoB5)0rF*),+AU&0.@;KXgDe!L"F`R~>
-  #
-  #
-  def self.encode(str, wrap_lines = 80)
-    to_encode = str.to_s
-    return '' if to_encode.empty?
+      # Setup buffered Reader and Writers
+      bufreader = BufferedReader.new(reader, unencoded_chunk_size)
+      bufwriter = BufferedWriter.new(out || StringIO.new(String.new, 'wb'), encoded_chunk_size)
+      writer = wrap_lines ? Wrapper.new(bufwriter, wrap_lines) : DummyWrapper.new(bufwriter)
 
-    # Deal with multi-byte encodings
-    if to_encode.respond_to?(:bytesize)
-      input_size = to_encode.bytesize
-    else
-      input_size = to_encode.size
-    end
+      padding = "\0\0\0\0"
+      tuplebuf = '!!!!!'.dup
 
-    # Compute number of \0s to pad the message with (0..3)
-    padding_length = (-input_size) % 4
+      bufreader.each_chunk do |chunk|
+        chunk.unpack('N*').each do |word|
+          # Encode each big-endian 32-bit word into a 5-character tuple (except
+          # for 0, which encodes to 'z')
+          if word.zero?
+            writer.write('z')
+          else
+            word, b0 = word.divmod(85)
+            word, b1 = word.divmod(85)
+            word, b2 = word.divmod(85)
+            word, b3 = word.divmod(85)
+            b4 = word
 
-    # Extract big-endian integers
-    tuples = (to_encode + ("\0" * padding_length)).unpack('N*')
+            tuplebuf.setbyte(0, b4 + 33)
+            tuplebuf.setbyte(1, b3 + 33)
+            tuplebuf.setbyte(2, b2 + 33)
+            tuplebuf.setbyte(3, b1 + 33)
+            tuplebuf.setbyte(4, b0 + 33)
 
-    # Encode
-    tuples.map! do |tuple|
-      if tuple == 0
-        'z'
-      else
-        tmp = String.new
-        5.times do
-          tmp << ((tuple % 85) + 33).chr
-          tuple /= 85
+            writer.write(tuplebuf)
+          end
         end
-        tmp.reverse
-      end
-    end
 
-    # We can't use the z-abbreviation if we're going to cut off padding
-    if (padding_length > 0) and (tuples.last == 'z')
-      tuples[-1] = '!!!!!'
-    end
+        next if (chunk.bytesize & 0b11).zero?
 
-    # Cut off the padding
-    tuples[-1] = tuples[-1][0..(4 - padding_length)]
+        # If we have leftover bytes, we need to zero-pad to a multiple of four
+        # before converting to a 32-bit word.
+        padding_length = (-chunk.bytesize) % 4
+        trailing = chunk[-(4 - padding_length)..]
+        word = (trailing + padding[0...padding_length]).unpack1('N')
 
-    # If we don't need to wrap the lines, add delimiters and return
-    if (!wrap_lines)
-      return '<~' + tuples.join + '~>'
-    end
+        # Encode the last word and cut off any padding
+        if word.zero?
+          writer.write('!!!!!'[0..(4 - padding_length)])
+        else
+          word, b0 = word.divmod(85)
+          word, b1 = word.divmod(85)
+          word, b2 = word.divmod(85)
+          word, b3 = word.divmod(85)
+          b4 = word
 
-    # Otherwise we wrap the lines
-    line_length = [2, wrap_lines.to_i].max
+          tuplebuf.setbyte(0, b4 + 33)
+          tuplebuf.setbyte(1, b3 + 33)
+          tuplebuf.setbyte(2, b2 + 33)
+          tuplebuf.setbyte(3, b1 + 33)
+          tuplebuf.setbyte(4, b0 + 33)
 
-    wrapped = []
-    to_wrap = '<~' + tuples.join
+          writer.write(tuplebuf[0..(4 - padding_length)])
+        end
+      end
 
-    0.step(to_wrap.length, line_length) do |index|
-      wrapped << to_wrap.slice(index, line_length)
-    end
+      # If no output IO-object was provided, extract the encoded String from the
+      # default StringIO writer. We force the encoding to 'ASCII-8BIT' to work
+      # around a TruffleRuby bug.
+      return writer.finish.io.string.force_encoding('ASCII-8BIT') if out.nil?
 
-    # Add end-marker – on a new line if necessary
-    if (wrapped.last.length + 2) > line_length
-      wrapped << '~>'
-    else
-      wrapped[-1] << '~>'
+      # Otherwise we make sure to flush the output writer, and then return it.
+      writer.finish.io
     end
 
-    return wrapped.join("\n")
-  end
+    # Searches through a String and extracts the first substring enclosed by '<~' and '~>'.
+    #
+    # @param str [String] The String to search through
+    #
+    # @return [String] The extracted substring, or an empty String if no valid delimiters are found
+    #
+    # @example Extracting Ascii85 content
+    #   Ascii85.extract("Foo<~;KZGo~>Bar<~z~>Baz")
+    #   # => ";KZGo"
+    #
+    # @example When no delimiters are found
+    #   Ascii85.extract("No delimiters")
+    #   # => ""
+    #
+    # @note This method only accepts a String, not an IO-like object, as the entire input
+    #       needs to be available to ensure validity.
+    #
+    def extract(str)
+      input = str.to_s
 
-  #
-  # Searches through +str+ and decodes the _first_ Ascii85-String found.
-  #
-  # #decode expects an Ascii85-encoded String enclosed in <~ and ~> — it will
-  # ignore all characters outside these markers. The returned strings are always
-  # encoded as ASCII-8BIT.
-  #
-  #     Ascii85.decode("<~;KZGo~>")
-  #     => "Ruby"
-  #
-  #     Ascii85.decode("Foo<~;KZGo~>Bar<~;KZGo~>Baz")
-  #     => "Ruby"
-  #
-  #     Ascii85.decode("No markers")
-  #     => ""
-  #
-  # #decode will raise Ascii85::DecodingError when malformed input is
-  # encountered.
-  #
-  def self.decode(str)
-    input = str.to_s
+      # Make sure the delimiter Strings have the correct encoding.
+      opening_delim = '<~'.encode(input.encoding)
+      closing_delim = '~>'.encode(input.encoding)
 
-    opening_delim = '<~'
-    closing_delim = '~>'
+      # Get the positions of the opening/closing delimiters. If there is no pair
+      # of opening/closing delimiters, return an unfrozen empty String.
+      (start_pos = input.index(opening_delim))                or return ''.dup
+      (end_pos   = input.index(closing_delim, start_pos + 2)) or return ''.dup
 
-    # Make sure the delimiter strings have the correct encoding.
+      # Get the String inside the delimiter-pair
+      input[(start_pos + 2)...end_pos]
+    end
+
     #
-    # Although I don't think it likely, this may raise encoding
-    # errors if an especially exotic input encoding is introduced.
-    # As of Ruby 1.9.2 all non-dummy encodings work fine though.
+    # Searches through a String and decodes the first substring enclosed by '<~' and '~>'.
     #
-    if opening_delim.respond_to?(:encode)
-      opening_delim = opening_delim.encode(input.encoding)
-      closing_delim = closing_delim.encode(input.encoding)
+    # @param str [String] The String containing Ascii85-encoded content
+    # @param out [IO, nil] An optional IO-like object to write the output to
+    #
+    # @return [String, IO] The decoded String (in ASCII-8BIT encoding) or the output IO object (if it was provided)
+    #
+    # @raise [Ascii85::DecodingError] When malformed input is encountered
+    #
+    # @example Decoding Ascii85 content
+    #   Ascii85.decode("<~;KZGo~>")
+    #   # => "Ruby"
+    #
+    # @example Decoding with multiple Ascii85 blocks present (ignores all but the first)
+    #   Ascii85.decode("Foo<~;KZGo~>Bar<~87cURDZ~>Baz")
+    #   # => "Ruby"
+    #
+    # @example When no delimiters are found
+    #   Ascii85.decode("No delimiters")
+    #   # => ""
+    #
+    # @example Decoding to an IO object
+    #   output = StringIO.new
+    #   Ascii85.decode("<~;KZGo~>", out: output)
+    #   # => output (with "Ruby" written to it)
+    #
+    # @note This method only accepts a String, not an IO-like object, as the entire input
+    #       needs to be available to ensure validity.
+    #
+    def decode(str, out: nil)
+      decode_raw(extract(str), out: out)
     end
 
-    # Get the positions of the opening/closing delimiters. If there is
-    # no pair of opening/closing delimiters, return the empty string.
-    (start_pos = input.index(opening_delim))                or return ''
-    (end_pos   = input.index(closing_delim, start_pos + 2)) or return ''
+    #
+    # Decodes the given raw Ascii85-encoded String or IO-like object.
+    #
+    # @param str_or_io [String, IO] The Ascii85-encoded input to decode
+    # @param out [IO, nil] An optional IO-like object to write the output to
+    #
+    # @return [String, IO] The decoded String (in ASCII-8BIT encoding) or the output IO object (if it was provided)
+    #
+    # @raise [Ascii85::DecodingError] When malformed input is encountered
+    #
+    # @example Decoding a raw Ascii85 String
+    #   Ascii85.decode_raw(";KZGo")
+    #   # => "Ruby"
+    #
+    # @example Decoding from an IO-like object
+    #   input = StringIO.new(";KZGo")
+    #   Ascii85.decode_raw(input)
+    #   # => "Ruby"
+    #
+    # @example Decoding to an IO object
+    #   output = StringIO.new
+    #   Ascii85.decode_raw(";KZGo", out: output)
+    #   # => output (with "Ruby" written to it)
+    #
+    # @note The input must not be enclosed in '<~' and '~>' delimiters.
+    #
+    def decode_raw(str_or_io, out: nil)
+      reader = if io_like?(str_or_io)
+                 str_or_io
+               else
+                 StringIO.new(str_or_io.to_s, 'rb')
+               end
 
-    # Get the string inside the delimiter-pair
-    input = input[(start_pos + 2)...end_pos]
+      # Return an unfrozen String on empty input
+      return ''.dup if reader.eof?
 
-    # Decode
-    word   = 0
-    count  = 0
-    result = []
+      # Setup buffered Reader and Writers
+      bufreader = BufferedReader.new(reader, encoded_chunk_size)
+      bufwriter = BufferedWriter.new(out || StringIO.new(String.new, 'wb'), unencoded_chunk_size)
 
-    input.each_byte do |c|
-      case c.chr
-      when " ", "\t", "\r", "\n", "\f", "\0"
-        # Ignore whitespace
-        next
+      # Populate the lookup table (caches the exponentiation)
+      lut = (0..4).map { |count| 85**(4 - count) }
 
-      when 'z'
-        if count == 0
-          # Expand z to 0-word
-          result << 0
-        else
-          raise(Ascii85::DecodingError, "Found 'z' inside Ascii85 5-tuple")
-        end
+      # Decode
+      word   = 0
+      count  = 0
+      wordbuf = "\0\0\0\0".dup
 
-      when '!'..'u'
-        # Decode 5 characters into a 4-byte word
-        word  += (c - 33) * 85**(4 - count)
-        count += 1
+      bufreader.each_chunk do |chunk|
+        chunk.each_byte do |c|
+          case c.chr
+          when ' ', "\t", "\r", "\n", "\f", "\0"
+            # Ignore whitespace
+            next
 
-        if count == 5
+          when 'z'
+            raise(Ascii85::DecodingError, "Found 'z' inside Ascii85 5-tuple") unless count.zero?
 
-          if word > 0xffffffff
-            raise(Ascii85::DecodingError,
-                  "Invalid Ascii85 5-tuple (#{word} >= 2**32)")
+            # Expand z to 0-word
+            bufwriter.write("\0\0\0\0")
+
+          when '!'..'u'
+            # Decode 5 characters into a 4-byte word
+            word  += (c - 33) * lut[count]
+            count += 1
+
+            if count == 5 && word > 0xffffffff
+              raise(Ascii85::DecodingError, "Invalid Ascii85 5-tuple (#{word} >= 2**32)")
+            elsif count == 5
+              b3 = word & 0xff; word >>= 8
+              b2 = word & 0xff; word >>= 8
+              b1 = word & 0xff; word >>= 8
+              b0 = word
+
+              wordbuf.setbyte(0, b0)
+              wordbuf.setbyte(1, b1)
+              wordbuf.setbyte(2, b2)
+              wordbuf.setbyte(3, b3)
+
+              bufwriter.write(wordbuf)
+
+              word  = 0
+              count = 0
+            end
+
+          else
+            raise(Ascii85::DecodingError, "Illegal character inside Ascii85: #{c.chr.dump}")
           end
+        end
+      end
 
-          result << word
+      # We're done if all 5-tuples have been consumed
+      if count.zero?
+        bufwriter.flush
+        return out || bufwriter.io.string.force_encoding('ASCII-8BIT')
+      end
 
-          word  = 0
-          count = 0
+      raise(Ascii85::DecodingError, 'Last 5-tuple consists of single character') if count == 1
+
+      # Finish last, partially decoded 32-bit word
+      count -= 1
+      word  += lut[count]
+
+      bufwriter.write((word >> 24).chr) if count >= 1
+      bufwriter.write(((word >> 16) & 0xff).chr) if count >= 2
+      bufwriter.write(((word >> 8) & 0xff).chr) if count == 3
+      bufwriter.flush
+
+      out || bufwriter.io.string.force_encoding('ASCII-8BIT')
+    end
+
+    private
+
+    # Buffers an underlying IO object to increase efficiency. You do not need
+    # to use this directly.
+    #
+    # @private
+    #
+    class BufferedReader
+      def initialize(io, buffer_size)
+        @io = io
+        @buffer_size = buffer_size
+      end
+
+      def each_chunk
+        return enum_for(:each_chunk) unless block_given?
+
+        until @io.eof?
+          chunk = @io.read(@buffer_size)
+          yield chunk if chunk
         end
+      end
+    end
 
-      else
-        raise(Ascii85::DecodingError,
-              "Illegal character inside Ascii85: #{c.chr.dump}")
+    # Buffers an underlying IO object to increase efficiency. You do not need
+    # to use this directly.
+    #
+    # @private
+    #
+    class BufferedWriter
+      attr_accessor :io
+
+      def initialize(io, buffer_size)
+        @io = io
+        @buffer_size = buffer_size
+        @buffer = String.new(capacity: buffer_size)
       end
+
+      def write(tuple)
+        flush if @buffer.bytesize + tuple.bytesize > @buffer_size
+        @buffer << tuple
+      end
+
+      def flush
+        @io.write(@buffer)
+        @buffer.clear
+      end
     end
 
-    # Convert result into a String
-    result = result.pack('N*')
+    # Wraps the input in '<~' and '~>' delimiters and passes it through
+    # unmodified to the underlying IO object otherwise. You do not need to
+    # use this directly.
+    #
+    # @private
+    #
+    class DummyWrapper
+      def initialize(out)
+        @out = out
+        @out.write('<~')
+      end
 
-    if count > 0
-      # Finish last, partially decoded 32-bit-word
+      def write(buffer)
+        @out.write(buffer)
+      end
 
-      if count == 1
-        raise(Ascii85::DecodingError,
-              "Last 5-tuple consists of single character")
+      def finish
+        @out.write('~>')
+        @out.flush
+
+        @out
       end
+    end
 
-      count -= 1
-      word  += 85**(4 - count)
+    # Wraps the input in '<~' and '~>' delimiters and ensures that no line is
+    # longer than the specified length. You do not need to use this directly.
+    #
+    # @private
+    #
+    class Wrapper
+      def initialize(out, wrap_lines)
+        @line_length = [2, wrap_lines.to_i].max
 
-      result << ((word >> 24) & 255).chr if count >= 1
-      result << ((word >> 16) & 255).chr if count >= 2
-      result << ((word >>  8) & 255).chr if count == 3
+        @out = out
+        @out.write('<~')
+
+        @cur_len = 2
+      end
+
+      def write(buffer)
+        loop do
+          s = buffer.bytesize
+
+          if @cur_len + s < @line_length
+            @out.write(buffer)
+            @cur_len += s
+            return
+          end
+
+          remaining = @line_length - @cur_len
+          @out.write(buffer[0...remaining])
+          @out.write("\n")
+          @cur_len = 0
+          buffer = buffer[remaining..]
+          return if buffer.empty?
+        end
+      end
+
+      def finish
+        # Add the closing delimiter (may need to be pushed to the next line)
+        @out.write("\n") if @cur_len + 2 > @line_length
+        @out.write('~>')
+
+        @out.flush
+        @out
+      end
     end
 
-    return result
+    # Check if an object is IO-like
+    #
+    # @private
+    #
+    def io_like?(obj)
+      obj.respond_to?(:read) &&
+        obj.respond_to?(:eof?)
+    end
+
+    # @return [Integer] Buffer size for to-be-encoded input
+    #
+    def unencoded_chunk_size
+      4 * 2048
+    end
+
+    # @return [Integer] Buffer size for encoded output
+    #
+    def encoded_chunk_size
+      5 * 2048
+    end
   end
 
   #
-  # This error is raised when Ascii85.decode encounters one of the following
-  # problems in the input:
+  # Error raised when Ascii85 encounters problems while decoding the input.
   #
-  # * An invalid character. Valid characters are '!'..'u' and 'z'.
-  # * A 'z' character inside a 5-tuple. 'z's are only valid on their own.
+  # This error is raised for the following issues:
+  # * An invalid character (valid characters are '!'..'u' and 'z')
+  # * A 'z' character inside a 5-tuple ('z' is only valid on its own)
   # * An invalid 5-tuple that decodes to >= 2**32
   # * The last tuple consisting of a single character. Valid tuples always have
   #   at least two characters.
   #
   class DecodingError < StandardError; end