lib/ascii85.rb in Ascii85-1.1.0 vs lib/ascii85.rb in Ascii85-2.0.0

- old
+ new

@@ -1,221 +1,453 @@ -# encoding: utf-8 # frozen_string_literal: true +require 'stringio' # # Ascii85 is an implementation of Adobe's binary-to-text encoding of the # same name in pure Ruby. # -# See http://www.adobe.com/products/postscript/pdfs/PLRM.pdf page 131 -# and http://en.wikipedia.org/wiki/Ascii85 for more information about -# the format. +# See http://en.wikipedia.org/wiki/Ascii85 for more information about the +# format. # # Author:: Johannes Holzfuß (johannes@holzfuss.name) # License:: Distributed under the MIT License (see LICENSE file) # +module Ascii85 + class << self + # + # Encodes the bytes of the given String or IO-like object as Ascii85. + # + # @param str_or_io [String, IO] The input to encode + # @param wrap_lines [Integer, false] The line length for wrapping, or +false+ for no wrapping + # @param out [IO, nil] An optional IO-like object to write the output to + # + # @return [String, IO] The encoded String or the output IO object that was passed in + # + # @example Encoding a simple String + # Ascii85.encode("Ruby") + # # => <~;KZGo~> + # + # @example Encoding with line wrapping + # Ascii85.encode("Supercalifragilisticexpialidocious", 15) + # # => <~;g!%jEarNoBkD + # # BoB5)0rF*),+AU& + # # 0.@;KXgDe!L"F`R + # # ~> + # + # @example Encoding without line wrapping + # Ascii85.encode("Supercalifragilisticexpialidocious", false) + # # => <~;g!%jEarNoBkDBoB5)0rF*),+AU&0.@;KXgDe!L"F`R~> + # + # @example Encoding from an IO-like object + # input = StringIO.new("Ruby") + # Ascii85.encode(input) + # # => "<~;KZGo~>" + # + # @example Encoding to an IO object + # output = StringIO.new + # Ascii85.encode("Ruby", out: output) + # # => output (with "<~;KZGo~>" written to it) + # + def encode(str_or_io, wrap_lines = 80, out: nil) + reader = if io_like?(str_or_io) + str_or_io + else + StringIO.new(str_or_io.to_s, 'rb') + end + return ''.dup if reader.eof? -module Ascii85 - # - # Encodes the bytes of the given String as Ascii85. - # - # If +wrap_lines+ evaluates to +false+, the output will be returned as - # a single long line. Otherwise #encode formats the output into lines - # of length +wrap_lines+ (minimum is 2). - # - # Ascii85.encode("Ruby") - # => <~;KZGo~> - # - # Ascii85.encode("Supercalifragilisticexpialidocious", 15) - # => <~;g!%jEarNoBkD - # BoB5)0rF*),+AU& - # 0.@;KXgDe!L"F`R - # ~> - # - # Ascii85.encode("Supercalifragilisticexpialidocious", false) - # => <~;g!%jEarNoBkDBoB5)0rF*),+AU&0.@;KXgDe!L"F`R~> - # - # - def self.encode(str, wrap_lines = 80) - to_encode = str.to_s - return '' if to_encode.empty? + # Setup buffered Reader and Writers + bufreader = BufferedReader.new(reader, unencoded_chunk_size) + bufwriter = BufferedWriter.new(out || StringIO.new(String.new, 'wb'), encoded_chunk_size) + writer = wrap_lines ? Wrapper.new(bufwriter, wrap_lines) : DummyWrapper.new(bufwriter) - # Deal with multi-byte encodings - if to_encode.respond_to?(:bytesize) - input_size = to_encode.bytesize - else - input_size = to_encode.size - end + padding = "\0\0\0\0" + tuplebuf = '!!!!!'.dup - # Compute number of \0s to pad the message with (0..3) - padding_length = (-input_size) % 4 + bufreader.each_chunk do |chunk| + chunk.unpack('N*').each do |word| + # Encode each big-endian 32-bit word into a 5-character tuple (except + # for 0, which encodes to 'z') + if word.zero? + writer.write('z') + else + word, b0 = word.divmod(85) + word, b1 = word.divmod(85) + word, b2 = word.divmod(85) + word, b3 = word.divmod(85) + b4 = word - # Extract big-endian integers - tuples = (to_encode + ("\0" * padding_length)).unpack('N*') + tuplebuf.setbyte(0, b4 + 33) + tuplebuf.setbyte(1, b3 + 33) + tuplebuf.setbyte(2, b2 + 33) + tuplebuf.setbyte(3, b1 + 33) + tuplebuf.setbyte(4, b0 + 33) - # Encode - tuples.map! do |tuple| - if tuple == 0 - 'z' - else - tmp = String.new - 5.times do - tmp << ((tuple % 85) + 33).chr - tuple /= 85 + writer.write(tuplebuf) + end end - tmp.reverse - end - end - # We can't use the z-abbreviation if we're going to cut off padding - if (padding_length > 0) and (tuples.last == 'z') - tuples[-1] = '!!!!!' - end + next if (chunk.bytesize & 0b11).zero? - # Cut off the padding - tuples[-1] = tuples[-1][0..(4 - padding_length)] + # If we have leftover bytes, we need to zero-pad to a multiple of four + # before converting to a 32-bit word. + padding_length = (-chunk.bytesize) % 4 + trailing = chunk[-(4 - padding_length)..] + word = (trailing + padding[0...padding_length]).unpack1('N') - # If we don't need to wrap the lines, add delimiters and return - if (!wrap_lines) - return '<~' + tuples.join + '~>' - end + # Encode the last word and cut off any padding + if word.zero? + writer.write('!!!!!'[0..(4 - padding_length)]) + else + word, b0 = word.divmod(85) + word, b1 = word.divmod(85) + word, b2 = word.divmod(85) + word, b3 = word.divmod(85) + b4 = word - # Otherwise we wrap the lines - line_length = [2, wrap_lines.to_i].max + tuplebuf.setbyte(0, b4 + 33) + tuplebuf.setbyte(1, b3 + 33) + tuplebuf.setbyte(2, b2 + 33) + tuplebuf.setbyte(3, b1 + 33) + tuplebuf.setbyte(4, b0 + 33) - wrapped = [] - to_wrap = '<~' + tuples.join + writer.write(tuplebuf[0..(4 - padding_length)]) + end + end - 0.step(to_wrap.length, line_length) do |index| - wrapped << to_wrap.slice(index, line_length) - end + # If no output IO-object was provided, extract the encoded String from the + # default StringIO writer. We force the encoding to 'ASCII-8BIT' to work + # around a TruffleRuby bug. + return writer.finish.io.string.force_encoding('ASCII-8BIT') if out.nil? - # Add end-marker – on a new line if necessary - if (wrapped.last.length + 2) > line_length - wrapped << '~>' - else - wrapped[-1] << '~>' + # Otherwise we make sure to flush the output writer, and then return it. + writer.finish.io end - return wrapped.join("\n") - end + # Searches through a String and extracts the first substring enclosed by '<~' and '~>'. + # + # @param str [String] The String to search through + # + # @return [String] The extracted substring, or an empty String if no valid delimiters are found + # + # @example Extracting Ascii85 content + # Ascii85.extract("Foo<~;KZGo~>Bar<~z~>Baz") + # # => ";KZGo" + # + # @example When no delimiters are found + # Ascii85.extract("No delimiters") + # # => "" + # + # @note This method only accepts a String, not an IO-like object, as the entire input + # needs to be available to ensure validity. + # + def extract(str) + input = str.to_s - # - # Searches through +str+ and decodes the _first_ Ascii85-String found. - # - # #decode expects an Ascii85-encoded String enclosed in <~ and ~> — it will - # ignore all characters outside these markers. The returned strings are always - # encoded as ASCII-8BIT. - # - # Ascii85.decode("<~;KZGo~>") - # => "Ruby" - # - # Ascii85.decode("Foo<~;KZGo~>Bar<~;KZGo~>Baz") - # => "Ruby" - # - # Ascii85.decode("No markers") - # => "" - # - # #decode will raise Ascii85::DecodingError when malformed input is - # encountered. - # - def self.decode(str) - input = str.to_s + # Make sure the delimiter Strings have the correct encoding. + opening_delim = '<~'.encode(input.encoding) + closing_delim = '~>'.encode(input.encoding) - opening_delim = '<~' - closing_delim = '~>' + # Get the positions of the opening/closing delimiters. If there is no pair + # of opening/closing delimiters, return an unfrozen empty String. + (start_pos = input.index(opening_delim)) or return ''.dup + (end_pos = input.index(closing_delim, start_pos + 2)) or return ''.dup - # Make sure the delimiter strings have the correct encoding. + # Get the String inside the delimiter-pair + input[(start_pos + 2)...end_pos] + end + # - # Although I don't think it likely, this may raise encoding - # errors if an especially exotic input encoding is introduced. - # As of Ruby 1.9.2 all non-dummy encodings work fine though. + # Searches through a String and decodes the first substring enclosed by '<~' and '~>'. # - if opening_delim.respond_to?(:encode) - opening_delim = opening_delim.encode(input.encoding) - closing_delim = closing_delim.encode(input.encoding) + # @param str [String] The String containing Ascii85-encoded content + # @param out [IO, nil] An optional IO-like object to write the output to + # + # @return [String, IO] The decoded String (in ASCII-8BIT encoding) or the output IO object (if it was provided) + # + # @raise [Ascii85::DecodingError] When malformed input is encountered + # + # @example Decoding Ascii85 content + # Ascii85.decode("<~;KZGo~>") + # # => "Ruby" + # + # @example Decoding with multiple Ascii85 blocks present (ignores all but the first) + # Ascii85.decode("Foo<~;KZGo~>Bar<~87cURDZ~>Baz") + # # => "Ruby" + # + # @example When no delimiters are found + # Ascii85.decode("No delimiters") + # # => "" + # + # @example Decoding to an IO object + # output = StringIO.new + # Ascii85.decode("<~;KZGo~>", out: output) + # # => output (with "Ruby" written to it) + # + # @note This method only accepts a String, not an IO-like object, as the entire input + # needs to be available to ensure validity. + # + def decode(str, out: nil) + decode_raw(extract(str), out: out) end - # Get the positions of the opening/closing delimiters. If there is - # no pair of opening/closing delimiters, return the empty string. - (start_pos = input.index(opening_delim)) or return '' - (end_pos = input.index(closing_delim, start_pos + 2)) or return '' + # + # Decodes the given raw Ascii85-encoded String or IO-like object. + # + # @param str_or_io [String, IO] The Ascii85-encoded input to decode + # @param out [IO, nil] An optional IO-like object to write the output to + # + # @return [String, IO] The decoded String (in ASCII-8BIT encoding) or the output IO object (if it was provided) + # + # @raise [Ascii85::DecodingError] When malformed input is encountered + # + # @example Decoding a raw Ascii85 String + # Ascii85.decode_raw(";KZGo") + # # => "Ruby" + # + # @example Decoding from an IO-like object + # input = StringIO.new(";KZGo") + # Ascii85.decode_raw(input) + # # => "Ruby" + # + # @example Decoding to an IO object + # output = StringIO.new + # Ascii85.decode_raw(";KZGo", out: output) + # # => output (with "Ruby" written to it) + # + # @note The input must not be enclosed in '<~' and '~>' delimiters. + # + def decode_raw(str_or_io, out: nil) + reader = if io_like?(str_or_io) + str_or_io + else + StringIO.new(str_or_io.to_s, 'rb') + end - # Get the string inside the delimiter-pair - input = input[(start_pos + 2)...end_pos] + # Return an unfrozen String on empty input + return ''.dup if reader.eof? - # Decode - word = 0 - count = 0 - result = [] + # Setup buffered Reader and Writers + bufreader = BufferedReader.new(reader, encoded_chunk_size) + bufwriter = BufferedWriter.new(out || StringIO.new(String.new, 'wb'), unencoded_chunk_size) - input.each_byte do |c| - case c.chr - when " ", "\t", "\r", "\n", "\f", "\0" - # Ignore whitespace - next + # Populate the lookup table (caches the exponentiation) + lut = (0..4).map { |count| 85**(4 - count) } - when 'z' - if count == 0 - # Expand z to 0-word - result << 0 - else - raise(Ascii85::DecodingError, "Found 'z' inside Ascii85 5-tuple") - end + # Decode + word = 0 + count = 0 + wordbuf = "\0\0\0\0".dup - when '!'..'u' - # Decode 5 characters into a 4-byte word - word += (c - 33) * 85**(4 - count) - count += 1 + bufreader.each_chunk do |chunk| + chunk.each_byte do |c| + case c.chr + when ' ', "\t", "\r", "\n", "\f", "\0" + # Ignore whitespace + next - if count == 5 + when 'z' + raise(Ascii85::DecodingError, "Found 'z' inside Ascii85 5-tuple") unless count.zero? - if word > 0xffffffff - raise(Ascii85::DecodingError, - "Invalid Ascii85 5-tuple (#{word} >= 2**32)") + # Expand z to 0-word + bufwriter.write("\0\0\0\0") + + when '!'..'u' + # Decode 5 characters into a 4-byte word + word += (c - 33) * lut[count] + count += 1 + + if count == 5 && word > 0xffffffff + raise(Ascii85::DecodingError, "Invalid Ascii85 5-tuple (#{word} >= 2**32)") + elsif count == 5 + b3 = word & 0xff; word >>= 8 + b2 = word & 0xff; word >>= 8 + b1 = word & 0xff; word >>= 8 + b0 = word + + wordbuf.setbyte(0, b0) + wordbuf.setbyte(1, b1) + wordbuf.setbyte(2, b2) + wordbuf.setbyte(3, b3) + + bufwriter.write(wordbuf) + + word = 0 + count = 0 + end + + else + raise(Ascii85::DecodingError, "Illegal character inside Ascii85: #{c.chr.dump}") end + end + end - result << word + # We're done if all 5-tuples have been consumed + if count.zero? + bufwriter.flush + return out || bufwriter.io.string.force_encoding('ASCII-8BIT') + end - word = 0 - count = 0 + raise(Ascii85::DecodingError, 'Last 5-tuple consists of single character') if count == 1 + + # Finish last, partially decoded 32-bit word + count -= 1 + word += lut[count] + + bufwriter.write((word >> 24).chr) if count >= 1 + bufwriter.write(((word >> 16) & 0xff).chr) if count >= 2 + bufwriter.write(((word >> 8) & 0xff).chr) if count == 3 + bufwriter.flush + + out || bufwriter.io.string.force_encoding('ASCII-8BIT') + end + + private + + # Buffers an underlying IO object to increase efficiency. You do not need + # to use this directly. + # + # @private + # + class BufferedReader + def initialize(io, buffer_size) + @io = io + @buffer_size = buffer_size + end + + def each_chunk + return enum_for(:each_chunk) unless block_given? + + until @io.eof? + chunk = @io.read(@buffer_size) + yield chunk if chunk end + end + end - else - raise(Ascii85::DecodingError, - "Illegal character inside Ascii85: #{c.chr.dump}") + # Buffers an underlying IO object to increase efficiency. You do not need + # to use this directly. + # + # @private + # + class BufferedWriter + attr_accessor :io + + def initialize(io, buffer_size) + @io = io + @buffer_size = buffer_size + @buffer = String.new(capacity: buffer_size) end + + def write(tuple) + flush if @buffer.bytesize + tuple.bytesize > @buffer_size + @buffer << tuple + end + + def flush + @io.write(@buffer) + @buffer.clear + end end - # Convert result into a String - result = result.pack('N*') + # Wraps the input in '<~' and '~>' delimiters and passes it through + # unmodified to the underlying IO object otherwise. You do not need to + # use this directly. + # + # @private + # + class DummyWrapper + def initialize(out) + @out = out + @out.write('<~') + end - if count > 0 - # Finish last, partially decoded 32-bit-word + def write(buffer) + @out.write(buffer) + end - if count == 1 - raise(Ascii85::DecodingError, - "Last 5-tuple consists of single character") + def finish + @out.write('~>') + @out.flush + + @out end + end - count -= 1 - word += 85**(4 - count) + # Wraps the input in '<~' and '~>' delimiters and ensures that no line is + # longer than the specified length. You do not need to use this directly. + # + # @private + # + class Wrapper + def initialize(out, wrap_lines) + @line_length = [2, wrap_lines.to_i].max - result << ((word >> 24) & 255).chr if count >= 1 - result << ((word >> 16) & 255).chr if count >= 2 - result << ((word >> 8) & 255).chr if count == 3 + @out = out + @out.write('<~') + + @cur_len = 2 + end + + def write(buffer) + loop do + s = buffer.bytesize + + if @cur_len + s < @line_length + @out.write(buffer) + @cur_len += s + return + end + + remaining = @line_length - @cur_len + @out.write(buffer[0...remaining]) + @out.write("\n") + @cur_len = 0 + buffer = buffer[remaining..] + return if buffer.empty? + end + end + + def finish + # Add the closing delimiter (may need to be pushed to the next line) + @out.write("\n") if @cur_len + 2 > @line_length + @out.write('~>') + + @out.flush + @out + end end - return result + # Check if an object is IO-like + # + # @private + # + def io_like?(obj) + obj.respond_to?(:read) && + obj.respond_to?(:eof?) + end + + # @return [Integer] Buffer size for to-be-encoded input + # + def unencoded_chunk_size + 4 * 2048 + end + + # @return [Integer] Buffer size for encoded output + # + def encoded_chunk_size + 5 * 2048 + end end # - # This error is raised when Ascii85.decode encounters one of the following - # problems in the input: + # Error raised when Ascii85 encounters problems while decoding the input. # - # * An invalid character. Valid characters are '!'..'u' and 'z'. - # * A 'z' character inside a 5-tuple. 'z's are only valid on their own. + # This error is raised for the following issues: + # * An invalid character (valid characters are '!'..'u' and 'z') + # * A 'z' character inside a 5-tuple ('z' is only valid on its own) # * An invalid 5-tuple that decodes to >= 2**32 # * The last tuple consisting of a single character. Valid tuples always have # at least two characters. # class DecodingError < StandardError; end