lib/oak.rb in oak-0.0.3 vs lib/oak.rb in oak-0.4.1

- old
+ new

@@ -1,122 +1,1085 @@ -require 'thor' +# coding: utf-8 +# +# OAK: An encoding format with enough polymorphism to support run-time +# performance experimentation and some light encryption-at-rest. +# +# author: jhw@prosperworks.com +# incept: 2016-03-02 -class Oak < Thor - include Thor::Actions - attr_reader :secret_token +require_relative 'oak/version' +require 'strscan' +require 'digest' +require 'base64' +require 'lz4-ruby' +require 'zlib' +require 'bzip2/ffi' +require 'lzma' +require 'openssl' - desc "setup oak", "Set current rails app source open ready" - def setup(working_directory = '.') - self.destination_root = working_directory - FileUtils.chdir destination_root do - check_cfg - dummy_config - git_prepare - create_config_on_deploy - commit_deploy_branch +module OAK + + # CantTouchThisObjectError is thrown when encode() or serialize() is + # called on an object which cannot be encoded losslessly by OAK. + # + class CantTouchThisObjectError < ArgumentError ; end + + # CantTouchThisStringError is thrown when decode(), deserialize(), + # or unwrap() called on a String which cannot be decoded. + # + class CantTouchThisStringError < ArgumentError ; end + + # Internal syntactic conveniences. + # + BAD_OBJ = CantTouchThisObjectError + BAD_STR = CantTouchThisStringError + + # OAK_4 supports one and only one encryption algorithm and mode of + # operation. + # + # - AES-256-GCM + # - 128 bits of security + # - 256-bit keys (32 bytes) + # - 96-bit IVs (12 bytes) + # - 128-bit auth_tags (16 bytes) + # - Random IV ("Initialization Vector") for each encryption op + # - All headers authenticated. + # - Headers encrypted when not required for decryption. + # + ENCRYPTION_ALGO_NAME = 'aes-256-gcm'.freeze + ENCRYPTION_ALGO_IV_BYTES = 12 # AES-256-GCM has 96-bit IVs + ENCRYPTION_ALGO_AUTH_TAG_BYTES = 16 # AES-256-GCM has 128-bit auth, we use all + + # Get a new instance of OpenSSL::Cipher for our algorithm. + # + def self.encryption_algo + OpenSSL::Cipher.new(ENCRYPTION_ALGO_NAME) + end + + # Generate a new random key appropriate for the OAK_4 encryption + # algorithm. + # + def self.random_key + encryption_algo.random_key + end + + # Generate a new random initialization vector appropriate for the + # OAK_4 encryption algorithm. + # + def self.random_iv + encryption_algo.random_iv + end + + class Key + + # @param key String encryption key suitable for AES-256, + # specifically a binary string of 32 bytes (256 bits), + # randomly-generated and kept very, very secret. + # + def initialize(key) + if !key.is_a?(String) + raise ArgumentError, "bad non-String key: ELIDED" + end + rk_size = OAK.random_key.size + if key.size != rk_size + raise ArgumentError, "bad key ELIDED, length not #{rk_size}" + end + @key = key.dup.freeze # happy :) end + + attr_reader :key + + def inspect + # + # Avoid exposing the key in casual logs or console session. + # + to_s[0..-2] + " @key=ELIDED>" + end + end - no_tasks do + class KeyChain - def check_cfg - ['config/application.rb', '.gitignore'].each do |f| - if !File.exists? f - raise "#{f} not found, are we at the root directory of a rails app?" + def initialize(keys) + if !keys.is_a?(Hash) + raise ArgumentError, "bogus keys #{keys}" + end + keys.each do |k,v| + if !k.is_a?(String) + raise ArgumentError, "bogus key #{k} in keys #{keys}" end + if /^[a-zA-Z][0-9a-zA-Z]*$/ !~ k + # + # In oak_4, we restrict key names to sequences which look + # like code identifiers: alphanumeric strings which start + # with a letter. + # + # This keeps the encoding simple but compact. + # + raise ArgumentError, "bad key #{k} in keys #{keys}" + end + if !v.is_a?(Key) + raise ArgumentError, "bogus val #{v} at #{k} in keys #{keys}" + end end + # + # We are a happy KeyChain object now! + # + @keys = keys.dup.freeze + end - # make 'config/database.yml' globally ignored - global_ignore_file = File.expand_path('~/.gitignore') - if File.exist? global_ignore_file - ignored = File.binread global_ignore_file - if !ignored.include?('config/database.yml') - append_to_file(global_ignore_file, 'config/database.yml') + attr_reader :keys + + end + + # Parses a KeyChain object and keys from an ENV-like object. + # + # E.g. if the ENV contains: + # + # FOO_KEYS=a,b + # FOO_KEY_a=#{OAK.encode(<binary key>)} + # FOO_KEY_b=#{OAK.encode(<binary key>)} + # + # ...then the call OAK.parse_key_chain(ENV,'FOO') will return a new + # OAK::KeyChain with two OAK::Keys, 'a' and 'b'. + # + # This self-referential (but not recursive!) use of OAK to encode + # the key and iv is to avoid the problems with binary strings in ENV + # variables, 'heroku config:set' command line arguments, etc. + # + # @param env ENV or an ENV-like Hash from String to String. + # + # @param name String the root token + # + # @returns a new OAK::KeyChain + # + def self.parse_env_chain(env,name) + key_names = (env["#{name}_KEYS"] || '').gsub(/^[, ]*/,'').split(/[ ,]+/) + keys = key_names.map do |key_name| + key = OAK.decode(env["#{name}_KEY_#{key_name}"] || '') + [ key_name, Key.new(key) ] + end.to_h + KeyChain.new(keys) + end + + ########################################################################## + # + # encode() and decode() are the top layer + # + # They coordinate the structure layer and the byte layer. + # + # These are the recommended entry points for most callers. + # + ########################################################################## + + # Encodes suitable objects string into OAK strings. + # + # Is inverted by decode(). For all obj, if encode(obj) does not + # raise an exception, decode(encode(obj)) == obj. + # + # @param obj to encode + # + # @param redundancy 'none', 'crc32' (default), or 'sha1' + # + # @param compression 'none' (default), 'lz4', 'zlib', 'bzip2', 'lzma' + # + # @param force false (default), or true. When true, always + # compress. When false, fall back to the + # original if the compressed form is larger. + # + # @param key_chain OAK::KeyChain from which to draw the encryption + # key, or nil for none. + # + # @param key String name of a key in key_chain to be used + # for encryption, or nil if none. + # + # @param format 'none', 'base64' (default) + # + # @param force_oak_4 Bool, for debugging, force oak_4 encoding even + # if no encryption key is specified. + # + # @param debug_iv String, force encryption with a known IV, TEST ONLY! + # + # WARNING: Use of debug_iv jeopardizes the security of all messages + # *ever* encrypted with that key! Never use debug_iv in production! + # + # @raises ArgumentError if obj is not handled. + # + def self.encode(obj,opts={}) + ser = _serialize(obj) + _wrap(ser,opts) + end + + # Decodes suitable OAK strings into objects. + # + # Inverts encode(). + # + # @param str String to decode + # + # @param key_chain OAK::KeyChain in which to look for keys to + # decrypt encrypted OAK strings, or nil for none. + # + # @returns obj String to decode + # + # @raises ArgumentError if str is not a recognized string. + # + def self.decode(str,opts={}) + if !str.is_a?(String) + raise ArgumentError, "str not a String" + end + ser = _unwrap(str,opts) + _deserialize(ser) + end + + ########################################################################## + # + # serialize() and deserialize() are the structure layer + # + # They are responsible for interconverting between objects and naive + # strings. + # + # This layer is analagous to TAR for files or JSON: it converts + # structure into string and vice-versa. + # + ########################################################################## + + # Serializes suitable objects string into naive strings. + # + # Is inverted by deserialize(). For all obj, if serialize(obj) does + # not raise an exception, deserialize(serialize(obj)) == obj. + # + # @raises CantTouchThisObjectError if obj contains any types or + # structure which cannot be encoded reversibly by OAK. + # + def self._serialize(obj) + seen,_reseen = _safety_dance(obj) do |child| + next if ALL_TYPES.select{ |type| child.is_a?(type) }.size > 0 + raise CantTouchThisObjectError, "#{child.class} not supported: #{child}" + end + strt = Hash.new # string table, str => id for strings already encoded + ser = 'F' + ser << seen.size.to_s + seen.each_with_index do |(_object_id,(_idx2,child)),_idx| + # + # First, identify the unique apex type in TYPE_2_CODE.keys + # which matches the child. + # + # child.class may not be listed explicitly, such as for Fixnum + # and Bigint both being Integer, so we search and assert + # uniqueness and existence. + # + is_as = ALL_TYPES.select{ |type| child.is_a?(type) } + raise CantTouchThisObjectError if 1 != is_as.size + type = is_as[0] + typecode = TYPE_2_CODE[type] + if nil == child || true == child || false == child + # + # The type code by itself is sufficient to decode NilType, + # TrueType, and FalseType. We need use other space for them. + # + ser << typecode + next + end + if child.is_a?(Symbol) || child.is_a?(String) + # + # Strings and Symbols encode as their size in chars followed + # by their bytes. + # + # We maintain a running string table, strt, to recognize when + # we encounter a string representation which has been + # previously encoded. + # + # If we find such a duplicate, we encode the current string + # via a back reference to the first one we saw. This is + # indicated by downcasing the typecode. + # + str = child.to_s + enc = str.encoding + enc_code = nil + case enc + when Encoding::ASCII_8BIT, Encoding::US_ASCII, Encoding::ASCII + enc_code = 'A' + when Encoding::UTF_8 + enc_code = 'U' + else + raise CantTouchThisObjectError, "unknown string encoding #{enc}" end - else - File.open(global_ignore_file, 'w') do |f| - f.write 'config/database.yml' + if strt.has_key?(str) + ser << typecode.downcase # downcase indicates strt reference + ser << enc_code + ser << strt[str].to_s + else + ser << typecode # upcase indicates full representation + ser << enc_code + ser << str.bytesize.to_s + if str.bytesize > 0 + ser << '_' + ser << str + end + strt[str] = strt.size end + next end - `git config --global core.excludesfile ~/.gitignore` + if child.is_a?(Numeric) + # + # Numerics primitives encode as their Ruby to_s which + # matches their JSON.dump(). + # + ser << typecode + ser << child.to_s + next + end + if child.is_a?(Array) + # + # An array is encoded as a size N followed by N indexes into + # the seen list. + # + ser << typecode + ser << child.size.to_s + child.each do |a| + ser << '_' + ser << seen[a.object_id][0].to_s + end + next + end + if child.is_a?(Hash) + # + # An array is encoded as a size N followed by 2*N indexes + # into the seen list, organized pairwise key+value. + # + ser << typecode + ser << child.size.to_s + child.each do |k,v| + ser << '_' + ser << seen[k.object_id][0].to_s + ser << '_' + ser << seen[v.object_id][0].to_s + end + next + end + raise CantTouchThisObjectError, "not handled: #{child.class} #{child}" + end + ser + end - # append config/config.yml to .gitignore if not already in - File.open('.gitignore') do |f| - f.each_line do |l| - if l == 'config/config.yml' - return + # Deserializes suitable naive strings into objects. + # + # Inverts serialize(). + # + # @raises CantTouchThisObjectError if str is not recognized + # + def self._deserialize(str) + scanner = StringScanner.new(str) + serial_code = scanner.scan(/F/) + if 'F' != serial_code + raise CantTouchThisStringError, "bogus serial_code #{serial_code}" + end + num_objs = scanner.scan(/[0-9]+/) + if !num_objs + raise CantTouchThisStringError, "missing object list size" + end + num_objs = num_objs.to_i + strt = Hash.new # string table, id => str for strings already decoded + seen = [] + # + # We parse the stream, constructing all the objects we see in to + # a seen list. + # + # In this first pass, Arrays and Hashes are created whose + # elements, keys, and values are temporarily integers. These all + # refer to slots in the seen list, and many of them will be + # forward references to objects which we have yet to decode. + # Later we will rectify the object graph by replacing these + # integers with their refrants from the seen list. + # + num_objs.times.each do |idx_obj| + code = scanner.scan(/[a-zA-Z]/) + case code + when 'n' + seen[idx_obj] = nil + when 'f' + seen[idx_obj] = false + when 't' + seen[idx_obj] = true + when 'S', 'Y', 's', 'y' + enc_code = scanner.scan(/[AU]/) + enc = nil + case enc_code + when 'A' + enc = Encoding::ASCII_8BIT + when 'U' + enc = Encoding::UTF_8 + else + raise CantTouchThisStringError, "unknown enc_code #{enc_code}" + end + num = scanner.scan(/[0-9]+/) + if !num + raise CantTouchThisStringError, "missing num" + end + num = num.to_i + case code + when 'S', 'Y' + if num > 0 + scanner.scan(/_/) or raise BAD_STR, "missing _" + seen[idx_obj] = scanner.peek(num) + scanner.pos += num # skip body + else + seen[idx_obj] = '' end + strt[strt.size] = seen[idx_obj] + when 's', 'y' + seen[idx_obj] = strt[num] end + seen[idx_obj] = seen[idx_obj].dup.force_encoding(enc) + case code + when 'Y', 'y' + seen[idx_obj] = seen[idx_obj].intern + end + when 'I' + pattern = /-?[0-9]+/ + seen[idx_obj] = scanner.scan(pattern).to_i + when 'F' + pattern = /-?(Infinity|NaN|[0-9]+(\.[0-9]*)?(e([+-][0-9]*)?)?)/ + match = scanner.scan(pattern) + case match + when 'Infinity' then seen[idx_obj] = Float::INFINITY + when '-Infinity' then seen[idx_obj] = -Float::INFINITY + when 'NaN' then seen[idx_obj] = Float::NAN + else seen[idx_obj] = match.to_f + end + when 'A' + num_items = scanner.scan(/[0-9]+/).to_i + arr = [] + num_items.times.each do |idx| + scanner.scan(/_/) or raise BAD_STR, "missing _" + val = scanner.scan(/[0-9]+/).to_i # temp obj + arr[idx] = val + end + seen[idx_obj] = arr + when 'H' + num_items = scanner.scan(/[0-9]+/).to_i + hash = Hash.new + num_items.times.each do + scanner.scan(/_/) or raise BAD_STR, "missing _" + k = scanner.scan(/[0-9]+/).to_i # temp obj + scanner.scan(/_/) or raise BAD_STR, "missing _" + v = scanner.scan(/[0-9]+/).to_i # temp obj + hash[k] = v + end + seen[idx_obj] = hash + else + raise BAD_STR, "not handled: #{code} #{scanner.pos} #{scanner.rest}" end + end + # + # If we parsed correctly, there will be no unconsumed in the + # scanner. + # + if !scanner.eos? + raise BAD_STR, "not at end-of-string: #{scanner.pos} #{scanner.rest}" + end + # + # We rectify the references for each intermediate Array and Hash + # as promised earlier. + # + # Note that this code must be inherently mutation-oriented since + # it might have to construct cyclic graphs. + # + rectified = seen.map do |elem| + if elem.is_a?(Array) + next Array.new + elsif elem.is_a?(Hash) + next Hash.new + else + elem + end + end + rectified.each_with_index do |elem,idx| + if elem.is_a?(Array) + seen[idx].each_with_index do |a,i| + elem[i] = rectified[a] + end + elsif elem.is_a?(Hash) + seen[idx].each do |k,v| + elem[rectified[k]] = rectified[v] + end + end + end + # + # By the way _safety_dance performed its walk in _serialize(), the + # object we are decoding is the first object encoded in str. + # + # Thus, we return the first element of the rectified list. + # + rectified.first + end - append_to_file '.gitignore', 'config/config.yml' + ########################################################################## + # + # wrap() and unwrap() are the byte layer + # + # They are responsible for interconverting between naive strings and + # strings which are ready to go out on the wire into external + # storage. + # + # This layer is analagous to GZIP: it converts strings into a + # different representation which is smaller, more resistant to + # corruption, and/or more recognizable. + # + ########################################################################## - # protect secret_token - full_text = File.binread 'config/initializers/secret_token.rb' - full_text.gsub! /(Application\.config\.secret_token\s=\s)'(.*)'/, '\1APP_CONFIG[\'secret_token\']' - # save per app secret_token for later use - @secret_token = "#{$2}" - File.open('config/initializers/secret_token.rb', 'w') do |f| - f.write full_text - end + # Wraps any string into a OAK string. + # + # Is inverted by unwrap(). For all str, unwrap(wrap(str)) == str. + # + # @param str naive string to be wrapped as an OAK string + # + # @param redundancy 'none', 'crc32' (default), or 'sha1' + # + # @param compression 'none' (default), 'lz4', 'zlib', 'bzip2', or 'lzma' + # + # @param force false (default), or true. When true, always + # compress. When false, fall back to the + # original if the compressed form is larger. + # + # @param key_chain OAK::KeyChain from which to draw the encryption + # key, or nil for none. + # + # @param key String name of a key in key_chain to be used + # for encryption, or nil if none. + # + # @param force_oak_4 Bool, for debugging, force oak_4 encoding even + # if no encryption key is specified. + # + # @param format 'none', 'base64' (default) + # + # @returns an OAK string + # + def self._wrap(str,opts={}) + redundancy = (opts[:redundancy] || :crc32).to_s + compression = (opts[:compression] || :none).to_s + force = (opts[:force] || false) + format = (opts[:format] || :base64).to_s + key_chain = opts[:key_chain] + key = opts[:key] + debug_iv = opts[:debug_iv] + if key_chain && !key_chain.is_a?(KeyChain) + raise ArgumentError, "bad key_chain #{key_chain}" end + if debug_iv && !debug_iv.is_a?(String) + raise ArgumentError, "bad debug_iv #{debug_iv}" + end + if debug_iv && ENCRYPTION_ALGO_IV_BYTES != debug_iv.size + raise ArgumentError, "bad debug_iv #{debug_iv}" + end + if key && !key_chain + raise ArgumentError, "key #{key} without key_chain" + end + if key && !key_chain.keys[key] + keys = key_chain.keys + raise ArgumentError, "key not found in #{keys}: #{key}" + end + encryption_key = key ? key_chain.keys[key] : nil + str = str.b # dupe to Encoding::ASCII_8BIT + if encryption_key || opts[:force_oak_4] + _wrap_oak_4( + str, + redundancy, + compression, + force, + format, + key, + encryption_key, + debug_iv + ) + else + _wrap_oak_3( + str, + redundancy, + compression, + force, + format + ) + end + end - def dummy_config - File.open('config/config.example.yml', 'w') do |f| - f.write "secret_token: 'c1cae0f52a3ef8efa369a127c63bd6ede539a4089fd952b33199100a6769c8455ab4969f2eefaf1ebcbe0208bd57531204c77f41f715207f961e7e45f139f4e7'" - end - prepend_to_file 'config/application.rb', "require 'yaml'\nAPP_CONFIG = YAML.load(File.read(File.expand_path('../config.yml', __FILE__)))\n" + def self._wrap_oak_3( + str, + redundancy, + compression, + force, + format + ) + source_redundancy = _check(redundancy,str) + compressed, compression = _compress(compression,force,str) + formatted = _format(format,compressed) + output = 'oak_3' # format id+ver + output << REDUNDANCY_2_CODE[redundancy] # redundancy + output << COMPRESSION_2_CODE[compression] # compression + output << FORMAT_2_CODE[format] # format + output << '_' + output << source_redundancy # source check + output << '_' + output << '%d' % formatted.size # formatted size + output << '_' + output << formatted # payload + output << '_' + output << 'ok' # terminator + output.force_encoding(Encoding::ASCII_8BIT) + end - # simply copy database.yml to database.example.yml - File.open('config/database.example.yml', 'w') do |f| - File.open('config/database.yml', 'r') do |o| - f.write o.read - end - end + def self._wrap_oak_4( + str, + redundancy, + compression, + force, + format, + key, + encryption_key, + debug_iv + ) + header = 'oak_4' # format id+ver + if key + header << key # key name end + header << '_' + header << FORMAT_2_CODE[format] # format + compressed, compression = _compress(compression,force,str) + plaintext = '' + plaintext << REDUNDANCY_2_CODE[redundancy] # redundancy + plaintext << COMPRESSION_2_CODE[compression] # compression + plaintext << _check(redundancy,str) # source check + plaintext << '_' + plaintext << compressed + encrypted = _encrypt( + encryption_key, + plaintext, + header, + debug_iv + ) + formatted = _format(format,encrypted) + output = header + output << '%d' % formatted.size # formatted size + output << '_' + output << formatted # payload + output << '_' + output << 'ok' # terminator + output.force_encoding(Encoding::ASCII_8BIT) + end - def git_prepare - if File.exists? '.git' - puts 'It seems a git repository has already created, I\'ll leave it untouched.' - return - end + # Unwraps any OAK string into a string. + # + # Inverts wrap(). For all str, unwrap(wrap(str)) == str. + # + # @param str OAK string to be unwrapped + # + # @param key_chain OAK::KeyChain in which to look for keys to + # decrypt encrypted OAK strings, or nil for none. + # + # @returns a string + # + # @raises ArgumentError if str is not in OAK format. + # + def self._unwrap(str,opts={}) + str = str.b # str.b for dup to ASCII_8BIT + sc = StringScanner.new(str) + ov = sc.scan(/oak_[34]/) or raise BAD_STR, "bad oak+ver" + if 'oak_4' == ov + _unwrap_oak_4(sc,opts) # encryption opts possible for decoding OAK_4 :( + else + _unwrap_oak_3(sc) # no opts for decoding OAK_3 :) + end + end - `git init && git add . && git commit -m "init"` - `git checkout -b deploy` + def self._unwrap_oak_3(sc) + r = sc.scan(/[NCS]/) or raise BAD_STR, "bad redundancy" + c = sc.scan(/[N4ZBM]/) or raise BAD_STR, "bad compression" + f = sc.scan(/[NB]/) or raise BAD_STR, "bad format" + _ = sc.scan(/_/) or raise BAD_STR, "missing _" + scheck = sc.scan(/[a-f0-9]+/) or raise BAD_STR, "bad scheck" + _ = sc.scan(/_/) or raise BAD_STR, "missing _" + fsize = sc.scan(/[0-9]+/) or raise BAD_STR, "bad fsize" + fsize = fsize.to_i + _ = sc.scan(/_/) or raise BAD_STR, "missing _" + formatted = sc.peek(fsize) + begin + sc.pos += fsize + rescue RangeError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" end + _ = sc.scan(/_ok$/) or raise BAD_STR, "bad ok: #{formatted}" + redundancy = CODE_2_REDUNDANCY[r] || r + compression = CODE_2_COMPRESSION[c] || c + format = CODE_2_FORMAT[f] || f + fsize_re = formatted.size + if fsize.to_i != fsize_re + raise CantTouchThisStringError, "fsize #{fsize} vs #{fsize_re}" + end + compressed = _deformat(format,formatted) + original = _decompress(compression,compressed) + scheck_re = _check(redundancy,original) + if scheck != scheck_re + raise CantTouchThisStringError, "scheck #{scheck} vs #{scheck_re}" + end + original + end - def create_config_on_deploy - File.open('config/config.yml', 'w') do |f| - f.write 'secret_token: \'' + secret_token + '\'' + def self._unwrap_oak_4(sc,opts={}) + key = sc.scan(/[^_]+/) # nil OK, indicates no compression + encryption_key = nil + if key + key_chain = opts[:key_chain] + if !key_chain + raise CantTouchThisStringError, "key #{key} but no key_chain" end - - # remove 'config/config.yml' from .gitignore on deploy branch - ignored = File.binread('.gitignore') - ignored.gsub! /config\/config.yml/, '' - File.open('.gitignore', 'w') do |f| - f.write ignored + encryption_key = opts[:key_chain].keys[key] + if !encryption_key + keys = key_chain.keys + raise CantTouchThisStringError, "key not found in #{keys}: #{key}" end + end + _ = sc.scan(/_/) or raise BAD_STR, "missing _" + f = sc.scan(/[NB]/) or raise BAD_STR, "bad format" + header = sc.string[0..(sc.pos-1)] # for authentication by _decrypt + format = CODE_2_FORMAT[f] + fsize = sc.scan(/[0-9]+/) or raise BAD_STR, "bad fsize" + fsize = fsize.to_i + _ = sc.scan(/_/) or raise BAD_STR, "missing _" + formatted = sc.peek(fsize) + begin + sc.pos += fsize + rescue RangeError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" + end + _ = sc.scan(/_ok$/) or raise BAD_STR, "bad ok" + encrypted = _deformat(format,formatted) + plaintext = _decrypt(encryption_key,encrypted,header) + sp = StringScanner.new(plaintext) + r = sp.scan(/[NCS]/) or raise BAD_STR, "bad redundancy" + c = sp.scan(/[N4ZBM]/) or raise BAD_STR, "bad compression" + scheck = sp.scan(/[a-f0-9]+/) or raise BAD_STR, "bad scheck" + _ = sp.scan(/_/) or raise BAD_STR, "missing _" + compressed = sp.rest + redundancy = CODE_2_REDUNDANCY[r] || r + compression = CODE_2_COMPRESSION[c] || c + original = _decompress(compression,compressed) + scheck_re = _check(redundancy,original) + if scheck != scheck_re + raise( + CantTouchThisStringError, + "scheck #{scheck} vs #{scheck_re} in #{sc.string}" + ) + end + original + end - # add checkout hook for switching from 'deploy' to 'master' - File.open('.git/hooks/post-checkout', 'w') do |f| - f.write <<-EOS -#!/bin/bash + # How we encode object type. + # + TYPE_2_CODE ||= { + Hash => 'H', + Array => 'A', + String => 'S', # downcased to 's' for string table lookup + Symbol => 'Y', # downcased to 'y' for string table lookup + Integer => 'I', + Float => 'F', + NilClass => 'n', + TrueClass => 't', + FalseClass => 'f', + }.freeze + ALL_TYPES ||= TYPE_2_CODE.keys.freeze -branch_name=$(git symbolic-ref -q HEAD) -branch_name=${branch_name##refs/heads/} + # How we encode :format and :compression in the OAK strings. + # + FORMAT_2_CODE ||= { + 'none' => 'N', + 'base64' => 'B', # urlsafe form with padding and whitespace stripped + }.freeze + CODE_2_FORMAT ||= FORMAT_2_CODE.invert.freeze -if [ "$branch_name" = master -a -e "config/config.example.yml" ]; then - cp config/config.example.yml config/config.yml - echo "cp config/config.example.yml config/config.yml" -fi - EOS + # How we encode :compression in the OAK strings. + # + # Early on, I captures some metrics using the catenation of all our + # Ruby code as a test file. + # + # I measured: + # + # SOURCE 5707334 + # none 5707370 compression 0.17s decompression 0.16s + # lzo 1804765 compression 0.18s decompression 0.16s + # lzf 1807971 compression 0.16s decompression 0.17s + # lz4 1813574 compression 0.17s decompression 0.14s + # zlib 1071216 compression 0.53s decompression 0.19s + # bzip2 868595 compression 0.62s decompression 0.33s + # lzma 760594 compression 6.22s decompression 0.20s + # + # From this, I conclude that only one of lzo,lzf,lz4 is interesting. + # They all yield approximately the same compression, and their + # compression times are indistinguishable from the rest of the + # streaming and encoding times imposed by OAK. + # + # I'm settling on supporting only lz4 because it seems to be better + # supported as a polymorphic lib - it's closer to a defacto standard + # for the LZ77 family. + # + # zlib, bzip2, and lzma each represent interesting distinct choices + # - I'm keeping support for all three. + # + COMPRESSION_2_CODE ||= { + 'none' => 'N', + 'lz4' => '4', + 'zlib' => 'Z', + 'bzip2' => 'B', + 'lzma' => 'M', + }.freeze + CODE_2_COMPRESSION ||= COMPRESSION_2_CODE.invert.freeze + + # How we encode :redundancy in the OAK strings. + # + REDUNDANCY_2_CODE ||= { + 'none' => 'N', + 'crc32' => 'C', + 'sha1' => 'S', + }.freeze + CODE_2_REDUNDANCY ||= REDUNDANCY_2_CODE.invert.freeze + + # Helper method, calculates redundancy check for str. + # + def self._check(redundancy,str) + case redundancy.to_s + when 'none' then return '0' + when 'crc32' then return '%d' % Zlib.crc32(str) + when 'sha1' then return Digest::SHA1.hexdigest(str) + else + raise ArgumentError, "unknown redundancy #{redundancy}" + end + end + + # Helper method, calculates formatted version of str. + # + def self._format(format,str) + case format.to_s + when 'none' + return str + when 'base64' + # + # We actual using "Base 64 Encoding with URL and Filename Safe + # Alphabet" aka base64url with the option not to use padding, + # per https://tools.ietf.org/html/rfc4648#section-5. + # + # If we were using Ruby 2.3+, we could use the option "padding: + # false" instead of chopping out the /=*$/ with gsub. + # + return Base64.urlsafe_encode64(str).gsub(/=.*$/,'') + else + raise ArgumentError, "unknown format #{format}" + end + end + + def self._deformat(format,str) + case format.to_s + when 'none' + return str + when 'base64' + # + # Regrettably, Base64.urlsafe_decode64(str) does not reverse + # Base64.urlsafe_encode64(str).gsub(/=.*$/,''), it raises an + # ArgumentError "invalid base64". + # + # Fortunately, simple Base64.decode64() is liberal in what it + # accepts, and handles the output of all of encode64, + # strict_encode64, and urlsafe_encode64 both with and without + # the /=*$/. + # + return Base64.decode64(str.tr('-_','+/')) + else + raise ArgumentError, "unknown format #{format}" + end + end + + # Helper for wrap() and unwrap(), multiplexes encryption. + # + def self._encrypt(encryption_key,data,auth_data,debug_iv) + return data if !encryption_key + # + # WARNING: In at least some versions of OpenSSL::Cipher, setting + # iv before key would cause the iv to be ignored in aes-*-gcm + # ciphers! + # + # https://github.com/attr-encrypted/encryptor/pull/22 + # https://github.com/attr-encrypted/encryptor/blob/master/README.md + # + # The issue was reported against version "1.0.1f 6 Jan 2014". I + # have yet to figure out whether our current version, 1.1.0, is + # affected, or when/how the fix will go live. + # + # OAK_4 only supports AES-256-GCB. Although the implementation + # bug has been fixed and OAK will almost certainly not be used + # with a buggy version of OpenSSL, nevertheless we take great + # care to set cipher.key *then* cipher.iv. + # + # Still, can't be to careful. + # + iv_size = ENCRYPTION_ALGO_IV_BYTES + auth_tag_size = ENCRYPTION_ALGO_AUTH_TAG_BYTES + if debug_iv && iv_size != debug_iv.size + raise "unexpected debug_iv.size #{debug_iv.size} not #{iv_size}" + end + cipher = encryption_algo.encrypt + cipher.key = encryption_key.key + iv = debug_iv || cipher.random_iv + cipher.iv = iv + cipher.auth_data = auth_data + ciphertext = cipher.update(data) + cipher.final + auth_tag = cipher.auth_tag + if iv_size != iv.size + raise "unexpected iv.size #{iv.size} not #{iv_size}" + end + if auth_tag_size != auth_tag.size + raise "unexpected auth_tag.size #{auth_tag.size} not #{auth_tag_size}" + end + # + # Since iv and auth_tag have fixed widths, they are trivial to + # parse without putting any effort or space into recording their + # sizes in the message body. + # + iv + auth_tag + ciphertext + end + + # Helper for wrap() and unwrap(), multiplexes decryption. + # + def self._decrypt(encryption_key,data,auth_data) + return data if !encryption_key + iv_size = ENCRYPTION_ALGO_IV_BYTES + auth_tag_size = ENCRYPTION_ALGO_AUTH_TAG_BYTES + iv = data[0..(iv_size-1)] + auth_tag = data[iv_size..(auth_tag_size+iv_size-1)] + ciphertext = data[(auth_tag_size+iv_size)..-1] + cipher = encryption_algo.decrypt + cipher.key = encryption_key.key + begin + cipher.iv = iv + cipher.auth_tag = auth_tag + cipher.auth_data = auth_data + cipher.update(ciphertext) + cipher.final + rescue OpenSSL::Cipher::CipherError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" + end + end + + # Helper for wrap() and unwrap(), multiplexes compression. + # + def self._compress(compression,force,str) + case compression.to_s + when 'none' + compressed = str + when 'lz4' + compressed = LZ4.compress(str) + when 'zlib' + compressed = Zlib.deflate(str) + when 'bzip2' + io = StringIO.new + io.set_encoding(Encoding::ASCII_8BIT) + Bzip2::FFI::Writer.write(io, str) + compressed = io.string + when 'lzma' + compressed = LZMA.compress(str) + else + raise ArgumentError, "unknown compression #{compression}" + end + if !force && compressed.size >= str.size + compressed = str + compression = 'none' + end + [compressed,compression.to_s] + end + + # Helper for wrap() and unwrap(), multiplexes decompression. + # + def self._decompress(compression,str) + case compression.to_s + when 'none' + return str + when 'lz4' + begin + return LZ4.uncompress(str) + rescue LZ4Internal::Error => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" end - `chmod +x .git/hooks/post-checkout` + when 'zlib' + begin + return Zlib::Inflate.inflate(str) + rescue Zlib::DataError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" + end + when 'bzip2' + io = StringIO.new(str) + raw = nil + begin + raw = Bzip2::FFI::Reader.read(io) + rescue Bzip2::FFI::Error::MagicDataError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" + end + str = raw.b # dupe to Encoding::ASCII_8BIT + return str + when 'lzma' + begin + raw = LZMA.decompress(str) + rescue RuntimeError => ex + raise CantTouchThisStringError, "#{ex.class}: #{ex.message}" + end + str = raw.b # dupe to Encoding::ASCII_8BIT + return str + else + raise ArgumentError, "unknown compression #{compression}" end - - def commit_deploy_branch - # commit deploy branch - `git add . && git commit -m "deploy setup"` - `git checkout master` + end + + # Walks obj recursively, touching each reachable child only once + # without getting caught up cycles or touching DAGy bits twice. + # + # Only knows how to recurse into Arrays and Hashs. + # + # This traversal is depth-first pre-order with the children of + # Arrays walked in positional anbd Hash pairs walked in positional + # order k,v,k,v, etc. + # + # @param obj object to walk + # + # @param seen Hash which maps object_id => [idx,child] of every + # object touched, where idx is 0,1,2,... corresponding to the order + # in which we encountered child. + # + # @param reseen List of children which were walked more than once. + # + # @param block if present, every object touched is yielded to block + # + # @return seen,reseen + # + def self._safety_dance(obj,seen=nil,reseen=nil,&block) + # + # Note that OAK._serialize() depends on the depth-first pre-order + # specification here - at least, it assumes that the first element + # walked will be the first element added to seen. + # + seen ||= {} + reseen ||= [] + oid = obj.object_id + if seen.has_key?(oid) + reseen << obj + return seen,reseen end + seen[oid] = [seen.size,obj] + yield obj if block # pre-order: this node before children + if obj.is_a?(Hash) + obj.each do |k,v| # children in hash order and k,v,... + _safety_dance(k,seen,reseen,&block) + _safety_dance(v,seen,reseen,&block) + end + elsif obj.is_a?(Array) + obj.each do |v| # children in list order + _safety_dance(v,seen,reseen,&block) + end + end + return seen,reseen end + end