# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 # This code was adapted from the ya2yaml gem, maintained by Akira Funai. # https://github.com/afunai/ya2yaml # Copyright (c) 2006 Akira FUNAI # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. module TwitterCldr module Utils class YAML UCS_0X85 = [0x85].pack('U') # c285@UTF8 Unicode next line UCS_0XA0 = [0xa0].pack('U') # c2a0@UTF8 Unicode non-breaking space UCS_0X2028 = [0x2028].pack('U') # e280a8@UTF8 Unicode line separator UCS_0X2029 = [0x2029].pack('U') # e280a9@UTF8 Unicode paragraph separator # non-break characters ESCAPE_SEQ = { "\x00" => '\\0', "\x07" => '\\a', "\x08" => '\\b', "\x0b" => '\\v', "\x0c" => '\\f', "\x1b" => '\\e', "\"" => '\\"', "\\" => '\\\\', } # non-breaking space ESCAPE_SEQ_NS = { UCS_0XA0 => '\\_', } # white spaces ESCAPE_SEQ_WS = { "\x09" => '\\t', " " => '\\x20', } # line breaks ESCAPE_SEQ_LB ={ "\x0a" => '\\n', "\x0d" => '\\r', UCS_0X85 => '\\N', UCS_0X2028 => '\\L', UCS_0X2029 => '\\P', } # regexps for line breaks REX_LF = Regexp.escape("\x0a") REX_CR = Regexp.escape("\x0d") REX_CRLF = Regexp.escape("\x0d\x0a") REX_NEL = Regexp.escape(UCS_0X85) REX_LS = Regexp.escape(UCS_0X2028) REX_PS = Regexp.escape(UCS_0X2029) REX_ANY_LB = /(#{REX_LF}|#{REX_CR}|#{REX_NEL}|#{REX_LS}|#{REX_PS})/ REX_NORMAL_LB = /(#{REX_LF}|#{REX_LS}|#{REX_PS})/ # regexps for language-Independent types for YAML1.1 REX_BOOL = / y|Y|yes|Yes|YES|n|N|no|No|NO |true|True|TRUE|false|False|FALSE |on|On|ON|off|Off|OFF /x REX_FLOAT = / [-+]?([0-9][0-9_]*)?\.[0-9.]*([eE][-+][0-9]+)? # (base 10) |[-+]?[0-9][0-9_]*(:[0-5]?[0-9])+\.[0-9_]* # (base 60) |[-+]?\.(inf|Inf|INF) # (infinity) |\.(nan|NaN|NAN) # (not a number) /x REX_INT = / [-+]?0b[0-1_]+ # (base 2) |[-+]?0[0-7_]+ # (base 8) |[-+]?(0|[1-9][0-9_]*) # (base 10) |[-+]?0x[0-9a-fA-F_]+ # (base 16) |[-+]?[1-9][0-9_]*(:[0-5]?[0-9])+ # (base 60) /x REX_MERGE = / << /x REX_NULL = / ~ # (canonical) |null|Null|NULL # (English) | # (Empty) /x REX_TIMESTAMP = / [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] # (ymd) |[0-9][0-9][0-9][0-9] # (year) -[0-9][0-9]? # (month) -[0-9][0-9]? # (day) ([Tt]|[ \t]+)[0-9][0-9]? # (hour) :[0-9][0-9] # (minute) :[0-9][0-9] # (second) (\.[0-9]*)? # (fraction) (([ \t]*)Z|[-+][0-9][0-9]?(:[0-9][0-9])?)? # (time zone) /x REX_VALUE = / = /x REX_SYMBOL = / \A:.* /x class << self def dump(obj, opts = {}) raise 'set $KCODE to "UTF8".' if (RUBY_VERSION < '1.9.0') && ($KCODE != 'UTF8') @options = opts.dup @options[:indent_size] = 2 if @options[:indent_size].to_i <= 0 @options[:minimum_block_length] = 0 if @options[:minimum_block_length].to_i <= 0 @options.update( { :printable_with_syck => true, :escape_b_specific => true, :escape_as_utf8 => true, } ) if @options[:syck_compatible] "--- #{emit(obj, 1)}\n" rescue SystemStackError raise ArgumentError, "TwitterCLDR yaml dumper can't handle circular references" end private def emit(obj, level) case obj when Array if (obj.length == 0) '[]' else indent = "\n#{s_indent(level - 1)}" obj.collect do |o| "#{indent}- #{emit(o, level + 1)}" end.join('') end when Hash if (obj.length == 0) '{}' else indent = "\n#{s_indent(level - 1)}" hash_order = @options[:hash_order] if (hash_order && level == 1) hash_keys = obj.keys.sort do |x, y| x_order = hash_order.index(x) ? hash_order.index(x) : Float::MAX y_order = hash_order.index(y) ? hash_order.index(y) : Float::MAX o = (x_order <=> y_order) (o != 0) ? o : (x.to_s <=> y.to_s) end elsif @options[:preserve_order] hash_keys = obj.keys else hash_keys = obj.keys.sort { |x, y| x.to_s <=> y.to_s } end hash_keys.collect do |k| key = emit(k, level + 1) if ( is_one_plain_line?(key) || key =~ /\A(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_NULL}|#{REX_SYMBOL})\z/x ) "#{indent}#{key}: #{emit(obj[k], level + 1)}" else "#{indent}? #{key}#{indent}: #{emit(obj[k], level + 1)}" end end.join('') end when NilClass '~' when String emit_string(obj, level) when TrueClass, FalseClass obj.to_s when Fixnum, Bignum, Float obj.to_s when Date obj.to_s when Time offset = obj.gmtoff off_hm = sprintf( '%+.2d:%.2d', (offset / 3600.0).to_i, (offset % 3600.0) / 60 ) u_sec = (obj.usec != 0) ? sprintf(".%.6d", obj.usec) : '' obj.strftime("%Y-%m-%d %H:%M:%S#{u_sec} #{off_hm}") when Symbol prefix = @options[:use_natural_symbols] && is_one_plain_line?(obj.to_s) ? ":" : "!ruby/symbol " "#{prefix}#{emit_string(obj, level)}" when Range '!ruby/range ' + obj.to_s when Regexp '!ruby/regexp ' + obj.inspect else case when obj.is_a?(Struct) struct_members = {} obj.each_pair { |k, v| struct_members[k.to_s] = v } "!ruby/struct:#{obj.class.to_s.sub(/^(Struct::(.+)|.*)$/, '\2')} #{emit(struct_members, level + 1)}" else # serialized as a generic object object_members = {} obj.instance_variables.each do |k, v| object_members[k.to_s.sub(/^@/, '')] = obj.instance_variable_get(k) end "!ruby/object:#{obj.class.to_s} #{emit(object_members, level + 1)}" end end end def emit_string(str, level) if @options[:quote_all_strings] && !str.is_a?(Symbol) emit_quoted_string(str, level) else str = str.to_s (is_string, is_printable, is_one_line, is_one_plain_line) = string_type(str) if is_string if is_printable if is_one_plain_line emit_simple_string(str, level) else (is_one_line || str.length < @options[:minimum_block_length]) ? emit_quoted_string(str, level) : emit_block_string(str, level) end else emit_quoted_string(str, level) end else emit_base64_binary(str, level) end end end def emit_simple_string(str, level) str end def emit_block_string(str, level) str = normalize_line_break(str) indent = s_indent(level) indentation_indicator = (str =~ /\A /) ? indent.size.to_s : '' str =~ /(#{REX_NORMAL_LB}*)\z/ chomping_indicator = case $1.length when 0 '-' when 1 '' else '+' end str.chomp! str.gsub!(/#{REX_NORMAL_LB}/) { $1 + indent } "|#{indentation_indicator}#{chomping_indicator}\n#{indent}#{str}" end def emit_quoted_string(str, level) str = yaml_escape(normalize_line_break(str)) if (str.length < @options[:minimum_block_length]) str.gsub!(/#{REX_NORMAL_LB}/) { ESCAPE_SEQ_LB[$1] } else str.gsub!(/#{REX_NORMAL_LB}$/) { ESCAPE_SEQ_LB[$1] } str.gsub!(/(#{REX_NORMAL_LB}+)(.)/) do trail_c = $3 $1 + trail_c.sub(/([\t ])/) { ESCAPE_SEQ_WS[$1] } end indent = s_indent(level) str.gsub!(/#{REX_NORMAL_LB}/) { "#{ESCAPE_SEQ_LB[$1]}\\\n#{indent}" } end %Q("#{str}") end def emit_base64_binary(str, level) indent = "\n#{s_indent(level)}" base64 = [str].pack('m') "!binary |#{indent}#{base64.gsub(/\n(?!\z)/, indent)}" end def string_type(str) if str.respond_to?(:encoding) && (!str.valid_encoding? || str.encoding == Encoding::ASCII_8BIT) return false, false, false, false end (ucs_codes = str.unpack('U*')) rescue ( # ArgumentError -> binary data return false, false, false, false ) if ( @options[:printable_with_syck] && str =~ /\A#{REX_ANY_LB}* | #{REX_ANY_LB}*\z|#{REX_ANY_LB}{2}\z/ ) # detour Syck bug return true, false, nil, false end ucs_codes.each {|ucs_code| return true, false, nil, false unless is_printable?(ucs_code) } return true, true, is_one_line?(str), is_one_plain_line?(str) end def is_printable?(ucs_code) # YAML 1.1 / 4.1.1. ( [0x09, 0x0a, 0x0d, 0x85].include?(ucs_code) || (ucs_code <= 0x7e && ucs_code >= 0x20) || (ucs_code <= 0xd7ff && ucs_code >= 0xa0) || (ucs_code <= 0xfffd && ucs_code >= 0xe000) || (ucs_code <= 0x10ffff && ucs_code >= 0x10000) ) && !( # treat LS/PS as non-printable characters @options[:escape_b_specific] && (ucs_code == 0x2028 || ucs_code == 0x2029) ) end def is_one_line?(str) str !~ /#{REX_ANY_LB}(?!\z)/ end def is_one_plain_line?(str) # YAML 1.1 / 4.6.11. str !~ /^([\-\?:,\[\]\{\}\#&\*!\|>'"%@`\s]|---|\.\.\.)/ && str !~ /[:\#\s\[\]\{\},]/ && str !~ /#{REX_ANY_LB}/ && str !~ /^(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_MERGE} |#{REX_NULL}|#{REX_TIMESTAMP}|#{REX_VALUE})$/x end def s_indent(level) # YAML 1.1 / 4.2.2. ' ' * (level * @options[:indent_size]) end def normalize_line_break(str) # YAML 1.1 / 4.1.4. str.gsub(/(#{REX_CRLF}|#{REX_CR}|#{REX_NEL})/, "\n") end def yaml_escape(str) # YAML 1.1 / 4.1.6. str.gsub(/[^a-zA-Z0-9]/u) do |c| ucs_code, = (c.unpack('U') rescue [??]) case when ESCAPE_SEQ[c] ESCAPE_SEQ[c] when is_printable?(ucs_code) c when @options[:escape_as_utf8] c.respond_to?(:bytes) ? c.bytes.collect { |b| '\\x%.2x' % b }.join : '\\x' + c.unpack('H2' * c.size).join('\\x') when ucs_code == 0x2028 || ucs_code == 0x2029 ESCAPE_SEQ_LB[c] when ucs_code <= 0x7f sprintf('\\x%.2x', ucs_code) when ucs_code <= 0xffff sprintf('\\u%.4x', ucs_code) else sprintf('\\U%.8x', ucs_code) end end end end end end end