require 'spec_helper' require 'puppet/util/character_encoding' require 'puppet_spec/character_encoding' describe Puppet::Util::CharacterEncoding do describe "::convert_to_utf_8" do context "when passed a string that is already UTF-8" do context "with valid encoding" do let(:utf8_string) { "\u06FF\u2603".force_encoding(Encoding::UTF_8) } it "should return the string unmodified" do expect(Puppet::Util::CharacterEncoding.convert_to_utf_8(utf8_string)).to eq("\u06FF\u2603".force_encoding(Encoding::UTF_8)) end it "should not mutate the original string" do expect(utf8_string).to eq("\u06FF\u2603".force_encoding(Encoding::UTF_8)) end end context "with invalid encoding" do let(:invalid_utf8_string) { "\xfd\xf1".force_encoding(Encoding::UTF_8) } it "should issue a debug message" do expect(Puppet).to receive(:debug).with(/encoding is invalid/) Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_utf8_string) end it "should return the string unmodified" do expect(Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_utf8_string)).to eq("\xfd\xf1".force_encoding(Encoding::UTF_8)) end it "should not mutate the original string" do Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_utf8_string) expect(invalid_utf8_string).to eq("\xfd\xf1".force_encoding(Encoding::UTF_8)) end end end context "when passed a string in BINARY encoding" do context "that is valid in Encoding.default_external" do # When received as BINARY are not transcodable, but by "guessing" # Encoding.default_external can transcode to UTF-8 let(:win_31j) { [130, 187].pack('C*') } # pack('C*') returns string in BINARY it "should be able to convert to UTF-8 by labeling as Encoding.default_external" do # そ - HIRAGANA LETTER SO # In Windows_31J: \x82 \xbb - 130 187 # In Unicode: \u305d - \xe3 \x81 \x9d - 227 129 157 result = PuppetSpec::CharacterEncoding.with_external_encoding(Encoding::Windows_31J) do Puppet::Util::CharacterEncoding.convert_to_utf_8(win_31j) end expect(result).to eq("\u305d") expect(result.bytes.to_a).to eq([227, 129, 157]) end it "should not mutate the original string" do PuppetSpec::CharacterEncoding.with_external_encoding(Encoding::Windows_31J) do Puppet::Util::CharacterEncoding.convert_to_utf_8(win_31j) end expect(win_31j).to eq([130, 187].pack('C*')) end end context "that is invalid in Encoding.default_external" do let(:invalid_win_31j) { [255, 254, 253].pack('C*') } # these bytes are not valid windows_31j it "should return the string umodified" do result = PuppetSpec::CharacterEncoding.with_external_encoding(Encoding::Windows_31J) do Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_win_31j) end expect(result.bytes.to_a).to eq([255, 254, 253]) expect(result.encoding).to eq(Encoding::BINARY) end it "should not mutate the original string" do PuppetSpec::CharacterEncoding.with_external_encoding(Encoding::Windows_31J) do Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_win_31j) end expect(invalid_win_31j).to eq([255, 254, 253].pack('C*')) end it "should issue a debug message that the string was not transcodable" do expect(Puppet).to receive(:debug).with(/cannot be transcoded/) PuppetSpec::CharacterEncoding.with_external_encoding(Encoding::Windows_31J) do Puppet::Util::CharacterEncoding.convert_to_utf_8(invalid_win_31j) end end end context "Given a string labeled as neither UTF-8 nor BINARY" do context "that is transcodable" do let (:shift_jis) { [130, 174].pack('C*').force_encoding(Encoding::Shift_JIS) } it "should return a copy of the string transcoded to UTF-8 if it is transcodable" do # http://www.fileformat.info/info/unicode/char/3050/index.htm # ぐ - HIRAGANA LETTER GU # In Shift_JIS: \x82 \xae - 130 174 # In Unicode: \u3050 - \xe3 \x81 \x90 - 227 129 144 # if we were only ruby > 2.3.0, we could do String.new("\x82\xae", :encoding => Encoding::Shift_JIS) result = Puppet::Util::CharacterEncoding.convert_to_utf_8(shift_jis) expect(result).to eq("\u3050".force_encoding(Encoding::UTF_8)) # largely redundant but reinforces the point - this was transcoded: expect(result.bytes.to_a).to eq([227, 129, 144]) end it "should not mutate the original string" do Puppet::Util::CharacterEncoding.convert_to_utf_8(shift_jis) expect(shift_jis).to eq([130, 174].pack('C*').force_encoding(Encoding::Shift_JIS)) end end context "when not transcodable" do # An admittedly contrived case, but perhaps not so improbable # http://www.fileformat.info/info/unicode/char/5e0c/index.htm # 希 Han Character 'rare; hope, expect, strive for' # In EUC_KR: \xfd \xf1 - 253 241 # In Unicode: \u5e0c - \xe5 \xb8 \x8c - 229 184 140 # In this case, this EUC_KR character has been read in as ASCII and is # invalid in that encoding. This would raise an EncodingError # exception on transcode but we catch this issue a debug message - # leaving the original string unaltered. let(:euc_kr) { [253, 241].pack('C*').force_encoding(Encoding::ASCII) } it "should issue a debug message" do expect(Puppet).to receive(:debug).with(/cannot be transcoded/) Puppet::Util::CharacterEncoding.convert_to_utf_8(euc_kr) end it "should return the original string unmodified" do result = Puppet::Util::CharacterEncoding.convert_to_utf_8(euc_kr) expect(result).to eq([253, 241].pack('C*').force_encoding(Encoding::ASCII)) end it "should not mutate the original string" do Puppet::Util::CharacterEncoding.convert_to_utf_8(euc_kr) expect(euc_kr).to eq([253, 241].pack('C*').force_encoding(Encoding::ASCII)) end end end end end describe "::override_encoding_to_utf_8" do context "given a string with bytes that represent valid UTF-8" do # ☃ - unicode snowman # \u2603 - \xe2 \x98 \x83 - 226 152 131 let(:snowman) { [226, 152, 131].pack('C*') } it "should return a copy of the string with external encoding of the string to UTF-8" do result = Puppet::Util::CharacterEncoding.override_encoding_to_utf_8(snowman) expect(result).to eq("\u2603") expect(result.encoding).to eq(Encoding::UTF_8) end it "should not modify the original string" do Puppet::Util::CharacterEncoding.override_encoding_to_utf_8(snowman) expect(snowman).to eq([226, 152, 131].pack('C*')) end end context "given a string with bytes that do not represent valid UTF-8" do # Ø - Latin capital letter O with stroke # In ISO-8859-1: \xd8 - 216 # Invalid in UTF-8 without transcoding let(:oslash) { [216].pack('C*').force_encoding(Encoding::ISO_8859_1) } let(:foo) { 'foo' } it "should issue a debug message" do expect(Puppet).to receive(:debug).with(/not valid UTF-8/) Puppet::Util::CharacterEncoding.override_encoding_to_utf_8(oslash) end it "should return the original string unmodified" do result = Puppet::Util::CharacterEncoding.override_encoding_to_utf_8(oslash) expect(result).to eq([216].pack('C*').force_encoding(Encoding::ISO_8859_1)) end it "should not modify the string" do Puppet::Util::CharacterEncoding.override_encoding_to_utf_8(oslash) expect(oslash).to eq([216].pack('C*').force_encoding(Encoding::ISO_8859_1)) end end end describe "::scrub" do let(:utf_8_string_to_scrub) { "\xfdfoo".force_encoding(Encoding::UTF_8) } # invalid in UTF-8 # The invalid-ness of this string comes from unpaired surrogates, ie: # "any value in the range D80016 to DBFF16 not followed by a value in the # range DC0016 to DFFF16, or any value in the range DC0016 to DFFF16 not # preceded by a value in the range D80016 to DBFF16" # http://unicode.org/faq/utf_bom.html#utf16-7 # "a\ud800b" # We expect the "b" to be replaced as that is what makes the string invalid let(:utf_16LE_string_to_scrub) { [97, 237, 160, 128, 98].pack('C*').force_encoding(Encoding::UTF_16LE) } # invalid in UTF-16 let(:invalid_non_utf) { "foo\u2603".force_encoding(Encoding::EUC_KR) } # EUC_KR foosnowman! it "should defer to String#scrub if defined", :if => String.method_defined?(:scrub) do result = Puppet::Util::CharacterEncoding.scrub(utf_8_string_to_scrub) # The result should have the UTF-8 replacement character if we're using Ruby scrub expect(result).to eq("\uFFFDfoo".force_encoding(Encoding::UTF_8)) expect(result.bytes.to_a).to eq([239, 191, 189, 102, 111, 111]) end context "when String#scrub is not defined" do it "should still issue unicode replacement characters if the string is UTF-8" do allow(utf_8_string_to_scrub).to receive(:respond_to?).with(:scrub).and_return(false) result = Puppet::Util::CharacterEncoding.scrub(utf_8_string_to_scrub) expect(result).to eq("\uFFFDfoo".force_encoding(Encoding::UTF_8)) end it "should still issue unicode replacement characters if the string is UTF-16LE" do allow(utf_16LE_string_to_scrub).to receive(:respond_to?).with(:scrub).and_return(false) result = Puppet::Util::CharacterEncoding.scrub(utf_16LE_string_to_scrub) # Bytes of replacement character on UTF_16LE are [253, 255] # We just check for bytes because something (ruby?) interprets this array of bytes as: # (97) (237 160) (128 253 255) rather than (97) (237 160 128) (253 255) expect(result).to eq([97, 237, 160, 128, 253, 255].pack('C*').force_encoding(Encoding::UTF_16LE)) end it "should issue '?' characters if the string is not one of UTF_8 or UTF_16LE" do allow(invalid_non_utf).to receive(:respond_to?).with(:scrub).and_return(false) result = Puppet::Util::CharacterEncoding.scrub(invalid_non_utf) expect(result).to eq("foo???".force_encoding(Encoding::EUC_KR)) end end end end