# -*- encoding: utf-8 -*- require 'test/unit' require 'marc' # Testing char encodings under 1.9, don't bother running # these tests except under 1.9, will either fail (because # 1.9 func the test itself uses isn't there), or trivially pass # (becuase the func they are testing is no-op on 1.9). if "".respond_to?(:encoding) class ReaderCharEncodingsTest < Test::Unit::TestCase #### # Helper methods for our tests # #### @@utf_marc_path = 'test/utf8.marc' # tests against record at test/utf8.marc def assert_utf8_right_in_utf8(record) assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name assert_equal "UTF-8", record['245'].to_s.encoding.name assert_equal "UTF-8", record['245'].subfields.first.to_s.encoding.name assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name assert_equal "UTF-8", record['245']['a'].encoding.name assert record['245']['a'].start_with?("Photčhanānukrom") end # Test against multirecord just to be sure that works. # the multirecord file is just two concatenated copies # of the single one. @@cp866_marc_path = "test/cp866_multirecord.marc" # assumes record in test/cp866_unimarc.marc # Pass in an encoding name, using ruby's canonical name! # "IBM866" not "cp866". "UTF-8". def assert_cp866_right(record, encoding = "IBM866") assert_equal(encoding, record['001'].value.encoding.name) assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N end #### # end helper methods #### def test_unicode_load reader = MARC::Reader.new(@@utf_marc_path) record = nil assert_nothing_raised { record = reader.first } assert_utf8_right_in_utf8(record) end def test_unicode_decode_forgiving # two kinds of forgiving invocation, they shouldn't be different, # but just in case they have slightly different code paths, test em # too. marc_string = File.open(@@utf_marc_path).read.force_encoding("utf-8") record = MARC::Reader.decode(marc_string, :forgiving => true) assert_utf8_right_in_utf8(record) reader = MARC::ForgivingReader.new(@@utf_marc_path) record = reader.first assert_utf8_right_in_utf8(record) end def test_unicode_forgiving_reader_passes_options # Make sure ForgivingReader accepts same options as MARC::Reader # We don't test them ALL though, just a sample. # Tell it we're reading cp866, but trancode to utf8 for us. reader = MARC::ForgivingReader.new(@@cp866_marc_path, :external_encoding => "cp866", :internal_encoding => "utf-8") record = reader.first assert_cp866_right(record, "UTF-8") end def test_explicit_encoding reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866') assert_cp866_right(reader.first, "IBM866") end def test_bad_encoding_name_input reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'adadfadf') assert_raises ArgumentError do reader.first end end def test_marc8_with_binary # Marc8, best we can do is read it in binary. reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary') record = reader.first assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name end def test_load_file_opened_with_external_encoding reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866')) record = reader.first # Make sure it's got the encoding it's supposed to. assert_cp866_right(record, "IBM866") end def test_explicit_encoding_beats_file_encoding reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:utf-8'), :external_encoding => "cp866") record = reader.first assert_cp866_right(record, "IBM866") end def test_from_string_with_utf8_encoding marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8") reader = MARC::Reader.new(StringIO.new(marc_string)) record = reader.first assert_utf8_right_in_utf8(record) end def test_from_string_with_cp866 marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866") reader = MARC::Reader.new(StringIO.new(marc_string)) record = reader.first assert_cp866_right(record, "IBM866") end def test_decode_from_string_with_cp866 marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866") record = MARC::Reader.decode(marc_string) assert_cp866_right(record, "IBM866") end def test_with_transcode reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866', :internal_encoding => 'UTF-8') record = reader.first assert_cp866_right(record, "UTF-8") end def test_with_binary_filehandle # about to recommend this as a foolproof way to avoid # ruby transcoding behind your back in docs, let's make # sure it really works. reader = MARC::Reader.new(File.open(@@cp866_marc_path, :external_encoding => "binary", :internal_encoding => "binary"), :external_encoding => "IBM866") record = reader.first assert_cp866_right(record, "IBM866") end def test_with_bad_source_bytes reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc', :external_encoding => "UTF-8", :validate_encoding => true) assert_raise Encoding::InvalidByteSequenceError do record = reader.first end end def test_bad_source_bytes_with_replace reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc', :external_encoding => "UTF-8", :invalid => :replace) record = nil assert_nothing_raised do record = reader.first end # it should have the unicode replacement char where the bad # byte was. assert_match '=> ' + "\uFFFD" + '( <=', record['245']['a'] end def test_bad_source_bytes_with_custom_replace reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc', :external_encoding => "UTF-8", :invalid => :replace, :replace => '') record = reader.first # bad byte replaced with empty string, gone. assert_match '=> ( <=', record['245']['a'] end def test_default_internal_encoding # Some people WILL be changing their Encoding.default_internal # It's even recommended by wycats # http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/ # This will in some cases make ruby File object trans-code # by default. Trans-coding a serial marc binary can change the # byte count and mess it up. # # But at present, because of the way the Reader is implemented reading # specific bytecounts, it _works_, although it does not _respect_ # Encoding.default_internal. That's the best we can do right now, # thsi test is important to ensure it stays at least this good. begin original = Encoding.default_internal Encoding.default_internal = "UTF-8" reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866')) record = reader.first assert_cp866_right(record, "IBM866") ensure Encoding.default_internal = original end end def test_default_internal_encoding_with_string_arg begin original = Encoding.default_internal Encoding.default_internal = "UTF-8" reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => "cp866") record = reader.first assert_cp866_right(record, "IBM866") ensure Encoding.default_internal = original end end end else require 'pathname' $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n" end