# Encoding: UTF-8 require 'test_helper' require 'traject' require 'traject/indexer' require 'traject/marc4j_reader' require 'marc' describe "Marc4JReader" do it "reads Marc binary" do file = File.new(support_file_path("test_data.utf8.mrc")) settings = Traject::Indexer::Settings.new() # binary type is default reader = Traject::Marc4JReader.new(file, settings) array = reader.to_a assert_equal 30, array.length first = array.first assert_kind_of MARC::Record, first assert_equal first['245']['a'].encoding.name, "UTF-8" end it "can skip a bad subfield code" do file = File.new(support_file_path("bad_subfield_code.marc")) settings = Traject::Indexer::Settings.new() # binary type is default reader = Traject::Marc4JReader.new(file, settings) array = reader.to_a assert_equal 1, array.length assert_kind_of MARC::Record, array.first assert_length 2, array.first['260'].subfields end it "reads Marc binary in Marc8 encoding" do file = File.new(support_file_path("one-marc8.mrc")) settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") reader = Traject::Marc4JReader.new(file, settings) array = reader.to_a assert_length 1, array assert_kind_of MARC::Record, array.first a245a = array.first['245']['a'] assert a245a.encoding.name, "UTF-8" assert a245a.valid_encoding? # marc4j converts to denormalized unicode, bah. Although # it's legal, it probably looks weird as a string literal # below, depending on your editor. assert_equal "Por uma outra globalização :", a245a # Set leader byte to proper for unicode assert_equal 'a', array.first.leader[9] end it "reads XML" do file = File.new(support_file_path "test_data.utf8.marc.xml") settings = Traject::Indexer::Settings.new("marc_source.type" => "xml") reader = Traject::Marc4JReader.new(file, settings) array = reader.to_a assert_equal 30, array.length first = array.first assert_kind_of MARC::Record, first assert first['245']['a'].encoding.name, "UTF-8" assert_equal "Fikr-i Ayāz /", first['245']['a'] end it "keeps marc4j object when asked" do file = File.new(support_file_path "test_data.utf8.marc.xml") settings = Traject::Indexer::Settings.new("marc_source.type" => "xml", 'marc4j_reader.keep_marc4j' => true) record = Traject::Marc4JReader.new(file, settings).to_a.first assert_kind_of MARC::Record, record assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j end it "replaces unicode character reference in Marc8 transcode" do file = File.new(support_file_path "escaped_character_reference.marc8.marc") # due to marc4j idiosyncracies, this test will NOT pass with default source_encoding # of "BESTGUESS", it only works if you explicitly set to MARC8. Doh. settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") # binary type is default record = Traject::Marc4JReader.new(file, settings).to_a.first assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a'] end describe "Marc4J Java Permissive Stream Reader" do # needed for sanity check when our tests fail to see if Marc4J # is not behaving how we think it should. it "converts character references" do file = File.new(support_file_path "escaped_character_reference.marc8.marc") reader = MarcPermissiveStreamReader.new(file.to_inputstream, true, true, "MARC-8") record = reader.next field = record.getVariableField("260") subfield = field.getSubfield('a'.ord) value = subfield.getData assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", value end end it "replaces bad byte in UTF8 marc" do skip "Marc4J needs fixing on it's end" # Marc4J won't do this in 'permissive' mode, gah. # Note this only works because the marc file DOES correctly # have leader byte 9 set to 'a' for UTF8, otherwise Marc4J can't do it. file = File.new(support_file_path "bad_utf_byte.utf8.marc") settings = Traject::Indexer::Settings.new() # binary UTF8 type is default reader = Traject::Marc4JReader.new(file, settings) record = reader.to_a.first value = record['300']['a'] assert_equal value.encoding.name, "UTF-8" assert value.valid_encoding?, "Has valid encoding" assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", record['300']['a'] end end