require 'spec_helper' describe Spotlight::Dor::Indexer do subject { described_class.new } let(:fake_druid) { 'oo000oo0000' } let(:r) { Harvestdor::Indexer::Resource.new(double, fake_druid) } let(:sdb) { GDor::Indexer::SolrDocBuilder.new(r, Logger.new(StringIO.new)) } let(:solr_doc) { {} } let(:mods_loc_phys_loc) do Nokogiri::XML <<-EOF #{example} EOF end let(:mods_rel_item_loc_phys_loc) do Nokogiri::XML <<-EOF #{example} EOF end let(:mods_loc_multiple_phys_loc) do Nokogiri::XML <<-EOF Irrelevant Data #{example} EOF end before do # reduce log noise allow(r).to receive(:harvestdor_client) i = Harvestdor::Indexer.new i.logger.level = Logger::WARN allow(r).to receive(:indexer).and_return i end describe '#add_content_metadata_fields' do before do allow(r).to receive(:public_xml).and_return(public_xml) allow(sdb).to receive(:bare_druid).and_return(fake_druid) # stacks url calculations require the druid solr_doc[:id] = fake_druid subject.send(:add_content_metadata_fields, sdb, solr_doc) end context 'with a record without contentMetadata' do let(:public_xml) do Nokogiri::XML <<-EOF EOF end it 'is blank, except for the document id' do expect(solr_doc.except(:id)).to be_blank end end context 'with a record with contentMetadata' do let(:public_xml) do Nokogiri::XML <<-EOF EOF end it 'indexes the declared content metadata type' do expect(solr_doc['content_metadata_type_ssim']).to contain_exactly 'image' end it 'indexes the thumbnail information' do expect(solr_doc['content_metadata_first_image_file_name_ssm']).to contain_exactly 'bj356mh7176_00_0001' expect(solr_doc['content_metadata_first_image_width_ssm']).to contain_exactly '12967' expect(solr_doc['content_metadata_first_image_height_ssm']).to contain_exactly '22970' end it 'indexes the images' do stacks_base_url = 'https://stacks.stanford.edu/image/iiif/oo000oo0000%2Fbj356mh7176_00_0001' expect(solr_doc['content_metadata_image_iiif_info_ssm']).to include "#{stacks_base_url}/info.json" expect(solr_doc['thumbnail_square_url_ssm']).to include "#{stacks_base_url}/square/100,100/0/default.jpg" expect(solr_doc['thumbnail_url_ssm']).to include "#{stacks_base_url}/full/!400,400/0/default.jpg" expect(solr_doc['large_image_url_ssm']).to include "#{stacks_base_url}/full/pct:25/0/default.jpg" expect(solr_doc['full_image_url_ssm']).to include "#{stacks_base_url}/full/full/0/default.jpg" end end end describe '#add_donor_tags' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_donor_tags, sdb, solr_doc) end context 'with a record without donor tags' do let(:mods) do Nokogiri::XML <<-EOF (not a donor tag) EOF end it 'is blank' do expect(solr_doc['donor_tags_ssim']).to be_blank end end context 'with a record with donor tags' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Knowledge Systems Laboratory medical applications Publishing Stanford Stanford Computer Science Department EOF end it 'extracts the donor tags' do expect(solr_doc['donor_tags_ssim']).to contain_exactly 'Knowledge Systems Laboratory', 'medical applications', 'Publishing', 'Stanford', 'Stanford Computer Science Department' end end end context 'StanfordMods concern' do describe '#add_author_no_collector' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_author_no_collector, sdb, solr_doc) end let(:name) { 'Macro Hamster' } let(:mods) do Nokogiri::XML <<-EOF #{name} cre Ignored col EOF end it 'populates author_no_collector_ssim field in solr doc' do expect(solr_doc['author_no_collector_ssim']).to eq [name] end it 'calls non_collector_person_authors on Stanford::Mods::Record object' do expect(sdb.smods_rec).to receive(:non_collector_person_authors) subject.send(:add_author_no_collector, sdb, solr_doc) end end describe '#add_collector' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_collector, sdb, solr_doc) end let(:name) { 'Macro Hamster' } let(:mods) do Nokogiri::XML <<-EOF #{name} col EOF end it 'populates collector_ssim field in solr doc' do expect(solr_doc['collector_ssim']).to eq [name] end it 'calls collectors_w_dates on Stanford::Mods::Record object' do expect(sdb.smods_rec).to receive(:collectors_w_dates) subject.send(:add_collector, sdb, solr_doc) end end describe '#add_genre' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_genre, sdb, solr_doc) end context 'with a record without a genre' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['genre_ssim']).to be_blank end end context 'with a record with a genre' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF manuscripts for publication EOF end it 'extracts the genre' do expect(solr_doc['genre_ssim']).to contain_exactly 'manuscripts for publication' end end end end describe '#add_series' do # example string as key, expected series name as value { # feigenbaum 'Call Number: SC0340, Accession 2005-101': '2005-101', 'Call Number: SC0340, Accession 2005-101, Box : 39, Folder: 9': '2005-101', 'Call Number: SC0340, Accession 2005-101, Box: 2, Folder: 3': '2005-101', 'Call Number: SC0340, Accession: 1986-052': '1986-052', 'Call Number: SC0340, Accession: 1986-052, Box 3 Folder 38': '1986-052', 'Call Number: SC0340, Accession: 2005-101, Box : 50, Folder: 31': '2005-101', 'Call Number: SC0340, Accession: 1986-052, Box: 5, Folder: 1': '1986-052', 'SC0340, Accession 1986-052': '1986-052', 'SC0340, Accession 2005-101, Box 18': '2005-101', 'Call Number: SC0340, Accession 2005-101, Box: 42A, Folder: 24': '2005-101', 'Call Number: SC0340, Accession: 1986-052, Box: 42A, Folder: 59': '1986-052', 'SC0340': nil, 'SC0340, 1986-052, Box 18': nil, 'Stanford University. Libraries. Department of Special Collections and University Archives': nil, # shpc (actually in ) 'Series Biographical Photographs | Box 42 | Folder Abbot, Nathan': 'Biographical Photographs', 'Series General Photographs | Box 42 | Folder Administration building--Outer Quad': 'General Photographs', # menuez 'MSS Photo 451, Series 1, Box 32, Folder 11, Sleeve 32-11-2, Frame B32-F11-S2-6': '1', 'Series 1, Box 10, Folder 8': '1', # fuller 'Collection: M1090 , Series: 4 , Box: 5 , Folder: 10': '4', # hummel (actually in ) 'Box 42 | Folder 3': nil, 'Flat-box 228 | Volume 1': nil }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in /location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_loc_phys_loc) subject.send(:add_series, sdb, solr_doc) end it "has the expected series name '#{expected}'" do expect(solr_doc['series_ssi']).to eq expected end end context 'in /relatedItem/location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_rel_item_loc_phys_loc) subject.send(:add_series, sdb, solr_doc) end it "has the expected series name '#{expected}'" do expect(solr_doc['series_ssi']).to eq expected end end context 'with multiple physicalLocation elements' do before do allow(r).to receive(:mods).and_return(mods_loc_multiple_phys_loc) subject.send(:add_series, sdb, solr_doc) end it "has the expected series name '#{expected}'" do expect(solr_doc['series_ssi']).to eq expected end end end # for example end # each end # add_series describe '#add_box' do # example string as key, expected box name as value { # feigenbaum 'Call Number: SC0340, Accession 2005-101, Box : 1, Folder: 1': '1', 'Call Number: SC0340, Accession 2005-101, Box: 39, Folder: 9': '39', 'Call Number: SC0340, Accession: 1986-052, Box 3 Folder 38': '3', 'Call Number: SC0340, Accession: 2005-101, Box : 50, Folder: 31': '50', 'Call Number: SC0340, Accession: 1986-052, Box: 5, Folder: 1': '5', 'SC0340, 1986-052, Box 18': '18', 'SC0340, Accession 2005-101, Box 18': '18', 'Call Number: SC0340, Accession 2005-101, Box: 42A, Folder: 24': '42A', 'Call Number: SC0340, Accession: 1986-052, Box: 42A, Folder: 59': '42A', 'Call Number: SC0340, Accession 2005-101': nil, 'Call Number: SC0340, Accession: 1986-052': nil, 'SC0340': nil, 'SC0340, Accession 1986-052': nil, 'Stanford University. Libraries. Department of Special Collections and University Archives': nil, # shpc (actually in ) 'Series Biographical Photographs | Box 42 | Folder Abbot, Nathan': '42', 'Series General Photographs | Box 42 | Folder Administration building--Outer Quad': '42', # menuez 'MSS Photo 451, Series 1, Box 32, Folder 11, Sleeve 32-11-2, Frame B32-F11-S2-6': '32', 'Series 1, Box 10, Folder 8': '10', # fuller 'Collection: M1090 , Series: 1 , Box: 5 , Folder: 42': '5', # hummel (actually in ) 'Box 42 | Folder 3': '42', 'Flat-box 228 | Volume 1': '228' }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in /location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_loc_phys_loc) subject.send(:add_box, sdb, solr_doc) end it "has the expected box label '#{expected}'" do expect(solr_doc['box_ssi']).to eq expected end end context 'in /relatedItem/location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_rel_item_loc_phys_loc) subject.send(:add_box, sdb, solr_doc) end it "has the expected box label '#{expected}'" do expect(solr_doc['box_ssi']).to eq expected end end context 'with multiple physicalLocation elements' do before do allow(r).to receive(:mods).and_return(mods_loc_multiple_phys_loc) subject.send(:add_box, sdb, solr_doc) end it "has the expected box label '#{expected}'" do expect(solr_doc['box_ssi']).to eq expected end end end # for example end # each end # add_box describe '#add_folder' do # example string as key, expected folder name as value { # feigenbaum 'Call Number: SC0340, Accession 2005-101, Box : 1, Folder: 42': '42', 'Call Number: SC0340, Accession 2005-101, Box: 2, Folder: 42': '42', 'Call Number: SC0340, Accession: 1986-052, Box 3 Folder 42': '42', 'Call Number: SC0340, Accession: 2005-101, Box : 4, Folder: 42': '42', 'Call Number: SC0340, Accession: 1986-052, Box: 5, Folder: 42': '42', 'Call Number: SC0340, Accession 2005-101, Box: 4A, Folder: 42': '42', 'Call Number: SC0340, Accession: 1986-052, Box: 5A, Folder: 42': '42', 'Call Number: SC0340, Accession 2005-101': nil, 'Call Number: SC0340, Accession: 1986-052': nil, 'SC0340': nil, 'SC0340, 1986-052, Box 18': nil, 'SC0340, Accession 2005-101': nil, 'SC0340, Accession 2005-101, Box 18': nil, 'Stanford University. Libraries. Department of Special Collections and University Archives': nil, # menuez 'MSS Photo 451, Series 1, Box 32, Folder 42, Sleeve 32-11-2, Frame B32-F11-S2-6': '42', 'Series 1, Box 10, Folder 42': '42', # fuller 'Collection: M1090 , Series: 4 , Box: 5 , Folder: 42': '42', # hummel (actually in ) 'Box 1 | Folder 42': '42', 'Flat-box 228 | Volume 1': nil, # shpc (actually in ) 'Series Biographical Photographs | Box 1 | Folder Abbot, Nathan': 'Abbot, Nathan', 'Series General Photographs | Box 1 | Folder Administration building--Outer Quad': 'Administration building--Outer Quad', # hypothetical 'Folder: 42, Sheet: 15': '42' }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in /location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_loc_phys_loc) subject.send(:add_folder, sdb, solr_doc) end it "has the expected folder label '#{expected}'" do expect(solr_doc['folder_ssi']).to eq expected end end context 'in /relatedItem/location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_rel_item_loc_phys_loc) subject.send(:add_folder, sdb, solr_doc) end it "has the expected folder label '#{expected}'" do expect(solr_doc['folder_ssi']).to eq expected end end context 'with multiple physicalLocation elements' do before do allow(r).to receive(:mods).and_return(mods_loc_multiple_phys_loc) subject.send(:add_folder, sdb, solr_doc) end it "has the expected folder label '#{expected}'" do expect(solr_doc['folder_ssi']).to eq expected end end end # for example end # each end # add_folder # rubocop:disable Metrics/LineLength describe '#add_location' do # example string as key, expected box name as value { # feigenbaum 'Call Number: SC0340, Accession 2005-101, Box : 1, Folder: 1': 'Call Number: SC0340, Accession 2005-101, Box : 1, Folder: 1', 'Call Number: SC0340, Accession 2005-101': 'Call Number: SC0340, Accession 2005-101', 'SC0340, 1986-052, Box 18': 'SC0340, 1986-052, Box 18', 'SC0340, Accession 2005-101, Box 18': 'SC0340, Accession 2005-101, Box 18', 'SC0340': nil, 'SC0340, Accession 1986-052': 'SC0340, Accession 1986-052', 'Stanford University. Libraries. Department of Special Collections and University Archives': nil, # shpc (actually in ) 'Series Biographical Photographs | Box 42 | Folder Abbot, Nathan': 'Series Biographical Photographs | Box 42 | Folder Abbot, Nathan', 'Series General Photographs | Box 42 | Folder Administration building--Outer Quad': 'Series General Photographs | Box 42 | Folder Administration building--Outer Quad', # menuez 'MSS Photo 451, Series 1, Box 32, Folder 11, Sleeve 32-11-2, Frame B32-F11-S2-6': 'MSS Photo 451, Series 1, Box 32, Folder 11, Sleeve 32-11-2, Frame B32-F11-S2-6', 'Series 1, Box 10, Folder 8': 'Series 1, Box 10, Folder 8', # fuller 'Collection: M1090 , Series: 1 , Box: 5 , Folder: 42': 'Collection: M1090 , Series: 1 , Box: 5 , Folder: 42', # hummel (actually in ) 'Box 42 | Folder 3': 'Box 42 | Folder 3', 'Flat-box 228 | Volume 1': 'Flat-box 228 | Volume 1' }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in /location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_loc_phys_loc) subject.send(:add_location, sdb, solr_doc) end it "has the expected location '#{expected}'" do expect(solr_doc['location_ssi']).to eq expected end end context 'in /relatedItem/location/physicalLocation' do before do allow(r).to receive(:mods).and_return(mods_rel_item_loc_phys_loc) subject.send(:add_location, sdb, solr_doc) end it "has the expected location '#{expected}'" do expect(solr_doc['location_ssi']).to eq expected end end context 'with multiple physicalLocation elements' do before do allow(r).to receive(:mods).and_return(mods_loc_multiple_phys_loc) subject.send(:add_location, sdb, solr_doc) end it "has the expected location '#{expected}'" do expect(solr_doc['location_ssi']).to eq expected end end end # for example end # each end # add_location # rubocop:enable Metrics/LineLength let(:mods_note_plain) do Nokogiri::XML <<-EOF #{example} EOF end let(:mods_note_preferred_citation) do Nokogiri::XML <<-EOF #{example} EOF end # rubocop:disable Metrics/LineLength describe '#add_folder_name' do # example string as key, expected folder name as value # all from feigenbaum (or based on feigenbaum), as that is only coll with this data { 'Call Number: SC0340, Accession: 1986-052, Box: 20, Folder: 40, Title: S': 'S', 'Call Number: SC0340, Accession: 1986-052, Box: 54, Folder: 25, Title: Balzer': 'Balzer', 'Call Number: SC0340, Accession: 1986-052, Box : 30, Folder: 21, Title: Feigenbaum, Publications. 2 of 2.': 'Feigenbaum, Publications. 2 of 2.', # colon in name 'Call Number: SC0340, Accession 2005-101, Box: 10, Folder: 26, Title: Gordon Bell Letter rdf:about blah (AI) 1987': 'Gordon Bell Letter rdf:about blah (AI) 1987', 'Call Number: SC0340, Accession 2005-101, Box: 11, Folder: 74, Title: Microcomputer Systems Proposal: blah blah': 'Microcomputer Systems Proposal: blah blah', 'Call Number: SC0340, Accession 2005-101, Box: 14, Folder: 20, Title: blah "bleah: blargW^"ugh" seriously?.': 'blah "bleah: blargW^"ugh" seriously?.', # quotes in name 'Call Number: SC0340, Accession 2005-101, Box: 29, Folder: 18, Title: "bleah" blah': '"bleah" blah', 'Call Number: SC0340, Accession 2005-101, Box: 11, Folder: 58, Title: "M": blah': '"M": blah', 'Call Number: SC0340, Accession 2005-101, Box : 32A, Folder: 19, Title: blah "bleah" blue': 'blah "bleah" blue', # not parseable 'Call Number: SC0340, Accession 2005-101': nil, 'Call Number: SC0340, Accession: 1986-052': nil, 'Call Number: SC0340, Accession: 1986-052, Box 36 Folder 38': nil, 'blah blah ... with the umbrella title Feigenbaum and Feldman, Computers and Thought II. blah blah': nil, 'blah blah ... Title ... blah blah': nil }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in preferred citation note' do before do allow(r).to receive(:mods).and_return(mods_note_preferred_citation) subject.send(:add_folder_name, sdb, solr_doc) end it "has the expected folder name '#{expected}'" do expect(solr_doc['folder_name_ssi']).to eq expected end end context 'in plain note' do before do allow(r).to receive(:mods).and_return(mods_note_plain) subject.send(:add_folder_name, sdb, solr_doc) end it 'does not have a folder name' do expect(solr_doc['folder_name_ssi']).to be_falsey end end end # for example end # each end # add_folder_name # rubocop:enable Metrics/LineLength describe '#add_object_full_text' do let(:full_text_solr_fname) { 'full_text_tesimv' } before do allow(sdb).to receive(:bare_druid).and_return(fake_druid) end let!(:expected_text) { 'SOME full text string that is returned from the server' } let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' } it 'indexes the full text into the appropriate field if a recognized file pattern is found' do public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text) # don't actually attempt a call to the stacks allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path] expect(solr_doc[full_text_solr_fname]).to eq [expected_text] end it 'does not index the full text if no recognized pattern is found' do public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [] expect(solr_doc[full_text_solr_fname]).to be_nil end it 'indexes the full text from two files if two recognized patterns are found' do public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files) allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path] expect(solr_doc[full_text_solr_fname]).to eq [expected_text, expected_text] # same file twice in a 2 element array end end # add_object_full_text end