require 'spec_helper' describe Spotlight::Dor::Indexer do subject { described_class.new } let(:fake_druid) { 'oo000oo0000' } let(:r) { Harvestdor::Indexer::Resource.new(double, fake_druid) } let(:sdb) { GDor::Indexer::SolrDocBuilder.new(r, Logger.new(StringIO.new)) } let(:solr_doc) { {} } before do # reduce log noise allow(r).to receive(:harvestdor_client) i = Harvestdor::Indexer.new i.logger.level = Logger::WARN allow(r).to receive(:indexer).and_return i end describe '#add_content_metadata_fields' do before do allow(r).to receive(:public_xml).and_return(public_xml) allow(sdb).to receive(:bare_druid).and_return(fake_druid) # stacks url calculations require the druid solr_doc[:id] = fake_druid subject.send(:add_content_metadata_fields, sdb, solr_doc) end context 'with a record without contentMetadata' do let(:public_xml) do Nokogiri::XML <<-EOF EOF end it 'is blank, except for the document id' do expect(solr_doc.except(:id)).to be_blank end end context 'with a record with contentMetadata' do let(:public_xml) do Nokogiri::XML <<-EOF EOF end it 'indexes the declared content metadata type' do expect(solr_doc['content_metadata_type_ssim']).to contain_exactly 'image' end it 'indexes the thumbnail information' do expect(solr_doc['content_metadata_first_image_file_name_ssm']).to contain_exactly 'bj356mh7176_00_0001' expect(solr_doc['content_metadata_first_image_width_ssm']).to contain_exactly '12967' expect(solr_doc['content_metadata_first_image_height_ssm']).to contain_exactly '22970' end it 'indexes the images' do stacks_base_url = 'https://stacks.stanford.edu/image/iiif/oo000oo0000%2Fbj356mh7176_00_0001' expect(solr_doc['content_metadata_image_iiif_info_ssm']).to include "#{stacks_base_url}/info.json" expect(solr_doc['thumbnail_square_url_ssm']).to include "#{stacks_base_url}/square/100,100/0/default.jpg" expect(solr_doc['thumbnail_url_ssm']).to include "#{stacks_base_url}/full/!400,400/0/default.jpg" expect(solr_doc['large_image_url_ssm']).to include "#{stacks_base_url}/full/pct:25/0/default.jpg" expect(solr_doc['full_image_url_ssm']).to include "#{stacks_base_url}/full/full/0/default.jpg" end end end context 'Feigbenbaum specific fields concern' do describe '#add_document_subtype' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_document_subtype, sdb, solr_doc) end context 'with a record without document subtype' do let(:mods) do Nokogiri::XML <<-EOF (not a document subtype) a generic note EOF end it 'is blank' do expect(solr_doc['doc_subtype_ssi']).to be_blank end end context 'with a record with document subtype' do let(:mods) do Nokogiri::XML <<-EOF memorandums a generic note EOF end it 'extracts the doc subtypes' do expect(solr_doc['doc_subtype_ssi']).to eq('memorandums') end end end # doc subtype describe '#add_donor_tags' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_donor_tags, sdb, solr_doc) end context 'with a record without donor tags' do let(:mods) do Nokogiri::XML <<-EOF (not a donor tag) EOF end it 'is blank' do expect(solr_doc['donor_tags_ssim']).to be_blank end end context 'with a record with donor tags' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Knowledge Systems Laboratory medical applications medical Applications (second word CAPPED) Publishing Stanford Stanford Computer Science Department EOF end it 'extracts the donor tags' do expect(solr_doc['donor_tags_ssim']).to contain_exactly 'Knowledge Systems Laboratory', 'Medical applications', 'Medical Applications (second word CAPPED)', 'Publishing', 'Stanford', 'Stanford Computer Science Department' end end end # donor tags # rubocop:disable Metrics/LineLength describe '#add_folder_name' do let(:mods_note_plain) do Nokogiri::XML <<-EOF #{example} EOF end let(:mods_note_preferred_citation) do Nokogiri::XML <<-EOF #{example} EOF end # example string as key, expected folder name as value # all from feigenbaum (or based on feigenbaum), as that is only coll with this data { 'Call Number: SC0340, Accession: 1986-052, Box: 20, Folder: 40, Title: S': 'S', 'Call Number: SC0340, Accession: 1986-052, Box: 54, Folder: 25, Title: Balzer': 'Balzer', 'Call Number: SC0340, Accession: 1986-052, Box : 30, Folder: 21, Title: Feigenbaum, Publications. 2 of 2.': 'Feigenbaum, Publications. 2 of 2.', # colon in name 'Call Number: SC0340, Accession 2005-101, Box: 10, Folder: 26, Title: Gordon Bell Letter rdf:about blah (AI) 1987': 'Gordon Bell Letter rdf:about blah (AI) 1987', 'Call Number: SC0340, Accession 2005-101, Box: 11, Folder: 74, Title: Microcomputer Systems Proposal: blah blah': 'Microcomputer Systems Proposal: blah blah', 'Call Number: SC0340, Accession 2005-101, Box: 14, Folder: 20, Title: blah "bleah: blargW^"ugh" seriously?.': 'blah "bleah: blargW^"ugh" seriously?.', # quotes in name 'Call Number: SC0340, Accession 2005-101, Box: 29, Folder: 18, Title: "bleah" blah': '"bleah" blah', 'Call Number: SC0340, Accession 2005-101, Box: 11, Folder: 58, Title: "M": blah': '"M": blah', 'Call Number: SC0340, Accession 2005-101, Box : 32A, Folder: 19, Title: blah "bleah" blue': 'blah "bleah" blue', # not parseable 'Call Number: SC0340, Accession 2005-101': nil, 'Call Number: SC0340, Accession: 1986-052': nil, 'Call Number: SC0340, Accession: 1986-052, Box 36 Folder 38': nil, 'blah blah ... with the umbrella title Feigenbaum and Feldman, Computers and Thought II. blah blah': nil, 'blah blah ... Title ... blah blah': nil }.each do |example, expected| describe "for example '#{example}'" do let(:example) { example } context 'in preferred citation note' do before do allow(r).to receive(:mods).and_return(mods_note_preferred_citation) subject.send(:add_folder_name, sdb, solr_doc) end it "has the expected folder name '#{expected}'" do expect(solr_doc['folder_name_ssi']).to eq expected end end context 'in plain note' do before do allow(r).to receive(:mods).and_return(mods_note_plain) subject.send(:add_folder_name, sdb, solr_doc) end it 'does not have a folder name' do expect(solr_doc['folder_name_ssi']).to be_falsey end end end # for example end # each end # add_folder_name # rubocop:enable Metrics/LineLength describe '#add_general_notes' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_general_notes, sdb, solr_doc) end context 'no general notes, but other types of notes' do let(:mods) do Nokogiri::XML <<-EOF (not a document subtype) memorandums Knowledge Systems Laboratory EOF end it 'is blank' do expect(solr_doc['general_notes_ssim']).to be_blank end end context 'ignore extra notes' do let(:mods) do Nokogiri::XML <<-EOF memorandums a generic note EOF end it 'extracts the doc subtypes' do expect(solr_doc['general_notes_ssim']).to contain_exactly 'a generic note' end end end # general notes end # feigbenbaum specific fields concern context 'StanfordMods concern' do describe '#add_author_no_collector' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_author_no_collector, sdb, solr_doc) end let(:name) { 'Macro Hamster' } let(:mods) do Nokogiri::XML <<-EOF #{name} cre Ignored col EOF end it 'populates author_no_collector_ssim field in solr doc' do expect(solr_doc['author_no_collector_ssim']).to eq [name] end it 'calls non_collector_person_authors on Stanford::Mods::Record object' do expect(sdb.smods_rec).to receive(:non_collector_person_authors) subject.send(:add_author_no_collector, sdb, solr_doc) end end describe '#add_box' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_box, sdb, solr_doc) end context 'with a record without a box' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['box_ssi']).to be_blank end end context 'with a record with a box' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Series 1, Box 10, Folder 8 EOF end it 'extracts the box' do expect(solr_doc['box_ssi']).to eq('10') end end end # add_box describe '#add_collector' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_collector, sdb, solr_doc) end let(:name) { 'Macro Hamster' } let(:mods) do Nokogiri::XML <<-EOF #{name} col EOF end it 'populates collector_ssim field in solr doc' do expect(solr_doc['collector_ssim']).to eq [name] end it 'calls collectors_w_dates on Stanford::Mods::Record object' do expect(sdb.smods_rec).to receive(:collectors_w_dates) subject.send(:add_collector, sdb, solr_doc) end end describe '#add_coordinates' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_coordinates, sdb, solr_doc) end context 'with a record without coordinates' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['coordinates_tesim']).to be_blank end end context 'with a record with coordinates' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Scale 1:500,000 (W16°--E28°/N13°--S15°). EOF end it 'extracts the coordinates' do expect(solr_doc['coordinates_tesim']).to eq(['(W16°--E28°/N13°--S15°).']) end end end # add_coordinates describe '#add_folder' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_folder, sdb, solr_doc) end context 'with a record without a folder' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['folder_ssi']).to be_blank end end context 'with a record with a folder' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Series 1, Box 10, Folder 8 EOF end it 'extracts the folder' do expect(solr_doc['folder_ssi']).to eq('8') end end end # add_folder describe '#add_genre' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_genre, sdb, solr_doc) end context 'with a record without a genre' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['genre_ssim']).to be_blank end end context 'with a record with a genre' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF manuscripts for publication EOF end it 'extracts the genre' do expect(solr_doc['genre_ssim']).to contain_exactly 'manuscripts for publication' end end end describe '#add_location' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_location, sdb, solr_doc) end context 'with a record without a location' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['location_ssi']).to be_blank end end context 'with a record with a location' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Series 1, Box 10, Folder 8 EOF end it 'extracts the location' do expect(solr_doc['location_ssi']).to eq('Series 1, Box 10, Folder 8') end end end # add_location describe '#add_point_bbox' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_point_bbox, sdb, solr_doc) end context 'with a record without coordinates' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['point_bbox']).to be_blank end end context 'with a record with coordinates' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Scale 1:500,000 (W16°--E28°/N13°--S15°). EOF end it 'extracts the point_bbox' do expect(solr_doc['point_bbox']).to eq(['ENVELOPE(-16.0, 28.0, 13.0, -15.0)']) end end end # add_point_bbox describe '#add_series' do before do allow(r).to receive(:mods).and_return(mods) subject.send(:add_series, sdb, solr_doc) end context 'with a record without a series' do let(:mods) do Nokogiri::XML <<-EOF EOF end it 'is blank' do expect(solr_doc['series_ssi']).to be_blank end end context 'with a record with a series' do let(:mods) do # e.g. from https://purl.stanford.edu/vw282gv1740 Nokogiri::XML <<-EOF Series 1, Box 10, Folder 8 EOF end it 'extracts the series' do expect(solr_doc['series_ssi']).to eq('1') end end end # add_series end # context StanfordMods concern context 'Full Text Indexing concern' do describe '#add_object_full_text' do let(:full_text_solr_fname) { 'full_text_tesimv' } before do allow(sdb).to receive(:bare_druid).and_return(fake_druid) end let!(:expected_text) { 'SOME full text string that is returned from the server' } let!(:full_file_path) { 'https://stacks.stanford.edu/file/oo000oo0000/oo000oo0000.txt' } it 'indexes the full text into the appropriate field if a recognized file pattern is found' do public_xml_with_feigenbaum_full_text = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_feigenbaum_full_text) # don't actually attempt a call to the stacks allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path] expect(solr_doc[full_text_solr_fname]).to eq [expected_text] end it 'does not index the full text if no recognized pattern is found' do public_xml_with_no_recognized_full_text = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_no_recognized_full_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [] expect(solr_doc[full_text_solr_fname]).to be_nil end it 'indexes the full text from two files if two recognized patterns are found' do public_xml_with_two_recognized_full_text_files = Nokogiri::XML <<-EOF EOF allow(sdb).to receive(:public_xml).and_return(public_xml_with_two_recognized_full_text_files) allow(subject).to receive(:get_file_content).with(full_file_path).and_return(expected_text) subject.send(:add_object_full_text, sdb, solr_doc) expect(subject.object_level_full_text_urls(sdb)).to eq [full_file_path, full_file_path] expect(solr_doc[full_text_solr_fname]).to eq [expected_text, expected_text] # same file twice in a 2 element array end end # add_object_full_text end # full text indexing concern end