require 'yaml' describe GDor::Indexer do before(:all) do @config_yml_path = File.join(File.dirname(__FILE__), '..', 'config', 'walters_integration_spec.yml') @yaml = YAML.load_file(@config_yml_path) @ns_decl = "xmlns='#{Mods::MODS_NS}'" @fake_druid = 'oo000oo0000' @coll_druid_from_test_config = 'ww121ss5000' @mods_xml = "Indexer test" @ng_mods_xml = Nokogiri::XML("Indexer test") @pub_xml = "" @ng_pub_xml = Nokogiri::XML("") end before do @indexer = described_class.new(@config_yml_path) do |config| config.whitelist = ['druid:ww121ss5000'] end allow(@indexer.solr_client).to receive(:add) end let :resource do r = Harvestdor::Indexer::Resource.new(double, @fake_druid) allow(r).to receive(:collections).and_return [] allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml) allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml) allow(r).to receive(:public_xml?).and_return true allow(r).to receive(:content_metadata).and_return nil allow(r).to receive(:collection?).and_return false i = Harvestdor::Indexer.new i.logger.level = Logger::WARN allow(r).to receive(:indexer).and_return i r end let :collection do r = Harvestdor::Indexer::Resource.new(double, @coll_druid_from_test_config) allow(r).to receive(:collections).and_return [] allow(r).to receive(:mods).and_return Nokogiri::XML(@mods_xml) allow(r).to receive(:public_xml).and_return Nokogiri::XML(@pub_xml) allow(r).to receive(:public_xml?).and_return true allow(r).to receive(:content_metadata).and_return nil allow(r).to receive(:identity_md_obj_label).and_return '' allow(r).to receive(:collection?).and_return true i = Harvestdor::Indexer.new i.logger.level = Logger::WARN allow(r).to receive(:indexer).and_return i r end context 'logging' do it 'writes the log file to the directory indicated by log_dir' do @indexer.logger.info('walters_integration_spec logging test message') expect(File).to exist(File.join(@yaml['harvestdor']['log_dir'], @yaml['harvestdor']['log_name'])) end it 'logger level defaults to INFO' do expect(@indexer.logger.level).to eq Logger::INFO end it 'logger level can be specified in config field' do indexer = described_class.new(@config_yml_path) do |config| config.log_level = 'debug' end expect(indexer.logger.level).to eq Logger::DEBUG indexer = described_class.new(@config_yml_path) do |config| config.log_level = 'warn' end expect(indexer.logger.level).to eq Logger::WARN end end describe '#harvest_and_index' do before do allow(@indexer.harvestdor).to receive(:each_resource) allow(@indexer).to receive(:solr_client).and_return(double(commit!: nil)) allow(@indexer).to receive(:log_results) allow(@indexer).to receive(:email_results) end it 'logs and email results' do expect(@indexer).to receive(:log_results) expect(@indexer).to receive(:email_results) @indexer.harvest_and_index end it 'indexes each resource' do allow(@indexer).to receive(:harvestdor).and_return(Class.new do def initialize(*items) @items = items end def each_resource(_opts = {}) @items.each { |x| yield x } end def logger lgr = Logger.new(StringIO.new) lgr.level = Logger::WARN lgr end end.new(collection, resource)) expect(@indexer).to receive(:index).with(collection) expect(@indexer).to receive(:index).with(resource) @indexer.harvest_and_index end it 'sends a solr commit' do expect(@indexer.solr_client).to receive(:commit!) @indexer.harvest_and_index end it 'does not commit if nocommit is set' do expect(@indexer.solr_client).not_to receive(:commit!) @indexer.harvest_and_index(true) end end describe '#index' do it 'indexes collections as collections' do expect(@indexer).to receive(:collection_solr_document).with(collection) @indexer.index collection end it 'indexes other resources as items' do expect(@indexer).to receive(:item_solr_document).with(resource) @indexer.index resource end end describe '#index_with_exception_handling' do it 'captures log and re-raises any exception thrown by the indexing process' do expect(@indexer).to receive(:index).with(resource).and_raise 'xyz' expect(@indexer.logger).to receive(:error) expect { @indexer.index_with_exception_handling(resource) }.to raise_error RuntimeError expect(@indexer.druids_failed_to_ix).to include resource.druid end end context '#item_solr_document' do it 'calls Harvestdor::Indexer.solr_add' do doc_hash = @indexer.item_solr_document(resource) expect(doc_hash).to include id: @fake_druid end it 'calls validate_item' do expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([]) @indexer.item_solr_document resource end it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do allow_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_item).and_return([]) expect_any_instance_of(GDor::Indexer::SolrDocHash).to receive(:validate_mods).and_return([]) @indexer.item_solr_document resource end it 'calls add_coll_info' do expect(@indexer).to receive(:add_coll_info) @indexer.item_solr_document resource end it 'has fields populated from the collection record' do sdb = double allow(sdb).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) allow(sdb).to receive(:display_type) allow(sdb).to receive(:file_ids) allow(sdb.doc_hash).to receive(:validate_mods).and_return([]) allow(GDor::Indexer::SolrDocBuilder).to receive(:new).and_return(sdb) allow(resource).to receive(:collections).and_return([double(druid: 'foo', bare_druid: 'foo', identity_md_obj_label: 'bar')]) doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include druid: @fake_druid, collection: ['foo'], collection_with_title: ['foo-|-bar'] end it 'has fields populated from the MODS' do title = 'fake title in mods' ng_mods = Nokogiri::XML("#{title}") allow(resource).to receive(:mods).and_return(ng_mods) doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, title_display: title end it 'populates url_fulltext field with purl page url' do doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@fake_druid}" end it 'populates druid and access_facet fields' do doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, druid: @fake_druid, access_facet: 'Online' end it 'populates display_type field by calling display_type method' do expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:display_type).and_return('foo') doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, display_type: 'foo' end it 'populates file_id field by calling file_ids method' do expect_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:file_ids).at_least(1).times.and_return(['foo']) doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, file_id: ['foo'] end it 'populates building_facet field with Stanford Digital Repository' do doc_hash = @indexer.item_solr_document resource expect(doc_hash).to include id: @fake_druid, building_facet: 'Stanford Digital Repository' end end # item_solr_document context '#collection_solr_document' do let(:doc_hash) { GDor::Indexer::SolrDocHash.new } it 'calls validate_collection' do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test expect(doc_hash).to receive(:validate_collection).and_return([]) @indexer.collection_solr_document collection end it 'calls GDor::Indexer::SolrDocBuilder.validate_mods' do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(doc_hash) # speed up the test expect(doc_hash).to receive(:validate_mods).and_return([]) @indexer.collection_solr_document collection end it 'populates druid and access_facet fields' do doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include druid: @coll_druid_from_test_config, access_facet: 'Online' end it 'populates url_fulltext field with purl page url' do doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include url_fulltext: "#{@yaml['harvestdor']['purl']}/#{@coll_druid_from_test_config}" end it "collection_type should be 'Digital Collection'" do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) # speed up the test doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include collection_type: 'Digital Collection' end context 'add format_main_ssim Archive/Manuscript' do it 'no other values' do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new) doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include format_main_ssim: 'Archive/Manuscript' end it 'other values present' do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: %w(Image Video) })) doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include format_main_ssim: ['Image', 'Video', 'Archive/Manuscript'] end it 'already has values Archive/Manuscript' do allow_any_instance_of(GDor::Indexer::SolrDocBuilder).to receive(:doc_hash).and_return(GDor::Indexer::SolrDocHash.new({ format_main_ssim: 'Archive/Manuscript' })) doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include format_main_ssim: ['Archive/Manuscript'] end end it 'populates building_facet field with Stanford Digital Repository' do doc_hash = @indexer.collection_solr_document collection expect(doc_hash).to include building_facet: 'Stanford Digital Repository' end end # index_coll_obj_per_config context '#add_coll_info and supporting methods' do before do @coll_druids_array = [collection] end let(:doc_hash) { GDor::Indexer::SolrDocHash.new({}) } it 'adds no collection field values to doc_hash if there are none' do @indexer.add_coll_info(doc_hash, nil) expect(doc_hash[:collection]).to be_nil expect(doc_hash[:collection_with_title]).to be_nil expect(doc_hash[:display_type]).to be_nil end context 'collection field' do it 'is added field to doc hash' do @indexer.add_coll_info(doc_hash, @coll_druids_array) expect(doc_hash[:collection]).to match_array [@coll_druid_from_test_config] end it 'adds two values to doc_hash when object belongs to two collections' do coll_druid1 = 'oo111oo2222' coll_druid2 = 'oo333oo4444' doc_hash = GDor::Indexer::SolrDocHash.new({}) @indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: ''), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: '')]) expect(doc_hash[:collection]).to match_array [coll_druid1, coll_druid2] end end context 'collection_with_title field' do it 'is added to doc_hash' do coll_druid = 'oo000oo1234' doc_hash = GDor::Indexer::SolrDocHash.new({}) @indexer.add_coll_info(doc_hash, [double(druid: coll_druid, bare_druid: coll_druid, public_xml: @ng_pub_xml, identity_md_obj_label: 'zzz')]) expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid}-|-zzz"] end it 'adds two values to doc_hash when object belongs to two collections' do coll_druid1 = 'oo111oo2222' coll_druid2 = 'oo333oo4444' @indexer.add_coll_info(doc_hash, [double(druid: coll_druid1, bare_druid: coll_druid1, public_xml: @ng_pub_xml, identity_md_obj_label: 'foo'), double(druid: coll_druid2, bare_druid: coll_druid2, public_xml: @ng_pub_xml, identity_md_obj_label: 'bar')]) expect(doc_hash[:collection_with_title]).to match_array ["#{coll_druid1}-|-foo", "#{coll_druid2}-|-bar"] end end context '#coll_display_types_from_items' do before do @indexer.coll_display_types_from_items(collection) end it 'gets single item display_type for single collection (and no dups)' do allow(@indexer).to receive(:identity_md_obj_label) doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' }) @indexer.add_coll_info(doc_hash, @coll_druids_array) doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' }) @indexer.add_coll_info(doc_hash, @coll_druids_array) expect(@indexer.coll_display_types_from_items(collection)).to match_array ['image'] end it 'gets multiple formats from multiple items for single collection' do allow(@indexer).to receive(:identity_md_obj_label) doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'image' }) @indexer.add_coll_info(doc_hash, @coll_druids_array) doc_hash = GDor::Indexer::SolrDocHash.new({ display_type: 'file' }) @indexer.add_coll_info(doc_hash, @coll_druids_array) expect(@indexer.coll_display_types_from_items(collection)).to match_array %w(image file) end end # coll_display_types_from_items end # add_coll_info context '#num_found_in_solr' do before do @collection_response = { 'response' => { 'numFound' => '1', 'docs' => [{ 'id' => 'dm212rn7381', 'url_fulltext' => ['https://purl.stanford.edu/dm212rn7381'] }] } } @item_response = { 'response' => { 'numFound' => '265', 'docs' => [{ 'id' => 'dm212rn7381' }] } } end it 'counts the items and the collection object in the solr index after indexing' do allow(@indexer.solr_client.client).to receive(:get) do |_wt, params| if params[:params][:fq].include?('id:"dm212rn7381"') @collection_response else @item_response end end expect(@indexer.num_found_in_solr(collection: 'dm212rn7381')).to eq(266) end end # num_found_in_solr context '#email_report_body' do before do @indexer.config.notification = 'notification-list@example.com' allow(@indexer).to receive(:num_found_in_solr).and_return(500) allow(@indexer.harvestdor).to receive(:resources).and_return([collection]) allow(collection).to receive(:items).and_return([1, 2, 3]) allow(collection).to receive(:identity_md_obj_label).and_return('testcoll title') end subject do @indexer.email_report_body end it 'email body includes coll id' do expect(subject).to match(/testcoll indexed coll record is: ww121ss5000/) end it 'email body includes coll title' do expect(subject).to match(/coll title: testcoll title/) end it 'email body includes failed to index druids' do @indexer.instance_variable_set(:@druids_failed_to_ix, %w(a b)) expect(subject).to match(/records that may have failed to index: \na\nb\n\n/) end it 'email body include validation messages' do @indexer.instance_variable_set(:@validation_messages, instance_double(File, rewind: 0, read: 'this is a validation message')) expect(subject).to match /this is a validation message/ end it 'email includes reference to full log' do expect(subject).to match(/full log is at gdor_indexer\/shared\/spec\/test_logs\/testcoll\.log/) end end describe '#email_results' do before do @indexer.config.notification = 'notification-list@example.com' allow(@indexer).to receive(:send_email) allow(@indexer).to receive(:email_report_body).and_return('Report Body') end it 'has an appropriate subject' do expect(@indexer).to receive(:send_email) do |_to, opts| expect(opts[:subject]).to match(/is finished/) end @indexer.email_results end it 'sends the email to the notification list' do expect(@indexer).to receive(:send_email) do |to, _opts| expect(to).to eq @indexer.config.notification end @indexer.email_results end it 'has the report body' do expect(@indexer).to receive(:send_email) do |_to, opts| expect(opts[:body]).to eq 'Report Body' end @indexer.email_results end end describe '#send_email' do it 'sends an email to the right list' do expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail| expect(mail.to).to match_array ['notification-list@example.com'] end @indexer.send_email 'notification-list@example.com', {} end it 'has the appropriate options set' do expect_any_instance_of(Mail::Message).to receive(:deliver!) do |mail| expect(mail.subject).to eq 'Subject' expect(mail.from).to match_array ['rspec'] expect(mail.body).to eq 'Body' end @indexer.send_email 'notification-list@example.com', { from: 'rspec', subject: 'Subject', body: 'Body' } end end describe '#solr_client' do it 'defaults to the harvestdor-configured client' do expect(@indexer.solr_client).to eq @indexer.harvestdor.solr end it 'can be set as an option' do solr_client = double @indexer = described_class.new(solr_client: solr_client) expect(@indexer.solr_client).to eq solr_client end end # context "skip heartbeat" do # it "allows use of a fake url for dor-fetcher-client" do # expect {GDor::Indexer.new(@config_yml_path)}.not_to raise_error # end # end end