require 'spec_helper' describe Harvestdor::Indexer do before(:all) do VCR.use_cassette('before_all_call') do @config_yml_path = File.join(File.dirname(__FILE__), "..", "config", "ap.yml") @client_config_path = File.join(File.dirname(__FILE__), "../..", "config", "dor-fetcher-client.yml") @indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) require 'yaml' @yaml = YAML.load_file(@config_yml_path) @hdor_client = @indexer.send(:harvestdor_client) @fake_druid = 'oo000oo0000' @blacklist_path = File.join(File.dirname(__FILE__), "../config/ap_blacklist.txt") @whitelist_path = File.join(File.dirname(__FILE__), "../config/ap_whitelist.txt") end end # The method that sends the solr document to solr describe "#solr_add" do before(:each) do doc_hash = { :modsxml => 'whatever', :title_display => 'title', :pub_year_tisim => 'some year', :author_person_display => 'author', :format => 'Image', :language => 'English' } end it "sends an add request to the solr_client" do expect(@indexer.solr_client).to receive(:add) @indexer.solr_add(@doc_hash, "abc123") end end describe "access methods" do it "initializes success count" do @indexer.success_count.should == 0 end it "initializes error count" do @indexer.error_count.should == 0 end it "initializes max_retries" do expect(@indexer.max_retries).to eql(10) end it "allows overriding of max_retries" do @indexer.max_retries=6 @indexer.max_retries.should == 6 end end describe "logging" do it "should write the log file to the directory indicated by log_dir" do @indexer.logger.info("indexer_spec logging test message") File.exists?(File.join(@yaml['log_dir'], @yaml['log_name'])).should == true end end it "should initialize the harvestdor_client from the config" do expect(@hdor_client).to be_an_instance_of(Harvestdor::Client) expect(@hdor_client.config.default_set).to eq(@yaml['default_set']) end context "harvest_and_index" do before(:all) do @doc_hash = { :id => @fake_druid } end it "should call dor_fetcher_client.druid_array and then call :add on rsolr connection" do @indexer.should_receive(:druids).and_return([@fake_druid]) @indexer.solr_client.should_receive(:add).with(@doc_hash) @indexer.solr_client.should_receive(:commit) @indexer.harvest_and_index end it "should only call :commit on rsolr connection once" do VCR.use_cassette('single_rsolr_connection_call') do indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_receive(:druid_array).and_return(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"]) indexer.solr_client.should_receive(:add).exactly(6).times indexer.solr_client.should_receive(:commit).once indexer.harvest_and_index end end it "should not process druids in blacklist" do VCR.use_cassette('ignore_druids_in_blacklist_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path}) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_receive(:druid_array).and_return(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"]) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:nz353cp1092'})) indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'})) indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:tc552kq0798'})) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:th998nk0722'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index } end end it "should not process druid if it is in both blacklist and whitelist" do VCR.use_cassette('ignore_druids_in_blacklist_and_whitelist_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path, :whitelist => @whitelist_path}) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_not_receive(:druid_array) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:yg867hg1375'})) indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index } end end it "should only process druids in whitelist if it exists" do VCR.use_cassette('process_druids_whitelist_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path}) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_not_receive(:druid_array) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:yg867hg1375'})) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:jf275fd6276'})) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'druid:nz353cp1092'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index } end end end # Check for replacement of oai harvesting with dor-fetcher context "replacing OAI harvesting with dor-fetcher" do it "has a dor-fetcher client" do expect(@indexer.dor_fetcher_client).to be_an_instance_of(DorFetcher::Client) end it "should strip off is_member_of_collection_ and is_governed_by_ and return only the druid" do expect(@indexer.strip_default_set_string()).to eq("yg867hg1375") end it "druids method should call druid_array and get_collection methods on fetcher_client" do VCR.use_cassette('get_collection_druids_call') do expect(@indexer.druids).to eq(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092", "druid:tc552kq0798", "druid:th998nk0722", "druid:ww689vs6534"]) end end it "should get the configuration of the dor-fetcher client from included yml file" do expect(@indexer.dor_fetcher_client.service_url).to eq(@indexer.client_config["dor_fetcher_service_url"]) end end # ending replacing OAI context context "smods_rec method" do before(:all) do @fake_druid = 'oo000oo0000' @ns_decl = "xmlns='#{Mods::MODS_NS}'" @mods_xml = "hi" @ng_mods_xml = Nokogiri::XML(@mods_xml) end it "should call mods method on harvestdor_client" do @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml) @indexer.smods_rec(@fake_druid) end it "should return Stanford::Mods::Record object" do @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml) @indexer.smods_rec(@fake_druid).should be_an_instance_of(Stanford::Mods::Record) end it "should raise exception if MODS xml for the druid is empty" do @hdor_client.stub(:mods).with(@fake_druid).and_return(Nokogiri::XML("")) expect { @indexer.smods_rec(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty MODS metadata for #{@fake_druid}: <")) end it "should raise exception if there is no MODS xml for the druid" do VCR.use_cassette('exception_no_MODS_call') do expect { @indexer.smods_rec(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingMods) end end end context "public_xml related methods" do before(:all) do @id_md_xml = "druid:#{@fake_druid}" @cntnt_md_xml = "foo" @rights_md_xml = "bar" @rdf_xml = "relationship!" @pub_xml = "#{@id_md_xml}#{@cntnt_md_xml}#{@rights_md_xml}#{@rdf_xml}" @ng_pub_xml = Nokogiri::XML(@pub_xml) end context "#public_xml" do it "should call public_xml method on harvestdor_client" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml) @indexer.public_xml @fake_druid end it "retrieves entire public xml as a Nokogiri::XML::Document" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml) px = @indexer.public_xml @fake_druid px.should be_kind_of(Nokogiri::XML::Document) px.root.name.should == 'publicObject' px.root.attributes['id'].text.should == "druid:#{@fake_druid}" end it "raises exception if public xml for the druid is empty" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(Nokogiri::XML("")) expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty public xml for #{@fake_druid}: <")) end it "raises error if there is no public_xml page for the druid" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(nil) expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, "No public xml for #{@fake_druid}") end end context "#content_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) cm = @indexer.content_metadata(@fake_druid) cm.should be_kind_of(Nokogiri::XML::Document) cm.root.should_not == nil cm.root.name.should == 'contentMetadata' cm.root.attributes['objectId'].text.should == @fake_druid cm.root.text.strip.should == 'foo' end it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do URI::HTTP.any_instance.should_not_receive(:open) @hdor_client.should_receive(:content_metadata).and_call_original cm = @indexer.content_metadata(@ng_pub_xml) cm.should be_kind_of(Nokogiri::XML::Document) cm.root.should_not == nil cm.root.name.should == 'contentMetadata' cm.root.attributes['objectId'].text.should == @fake_druid cm.root.text.strip.should == 'foo' end it "raises RuntimeError if nil is returned by Harvestdor::Client.contentMetadata for the druid" do @hdor_client.should_receive(:content_metadata).with(@fake_druid).and_return(nil) expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for \"#{@fake_druid}\"") end end context "#identity_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.identity_metadata(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'identityMetadata' im.root.text.strip.should == "druid:#{@fake_druid}" end it "if passed a Nokogiri::XML::Document of the public xml, it does no fetch" do URI::HTTP.any_instance.should_not_receive(:open) @hdor_client.should_receive(:identity_metadata).and_call_original im = @indexer.identity_metadata(@ng_pub_xml) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'identityMetadata' im.root.text.strip.should == "druid:#{@fake_druid}" end it "raises RuntimeError if nil is returned by Harvestdor::Client.identityMetadata for the druid" do @hdor_client.should_receive(:identity_metadata).with(@fake_druid).and_return(nil) expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for \"#{@fake_druid}\"") end end context "#rights_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.rights_metadata(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'rightsMetadata' im.root.text.strip.should == "bar" end it "raises RuntimeError if nil is returned by Harvestdor::Client.rightsMetadata for the druid" do @hdor_client.should_receive(:rights_metadata).with(@fake_druid).and_return(nil) expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for \"#{@fake_druid}\"") end end context "#rdf" do it "returns a Nokogiri::XML::Document derived from the public xml if a druid is passed" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.rdf(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'RDF' im.root.text.strip.should == "relationship!" end it "raises RuntimeError if nil is returned by Harvestdor::Client.rdf for the druid" do @hdor_client.should_receive(:rdf).with(@fake_druid).and_return(nil) expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for \"#{@fake_druid}\"") end end end context "blacklist" do it "should be an Array with an entry for each non-empty line in the file" do @indexer.send(:load_blacklist, @blacklist_path) @indexer.send(:blacklist).should be_an_instance_of(Array) @indexer.send(:blacklist).size.should == 2 end it "should be empty Array if there was no blacklist config setting" do VCR.use_cassette('empty_array_no_blacklist_config_call') do indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) expect(indexer.blacklist).to eq([]) end end context "load_blacklist" do it "knows what is in the blacklist" do VCR.use_cassette('know_what_is_in_blacklist_call') do indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path}) expect(indexer.blacklist).to eq(["druid:jf275fd6276", "druid:tc552kq0798"]) end end it "should not be called if there was no blacklist config setting" do VCR.use_cassette('no_blacklist_config_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) indexer.should_not_receive(:load_blacklist) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_receive(:druid_array).and_return([@fake_druid]) indexer.solr_client.should_receive(:add) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index } end end it "should only try to load a blacklist once" do VCR.use_cassette('load_blacklist_once_call') do indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => @blacklist_path}) indexer.send(:blacklist) File.any_instance.should_not_receive(:open) indexer.send(:blacklist) end end it "should log an error message and throw RuntimeError if it can't find the indicated blacklist file" do VCR.use_cassette('no_blacklist_found_call') do exp_msg = 'Unable to find list of druids at bad_path' indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:blacklist => 'bad_path'}) indexer.logger.should_receive(:fatal).with(exp_msg) expect { indexer.send(:load_blacklist, 'bad_path') }.to raise_error(exp_msg) end end end end # blacklist context "whitelist" do it "knows what is in the whitelist" do VCR.use_cassette('know_what_is_in_whitelist_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path}) expect(indexer.whitelist).to eq(["druid:yg867hg1375", "druid:jf275fd6276", "druid:nz353cp1092"]) } end end it "should be an Array with an entry for each non-empty line in the file" do @indexer.send(:load_whitelist, @whitelist_path) @indexer.send(:whitelist).should be_an_instance_of(Array) @indexer.send(:whitelist).size.should == 3 end it "should be empty Array if there was no whitelist config setting" do VCR.use_cassette('empty_array_no_whitelist_config_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) expect(indexer.whitelist).to eq([]) } end end context "load_whitelist" do it "should not be called if there was no whitelist config setting" do VCR.use_cassette('no_whitelist_config_call') do lambda{ indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path) indexer.should_not_receive(:load_whitelist) hdor_client = indexer.send(:harvestdor_client) indexer.dor_fetcher_client.should_receive(:druid_array).and_return([@fake_druid]) indexer.solr_client.should_receive(:add) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index } end end it "should only try to load a whitelist once" do VCR.use_cassette('load_whitelist_once_call') do indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => @whitelist_path}) indexer.send(:whitelist) File.any_instance.should_not_receive(:open) indexer.send(:whitelist) end end it "should log an error message and throw RuntimeError if it can't find the indicated whitelist file" do VCR.use_cassette('cant_find_whitelist_call') do exp_msg = 'Unable to find list of druids at bad_path' indexer = Harvestdor::Indexer.new(@config_yml_path, @client_config_path, {:whitelist => 'bad_path'}) indexer.logger.should_receive(:fatal).with(exp_msg) expect { indexer.send(:load_whitelist, 'bad_path') }.to raise_error(exp_msg) end end end end # whitelist it "solr_client should initialize the rsolr client using the options from the config" do VCR.use_cassette('rsolr_client_config_call') do indexer = Harvestdor::Indexer.new(nil, @client_config_path, Confstruct::Configuration.new(:solr => { :url => 'http://localhost:2345', :a => 1 }) ) RSolr.should_receive(:connect).with(hash_including(:a => 1, :url => 'http://localhost:2345')).and_return('foo') indexer.solr_client end end end