require 'spec_helper' describe Harvestdor::Indexer do before(:all) do @config_yml_path = File.join(File.dirname(__FILE__), "..", "config", "ap.yml") @indexer = Harvestdor::Indexer.new(@config_yml_path) require 'yaml' @yaml = YAML.load_file(@config_yml_path) @hdor_client = @indexer.send(:harvestdor_client) @fake_druid = 'oo000oo0000' @blacklist_path = File.join(File.dirname(__FILE__), "../config/ap_blacklist.txt") @whitelist_path = File.join(File.dirname(__FILE__), "../config/ap_whitelist.txt") end describe "logging" do it "should write the log file to the directory indicated by log_dir" do @indexer.logger.info("indexer_spec logging test message") File.exists?(File.join(@yaml['log_dir'], @yaml['log_name'])).should == true end end it "should initialize the harvestdor_client from the config" do @hdor_client.should be_an_instance_of(Harvestdor::Client) @hdor_client.config.default_set.should == @yaml['default_set'] end context "harvest_and_index" do before(:all) do @doc_hash = { :id => @fake_druid } end it "should call druids_via_oai and then call :add on rsolr connection" do @hdor_client.should_receive(:druids_via_oai).and_return([@fake_druid]) @indexer.solr_client.should_receive(:add).with(@doc_hash) @indexer.solr_client.should_receive(:commit) @indexer.harvest_and_index end it "should not process druids in blacklist" do indexer = Harvestdor::Indexer.new(@config_yml_path, {:blacklist => @blacklist_path}) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_receive(:druids_via_oai).and_return(['oo000oo0000', 'oo111oo1111', 'oo222oo2222', 'oo333oo3333']) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'oo000oo0000'})) indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'oo111oo1111'})) indexer.solr_client.should_not_receive(:add).with(hash_including({:id => 'oo222oo2222'})) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'oo333oo3333'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index end it "should only process druids in whitelist if it exists" do indexer = Harvestdor::Indexer.new(@config_yml_path, {:whitelist => @whitelist_path}) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_not_receive(:druids_via_oai) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'oo000oo0000'})) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'oo222oo2222'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index end it "should not process druid if it is in both blacklist and whitelist" do indexer = Harvestdor::Indexer.new(@config_yml_path, {:blacklist => @blacklist_path, :whitelist => @whitelist_path}) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_not_receive(:druids_via_oai) indexer.solr_client.should_receive(:add).with(hash_including({:id => 'oo000oo0000'})) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index end it "should only call :commit on rsolr connection once" do indexer = Harvestdor::Indexer.new(@config_yml_path) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_receive(:druids_via_oai).and_return(['1', '2', '3']) indexer.solr_client.should_receive(:add).exactly(3).times indexer.solr_client.should_receive(:commit).once indexer.harvest_and_index end end it "druids method should call druids_via_oai method on harvestdor_client" do @hdor_client.should_receive(:druids_via_oai) @indexer.druids end context "smods_rec method" do before(:all) do @fake_druid = 'oo000oo0000' @ns_decl = "xmlns='#{Mods::MODS_NS}'" @mods_xml = "hi" @ng_mods_xml = Nokogiri::XML(@mods_xml) end it "should call mods method on harvestdor_client" do @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml) @indexer.smods_rec(@fake_druid) end it "should return Stanford::Mods::Record object" do @hdor_client.should_receive(:mods).with(@fake_druid).and_return(@ng_mods_xml) @indexer.smods_rec(@fake_druid).should be_an_instance_of(Stanford::Mods::Record) end it "should raise exception if MODS xml for the druid is empty" do @hdor_client.stub(:mods).with(@fake_druid).and_return(Nokogiri::XML("")) expect { @indexer.smods_rec(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty MODS metadata for #{@fake_druid}: <")) end it "should raise exception if there is no MODS xml for the druid" do expect { @indexer.smods_rec(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingMods) end end context "public_xml related methods" do before(:all) do @id_md_xml = "druid:#{@fake_druid}" @cntnt_md_xml = "foo" @rights_md_xml = "bar" @rdf_xml = "relationship!" @pub_xml = "#{@id_md_xml}#{@cntnt_md_xml}#{@rights_md_xml}#{@rdf_xml}" @ng_pub_xml = Nokogiri::XML(@pub_xml) end context "#public_xml" do it "should call public_xml method on harvestdor_client" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml) @indexer.public_xml @fake_druid end it "retrieves entire public xml as a Nokogiri::XML::Document" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(@ng_pub_xml) px = @indexer.public_xml @fake_druid px.should be_kind_of(Nokogiri::XML::Document) px.root.name.should == 'publicObject' px.root.attributes['id'].text.should == "druid:#{@fake_druid}" end it "raises exception if public xml for the druid is empty" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(Nokogiri::XML("")) expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, Regexp.new("^Empty public xml for #{@fake_druid}: <")) end it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do expect { @indexer.public_xml(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage) end it "raises error if there is no public_xml page for the druid" do @hdor_client.should_receive(:public_xml).with(@fake_druid).and_return(nil) expect { @indexer.public_xml(@fake_druid) }.to raise_error(RuntimeError, "No public xml for #{@fake_druid}") end end context "#content_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) cm = @indexer.content_metadata(@fake_druid) cm.should be_kind_of(Nokogiri::XML::Document) cm.root.should_not == nil cm.root.name.should == 'contentMetadata' cm.root.attributes['objectId'].text.should == @fake_druid cm.root.text.strip.should == 'foo' end it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do expect { @indexer.content_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage) end it "should raise exception if there is no contentMetadata in the public xml" do pub_xml = "#{@id_md_xml}" Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml)) expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for #{@fake_druid}") end it "raises RuntimeError if nil is returned by Harvestdor::Client.contentMetadata for the druid" do @hdor_client.should_receive(:content_metadata).with(@fake_druid).and_return(nil) expect { @indexer.content_metadata(@fake_druid) }.to raise_error(RuntimeError, "No contentMetadata for #{@fake_druid}") end it "raises MissingContentMetadata error if there is no contentMetadata in the public_xml for the druid" do URI::HTTP.any_instance.should_receive(:open) expect { @indexer.content_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingContentMetadata) end end context "#identity_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.identity_metadata(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'identityMetadata' im.root.text.strip.should == "druid:#{@fake_druid}" end it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage) end it "should raise exception if there is no identityMetadata in the public xml" do pub_xml = "#{@cntnt_md_xml}" Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml)) expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for #{@fake_druid}") end it "raises RuntimeError if nil is returned by Harvestdor::Client.identityMetadata for the druid" do @hdor_client.should_receive(:identity_metadata).with(@fake_druid).and_return(nil) expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(RuntimeError, "No identityMetadata for #{@fake_druid}") end it "raises MissingIdentityMetadata error if there is no identityMetadata in the public_xml for the druid" do URI::HTTP.any_instance.should_receive(:open) expect { @indexer.identity_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingIdentityMetadata) end end context "#rights_metadata" do it "returns a Nokogiri::XML::Document derived from the public xml" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.rights_metadata(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'rightsMetadata' im.root.text.strip.should == "bar" end it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage) end it "should raise exception if there is no rightsMetadata in the public xml" do pub_xml = "#{@cntnt_md_xml}" Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml)) expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for #{@fake_druid}") end it "raises RuntimeError if nil is returned by Harvestdor::Client.rightsMetadata for the druid" do @hdor_client.should_receive(:rights_metadata).with(@fake_druid).and_return(nil) expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(RuntimeError, "No rightsMetadata for #{@fake_druid}") end it "raises MissingRightsMetadata error if there is no rightsMetadata in the public_xml for the druid" do URI::HTTP.any_instance.should_receive(:open) expect { @indexer.rights_metadata(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingRightsMetadata) end end context "#rdf" do it "returns a Nokogiri::XML::Document derived from the public xml" do Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(@ng_pub_xml) im = @indexer.rdf(@fake_druid) im.should be_kind_of(Nokogiri::XML::Document) im.root.should_not == nil im.root.name.should == 'RDF' im.root.text.strip.should == "relationship!" end it "raises Harvestdor::Errors::MissingPurlPage if there is no purl page for the druid" do expect { @indexer.rdf(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingPurlPage) end it "should raise exception if there is no rdf in the public xml" do pub_xml = "#{@cntnt_md_xml}" Harvestdor.stub(:public_xml).with(@fake_druid, @indexer.config.purl).and_return(Nokogiri::XML(pub_xml)) expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for #{@fake_druid}") end it "raises RuntimeError if nil is returned by Harvestdor::Client.rdf for the druid" do @hdor_client.should_receive(:rdf).with(@fake_druid).and_return(nil) expect { @indexer.rdf(@fake_druid) }.to raise_error(RuntimeError, "No RDF for #{@fake_druid}") end it "raises MissingRDF error if there is no rdf in the public_xml for the druid" do URI::HTTP.any_instance.should_receive(:open) expect { @indexer.rdf(@fake_druid) }.to raise_error(Harvestdor::Errors::MissingRDF) end end end context "blacklist" do it "should be an Array with an entry for each non-empty line in the file" do @indexer.send(:load_blacklist, @blacklist_path) @indexer.send(:blacklist).should be_an_instance_of(Array) @indexer.send(:blacklist).size.should == 2 end it "should be empty Array if there was no blacklist config setting" do indexer = Harvestdor::Indexer.new(@config_yml_path) indexer.send(:blacklist).should == [] end context "load_blacklist" do it "should not be called if there was no blacklist config setting" do indexer = Harvestdor::Indexer.new(@config_yml_path) indexer.should_not_receive(:load_blacklist) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_receive(:druids_via_oai).and_return([@fake_druid]) indexer.solr_client.should_receive(:add) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index end it "should only try to load a blacklist once" do indexer = Harvestdor::Indexer.new(@config_yml_path, {:blacklist => @blacklist_path}) indexer.send(:blacklist) File.any_instance.should_not_receive(:open) indexer.send(:blacklist) end it "should log an error message and throw RuntimeError if it can't find the indicated blacklist file" do exp_msg = 'Unable to find list of druids at bad_path' indexer = Harvestdor::Indexer.new(@config_yml_path, {:blacklist => 'bad_path'}) indexer.logger.should_receive(:fatal).with(exp_msg) expect { indexer.send(:load_blacklist, 'bad_path') }.to raise_error(exp_msg) end end end # blacklist context "whitelist" do it "should be an Array with an entry for each non-empty line in the file" do @indexer.send(:load_whitelist, @whitelist_path) @indexer.send(:whitelist).should be_an_instance_of(Array) @indexer.send(:whitelist).size.should == 2 end it "should be empty Array if there was no whitelist config setting" do indexer = Harvestdor::Indexer.new(@config_yml_path) indexer.send(:whitelist).should == [] end context "load_whitelist" do it "should not be called if there was no whitelist config setting" do indexer = Harvestdor::Indexer.new(@config_yml_path) indexer.should_not_receive(:load_whitelist) hdor_client = indexer.send(:harvestdor_client) hdor_client.should_receive(:druids_via_oai).and_return([@fake_druid]) indexer.solr_client.should_receive(:add) indexer.solr_client.should_receive(:commit) indexer.harvest_and_index end it "should only try to load a whitelist once" do indexer = Harvestdor::Indexer.new(@config_yml_path, {:whitelist => @whitelist_path}) indexer.send(:whitelist) File.any_instance.should_not_receive(:open) indexer.send(:whitelist) end it "should log an error message and throw RuntimeError if it can't find the indicated whitelist file" do exp_msg = 'Unable to find list of druids at bad_path' indexer = Harvestdor::Indexer.new(@config_yml_path, {:whitelist => 'bad_path'}) indexer.logger.should_receive(:fatal).with(exp_msg) expect { indexer.send(:load_whitelist, 'bad_path') }.to raise_error(exp_msg) end end end # whitelist it "solr_client should initialize the rsolr client using the options from the config" do indexer = Harvestdor::Indexer.new(nil, Confstruct::Configuration.new(:solr => { :url => 'http://localhost:2345', :a => 1 }) ) RSolr.should_receive(:connect).with(hash_including(:a => 1, :url => 'http://localhost:2345')).and_return('foo') indexer.solr_client end end