# -*- coding: utf-8 -*-
require 'spec_helper'
describe Krikri::Harvesters::OAIHarvester do
let(:args) { { uri: 'http://example.org/endpoint' } }
subject { described_class.new(args) }
it 'has a client' do
expect(subject.client).to be_a OAI::Client
end
context 'with connection' do
before do
# TODO: webmock ListIdentifiers, test lazy resumption
records = (10..110).map do |id|
element = REXML::Element.new
element.add_element REXML::Element.new('identifier').add_text(
'oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN' +
id.to_s.rjust(10, '0'))
OAI::Header.new(element)
end
allow(subject.client).to receive_message_chain(:list_identifiers, :full)
.and_return(records)
# TODO: better way of maintaining example OAI record results?
# GetRecord -- Single record OAI Request
stub_request(:get,
'http://example.org/endpoint?identifier='\
'oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN0000000010'\
'&metadataPrefix=oai_dc&verb=GetRecord')
.with(:headers => {
'Accept' => '*/*',
'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3'
})
.to_return(:status => 200,
:body => '2014-10-27T22:19:17Zhttp://oaipmh.huygens.knaw.nl/oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000102012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 446-E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000010
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
',
:headers => {})
# ListRecords -- Multiple record OAI Request (w/ resumption)
response_body = <2014-10-27T23:05:33Zhttp://oaipmh.huygens.knaw.nl/oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000102012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 446-E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000010
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000112012-07-13T14:27:31Zarthurianfictionarthurianfiction:manuscript
Aberystwyth, National Library of Wales, 5018-D
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000011
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000122012-07-13T14:27:31Zarthurianfictionarthurianfiction:manuscript
Aberystwyth, National Library of Wales, 445-D
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000012
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
http://cdm16694.contentdm.oclc.org/oai/oai.php
oai:cdm16694.contentdm.oclc.org:R6A001/1
2015-01-07
http://www.openarchives.org/OAI/2.0/
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000132012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 5667 E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000013
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
MToxMHwyOnwzOnw0Onw1Om9haV9kYw==
EOM
stub_request(:get,
'http://example.org/endpoint?metadataPrefix=oai_dc&verb='\
'ListRecords')
.with(:headers => {
'Accept' => '*/*',
'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3'
})
.to_return(:status => 200,
:body => response_body,
:headers => {})
# with set
response_body = <2014-10-27T23:05:33Zhttp://oaipmh.huygens.knaw.nl/oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000102012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 446-E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000010
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000112012-07-13T14:27:31Zarthurianfictionarthurianfiction:manuscript
Aberystwyth, National Library of Wales, 5018-D
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000011
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000122012-07-13T14:27:31Zarthurianfictionarthurianfiction:manuscript
Aberystwyth, National Library of Wales, 445-D
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000012
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
http://cdm16694.contentdm.oclc.org/oai/oai.php
oai:cdm16694.contentdm.oclc.org:R6A001/1
2015-01-07
http://www.openarchives.org/OAI/2.0/
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000132012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 5667 E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000013
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
MToxMHwyOnwzOnw0Onw1Om9haV9kYw==
EOM
stub_request(:get,
'http://example.org/endpoint?metadataPrefix=mods&set=moomin&' \
'verb=ListRecords')
.with(:headers => {
'Accept' => '*/*',
'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3'
})
.to_return(:status => 200,
:body => response_body,
:headers => {})
# ListRecords -- Multiple record OAI Request (resumed)
stub_request(:get,
'http://example.org/endpoint?resumptionToken='\
'MToxMHwyOnwzOnw0Onw1Om9haV9kYw==&verb=ListRecords')
.with(:headers => {
'Accept' => '*/*',
'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3'
})
.to_return(:status => 200,
:body => '2014-10-27T23:05:33Zhttp://oaipmh.huygens.knaw.nl/oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000102012-07-13T14:27:31Zarthurianfiction:manuscriptarthurianfiction
Aberystwyth, National Library of Wales, 446-E
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000010
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN00000000112012-07-13T14:27:31Zarthurianfictionarthurianfiction:manuscript
Aberystwyth, National Library of Wales, 5018-D
Bart Besamusca
https://service.arthurianfiction.org/manuscript/MAN0000000011
2012-07-13T14:27:31Z
Bart Besamusca
model
eng
',
:headers => {})
end
it 'produces valid xml' do
expect do
Nokogiri::XML(subject.records.first.content) do |config|
config.options = Nokogiri::XML::ParseOptions::STRICT
end
end.not_to raise_error
end
it 'produces has oai namespace and header' do
expect(Nokogiri::XML(subject.records.first.content)
.xpath('//xmlns:header'))
.not_to be_empty
end
it 'retries timed out requests' do
expect_any_instance_of(Faraday::Adapter::NetHttp)
.to receive(:perform_request).at_least(4).times
.and_raise(Net::ReadTimeout.new)
expect { subject.records.first }.to raise_error Faraday::TimeoutError
end
it 'logs failed requests' do
allow_any_instance_of(Faraday::Adapter::NetHttp)
.to receive(:perform_request).and_raise(Net::ReadTimeout.new)
expect(Rails.logger).to receive(:info).at_least(4).times
expect { subject.records.first }.to raise_error Faraday::TimeoutError
end
describe 'resumption' do
let(:resumed_uri) do
'http://example.org/endpoint?resumptionToken='\
'MToxMHwyOnwzOnw0Onw1Om9haV9kYw==&verb=ListRecords'
end
it 'follows resumption token' do
subject.records.each { |r| r }
expect(WebMock).to have_requested(:get, resumed_uri).once
end
it 'only follows resumption token as far as requested' do
subject.records.take(4).each { |r| r }
expect(WebMock).not_to have_requested(:get, resumed_uri)
end
end
describe '#sets' do
before do
response_body = <
2015-02-21T17:07:46Z
http://fedora.digitalcommonwealth.org/oaiprovider/
commonwealth-oai:1r66j283x
History of the Academy
commonwealth-oai:1r66j2871
List of Vessels Belonging to the District of Gloucester
commonwealth-oai:wp988k074
NOBLE Collection
EOM
stub_request(:get, "http://example.org/endpoint?verb=ListSets").
with(:headers => {'Accept'=>'*/*', 'Accept-Encoding'=>'gzip;q=1.0,deflate;q=0.6,identity;q=0.3'}).
to_return(:status => 200,
:body => response_body,
:headers => {})
end
it 'returns sets' do
expect(subject.sets).to contain_exactly(an_instance_of(OAI::Set),
an_instance_of(OAI::Set),
an_instance_of(OAI::Set))
end
it 'passes block to sets' do
expect(subject.sets(&:spec)).to contain_exactly(an_instance_of(String),
an_instance_of(String),
an_instance_of(String))
end
end
describe 'options' do
let(:result) { double }
let(:args) do
{uri: 'http://example.org/endpoint', oai: {metadata_prefix: 'mods'}}
end
let(:request_opts) { {set: 'moomin'} }
shared_context 'oai options' do
before do
allow(result).to receive(:full).and_return([])
end
end
shared_examples 'send options' do
it 'sends request with option' do
expect(subject.client).to receive(request_type)
.with(:metadata_prefix => args[:oai][:metadata_prefix])
.and_return(result)
subject.send(method).first
end
it 'adds options passed into request' do
expect(subject.client).to receive(request_type)
.with(:metadata_prefix => args[:oai][:metadata_prefix],
:set => request_opts[:set])
.and_return(result)
subject.send(method, request_opts).first
end
context 'with multiple sets' do
before { args[:oai][:set] = ['moomin', 'moomin'] }
it 'skips sets when specific sets are selected' do
opts = args[:oai].dup
opts.delete(:set)
expect(subject.client).to receive(request_type).with(opts)
.and_return(result)
subject.send(method, { :skip_set => 'moomin' }).first
end
it 'skips sets from full list when none given' do
args[:oai].delete(:set)
opts = args[:oai].dup
opts[:set] = 'valid'
allow(subject).to receive(:sets).and_return(['valid', 'moomin'])
expect(subject.client).to receive(request_type).with(opts)
.and_return(result)
subject.send(method, { :skip_set => 'moomin' }).first
end
it 'skips sets that error' do
args[:oai].delete(:set)
invalid = args[:oai].dup
invalid[:set] = 'invalid'
valid = args[:oai].dup
valid[:set] = 'valid'
allow(subject).to receive(:sets).and_return(['invalid', 'valid', 'moomin'])
expect(subject.client).to receive(request_type).with(invalid)
.and_raise(OAI::Exception, '')
expect(subject.client).to receive(request_type).with(valid)
.and_return(result)
subject.send(method, { :skip_set => 'moomin' }).first
end
end
end
describe '#records' do
include_context 'oai options'
include_examples 'send options'
let(:request_type) { :list_records }
let(:method) { :records }
it 'combines sets' do
args[:oai][:set] = ['moomin', 'moomin']
single_set = subject.records(:set => 'moomin').to_a
expect(subject.records.to_a).to eq single_set.concat(single_set)
end
end
describe 'record_ids' do
include_context 'oai options'
include_examples 'send options'
let(:request_type) { :list_identifiers }
let(:method) { :record_ids }
end
describe '#get_record' do
before do
allow(result).to receive(:record).and_return(oai_record)
end
let(:identifier) { 'comet_moominland' }
let(:request_type) { :get_record }
let(:oai_record) { OAI::Record.new(REXML::Element.new) }
it 'sends request with option' do
expect(subject.client).to receive(request_type)
.with(:identifier => identifier,
:metadata_prefix => args[:oai][:metadata_prefix])
.and_return(result)
subject.get_record(identifier)
end
it 'adds options passed into request' do
expect(subject.client).to receive(request_type)
.with(:identifier => identifier,
:metadata_prefix => args[:oai][:metadata_prefix],
:set => request_opts[:set])
.and_return(result)
subject.get_record(identifier, request_opts)
end
end
end
describe '#request_with_sets' do
let(:sets) { [double('first'), double('second')] }
let(:opts) { { :set => sets } }
it 'is lazy' do
expect { |b| subject.send(:request_with_sets, opts, &b) }
.not_to yield_control
end
it 'sends requests' do
expect { |b| subject.send(:request_with_sets, opts, &b) }
.not_to yield_successive_args({ :set => opts[:set][0] },
{ :set => opts[:set][1] })
end
it 'sends requests lazily' do
expect(subject).to receive(:give_results).once
.and_return([1,2,3])
enum = subject.send(:request_with_sets, opts) do |b|
subject.give_results
end
3.times { enum.next }
end
end
describe '#enqueue' do
let(:args) do
{uri: 'http://example.org/endpoint', oai: {metadata_prefix: 'mods'}}
end
before do
Resque.remove_queue('harvest') # Not strictly necessary. Future?
Krikri::Activity.delete_all
end
it 'saves harvest options correctly when creating an activity' do
# Ascertain that options particular to this harvester type are
# serialized and deserialized properly.
described_class.enqueue(opts = args)
activity = Krikri::Activity.first
opts = JSON.parse(activity.opts, symbolize_names: true)
expect(opts).to eq(args)
end
end
describe 'concat_enum' do
it 'concatenates enums' do
expect(subject.concat_enum([(1..10), (100..110)]).to_a)
.to eq (1..10).to_a.concat((100..110).to_a)
end
it 'works with lazy' do
enum = double
expect(enum).not_to receive :each
subject.concat_enum([(1..10), enum]).lazy.take(10).each(&:inspect)
end
end
describe 'identifiers' do
let(:id) { 'oai:oaipmh.huygens.knaw.nl:arthurianfiction:MAN0000000010' }
it 'uses oai header identifier by default' do
expect(subject).to receive(:mint_id).with(id).and_return('id')
subject.records.first
end
context 'with :id_path' do
let(:args) do
{ uri: 'http://example.org/endpoint',
oai: { id_path: '//dc:identifier' } }
end
it 'returns the identifier from a given xpath' do
dcid = 'https://service.arthurianfiction.org/manuscript/MAN0000000010'
expect(subject).to receive(:mint_id).with(dcid).and_return('id')
subject.records.first
end
context 'and bad path' do
let(:id_path) { '//dc:not_a_path' }
let(:args) do
{ uri: 'http://example.org/endpoint',
oai: { id_path: id_path } }
end
it 'fails and raises error' do
expect(Rails.logger).to receive(:error).with(include(*id_path, id))
subject.records.first
end
end
context 'and bad namespace' do
let(:id_path) { '//fkns:identifier' }
let(:args) do
{ uri: 'http://example.org/endpoint',
oai: { id_path: '//fkns:identifier' } }
end
it 'fails and raises error' do
expect(Rails.logger).to receive(:error).with(include(id_path, id))
subject.records.first
end
end
end
end
it_behaves_like 'a harvester'
end
end
describe Krikri::Harvester::Registry do
describe '#registered?' do
it 'knows OAIHarvester is registered' do
# It should have been registered by the engine initializer, engine.rb.
expect(described_class.registered?(:oai)).to be true
end
end
end