module Krikri::Harvesters ## # A harvester implementation for REST APIs. The default ApiHarvester expects # Solr-like JSON responses/records. # # An internal interface is provided for easier subclassing. A new API # harvester may reimplement: # - #get_docs (to retrieve record docs from a response) # - #get_count (to determine total record count from a response) # - #get_identifier (to retrieve an indentifier from a record document) # - #get_content (to retrieve a content string from a record document) # - #next_options` (to generate the parameters for the next request) # # If the content type of the records is other than JSON, you will also want # to override `#content_type`. class ApiHarvester include Krikri::Harvester attr_reader :opts ## # @param opts [Hash] options for the harvester # @see .expected_opts def initialize(opts = {}) super @opts = opts.fetch(:api, {}) end ## # @return [Hash] A hash documenting the allowable options to pass to # initializers. # # @see Krikri::Harvester::expected_opts def self.expected_opts { key: :api, opts: { params: { type: :string, required: false } } } end ## # @see Krikri::Harvester#count def count get_count(request(opts)) end ## # @return [Enumerator::Lazy] an enumerator of the records targeted by this # harvester. def records enumerate_records.lazy.map { |rec| build_record(rec) } end ## # Gets a single record with the given identifier from the API # # @return [Enumerator::Lazy] an enumerator over the ids for the records # targeted by this harvester. def record_ids enumerate_records.lazy.map { |r| get_identifier(r) } end ## # @param identifier [#to_s] the identifier of the record to get # @return [#to_s] the record def get_record(identifier) response = request(:params => { :q => "id:#{identifier.to_s}" }) build_record(get_docs(response).first) end ## # @return [String] the content type for the records generated by this # harvester def content_type 'application/json' end private ## # @param doc [#to_s] a raw record document with an identifier # # @return [String] the provider's identifier for the document def get_identifier(doc) doc['record_id'] end ## # @param response [#to_s] a response from the REST API # # @return [Integer] a count of the total records found by the request def get_count(response) response['response']['numFound'] end ## # @param response [#to_s] a response from the REST API # # @return [Array] an array of record documents from the response def get_docs(response) response['response']['docs'] end ## # @param doc [#to_s] a raw record document # # @return [String] the record content def get_content(doc) doc.to_json end ## # Send a request via `RestClient`, and parse the result as JSON def request(request_opts) JSON.parse(RestClient.get(uri, request_opts)) end ## # Given a current set of options and a number of records from the last # request, generate the options for the next request. # # @param opts [Hash] an options hash from the previous request # @param record_count [#to_i] # # @return [Hash] the next request's options hash def next_options(opts, record_count) old_start = opts['params'].fetch('start', 0) opts['params']['start'] = old_start.to_i + record_count opts end ## # @return [Enumerator] an enumerator over the records def enumerate_records Enumerator.new do |yielder| request_opts = opts.deep_dup loop do break if request_opts.nil? docs = get_docs(request(request_opts.dup)) break if docs.empty? docs.each { |r| yielder << r } request_opts = next_options(request_opts, docs.count) end end end ## # Builds an instance of `@record_class` with the given doc's JSON as # content. # # @param doc [#to_json] the content to serialize as JSON in `#content` # @return [#to_s] an instance of @record_class with a minted id and # content the given content def build_record(doc) @record_class.build(mint_id(get_identifier(doc)), get_content(doc), content_type) end end end