lib/harvestdor/indexer.rb in harvestdor-indexer-2.1.1 vs lib/harvestdor/indexer.rb in harvestdor-indexer-2.2.0
- old
+ new
@@ -11,25 +11,25 @@
require 'dor-fetcher'
# stdlib
require 'logger'
-require "harvestdor/indexer/version"
+require 'harvestdor/indexer/version'
require 'active_support/benchmarkable'
module Harvestdor
# Base class to harvest from DOR via harvestdor gem and then index
class Indexer
- require "harvestdor/indexer/metrics"
- require "harvestdor/indexer/resource"
- require "harvestdor/indexer/solr"
+ require 'harvestdor/indexer/metrics'
+ require 'harvestdor/indexer/resource'
+ require 'harvestdor/indexer/solr'
include ActiveSupport::Benchmarkable
attr_accessor :metrics, :logger
- def initialize options = {}
+ def initialize(options = {})
config.configure(options)
yield(config) if block_given?
@metrics = Harvestdor::Indexer::Metrics.new logger: logger
end
@@ -50,30 +50,48 @@
# per this Indexer's config options
# harvest the druids via DorFetcher
# create a Solr profiling document for each druid
# write the result to the Solr index
- def harvest_and_index each_options = {in_threads: 4}
- benchmark "Harvest and Indexing" do
+ def harvest_and_index(each_options = { in_threads: 4 })
+ benchmark 'Harvest and Indexing' do
each_resource(each_options) do |resource|
index resource
end
solr.commit!
end
end
+ ##
+ # Collect all the explicitly identified resources (e.g. from the whitelist), and all
+ # members of any collections in that list, and offer them as an Enumerator.
+ #
+ # Using enumerators allows us to lazy-fetch and correctly garbage collect resources after
+ # a downstream consumer is finished processing them. If a consumer needs to use this resource
+ # list multiple times (and is confident all the resources will fit in available memory!), they
+ # could memoize the result of e.g. `#to_a` for their own use.
+ #
+ # @return [Enumerator] an enumerator of Harvestdor::Indexer::Resources for the druid whitelist,
+ # and all the items belonging to each collection id in druids.
def resources
- druids.map do |x|
- Harvestdor::Indexer::Resource.new(self, x)
- end.map do |x|
- [x, (x.items if x.collection?)]
- end.flatten.uniq.compact
+ return to_enum(:resources) unless block_given?
+
+ druids.each do |x|
+ # Include the named resource in the enumerable
+ resource = Harvestdor::Indexer::Resource.new(self, x)
+ yield resource
+
+ # And also yield any members of that resources
+ resource.items.each do |coll_member|
+ yield coll_member
+ end
+ end
end
- def each_resource options = {}, &block
- benchmark "" do
+ def each_resource(options = {}, &_block)
+ benchmark '' do
Parallel.each(resources, options) do |resource|
metrics.tally on_error: method(:resource_error) do
yield resource
end
end
@@ -82,13 +100,13 @@
logger.info("Successful count: #{metrics.success_count}")
logger.info("Error count: #{metrics.error_count}")
logger.info("Total records processed: #{metrics.total}")
end
- def resource_error e
- if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill
- raise e
+ def resource_error(e)
+ if e.instance_of?(Parallel::Break) || e.instance_of?(Parallel::Kill)
+ fail e
end
end
# return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
# @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
@@ -96,12 +114,11 @@
@druids ||= whitelist
end
# create Solr doc for the druid and add it to Solr
# NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
- def index resource
-
+ def index(resource)
benchmark "Indexing #{resource.druid}" do
logger.debug "About to index #{resource.druid}"
doc_hash = {}
doc_hash[:id] = resource.druid
@@ -143,15 +160,12 @@
# return an Array of druids ('oo000oo0000')
# populated by reading the File at the indicated path
# @param [String] path - path of file containing a list of druids
# @return [Array<String>] an Array of druids
def load_id_list(path)
- list = File.open(path).each_line
- .map { |line| line.strip }
- .reject { |line| line.strip.start_with?('#') }
- .reject { |line| line.empty? }
+ list = File.open(path).each_line.map(&:strip).reject { |line| line.strip.start_with?('#') }.reject(&:empty?)
rescue
- msg = "Unable to find list of druids at " + path
+ msg = 'Unable to find list of druids at ' + path
logger.fatal msg
raise msg
end
end # Indexer class
end # Harvestdor module
\ No newline at end of file