lib/harvestdor/indexer.rb in harvestdor-indexer-2.1.1 vs lib/harvestdor/indexer.rb in harvestdor-indexer-2.2.0

- old
+ new

@@ -11,25 +11,25 @@ require 'dor-fetcher' # stdlib require 'logger' -require "harvestdor/indexer/version" +require 'harvestdor/indexer/version' require 'active_support/benchmarkable' module Harvestdor # Base class to harvest from DOR via harvestdor gem and then index class Indexer - require "harvestdor/indexer/metrics" - require "harvestdor/indexer/resource" - require "harvestdor/indexer/solr" + require 'harvestdor/indexer/metrics' + require 'harvestdor/indexer/resource' + require 'harvestdor/indexer/solr' include ActiveSupport::Benchmarkable attr_accessor :metrics, :logger - def initialize options = {} + def initialize(options = {}) config.configure(options) yield(config) if block_given? @metrics = Harvestdor::Indexer::Metrics.new logger: logger end @@ -50,30 +50,48 @@ # per this Indexer's config options # harvest the druids via DorFetcher # create a Solr profiling document for each druid # write the result to the Solr index - def harvest_and_index each_options = {in_threads: 4} - benchmark "Harvest and Indexing" do + def harvest_and_index(each_options = { in_threads: 4 }) + benchmark 'Harvest and Indexing' do each_resource(each_options) do |resource| index resource end solr.commit! end end + ## + # Collect all the explicitly identified resources (e.g. from the whitelist), and all + # members of any collections in that list, and offer them as an Enumerator. + # + # Using enumerators allows us to lazy-fetch and correctly garbage collect resources after + # a downstream consumer is finished processing them. If a consumer needs to use this resource + # list multiple times (and is confident all the resources will fit in available memory!), they + # could memoize the result of e.g. `#to_a` for their own use. + # + # @return [Enumerator] an enumerator of Harvestdor::Indexer::Resources for the druid whitelist, + # and all the items belonging to each collection id in druids. def resources - druids.map do |x| - Harvestdor::Indexer::Resource.new(self, x) - end.map do |x| - [x, (x.items if x.collection?)] - end.flatten.uniq.compact + return to_enum(:resources) unless block_given? + + druids.each do |x| + # Include the named resource in the enumerable + resource = Harvestdor::Indexer::Resource.new(self, x) + yield resource + + # And also yield any members of that resources + resource.items.each do |coll_member| + yield coll_member + end + end end - def each_resource options = {}, &block - benchmark "" do + def each_resource(options = {}, &_block) + benchmark '' do Parallel.each(resources, options) do |resource| metrics.tally on_error: method(:resource_error) do yield resource end end @@ -82,13 +100,13 @@ logger.info("Successful count: #{metrics.success_count}") logger.info("Error count: #{metrics.error_count}") logger.info("Total records processed: #{metrics.total}") end - def resource_error e - if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill - raise e + def resource_error(e) + if e.instance_of?(Parallel::Break) || e.instance_of?(Parallel::Kill) + fail e end end # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params # @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234) @@ -96,12 +114,11 @@ @druids ||= whitelist end # create Solr doc for the druid and add it to Solr # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ... - def index resource - + def index(resource) benchmark "Indexing #{resource.druid}" do logger.debug "About to index #{resource.druid}" doc_hash = {} doc_hash[:id] = resource.druid @@ -143,15 +160,12 @@ # return an Array of druids ('oo000oo0000') # populated by reading the File at the indicated path # @param [String] path - path of file containing a list of druids # @return [Array<String>] an Array of druids def load_id_list(path) - list = File.open(path).each_line - .map { |line| line.strip } - .reject { |line| line.strip.start_with?('#') } - .reject { |line| line.empty? } + list = File.open(path).each_line.map(&:strip).reject { |line| line.strip.start_with?('#') }.reject(&:empty?) rescue - msg = "Unable to find list of druids at " + path + msg = 'Unable to find list of druids at ' + path logger.fatal msg raise msg end end # Indexer class end # Harvestdor module \ No newline at end of file