lib/harvestdor-indexer.rb in harvestdor-indexer-0.0.11 vs lib/harvestdor-indexer.rb in harvestdor-indexer-0.0.13
- old
+ new
@@ -1,8 +1,9 @@
# external gems
require 'confstruct'
require 'rsolr'
+require 'retries'
# sul-dlss gems
require 'harvestdor'
require 'stanford-mods'
@@ -19,11 +20,11 @@
attr_accessor :total_time_to_parse,:total_time_to_solr
def initialize yml_path, options = {}
@success_count=0 # the number of objects successfully indexed
@error_count=0 # the number of objects that failed
- @max_retries=5 # the number of times to retry an object
+ @max_retries=10 # the number of times to retry an object
@total_time_to_solr=0
@total_time_to_parse=0
@yml_path = yml_path
config.configure(YAML.load_file(yml_path)) if yml_path
config.configure options
@@ -52,17 +53,17 @@
end
solr_client.commit
total_time=elapsed_time(start_time)
total_objects=@success_count+@error_count
logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
- logger.info("Total elapsed time for harvest and index: #{(total_time/60.0)} minutes")
- logger.info("Avg solr commit time per object (successful): #{@total_time_to_solr/@success_count} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
- logger.info("Avg solr commit time per object (all): #{@total_time_to_solr/total_objects} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
- logger.info("Avg parse time per object (successful): #{@total_time_to_parse/@success_count} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
- logger.info("Avg parse time per object (all): #{@total_time_to_parse/total_objects} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
- logger.info("Avg complete index time per object (successful): #{total_time/@success_count} seconds") unless (@success_count == 0)
- logger.info("Avg complete index time per object (all): #{total_time/total_objects} seconds") unless (@error_count == 0 || total_object == 0)
+ logger.info("Total elapsed time for harvest and index: #{(total_time/60.0).round(2)} minutes")
+ logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr/@success_count).round(2)} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
+ logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr/total_objects).round(2)} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
+ logger.info("Avg parse time per object (successful): #{(@total_time_to_parse/@success_count).round(2)} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
+ logger.info("Avg parse time per object (all): #{(@total_time_to_parse/total_objects).round(2)} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
+ logger.info("Avg complete index time per object (successful): #{(total_time/@success_count).round(2)} seconds") unless (@success_count == 0)
+ logger.info("Avg complete index time per object (all): #{(total_time/total_objects).round(2)} seconds") unless (@error_count == 0 || total_objects == 0)
logger.info("Successful count: #{@success_count}")
logger.info("Error count: #{@error_count}")
logger.info("Total records processed: #{total_objects}")
end
@@ -76,30 +77,25 @@
logger.info("Completed OAI harves of druids at #{Time.now}. Found #{@druids.size} druids. Total elapsed time for OAI harvest = #{elapsed_time(start_time,:minutes)} minutes")
end
return @druids
end
- #add the document to solr, retry if an error occurs
- def solr_add(doc, id, do_retry=true)
- #if do_retry is false, skip retrying
- tries=do_retry ? 0 : 999
- max_tries=@max_retries ? @max_retries : 5 #if @max_retries isn't set, use 5
- while tries < max_tries
- begin
- tries+=1
- solr_client.add(doc)
- #return if successful
- return
- rescue => e
- if tries<max_tries
- logger.warn "#{id}: #{e.message}, retrying"
- else
- @error_count+=1
- logger.error "Failed saving #{id}: #{e.message}"
- logger.error e.backtrace
- return
- end
+ # Add the document to solr, retry if an error occurs.
+ # See https://github.com/ooyala/retries for docs on with_retries.
+ # @param [Hash] doc a Hash representation of the solr document
+ # @param [String] id the id of the document being sent, for logging
+ def solr_add(doc, id)
+ max_tries=@max_retries ? @max_retries : 10 #if @max_retries isn't set, use 10
+
+ handler = Proc.new do |exception, attempt_number, total_delay|
+ logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
+ # logger.debug exception.backtrace
end
+
+ with_retries(:max_tries => max_tries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
+ logger.debug "Attempt #{attempt} for #{id}"
+ solr_client.add(doc)
+ logger.info "Successfully indexed #{id} on attempt #{attempt}"
end
end
# create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
# NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...