# frozen_string_literal: true module Bulkrax # This module is responsible for providing the means of querying Solr for the appropriate works, # collections, and file sets for an export of entries. # # @see .for module ParserExportRecordSet # @api public # # A factory method for returning an object that can yield each id and associated entry_class as # well as return the count of objects in the record set. # # @param parser [Bulkrax::ApplicationParser] # @param export_from [String] # # @return [#each, #count] An object, likely a descendant of # {Bulkrax::CurrentParserRecordSet::Base} that responds to {Base#count} and # {Base#each}. def self.for(parser:, export_from:) "Bulkrax::ParserExportRecordSet::#{export_from.classify}".constantize.new(parser: parser) end SOLR_QUERY_PAGE_SIZE = 512 ## # A helper method for handling querying large batches of IDs. By default SOLR has a max of 1024 # `OR` clauses per query. This method helps chunk large sets of IDs into batches. # # @param array [Array<Object>] # @param page_size [Integer] # @yieldparam [Array<Object>] slice of the original arrays which are yielded. The results of # the yield are merged into the return value. # # @return [Array<Object>] # # @see https://github.com/samvera-labs/bulkrax/issues/776 def self.in_batches(array, page_size: SOLR_QUERY_PAGE_SIZE) array = Array.wrap(array) return [] if array.empty? results = [] array.each_slice(page_size) do |slice| results += Array.wrap(yield(slice)) end results end # @abstract # # @note This has {#each} and {#count} but is not an Enumerable. But because it has these two # methods that echo {Array}, we can do some lovely mocking and stubbing in those classes # dependent on this file. :) class Base def initialize(parser:) @parser = parser end attr_reader :parser private :parser delegate :limit_reached?, :work_entry_class, :collection_entry_class, :file_set_entry_class, :importerexporter, to: :parser private :limit_reached?, :work_entry_class, :collection_entry_class, :file_set_entry_class, :importerexporter ## # @return [Integer] def count sum = works.count + collections.count + file_sets.count return sum if limit.zero? return limit if sum > limit return sum end ## # Yield first the works, then collections, then file sets. Once we've yielded as many times # as the parser's limit, we break the iteration and return. # # @yieldparam id [String] The ID of the work/collection/file_set # @yieldparam entry_class [Class] The parser associated entry class for the # work/collection/file_set. # # @note The order of what we yield has been previously determined. def each counter = 0 works.each do |work| break if limit_reached?(limit, counter) yield(work.fetch('id'), work_entry_class) counter += 1 end return if limit_reached?(limit, counter) collections.each do |collection| break if limit_reached?(limit, counter) yield(collection.fetch('id'), collection_entry_class) counter += 1 end return if limit_reached?(limit, counter) file_sets.each do |file_set| break if limit_reached?(limit, counter) yield(file_set.fetch('id'), file_set_entry_class) counter += 1 end end private # Why call these candidates and not the actual file_set_ids? Because of implementation # details of Hyrax. What are those details? The upstream application (as of v2.9.x) puts # child works into the `file_set_ids_ssim` field. So we have a mix of file sets and works in # that property. # # @see #file_sets def candidate_file_set_ids @candidate_file_set_ids ||= works.flat_map { |work| work.fetch("#{Bulkrax.file_model_class.to_s.underscore}_ids_ssim", []) } end # @note Specifically not memoizing this so we can merge values without changing the object. # # No sense attempting to query for more than the limit. def query_kwargs { fl: "id,#{Bulkrax.file_model_class.to_s.underscore}_ids_ssim", method: :post, rows: row_limit } end # If we have a limit, we need not query beyond that limit def row_limit return 2_147_483_647 if limit.zero? limit end def limit parser.limit.to_i end alias works_query_kwargs query_kwargs alias collections_query_kwargs query_kwargs def extra_filters output = "" if importerexporter.start_date.present? start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ') finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW" output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]" end output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : "" output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : "" output end def works @works ||= ActiveFedora::SolrService.query(works_query, **works_query_kwargs) end def collections @collections ||= if collections_query ActiveFedora::SolrService.query(collections_query, **collections_query_kwargs) else [] end end # @note In most cases, when we don't have any candidate file sets, there is no need to query SOLR. # # @see Bulkrax::ParserExportRecordSet::Importer#file_sets # # Why can't we just use the candidate_file_set_ids? Because Hyrax is pushing child works into the # `file_set_ids_ssim` field. # # For v2.9.x of Hryax; perhaps this is resolved. # # @see https://github.com/scientist-softserv/britishlibrary/issues/289 # @see https://github.com/samvera/hyrax/blob/64c0bbf0dc0d3e1b49f040b50ea70d177cc9d8f6/app/indexers/hyrax/work_indexer.rb#L15-L18 def file_sets @file_sets ||= ParserExportRecordSet.in_batches(candidate_file_set_ids) do |batch_of_ids| fsq = "has_model_ssim:#{Bulkrax.file_model_class} AND id:(\"" + batch_of_ids.join('" OR "') + "\")" fsq += extra_filters if extra_filters.present? ActiveFedora::SolrService.query( fsq, { fl: "id", method: :post, rows: batch_of_ids.size } ) end end def solr_name(base_name) if Module.const_defined?(:Solrizer) ::Solrizer.solr_name(base_name) else ::ActiveFedora.index_field_mapper.solr_name(base_name) end end end class All < Base def works_query "has_model_ssim:(#{Bulkrax.curation_concerns.join(' OR ')}) #{extra_filters}" end def collections_query "has_model_ssim:Collection #{extra_filters}" end end class Collection < Base def works_query "member_of_collection_ids_ssim:#{importerexporter.export_source} #{extra_filters} AND " \ "has_model_ssim:(#{Bulkrax.curation_concerns.join(' OR ')})" end def collections_query "(id:#{importerexporter.export_source} #{extra_filters}) OR " \ "(has_model_ssim:Collection AND member_of_collection_ids_ssim:#{importerexporter.export_source})" end end class Worktype < Base def works_query "has_model_ssim:#{importerexporter.export_source} #{extra_filters}" end def collections_query nil end end class Importer < Base private delegate :work_identifier, to: :parser private :work_identifier def extra_filters '*:*' + super end def complete_entry_identifiers @complete_entry_identifiers ||= begin entry_ids ||= Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id) complete_statuses ||= Bulkrax::Status.latest_by_statusable .includes(:statusable) .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete') complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') } end end def works @works ||= ParserExportRecordSet.in_batches(complete_entry_identifiers) do |ids| ActiveFedora::SolrService.query( extra_filters.to_s, **query_kwargs.merge( fq: [ %(#{solr_name(work_identifier)}:("#{ids.join('" OR "')}")), "has_model_ssim:(#{Bulkrax.curation_concerns.join(' OR ')})" ], fl: 'id' ) ) end end def collections @collections ||= ParserExportRecordSet.in_batches(complete_entry_identifiers) do |ids| ActiveFedora::SolrService.query( "has_model_ssim:Collection #{extra_filters}", **query_kwargs.merge( fq: [ %(#{solr_name(work_identifier)}:("#{ids.join('" OR "')}")), "has_model_ssim:Collection" ], fl: "id" ) ) end end # This is an exception; we don't know how many candidate file sets there might be. So we will instead # make the query (assuming that there are {#complete_entry_identifiers}). # # @see Bulkrax::ParserExportRecordSet::Base#file_sets def file_sets @file_sets ||= ParserExportRecordSet.in_batches(complete_entry_identifiers) do |ids| ActiveFedora::SolrService.query( extra_filters, query_kwargs.merge( fq: [ %(#{solr_name(work_identifier)}:("#{ids.join('" OR "')}")), "has_model_ssim:#{Bulkrax.file_model_class}" ], fl: 'id' ) ) end end end end end