lib/ddr/ingesttools/dpc_folder_converter/converter.rb in ddr-ingesttools-0.1.0 vs lib/ddr/ingesttools/dpc_folder_converter/converter.rb in ddr-ingesttools-0.2.0

- old
+ new

@@ -4,90 +4,139 @@ require 'find' module Ddr::IngestTools::DpcFolderConverter class Converter - METADATA_HEADERS = [ 'path', 'local_id' ] + INTERMEDIATE_FILES_DIRNAME = 'intermediate_files' + DPC_TARGETS_DIRNAME = 'targets' + SIF_TARGETS_DIRNAME = 'dpc_targets' + SIF_METADATA_FILENAME = 'metadata.txt' + SIF_MANIFEST_SHA1_FILENAME = 'manifest-sha1.txt' - attr_reader :source, :target, :data_dir, :item_id_length - attr_accessor :local_id_metadata + Results = Struct.new(:file_map, :errors) - def initialize(source, target, item_id_length) + attr_reader :source, :target, :data_dir, :item_id_length, :checksums, :copy_files, :collection_title, + :metadata_headers + attr_accessor :errors, :file_map, :local_id_metadata, :results + + def initialize(source:, target:, item_id_length:, checksums: nil, copy_files: false, collection_title: nil) @source = source @target = target - @data_dir = File.join(target, 'data') @item_id_length = item_id_length - @local_id_metadata = {} + @checksums = checksums + @copy_files = copy_files + @collection_title = collection_title + @metadata_headers = [ 'path', 'local_id' ] + @metadata_headers << 'title' unless collection_title.nil? end def call - FileUtils.mkdir_p data_dir - find_component_files(source).each { |file| handle_component(file) } - find_target_files(source).each { |file| handle_target(file) } + setup + scan_files(source) output_metadata bagitup + validate_checksums if checksums + Results.new(file_map, errors) end private + def setup + @data_dir = File.join(target, 'data') + @errors = [] + @file_map = {} + @local_id_metadata = {} + FileUtils.mkdir_p data_dir + end + def included_extensions Ddr::IngestTools::DpcFolderConverter.config[:included_extensions] end - def find_component_files(dir) - files = [] - Find.find(dir) do |path| - Find.prune if path.include?('targets') - Find.prune if path.include?('intermediate_files') - next unless File.file?(path) - next unless included_extensions.include?(File.extname(path)) - files << path + def scan_files(dirpath, file_handler='handle_component'.to_sym) + Dir.foreach(dirpath).each do |entry| + next if [ '.', '..' ].include?(entry) + path = File.join(dirpath, entry) + if File.directory?(path) + if entry == DPC_TARGETS_DIRNAME + scan_files(path, :handle_target) + elsif entry == INTERMEDIATE_FILES_DIRNAME + scan_files(path, :handle_intermediate_file) + else + scan_files(path, file_handler) + end + else + if included_extensions.include?(File.extname(entry)) + self.send(file_handler, path) + end + end end - files end - def find_target_files(dir) - files = [] - Find.find(dir) do |path| - next unless path.include?('targets') - next unless File.file?(path) - next unless included_extensions.include?(File.extname(path)) - files << path - end - files - end - def handle_component(file) base = File.basename(file, File.extname(file)) item_id = item_id_length == 0 ? base : base[0, item_id_length] FileUtils.mkdir_p(File.join(data_dir, item_id)) local_id_metadata[item_id] = item_id - FileUtils.cp file, File.join(data_dir, item_id) + handle_file(file, item_id) local_id_metadata[File.join(item_id, File.basename(file))] = base end + def handle_intermediate_file(file) + FileUtils.mkdir_p(File.join(data_dir, INTERMEDIATE_FILES_DIRNAME)) + handle_file(file, INTERMEDIATE_FILES_DIRNAME) + end + def handle_target(file) base = File.basename(file, File.extname(file)) - FileUtils.mkdir_p(File.join(data_dir, 'dpc_targets')) - FileUtils.cp file, File.join(data_dir, 'dpc_targets') - local_id_metadata[File.join('dpc_targets', File.basename(file))] = base + FileUtils.mkdir_p(File.join(data_dir, SIF_TARGETS_DIRNAME)) + handle_file(file, SIF_TARGETS_DIRNAME) + local_id_metadata[File.join(SIF_TARGETS_DIRNAME, File.basename(file))] = base end + def handle_file(file, folder_name) + if copy_files + FileUtils.cp file, File.join(data_dir, folder_name) + else + FileUtils.ln_s file, File.join(data_dir, folder_name) + end + file_map[file] = File.join(data_dir, folder_name, File.basename(file)) + end + def output_metadata metadata_rows = [] + if collection_title + metadata_rows << CSV::Row.new(metadata_headers, [ nil, nil, collection_title ]) + end local_id_metadata.each_pair do |k,v| - metadata_rows << CSV::Row.new(METADATA_HEADERS, [ k, v ]) + row_elements = [ k, v ] + row_elements << nil if collection_title + metadata_rows << CSV::Row.new(metadata_headers, row_elements) end - File.open(File.join(data_dir, 'metadata.txt'), 'w') do |file| - file.puts(METADATA_HEADERS.join(Ddr::IngestTools::DpcFolderConverter.config[:csv_options][:col_sep])) + File.open(File.join(data_dir, SIF_METADATA_FILENAME), 'w') do |file| + file.puts(metadata_headers.join(Ddr::IngestTools::DpcFolderConverter.config[:csv_options][:col_sep])) metadata_rows.each do |row| - file.puts(row.to_csv(Ddr::IngestTools::DpcFolderConverter.config[:csv_options]).strip) + file.puts(row.to_csv(Ddr::IngestTools::DpcFolderConverter.config[:csv_options])) end end end def bagitup bag = BagIt::Bag.new(target) bag.manifest! + end + + def validate_checksums + external_checksums = Ddr::IngestTools::ChecksumFile.new(checksums) + sif_manifest = Ddr::IngestTools::ChecksumFile.new(File.join(target, SIF_MANIFEST_SHA1_FILENAME)) + file_map.each do |source_path, target_path| + external_checksum = external_checksums.digest(source_path) + manifest_path = target_path.sub("#{target}/", '') + sif_checksum = sif_manifest.digest(manifest_path) + unless external_checksum == sif_checksum + errors << I18n.translate('errors.checksum_mismatch', { c1: external_checksum, f1: source_path, + c2: sif_checksum, f2: target_path }) + end + end end end end