lib/duracloud/sync_validation.rb in duracloud-client-0.8.0 vs lib/duracloud/sync_validation.rb in duracloud-client-0.9.0

- old
+ new

@@ -1,66 +1,149 @@ require 'active_model' require 'tempfile' require 'csv' +require 'fileutils' module Duracloud class SyncValidation include ActiveModel::Model TWO_SPACES = ' ' MD5_CSV_OPTS = { col_sep: TWO_SPACES }.freeze MANIFEST_CSV_OPTS = { col_sep: "\t", headers: true, return_headers: false }.freeze - attr_accessor :space_id, :content_dir, :store_id + MISSING = "MISSING" + CHANGED = "CHANGED" + FOUND = "FOUND" + attr_accessor :space_id, :content_dir, :store_id, :work_dir, :fast + def self.call(*args) new(*args).call end + def in_work_dir + if work_dir + FileUtils.cd(work_dir) { yield } + else + Dir.mktmpdir("#{space_id}-validation-") do |tmpdir| + FileUtils.cd(tmpdir) { yield } + end + end + end + def call - Tempfile.open("#{space_id}-manifest") do |manifest| + in_work_dir do + download_manifest + convert_manifest + audit + end + end + + def download_manifest + File.open(manifest_filename, "w") do |manifest| Manifest.download(space_id, store_id) do |chunk| manifest.write(chunk) end - manifest.close + end + end - # convert manifest into md5deep format - Tempfile.open("#{space_id}-md5") do |md5_list| - CSV.foreach(manifest.path, MANIFEST_CSV_OPTS) do |row| - md5_list.puts [ row[2], row[1] ].join(TWO_SPACES) - end - md5_list.close + def convert_manifest + File.open(converted_manifest_filename, "w") do |f| + CSV.foreach(manifest_filename, MANIFEST_CSV_OPTS) do |row| + f.puts [ row[2], row[1] ].join(TWO_SPACES) + end + end + end - # run md5deep to find files not listed in the manifest - Tempfile.open("#{space_id}-audit") do |audit| - audit.close - pid = spawn("md5deep", "-X", md5_list.path, "-l", "-r", ".", chdir: content_dir, out: audit.path) - Process.wait(pid) - case $?.exitstatus - when 0 - true - when 1, 2 - failures = [] - CSV.foreach(audit.path, MD5_CSV_OPTS) do |md5, path| - content_id = path.sub(/^\.\//, "") - begin - if !Duracloud::Content.exist?(space_id: space_id, store_id: store_id, content_id: content_id, md5: md5) - failures << [ "MISSING", md5, content_id ].join("\t") - end - rescue MessageDigestError => e - failures << [ "CHANGED", md5, content_id ].join("\t") - end - end - STDOUT.puts failures - failures.empty? - when 64 - raise Error, "md5deep user error." - when 128 - raise Error, "md5deep internal error." - end - end + def audit + outfile = File.join(FileUtils.pwd, audit_filename) + infile = File.join(FileUtils.pwd, converted_manifest_filename) + pid = spawn("md5deep", "-X", infile, "-l", "-r", ".", chdir: content_dir, out: outfile) + Process.wait(pid) + case $?.exitstatus + when 0 + true + when 1, 2 + recheck + when 64, 128 + raise Error, "md5deep error." + else + raise Error, "Unknown error." + end + end + + def recheck + success = true + recheck_file do |csv| + do_recheck.each do |result| + csv << result.to_a + success &&= result.found? end end + success + end + + private + + CheckResult = Struct.new(:status, :md5, :content_id) do + def found? + status == FOUND + end + end + + def recheck_file + if work_dir + CSV.open(recheck_filename, "w", col_sep: "\t") { |csv| yield(csv) } + else + CSV($stdout, col_sep: "\t") { |csv| yield(csv) } + end + end + + def check(content_id, md5 = nil) + status = begin + exist?(content_id, md5) ? FOUND : MISSING + rescue MessageDigestError => e + CHANGED + end + CheckResult.new(status, md5 || "-", content_id) + end + + def exist?(content_id, md5 = nil) + Duracloud::Content.exist?(space_id: space_id, store_id: store_id, content_id: content_id, md5: md5) + end + + def do_recheck + Enumerator.new do |e| + CSV.foreach(audit_filename, MD5_CSV_OPTS) do |md5, path| + content_id = path.sub(/^\.\//, "") + e << check(content_id, md5) + end + end + end + + def prefix + space_id + end + + def filename(suffix) + [ prefix, suffix ].join("-") + end + + def manifest_filename + filename("manifest.tsv") + end + + def converted_manifest_filename + filename("converted-manifest.txt") + end + + def audit_filename + filename("audit.txt") + end + + def recheck_filename + filename("recheck.txt") end end end