Sha256: b6a69a2f7bc366ccdc69cc78ae83bbcd751ce1ea5866822b673e67f057a5199a

Contents?: true

Size: 1.51 KB

Versions: 10

Compression:

Stored size: 1.51 KB

Contents

#!/usr/bin/env ruby

require 'optparse'

options = {
  ignore_columns: []
}
OptionParser.new do |opts|
  opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'

  opts.on('-h', '--help', 'Prints this help') do
    puts opts
    exit
  end

  opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore') do |v|
    options[:ignore_columns] = v.split(',')
  end
end.parse!

require 'digest/sha2'
require 'json'
require 'csv-utils'

csv = CSVUtils::CSVIterator.new(ARGV[0])

missing_headers = options[:ignore_columns] - csv.first.keys
unless missing_headers.empty?
  raise("unkown headers #{missing_headers.join(', ')} configured ingnore headers")
end

hashed_rows = {}

csv.each_with_index do |row, idx|
  options[:ignore_columns].each do |ignore_column|
    row[ignore_column] = ''
  end
  key = Digest::SHA256.hexdigest(row.to_json)
  hashed_rows[key] ||= []
  hashed_rows[key] << idx
end

duplicate_rows = {}

hashed_rows.reject! { |key, row_numbers| row_numbers.size <= 1 }

hashed_rows.each do |key, row_numbers|
  hashed_rows[key] = {}
  row_numbers.each do |row_number|
    duplicate_rows[row_number] = key
    hashed_rows[key][row_number] = nil
  end
end

csv.each_with_index do |row, idx|
  next unless (key = duplicate_rows[idx])

  hashed_rows[key][idx] = row
end

CSV.open('duplicates-' + File.basename(ARGV[0]), 'wb') do |out|
  out << ['duplicate_key', 'row_no'] + csv.first.keys

  hashed_rows.each do |key, rows|
    rows.each do |idx, row|
      out << [key, idx] + row.values
    end
  end
end

Version data entries

10 entries across 10 versions & 1 rubygems

Version Path
csv-utils-0.3.24 bin/csv-duplicate-finder
csv-utils-0.3.23 bin/csv-duplicate-finder
csv-utils-0.3.22 bin/csv-duplicate-finder
csv-utils-0.3.21 bin/csv-duplicate-finder
csv-utils-0.3.20 bin/csv-duplicate-finder
csv-utils-0.3.19 bin/csv-duplicate-finder
csv-utils-0.3.18 bin/csv-duplicate-finder
csv-utils-0.3.17 bin/csv-duplicate-finder
csv-utils-0.3.16 bin/csv-duplicate-finder
csv-utils-0.3.15 bin/csv-duplicate-finder