#!/usr/bin/env ruby require 'cvg/version' if ARGV[0] == '--version' puts Cvg::VERSION exit end require 'set' require 'csv' require 'to_regexp' require 'active_support' require 'active_support/core_ext' require 'hash_digest' class Cvg class << self def parse(argv) flags = [] options = [] input_paths = [] option_k = nil argv.each do |arg| if option_k options << [option_k, arg] option_k = nil elsif FLAGS.has_key?(arg[2..-1]) flags << arg[2..-1] elsif arg.start_with?('--') option_k = arg[2..-1] else input_paths << arg end end [ flags, input_paths, options ] end end class Test attr_reader :k attr_reader :arg def initialize(k, arg) @k = k @arg = arg end # only used sometimes def col @col ||= arg.split(':', 2)[0] end # only used sometimes def cols @cols ||= CSV.parse_line arg.split(':', 2)[0] end end class Present < Test class << self def accept(k, arg) new(k, arg) if k == 'present' end end def pass?(row) row.fetch(arg).to_s.present? end end class AnyPresent < Test class << self def accept(k, arg) new(k, arg) if k == 'any-present' end end def pass?(row) row.values_at(*cols).any?(&:present?) end end class Missing < Test class << self def accept(k, arg) new(k, arg) if k == 'missing' end end def pass?(row) row.fetch(arg).to_s.strip.length == 0 end end class Match < Test class << self def accept(k, arg) new(k, arg) if k == 'match' end end REGEX_START_1 = '/' REGEX_START_2 = '%r{' def pass?(row) v = row.fetch(col).to_s matchers.any? do |matcher| case matcher when ::Regexp matcher =~ v else matcher == v end end end def matchers @matchers ||= CSV.parse_line(arg.split(':', 2)[1]).map do |x| if x.start_with?(REGEX_START_1) or x.start_with?(REGEX_START_2) x.to_regexp or die("invalid regex #{x.inspect}") else x end end end end class GreaterOrLesser < Test class << self def accept(k, arg) new(k, arg) if OPERATOR[k] end def numify(v, verify = false) case v when JUST_A_NUMBER v.to_f when NUMBER v.gsub(NUMBER_FLUFF, '').to_f else raise "#{v.inspect} not a number" if verify end end end OPERATOR = { 'lt' => :<, 'lte' => :<=, 'gt' => :>, 'gte' => :>=, } NUMBER = /\d/ JUST_A_NUMBER = /\A-?[\d.]+\z/ NUMBER_FLUFF = /[^\d.eE\+\-]+/ def pass?(row) if v = GreaterOrLesser.numify(row.fetch(col)) v.send(operator, threshold) end end def operator @operator ||= OPERATOR.fetch(k) end def threshold @threshold ||= GreaterOrLesser.numify arg.split(':', 2)[1], true end end class Dedup < Test class << self def accept(k, arg) new(k, arg) if k == 'dedup' end end def registry @registry ||= Set.new end def pass?(row) digest = HashDigest.digest3 row.values_at(*cols) if registry.include?(digest) false else registry << digest end end end class Flag attr_reader :parent def initialize(parent) @parent = parent end def mode nil end def per_row_pre_test? nil end def per_row_passed_test? nil end def final? nil end end class DetectMissing < Flag MISSING = %w{ N/A n/a NULL null - #DIV/0 #REF! #NAME? NIL nil NA na #VALUE! #NULL! NaN #N/A #NUM! ? } def apply!(row) row.each do |k, v| if v.is_a?(::String) and (MISSING.include?(v) or v.strip.empty?) row[k] = nil end end end def per_row_pre_test? true end end class Count < Flag attr_reader :num def initialize(*) super @num = 0 end def apply!(row) @num += 1 end def finalize parent.output_f.puts "Count: #{num}" end def mode :dont_write_rows end def per_row_passed_test? true end def final? true end end TESTS = [Present, Missing, Match, GreaterOrLesser, Dedup, AnyPresent] FLAGS = { 'detect-missing' => DetectMissing, 'count' => Count, } attr_reader :options attr_reader :input_paths def initialize(argv) @flags, @input_paths, @options = Cvg.parse argv end def perform if write_rows? write_headers end each_input_row do |row| if tests.all? { |t| t.pass?(row) } per_row_passed_test_flags.each { |flag| flag.apply! row } if write_rows? write_row row end end end final_flags.each { |flag| flag.finalize } close_output end def output_f @output_f ||= $stdout end private def each_input_row count = 0 input_paths.each do |path| CSV.foreach(path, headers: :first_row) do |row| if count < limit count += 1 row = row.to_hash per_row_pre_test_flags.each { |flag| flag.apply! row } yield row end end end end def tests @tests ||= begin memo = [] options.each do |k, v| TESTS.each do |test_class| if test = test_class.accept(k, v) memo << test end end end memo end end def write_rows? return @write_rows_query if defined?(@write_rows_query) @write_rows_query = !modes.include?(:dont_write_rows) end def limit return @limit if defined?(@limit) if limit = options.detect { |k, v| k == 'limit' } @limit = limit[1].to_f.round else @limit = Float::INFINITY end end def modes @modes ||= flags.map(&:mode).flatten.compact end def per_row_pre_test_flags @per_row_pre_test_flags ||= flags.select { |flag| flag.per_row_pre_test? } end def per_row_passed_test_flags @per_row_passed_test_flags ||= flags.select { |flag| flag.per_row_passed_test? } end def final_flags @final_flags ||= flags.select { |flag| flag.final? } end def flags @_flags ||= begin @flags.map do |flag| FLAGS.fetch(flag).new(self) end end end def write_row(row) output_f.puts row.values_at(*fields).to_csv end def write_headers output_f.puts fields.to_csv end def close_output output_f.close end def fields @fields ||= begin if user = options.detect { |k, v| k == 'output-fields' } CSV.parse_line user[1] else input_paths.map do |path| CSV.parse_line(File.open(path) { |f| f.gets }) end.flatten.uniq end end end end Cvg.new(ARGV).perform