#!/usr/bin/env ruby require 'csv' require 'to_regexp' require 'active_support/core_ext' class Cvg class << self def parse(argv) flags = [] options = [] input_paths = [] option_k = nil argv.each do |arg| if option_k options << [option_k, arg] option_k = nil elsif FLAGS.has_key?(arg[2..-1]) flags << arg[2..-1] elsif arg.start_with?('--') option_k = arg[2..-1] else input_paths << arg end end [ flags, input_paths, options ] end end class Test attr_reader :k attr_reader :arg def initialize(k, arg) @k = k @arg = arg end # only used sometimes def col @col ||= arg.split(':', 2)[0] end # only used sometimes def cols @cols ||= CSV.parse_line arg.split(':', 2)[0] end end class Present < Test class << self def accept(k, arg) new(k, arg) if k == 'present' end end def pass?(row) row.fetch(arg).to_s.present? end end class Missing < Test class << self def accept(k, arg) new(k, arg) if k == 'missing' end end def pass?(row) row.fetch(arg).to_s.strip.length == 0 end end class Match < Test class << self def accept(k, arg) new(k, arg) if k == 'match' end end REGEX_START_1 = '/' REGEX_START_2 = '%r{' def pass?(row) v = row.fetch(col).to_s matchers.any? do |matcher| case matcher when ::Regexp matcher =~ v else matcher == v end end end def matchers @matchers ||= CSV.parse_line(arg.split(':', 2)[1]).map do |x| if x.start_with?(REGEX_START_1) or x.start_with?(REGEX_START_2) x.to_regexp or die("invalid regex #{x.inspect}") else x end end end end class GreaterOrLesser < Test class << self def accept(k, arg) new(k, arg) if OPERATOR[k] end def numify(v, verify = false) case v when JUST_A_NUMBER v.to_f when NUMBER v.gsub(NUMBER_FLUFF, '').to_f else raise "#{v.inspect} not a number" if verify end end end OPERATOR = { 'lt' => :<, 'lte' => :<=, 'gt' => :>, 'gte' => :>=, } NUMBER = /\d/ JUST_A_NUMBER = /\A-?[\d.]+\z/ NUMBER_FLUFF = /[^\d.eE\+\-]+/ def pass?(row) if v = GreaterOrLesser.numify(row.fetch(col)) v.send(operator, threshold) end end def operator @operator ||= OPERATOR.fetch(k) end def threshold @threshold ||= GreaterOrLesser.numify arg, true end end class Dedup < Test class << self def accept(k, arg) new(k, arg) if k == 'dedup' end end def registry @registry ||= [] end def pass?(row) hash = row.values_at(*cols).hash if registry.include?(hash) false else registry << hash end end end class DetectMissing MISSING = %w{ N/A n/a NULL null - #DIV/0 #REF! #NAME? NIL nil NA na #VALUE! #NULL! NaN #N/A #NUM! ? } def apply!(row) row.each do |k, v| if v.is_a?(::String) and MISSING.include?(v) row[k] = nil end end end end TESTS = [Present, Missing, Match, GreaterOrLesser, Dedup] FLAGS = { 'detect-missing' => DetectMissing, } attr_reader :options attr_reader :input_paths def initialize(argv) @flags, @input_paths, @options = Cvg.parse argv end def perform write_headers each_input_row do |row| if tests.all? { |t| t.pass?(row) } write_row row end end close_output end private def each_input_row input_paths.each do |path| CSV.foreach(path, headers: :first_row) do |row| row = row.to_hash flags.each { |flag| flag.apply! row } yield row end end end def tests @tests ||= begin memo = [] options.each do |k, v| TESTS.each do |test_class| if test = test_class.accept(k, v) memo << test end end end memo end end def flags @_flags ||= begin @flags.map do |flag| FLAGS.fetch(flag).new end end end def write_row(row) output_f.puts row.values_at(*fields).to_csv end def write_headers output_f.puts fields.to_csv end def output_f @output_f ||= $stdout end def close_output output_f.close end def fields @fields ||= begin if user = options.detect { |k, v| k == 'output-fields' } CSV.parse_line user[1] else input_paths.map do |path| CSV.parse_line(File.open(path) { |f| f.gets }) end.flatten.uniq end end end end Cvg.new(ARGV).perform