lib/remi/transform.rb in remi-0.2.30 vs lib/remi/transform.rb in remi-0.2.31

- old
+ new

@@ -481,17 +481,221 @@ when :float Float(value) when :decimal Float("%.#{scale}f" % Float(value)) when :date - Date.strptime(value, in_format) + value.is_a?(Date) ? value : Date.strptime(value, in_format) # value.is_a?(Date) is only needed becuase we stub date types with actual dates, rather than strings like we probably should when :datetime Time.strptime(value, in_format) else raise ArgumentError, "Unknown type enforcement: #{type}" end end end end + + + + + + # Public: Converts strings into booleans. + # Uses a regex to convert strings representing booleans to actual booleans. + # The truthy regex is /^(t|true|y|yes|1)$/i and the falsey regex is /^(f|false|n|no|0)$/i + # + # allow_nils - Specifies whether to allow the result to include nils. If this is set + # to false, then the value is only checked against the truthy regex and + # the returned value is false if it doesn't match. If allow_nils + # is set to true, the both the truthy and the falsey regex are checked. + # If neither match, then the result is nil. (Default: false). + # + # Examples: + # + # Truthy.new.to_proc.call('True') # => true + # Truthy.new.to_proc.call('Yes') # => true + # Truthy.new.to_proc.call('y') # => true + # Truthy.new.to_proc.call('Yessire') # => false + # Truthy.new.to_proc.call('0') # => false + # Truthy.new.to_proc.call('Pineapple') # => false + # Truthy.new(allow_nils: false).to_proc.call('Pineapple') # => nil + class Truthy < Transform + def initialize(*args, allow_nils: false, **kargs, &block) + super + @allow_nils = allow_nils + + @true_regex = /^(t|true|y|yes|1)$/i + @false_regex = /^(f|false|n|no|0)$/i + end + + def match_true(value) + !!value.match(@true_regex) + end + + def match_false(value) + !!value.match(@false_regex) + end + + def transform(value) + value = value.to_s + + if @allow_nils + if match_true(value) + true + elsif match_false(value) + false + else + nil + end + else + match_true(value) + end + end + end + + + # Public: Applies a DataFrame grouping sieve. + # + # The DataFrame sieve can be used to simplify very complex nested + # if-then logic to group data into buckets. Given a DataFrame + # with N columns, the first N-1 columns represent the variables + # needed to group data into buckets. The last column is the + # desired group. The sieve then progresses down the rows of the + # DataFrame and checks to see if the input data matches the values + # in the columns of the sieve. Nils in the sieve are treated as + # wildcards and match anything. The first row that matches wins + # and the sieve progression stops. + # + # sieve_df - The sieve, defined as a dataframe. The arguments + # to the transform must appear in the same order as the + # first N-1 columns of the sieve. + # + # + # Examples: + # + # # This sieve captures the following business logic + # # 1 - All Non-Graduate Nursing, regardless of contact, gets assigned to the :intensive group. + # # 2 - All Undergraduate programs with contact get assigned to the :intensive group. + # # 3 - All Undergraduate programs without a contact get assigned to the :base group. + # # 4 - All Graduate engineering programs with a contact get assigned to the :intensive group. + # # 5 - All other programs get assigned to the :base group + # sieve_df = Daru::DataFrame.new([ + # [ 'Undergrad' , 'NURS' , nil , :intensive ], + # [ 'Undergrad' , nil , true , :intensive ], + # [ 'Undergrad' , nil , false , :base ], + # [ 'Grad' , 'ENG' , true , :intensive ], + # [ nil , nil , nil , :base ], + # ].transpose, + # order: [:level, :program, :contact, :group] + # ) + # + # test_df = Daru::DataFrame.new([ + # ['Undergrad' , 'CHEM' , false], + # ['Undergrad' , 'CHEM' , true], + # ['Grad' , 'CHEM' , true], + # ['Undergrad' , 'NURS' , false], + # ['Unknown' , 'CHEM' , true], + # ].transpose, + # order: [:level, :program, :contact] + # ) + # + # Remi::SourceToTargetMap.apply(test_df) do + # map source(:level, :program, :contact,) .target(:group) + # .transform(Remi::Transform::DataFrameSieve.new(sieve_df)) + # end + # + # test_df + # # => #<Daru::DataFrame:70099624408400 @name = d30888fd-6ca8-48dd-9be3-558f81ae1015 @size = 5> + # level program contact group + # 0 Undergrad CHEM nil base + # 1 Undergrad CHEM true intensive + # 2 Grad CHEM true base + # 3 Undergrad NURS nil intensive + # 4 Unknown CHEM true base + class DataFrameSieve < Transform + def initialize(sieve_df, *args, **kargs, &block) + super + @sieve_df = sieve_df.transpose.to_h.values + end + + def transform(*values) + sieve_keys = @sieve_df.first.index.to_a + sieve_result_key = sieve_keys.pop + + @sieve_df.each.find do |sieve_row| + match_row = true + sieve_keys.each_with_index do |key,idx| + match_row &&= sieve_row[key].nil? || sieve_row[key] == values[idx] + end + match_row + end[sieve_result_key] + end + end + + + # Public: Used to partition elements into groups (buckets). + # + # buckets - A hash where the keys are groups and the values are weights or percentages. + # current_population - A hashable object holding a count of the current number of + # elements in each bucket. + # + # Example: + # + # # The current population has 2 record in the A bucket and 3 in B + # current_pop = Daru::Vector.new([2,3], index: ['A', 'B']) + # + # # We want to generate 7 new records that will evenly populate the A, B, and C buckets, given the current populations. + # part = Remi::Transform::Partitioner.new(buckets: { 'A' => 1, 'B' => 1,'C' => 1 }, initial_population: current_pop) + # + # 1.upt(7).map { |iter| part.call } # => ["C", "C", "A", "C", "C", "B", "A"] + class Partitioner < Transform + def initialize(buckets:, initial_population: {}, **kargs, &block) + super + @buckets = buckets + @current_population = sanitize_initial_population(buckets, initial_population) + end + + attr_reader :buckets + attr_reader :current_population + + def transform(*values) + get_next_value + end + + def size + @size ||= @current_population.reduce(0) { |sum, (group, n)| sum += n } + end + + def total_weight + @total_weight ||= @buckets.reduce(0) { |sum, (bucket, weight)| sum += 1.0 * weight } + end + + def get_next_value + assigned = @buckets.max_by do |(group, weight)| + expected = @buckets[group] / total_weight * size + actual = @current_population[group] + + diff = expected - actual + if diff > 0 + rand**(1.0 / diff) + else + -rand**(- 1.0 / @buckets[group]) + end + end.first + + @current_population[assigned] += 1 + @size += 1 + + assigned + end + + private + + def sanitize_initial_population(buckets, dist) + dist = dist.to_h + + zero_distribution = buckets.keys.reduce({}) { |h, group| h[group] = 0; h } + zero_distribution.merge(dist.select { |k,v| buckets.keys.include? k }) + end + end + end end