transform.rb in remi-0.2.31

- old
+ new
@@ -481,17 +481,221 @@
           when :float
             Float(value)
           when :decimal
             Float("%.#{scale}f" % Float(value))
           when :date
-            Date.strptime(value, in_format)
+            value.is_a?(Date) ? value : Date.strptime(value, in_format) # value.is_a?(Date) is only needed becuase we stub date types with actual dates, rather than strings like we probably should
           when :datetime
             Time.strptime(value, in_format)
           else
             raise ArgumentError, "Unknown type enforcement: #{type}"
           end
         end
       end
     end
+
+
+
+
+
+    # Public: Converts strings into booleans.
+    # Uses a regex to convert strings representing booleans to actual booleans.
+    # The truthy regex is /^(t|true|y|yes|1)$/i and the falsey regex is /^(f|false|n|no|0)$/i
+    #
+    # allow_nils - Specifies whether to allow the result to include nils.  If this is set
+    #              to false, then the value is only checked against the truthy regex and
+    #              the returned value is false if it doesn't match.  If allow_nils
+    #              is set to true, the both the truthy and the falsey regex are checked.
+    #              If neither match, then the result is nil.  (Default: false).
+    #
+    # Examples:
+    #
+    # Truthy.new.to_proc.call('True')                         # => true
+    # Truthy.new.to_proc.call('Yes')                          # => true
+    # Truthy.new.to_proc.call('y')                            # => true
+    # Truthy.new.to_proc.call('Yessire')                      # => false
+    # Truthy.new.to_proc.call('0')                            # => false
+    # Truthy.new.to_proc.call('Pineapple')                    # => false
+    # Truthy.new(allow_nils: false).to_proc.call('Pineapple') # => nil
+    class Truthy < Transform
+      def initialize(*args, allow_nils: false, **kargs, &block)
+        super
+        @allow_nils = allow_nils
+
+        @true_regex = /^(t|true|y|yes|1)$/i
+        @false_regex = /^(f|false|n|no|0)$/i
+      end
+
+      def match_true(value)
+        !!value.match(@true_regex)
+      end
+
+      def match_false(value)
+        !!value.match(@false_regex)
+      end
+
+      def transform(value)
+        value = value.to_s
+
+        if @allow_nils
+          if match_true(value)
+            true
+          elsif match_false(value)
+            false
+          else
+            nil
+          end
+        else
+          match_true(value)
+        end
+      end
+    end
+
+
+    # Public: Applies a DataFrame grouping sieve.
+    #
+    # The DataFrame sieve can be used to simplify very complex nested
+    # if-then logic to group data into buckets.  Given a DataFrame
+    # with N columns, the first N-1 columns represent the variables
+    # needed to group data into buckets.  The last column is the
+    # desired group.  The sieve then progresses down the rows of the
+    # DataFrame and checks to see if the input data matches the values
+    # in the columns of the sieve.  Nils in the sieve are treated as
+    # wildcards and match anything.  The first row that matches wins
+    # and the sieve progression stops.
+    #
+    # sieve_df - The sieve, defined as a dataframe.  The arguments
+    #            to the transform must appear in the same order as the
+    #            first N-1 columns of the sieve.
+    #
+    #
+    # Examples:
+    #
+    #   # This sieve captures the following business logic
+    #   # 1 - All Non-Graduate Nursing, regardless of contact, gets assigned to the :intensive group.
+    #   # 2 - All Undergraduate programs with contact get assigned to the :intensive group.
+    #   # 3 - All Undergraduate programs without a contact get assigned to the :base group.
+    #   # 4 - All Graduate engineering programs with a contact get assigned to the :intensive group.
+    #   # 5 - All other programs get assigned to the :base group
+    #   sieve_df = Daru::DataFrame.new([
+    #     [ 'Undergrad' , 'NURS' , nil   , :intensive ],
+    #     [ 'Undergrad' , nil    , true  , :intensive ],
+    #     [ 'Undergrad' , nil    , false , :base ],
+    #     [ 'Grad'      , 'ENG'  , true  , :intensive ],
+    #     [ nil         , nil    , nil   , :base ],
+    #     ].transpose,
+    #     order: [:level, :program, :contact, :group]
+    #     )
+    #
+    #   test_df = Daru::DataFrame.new([
+    #     ['Undergrad' , 'CHEM' , false],
+    #     ['Undergrad' , 'CHEM' , true],
+    #     ['Grad'      , 'CHEM' , true],
+    #     ['Undergrad' , 'NURS' , false],
+    #     ['Unknown'   , 'CHEM' , true],
+    #     ].transpose,
+    #     order: [:level, :program, :contact]
+    #   )
+    #
+    #   Remi::SourceToTargetMap.apply(test_df) do
+    #     map source(:level, :program, :contact,) .target(:group)
+    #     .transform(Remi::Transform::DataFrameSieve.new(sieve_df))
+    #   end
+    #
+    #   test_df
+    #   # =>  #<Daru::DataFrame:70099624408400 @name = d30888fd-6ca8-48dd-9be3-558f81ae1015 @size = 5>
+    #             level    program    contact      group
+    #      0  Undergrad       CHEM        nil       base
+    #      1  Undergrad       CHEM       true  intensive
+    #      2       Grad       CHEM       true       base
+    #      3  Undergrad       NURS        nil  intensive
+    #      4    Unknown       CHEM       true       base
+    class DataFrameSieve < Transform
+      def initialize(sieve_df, *args, **kargs, &block)
+        super
+        @sieve_df = sieve_df.transpose.to_h.values
+      end
+
+      def transform(*values)
+        sieve_keys = @sieve_df.first.index.to_a
+        sieve_result_key = sieve_keys.pop
+
+        @sieve_df.each.find do |sieve_row|
+          match_row = true
+          sieve_keys.each_with_index do |key,idx|
+            match_row &&= sieve_row[key].nil? || sieve_row[key] == values[idx]
+          end
+          match_row
+        end[sieve_result_key]
+      end
+    end
+
+
+    # Public: Used to partition elements into groups (buckets).
+    #
+    # buckets            - A hash where the keys are groups and the values are weights or percentages.
+    # current_population - A hashable object holding a count of the current number of
+    #                      elements in each bucket.
+    #
+    # Example:
+    #
+    #   # The current population has 2 record in the A bucket and 3 in B
+    #   current_pop = Daru::Vector.new([2,3], index: ['A', 'B'])
+    #
+    #   # We want to generate 7 new records that will evenly populate the A, B, and C buckets, given the current populations.
+    #   part = Remi::Transform::Partitioner.new(buckets: { 'A' => 1, 'B' => 1,'C' => 1 }, initial_population: current_pop)
+    #
+    #   1.upt(7).map { |iter| part.call } # => ["C", "C", "A", "C", "C", "B", "A"]
+    class Partitioner < Transform
+      def initialize(buckets:, initial_population: {}, **kargs, &block)
+        super
+        @buckets = buckets
+        @current_population = sanitize_initial_population(buckets, initial_population)
+      end
+
+      attr_reader :buckets
+      attr_reader :current_population
+
+      def transform(*values)
+        get_next_value
+      end
+
+      def size
+        @size ||= @current_population.reduce(0) { |sum, (group, n)| sum += n }
+      end
+
+      def total_weight
+        @total_weight ||= @buckets.reduce(0) { |sum, (bucket, weight)| sum += 1.0 * weight }
+      end
+
+      def get_next_value
+        assigned = @buckets.max_by do |(group, weight)|
+          expected = @buckets[group] / total_weight * size
+          actual = @current_population[group]
+
+          diff = expected - actual
+          if diff > 0
+            rand**(1.0 / diff)
+          else
+            -rand**(- 1.0 / @buckets[group])
+          end
+        end.first
+
+        @current_population[assigned] += 1
+        @size += 1
+
+        assigned
+      end
+
+      private
+
+      def sanitize_initial_population(buckets, dist)
+        dist = dist.to_h
+
+        zero_distribution = buckets.keys.reduce({}) { |h, group| h[group] = 0; h }
+        zero_distribution.merge(dist.select { |k,v| buckets.keys.include? k })
+      end
+    end
+
 
   end
 end