module Remi class Transform # Public: Initializes the static arguments of a transform. # # source_metadata - Metadata for the transform source. # target_metadata - Metadata for the transform target. def initialize(*args, source_metadata: {}, target_metadata: {}, **kargs, &block) @source_metadata = source_metadata @target_metadata = target_metadata @multi_args = false end # Public: Accessor for source metadata attr_accessor :source_metadata # Public: Accessor for target metadata attr_accessor :target_metadata # Public: Set to true if the transform expects multiple arguments (default: false) attr_reader :multi_args # Public: Defines the operation of this transform class. # # value - The value to be transformed # # Returns the transformed value. def transform(value) raise NoMethodError, "#{__method__} not defined for #{self.class.name}" end # Public: Allows one to call the proc defined by the transform so that # Remi::Transform instances can be used interchangeably with normal lambdas. # # values - The values to be transformed. # # Returns the transformed value. def call(*args) if to_proc.arity == 0 to_proc.call else to_proc.call(*args) end end # Public: Returns the transform as a lambda. def to_proc @to_proc ||= method(:transform).to_proc end # Public: Transform used to prefix string values in a vector. # # prefix - The string prefix. # if_blank - String value to substitute if the value is blank (default: ''). # # Examples: # # Prefix.new('CU').to_proc.call('123') # => "CU123" class Prefix < Transform def initialize(prefix, *args, if_blank: '', **kargs, &block) super @prefix = prefix @if_blank = if_blank end def transform(value) if value.blank? @if_blank else "#{@prefix}#{value}" end end end # Public: Transform used to postfix values in a vector. # # postfix - The string postfix. # if_blank - String value to substitute if the value is blank (default: ''). # # Examples: # # Postfix.new('A').to_proc.call('123') # => "123A" class Postfix < Transform def initialize(postfix, *args, if_blank: '', **kargs, &block) super @postfix = postfix @if_blank = if_blank end def transform(value) if value.blank? @if_blank else "#{value}#{@postfix}" end end end # Public: Transform used to truncate values in a vector. # # len - The maximum length of the string. # # Examples: # # Truncate.new(3).to_proc.call('1234') # => "123" class Truncate < Transform def initialize(len, *args, **kargs, &block) super @len = len end def transform(value) (value || '').slice(0,@len) end end # Public: Transform used to concatenate a list of values, joined by a delimiter. # # delimiter - The delimiter used between values in the list (default: ''). # # Examples: # # Concatenate.new('-').to_proc.call('a', 'b', 'c') # => "a-b-c" class Concatenate < Transform def initialize(delimiter='', *args, **kargs, &block) super @multi_args = true @delimiter = delimiter end def transform(row) row = SourceToTargetMap::Row[row] row.each_source.map { |key, value| value.blank? ? nil : value }.compact.join(@delimiter) end end # Public: Transform used to do key-value lookup on hash-like objects # # lookup - The lookup object that takes keys and returns values. # missing - What to use if a key is not found in the lookup (default: nil). If this # is a proc, it is sent the key as an argument. # # Examples: # # my_lookup = { 1 => 'one', 2 => 'two } # Lookup.new().to_proc.call(1) # => "1" # Lookup.new().to_proc.call(3) # => nil # Lookup.new().to_proc.call(3, missing: 'UNK') # => "UNK" # Lookup.new().to_proc.call(3, missing: ->(v) { "I don't know #{v}" }) # => "I don't know 3" class Lookup < Transform def initialize(lookup, *args, missing: nil, **kargs, &block) super @lookup = lookup @missing = missing end def transform(value) result = @lookup[value] if !result.nil? result elsif @missing.respond_to? :call @missing.call(value) else @missing end end end # Public: (Next-Value-Lookup) transform used to find the first non-blank value in a list. # # default - What to use if all values are blank (default: ''). # # Examples: # # Nvl.new.to_proc.call(nil,'','a','b') # => "a" class Nvl < Transform def initialize(default='', *args, **kargs, &block) super @multi_args = true @default = default end def transform(row) row = SourceToTargetMap::Row[row] row.each_source.find(->() { [nil, @default] }) { |key, value| !value.blank? }[1] end end # Public: Used to replace blank values. # # replace_with - Use this if the source value is blank (default: ''). # # Examples: # # IfBlank.new('MISSING VALUE').to_proc.call('alpha') # => "alpha" # IfBlank.new('MISSING VALUE').to_proc.call('') # => "MISSING VALUE" class IfBlank < Transform def initialize(replace_with='', *args, **kargs, &block) super @replace_with = replace_with end def transform(value) value.blank? ? @replace_with : value end end # Public: Parses a string and converts it to a date. # This transform is metadata aware and will use :in_format metadata # from the source # # type - Specify either :date, or :datetime type (default: date) # in_format - The date format to use to convert the string (default: uses :in_format # from the source metadata. If that is not defined, use '%Y-%m-%d'). # if_blank - Value to use if the the incoming value is blank (default: uses :if_blank # from the source metadata. If that is not defined, use nil). If set to # :high, then use the largest date, if set to :ow, use the lowest date. # # Examples: # # ParseDate.new(in_format: '%m/%d/%Y').to_proc.call('02/22/2013') # => Date.new(2013,2,22) # # tform = ParseDate.new # tform.source_metadata = { in_format: '%m/%d/%Y' } # tform.to_proc.call('02/22/2013') # => Date.new(2013,2,22) class ParseDate < Transform def initialize(*args, type: nil, in_format: nil, if_blank: nil, **kargs, &block) super @type = type @in_format = in_format @if_blank = if_blank end def type @type ||= @source_metadata.fetch(:type, :date) end def in_format @in_format ||= @source_metadata.fetch(:in_format, default_date_format) end def if_blank @if_blank ||= @source_metadata.fetch(:if_blank, nil) end def default_date_format if type == :datetime '%Y-%m-%d %H:%M:%S' else '%Y-%m-%d' end end def transform(value) begin if value.respond_to?(:strftime) value elsif value.blank? then blank_handler(value) else string_to_date(value) end rescue ArgumentError => err raise err, "Error parsing date (#{value.class}): '#{value}' with format #{in_format})" end end def class_type @class_type ||= type == :datetime ? Time : Date end def string_to_date(value) class_type.strptime(value, in_format) end def blank_handler(value) if if_blank == :low class_type.new(1900,01,01) elsif if_blank == :high class_type.new(2999,12,31) elsif if_blank.respond_to? :call if_blank.call(value) else if_blank end end end # Public: (Re)formats a date. # This transform is metadata aware and will use :in_format/:out_format metadata # from the source. # # type - Specify either :date, or :datetime type (default: date) # in_format - The date format to used to parse the input value. If the input value # is a date, then then parameter is ignored. (default: uses :in_format # from the source metadata. If that is not defined, use '%Y-%m-%d') # out_format - The date format applied to provide the resulting string. (default: # uses :out_format from the source metadata. If that is not defined, # use '%Y-%m-%d') # # Examples: # # FormatDate.new(in_format: '%m/%d/%Y', out_format: '%Y-%m-%d').to_proc.call('02/22/2013') # => "2013-02-22" # # tform = FormatDate.new # tform.source_metadata = { in_format: '%m/%d/%Y', out_format: '%Y-%m-%d' } # tform.to_proc.call('02/22/2013') # => "2013-02-22" class FormatDate < Transform def initialize(*args, type: nil, in_format: nil, out_format: nil, **kargs, &block) super @type = type @in_format = in_format @out_format = out_format end def type @type ||= @source_metadata.fetch(:type, :date) end def in_format @in_format ||= @source_metadata.fetch(:in_format, default_date_format) end def out_format @out_format ||= @source_metadata.fetch(:out_format, default_date_format) end def default_date_format if type == :datetime '%Y-%m-%d %H:%M:%S' else '%Y-%m-%d' end end def class_type @class_type ||= type == :datetime ? Time : Date end def transform(value) begin if value.blank? then '' elsif value.respond_to? :strftime value.strftime(out_format) else class_type.strptime(value, in_format).strftime(out_format) end rescue ArgumentError => err raise err, "Error parsing date (#{value.class}): '#{value}' using the format #{in_format} => #{out_format}" end end end # Public: Used to calculate differences between dates by a given measure. # # measure - One of :days, :months, or :years. (default: :days). # # Examples: # # DateDiff.new(:months).to_proc.call([Date.new(2016,1,30), Date.new(2016,3,1)]) # => 2 class DateDiff < Transform def initialize(measure = :days, *args, **kargs, &block) super @multi_args = true @measure = measure end def transform(row) row = SourceToTargetMap::Row[row] from_date = row[row.keys[0]] to_date = row[row.keys[1]] case @measure.to_sym when :days (to_date - from_date).to_i when :months (to_date.year * 12 + to_date.month) - (from_date.year * 12 + from_date.month) when :years to_date.year - from_date.year else raise ArgumentError, "Unknown date difference measure: #{@measure}" end end end # Public: Simply returns a constant. # # constant - The constant value to return. # # Examples: # # Constant.new('ewoks').to_proc.call('whatever') # => 'ewoks' class Constant < Transform def initialize(constant, *args, **kargs, &block) super @constant = constant end def transform @constant end end # Public: Replaces one substring with another. # # to_replace - The string or regex to be replaced. # repalce_with - The value to substitute. # # Examples: # # Replace.new(/\s/, '-').to_proc.call('hey jude') #=> 'hey-jude' class Replace < Transform def initialize(to_replace, replace_with, *args, **kargs, &block) super @to_replace = to_replace @replace_with = replace_with end def transform(value) (value || '').gsub(@to_replace, @replace_with) end end # Public: Checks to see if an email validates against a regex (imperfect) # and will substitute it with some value if not. # # substitute - The value used to substitute for an invalid email. Can use a proc # that accepts the value of the invalid email # # Examples: # # ValidateEmail.new('invalid@example.com').to_proc.call('uhave.email') #=> 'invalid@example.com' # ValidateEmail.new(->(v) { "#{SecureRandom.uuid}@example.com" }).to_proc.call('uhave.email') #=> '3f158f29-bc75-44f0-91ed-22fbe5157297@example.com' class ValidateEmail < Transform def initialize(substitute='', *args, **kargs, &block) super @substitute = substitute end def transform(value) value = value || '' if value.match(/^[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,}$/i) value elsif @substitute.respond_to? :call @substitute.call value else @substitute end end end # Public: Enforces the type declared in the :type metadata field (if it exists) # # Examples: # # tform = EnforceType.new # tform.source_metadata = { type: :date, in_format: '%m/%d/%Y' } # tform.to_proc.call('02/22/2013') # => Date.new(2013,2,22) # # tform = EnforceType.new # tform.source_metadata = { type: :integer } # tform.to_proc.call('12') # => 12 # # tform = EnforceType.new # tform.source_metadata = { type: :integer } # tform.to_proc.call('12A') # => ArgumentError: invalid value for Integer(): "12A" class EnforceType < Transform def initialize(*args, **kargs, &block) super end def type @type ||= @source_metadata.fetch(:type, :string) end def in_format @in_format ||= @source_metadata.fetch(:in_format, '') end def scale @scale ||= @source_metadata.fetch(:scale, 0) end def if_blank return @if_blank if @if_blank_set @if_blank_set = true @if_blank = @source_metadata.fetch(:if_blank, default_if_blank) end def blank_handler(value) return value unless value.blank? if if_blank.respond_to? :to_proc if_blank.to_proc.call(value) else if_blank end end def default_if_blank type == :string ? '' : nil end def truthy(value) @truthy ||= Truthy.new(allow_nils: false).to_proc @truthy.call(value) end def transform(value) if value.blank? && type != :json blank_handler(value) else case type when :string value when :integer Integer(value) when :float Float(value) when :decimal Float("%.#{scale}f" % Float(value)) when :date value.is_a?(Date) ? value : Date.strptime(value, in_format) when :datetime value.is_a?(Time) ? value : Time.strptime(value, in_format) when :json if value.blank? && value != [] && value != {} blank_handler(value) else value.is_a?(Hash) || value.is_a?(Array) ? value : JSON.parse(value) end when :boolean # Ugh, there is a bug with Daru 0.1.4 that converts false values to nil when joining # For now, we'll just standardize boolean values (#to_s) truthy(value).to_s else raise ArgumentError, "Unknown type enforcement: #{type}" end end end end # Public: Converts strings into booleans. # Uses a regex to convert strings representing booleans to actual booleans. # The truthy regex is /^(t|true|y|yes|1)$/i and the falsey regex is /^(f|false|n|no|0)$/i # # allow_nils - Specifies whether to allow the result to include nils. If this is set # to false, then the value is only checked against the truthy regex and # the returned value is false if it doesn't match. If allow_nils # is set to true, the both the truthy and the falsey regex are checked. # If neither match, then the result is nil. (Default: false). # # Examples: # # Truthy.new.to_proc.call('True') # => true # Truthy.new.to_proc.call('Yes') # => true # Truthy.new.to_proc.call('y') # => true # Truthy.new.to_proc.call('Yessire') # => false # Truthy.new.to_proc.call('0') # => false # Truthy.new.to_proc.call('Pineapple') # => false # Truthy.new(allow_nils: false).to_proc.call('Pineapple') # => nil class Truthy < Transform def initialize(*args, allow_nils: false, **kargs, &block) super @allow_nils = allow_nils @true_regex = /^(t|true|y|yes|1)$/i @false_regex = /^(f|false|n|no|0)$/i end def match_true(value) !!value.match(@true_regex) end def match_false(value) !!value.match(@false_regex) end def transform(value) value = value.to_s if @allow_nils if match_true(value) true elsif match_false(value) false else nil end else match_true(value) end end end # Public: Applies a DataFrame grouping sieve. # # The DataFrame sieve can be used to simplify very complex nested # if-then logic to group data into buckets. Given a DataFrame # with N columns, the first N-1 columns represent the variables # needed to group data into buckets. The last column is the # desired group. The sieve then progresses down the rows of the # DataFrame and checks to see if the input data matches the values # in the columns of the sieve. Nils in the sieve are treated as # wildcards and match anything. The first row that matches wins # and the sieve progression stops. # # sieve_df - The sieve, defined as a dataframe. The names of the # sieve vectors must correspond to the names of the # vectors in the dataframe source to target map. The # last vector in the sieve_df is used as the result of the sieve. # # # Examples: # # # This sieve captures the following business logic # # 1 - All Non-Graduate Nursing, regardless of contact, gets assigned to the :intensive group. # # 2 - All Undergraduate programs with contact get assigned to the :intensive group. # # 3 - All Undergraduate programs without a contact get assigned to the :base group. # # 4 - All Graduate engineering programs with a contact get assigned to the :intensive group. # # 5 - All other programs get assigned to the :base group # sieve_df = Daru::DataFrame.new([ # [ 'Undergrad' , 'NURS' , nil , :intensive ], # [ 'Undergrad' , nil , true , :intensive ], # [ 'Undergrad' , nil , false , :base ], # [ 'Grad' , 'ENG' , true , :intensive ], # [ nil , nil , nil , :base ], # ].transpose, # order: [:level, :program, :contact, :group] # ) # # test_df = Daru::DataFrame.new([ # ['Undergrad' , 'CHEM' , false], # ['Undergrad' , 'CHEM' , true], # ['Grad' , 'CHEM' , true], # ['Undergrad' , 'NURS' , false], # ['Unknown' , 'CHEM' , true], # ].transpose, # order: [:level, :program, :contact] # ) # # Remi::SourceToTargetMap.apply(test_df) do # map source(:level, :program, :contact,) .target(:group) # .transform(Remi::Transform::DataFrameSieve.new(sieve_df)) # end # # test_df # # => # # level program contact group # 0 Undergrad CHEM nil base # 1 Undergrad CHEM true intensive # 2 Grad CHEM true base # 3 Undergrad NURS nil intensive # 4 Unknown CHEM true base class DataFrameSieve < Transform def initialize(sieve_df, *args, **kargs, &block) super @sieve_table = sieve_df.transpose.to_h.values end def transform(row) sieve_keys = @sieve_table.first.index.to_a sieve_result_key = sieve_keys.pop raise ArgumentError, "#{sieve_keys - row.source_keys} not found in row" unless (sieve_keys - row.source_keys).size == 0 @sieve_table.each.find do |sieve_row| match_row = true sieve_keys.each do |sieve_key| match_value = if sieve_row[sieve_key].is_a?(Regexp) !!sieve_row[sieve_key].match(row[sieve_key]) else sieve_row[sieve_key] == row[sieve_key] end match_row &&= sieve_row[sieve_key].nil? || match_value end match_row end[sieve_result_key] end end # Public: Used to partition elements into groups (buckets). # # buckets - A hash where the keys are groups and the values are weights or percentages. # current_population - A hashable object holding a count of the current number of # elements in each bucket. # # Example: # # # The current population has 2 record in the A bucket and 3 in B # current_pop = Daru::Vector.new([2,3], index: ['A', 'B']) # # # We want to generate 7 new records that will evenly populate the A, B, and C buckets, given the current populations. # part = Remi::Transform::Partitioner.new(buckets: { 'A' => 1, 'B' => 1,'C' => 1 }, initial_population: current_pop) # # 1.upt(7).map { |iter| part.call } # => ["C", "C", "A", "C", "C", "B", "A"] class Partitioner < Transform def initialize(buckets:, initial_population: {}, **kargs, &block) super @buckets = buckets @current_population = sanitize_initial_population(buckets, initial_population) end attr_reader :buckets attr_reader :current_population def transform get_next_value end def size @size ||= @current_population.reduce(0) { |sum, (group, n)| sum += n } end def total_weight @total_weight ||= @buckets.reduce(0) { |sum, (bucket, weight)| sum += 1.0 * weight } end def get_next_value assigned = @buckets.max_by do |(group, weight)| expected = @buckets[group] / total_weight * size actual = @current_population[group] diff = expected - actual if diff > 0 rand**(1.0 / diff) else -rand**(- 1.0 / @buckets[group]) end end.first @current_population[assigned] += 1 @size += 1 assigned end private def sanitize_initial_population(buckets, dist) dist = dist.to_h zero_distribution = buckets.keys.reduce({}) { |h, group| h[group] = 0; h } zero_distribution.merge(dist.select { |k,v| buckets.keys.include? k }) end end end end