lib/eps/linear_regression.rb in eps-0.2.1 vs lib/eps/linear_regression.rb in eps-0.3.0

- old
+ new

@@ -1,30 +1,107 @@ module Eps class LinearRegression < BaseEstimator - def initialize(coefficients: nil, gsl: nil) - @coefficients = Hash[coefficients.map { |k, v| [k.is_a?(Array) ? [k[0].to_sym, k[1]] : k.to_sym, v] }] if coefficients - @gsl = gsl.nil? ? defined?(GSL) : gsl + # pmml + + def self.load_pmml(data) + super do |data| + # TODO more validation + node = data.css("RegressionTable") + + coefficients = { + "_intercept" => node.attribute("intercept").value.to_f + } + + features = {} + + text_features, derived_fields = extract_text_features(data, features) + + node.css("NumericPredictor").each do |n| + name = n.attribute("name").value + if derived_fields[name] + name = derived_fields[name] + else + features[name] = "numeric" + end + coefficients[name] = n.attribute("coefficient").value.to_f + end + + node.css("CategoricalPredictor").each do |n| + name = n.attribute("name").value + coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f + features[name] = "categorical" + end + + Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features) + end end - def train(*args) - super + def coefficients + @evaluator.coefficients + end - x, @coefficient_names = prep_x(@x) + def r2 + @r2 ||= (sst - sse) / sst + end - if x.size <= @coefficient_names.size - raise "Number of samples must be at least two more than number of features" + def adjusted_r2 + @adjusted_r2 ||= (mst - mse) / mst + end + + private + + # https://people.richland.edu/james/ictcm/2004/multiple.html + def _summary(extended: false) + coefficients = @coefficients + str = String.new("") + len = [coefficients.keys.map(&:size).max, 15].max + if extended + str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"] + else + str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"] end + coefficients.each do |k, v| + if extended + str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]] + else + str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]] + end + end + str += "\n" + str += "r2: %.3f\n" % [r2] if extended + str += "adjusted r2: %.3f\n" % [adjusted_r2] + str + end + def _train(**options) + raise "Target must be numeric" if @target_type != "numeric" + check_missing_value(@train_set) + check_missing_value(@validation_set) if @validation_set + + data = prep_x(@train_set) + + if data.size < data.columns.size + 2 + raise "Number of data points must be at least two more than number of features" + end + + x = data.map_rows(&:to_a) + data.size.times do |i| + # add intercept + x[i].unshift(1) + end + + gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL) + v3 = - if @gsl + if gsl x = GSL::Matrix.alloc(*x) - y = GSL::Vector.alloc(@y) + y = GSL::Vector.alloc(data.label) c, @covariance, _, _ = GSL::MultiFit::linear(x, y) c.to_a else x = Matrix.rows(x) - y = Matrix.column_vector(@y) + y = Matrix.column_vector(data.label) removed = [] # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf # unforutnately, this method is unstable # haven't found an efficient way to do QR-factorization in Ruby @@ -65,215 +142,82 @@ raise "Multiple solutions - GSL is needed to select one" end end # huge performance boost # by multiplying xt * y first - v2 = matrix_arr(@xtxi * (xt * y)) + v2 = @xtxi * (xt * y) + # convert to array + v2 = v2.to_a.map { |xi| xi[0].to_f } + # add back removed removed.sort.each do |i| v2.insert(i, 0) end @removed = removed v2 end + @coefficient_names = ["_intercept"] + data.columns.keys @coefficients = Hash[@coefficient_names.zip(v3)] + Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features) end - # legacy + def generate_pmml + predictors = @coefficients.dup + predictors.delete("_intercept") - def coefficients - Hash[@coefficients.map { |k, v| [Array(k).join.to_sym, v] }] - end - - # ruby - - def self.load(data) - new(Hash[data.map { |k, v| [k.to_sym, v] }]) - end - - def dump - {coefficients: coefficients} - end - - # json - - def self.load_json(data) - data = JSON.parse(data) if data.is_a?(String) - coefficients = data["coefficients"] - - # for R models - if coefficients["(Intercept)"] - coefficients = coefficients.dup - coefficients["_intercept"] = coefficients.delete("(Intercept)") - end - - new(coefficients: coefficients) - end - - def to_json - JSON.generate(dump) - end - - # pmml - - def self.load_pmml(data) - # TODO more validation - node = data.css("RegressionTable") - coefficients = { - _intercept: node.attribute("intercept").value.to_f - } - node.css("NumericPredictor").each do |n| - coefficients[n.attribute("name").value] = n.attribute("coefficient").value.to_f - end - node.css("CategoricalPredictor").each do |n| - coefficients[[n.attribute("name").value.to_sym, n.attribute("value").value]] = n.attribute("coefficient").value.to_f - end - new(coefficients: coefficients) - end - - def to_pmml - predictors = @coefficients.reject { |k| k == :_intercept } - data_fields = {} - predictors.each do |k, v| - if k.is_a?(Array) - (data_fields[k[0]] ||= []) << k[1] + @features.each do |k, type| + if type == "categorical" + data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last) else data_fields[k] = nil end end - builder = Nokogiri::XML::Builder.new do |xml| - xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do - xml.Header - xml.DataDictionary do - data_fields.each do |k, vs| - if vs - xml.DataField(name: k, optype: "categorical", dataType: "string") do - vs.each do |v| - xml.Value(value: v) - end - end - else - xml.DataField(name: k, optype: "continuous", dataType: "double") - end + build_pmml(data_fields) do |xml| + xml.RegressionModel(functionName: "regression") do + xml.MiningSchema do + @features.each do |k, _| + xml.MiningField(name: k) end end - xml.RegressionModel(functionName: "regression") do - xml.MiningSchema do - data_fields.each do |k, _| - xml.MiningField(name: k) - end - end - xml.RegressionTable(intercept: @coefficients[:_intercept]) do - predictors.each do |k, v| - if k.is_a?(Array) - xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v) + pmml_local_transformations(xml) + xml.RegressionTable(intercept: @coefficients["_intercept"]) do + predictors.each do |k, v| + if k.is_a?(Array) + if @features[k.first] == "text" + xml.NumericPredictor(name: display_field(k), coefficient: v) else - xml.NumericPredictor(name: k, coefficient: v) + xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v) end + else + xml.NumericPredictor(name: k, coefficient: v) end end end end - end.to_xml - end - - # pfa - - def self.load_pfa(data) - data = JSON.parse(data) if data.is_a?(String) - init = data["cells"].first[1]["init"] - names = - if data["input"]["fields"] - data["input"]["fields"].map { |f| f["name"] } - else - init["coeff"].map.with_index { |_, i| "x#{i}" } - end - coefficients = { - _intercept: init["const"] - } - init["coeff"].each_with_index do |c, i| - name = names[i] - # R can export coefficients with same name - raise "Coefficients with same name" if coefficients[name] - coefficients[name] = c end - new(coefficients: coefficients) end - # metrics - - def self.metrics(actual, estimated) - errors = actual.zip(estimated).map { |yi, yi2| yi - yi2 } - - { - me: mean(errors), - mae: mean(errors.map { |v| v.abs }), - rmse: Math.sqrt(mean(errors.map { |v| v**2 })) - } - end - - # private - def self.mean(arr) - arr.inject(0, &:+) / arr.size.to_f - end - - # https://people.richland.edu/james/ictcm/2004/multiple.html - def summary(extended: false) - coefficients = @coefficients - str = String.new("") - len = [coefficients.keys.map(&:size).max, 15].max - if extended - str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"] - else - str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"] - end - coefficients.each do |k, v| - if extended - str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [display_field(k), v, std_err[k], t_value[k], p_value[k]] - else - str += "%-#{len}s %12.2f %12.3f\n" % [display_field(k), v, p_value[k]] + def prep_x(x) + x = x.dup + @features.each do |k, type| + if type == "categorical" + values = x.columns.delete(k) + labels = values.uniq[1..-1] + labels.each do |label| + x.columns[[k, label]] = values.map { |v| v == label ? 1 : 0 } + end end end - str += "\n" - str += "r2: %.3f\n" % [r2] if extended - str += "adjusted r2: %.3f\n" % [adjusted_r2] - str + prep_text_features(x) + x end - def r2 - @r2 ||= (sst - sse) / sst - end - - def adjusted_r2 - @adjusted_r2 ||= (mst - mse) / mst - end - - private - - def _predict(x) - x, c = prep_x(x, train: false) - coef = c.map do |v| - # use 0 if coefficient does not exist - # this can happen for categorical features - # since only n-1 coefficients are stored - @coefficients[v] || 0 - end - - x = Matrix.rows(x) - c = Matrix.column_vector(coef) - matrix_arr(x * c) - end - - def display_field(k) - k.is_a?(Array) ? k.join("") : k - end - def constant?(arr) arr.all? { |x| x == arr[0] } end # add epsilon for perfect fits @@ -287,11 +231,11 @@ Hash[@coefficients.map do |k, _| tp = if @gsl GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom) else - tdist_p(t_value[k].abs, degrees_of_freedom) + Eps::Statistics.tdist_p(t_value[k].abs, degrees_of_freedom) end [k, 2 * (1 - tp)] end] end @@ -320,238 +264,40 @@ def covariance @covariance ||= mse * @xtxi end def y_bar - @y_bar ||= mean(@y) + @y_bar ||= mean(@train_set.label) end def y_hat - @y_hat ||= predict(@x) + @y_hat ||= predict(@train_set) end # total sum of squares def sst - @sst ||= @y.map { |y| (y - y_bar)**2 }.sum + @sst ||= @train_set.label.map { |y| (y - y_bar)**2 }.sum end # sum of squared errors of prediction # not to be confused with "explained sum of squares" def sse - @sse ||= @y.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum + @sse ||= @train_set.label.zip(y_hat).map { |y, yh| (y - yh)**2 }.sum end def mst - @mst ||= sst / (@y.size - 1) + @mst ||= sst / (@train_set.size - 1) end def mse @mse ||= sse / degrees_of_freedom end def degrees_of_freedom - @y.size - @coefficients.size + @train_set.size - @coefficients.size end def mean(arr) arr.sum / arr.size.to_f - end - - ### Extracted from https://github.com/estebanz01/ruby-statistics - ### The Ruby author is Esteban Zapata Rojas - ### - ### Originally extracted from https://codeplea.com/incomplete-beta-function-c - ### This function is shared under zlib license and the author is Lewis Van Winkle - def tdist_p(value, degrees_of_freedom) - upper = (value + Math.sqrt(value * value + degrees_of_freedom)) - lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom)) - - x = upper/lower - - alpha = degrees_of_freedom/2.0 - beta = degrees_of_freedom/2.0 - - incomplete_beta_function(x, alpha, beta) - end - - ### Extracted from https://github.com/estebanz01/ruby-statistics - ### The Ruby author is Esteban Zapata Rojas - ### - ### This implementation is an adaptation of the incomplete beta function made in C by - ### Lewis Van Winkle, which released the code under the zlib license. - ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c - def incomplete_beta_function(x, alp, bet) - return if x < 0.0 - return 1.0 if x > 1.0 - - tiny = 1.0E-50 - - if x > ((alp + 1.0)/(alp + bet + 2.0)) - return 1.0 - incomplete_beta_function(1.0 - x, bet, alp) - end - - # To avoid overflow problems, the implementation applies the logarithm properties - # to calculate in a faster and safer way the values. - lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze - front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze - - # This is the non-log version of the left part of the formula (before the continuous fraction) - # down_left = alp * self.beta_function(alp, bet) - # upper_left = (x ** alp) * ((1.0 - x) ** bet) - # front = upper_left/down_left - - f, c, d = 1.0, 1.0, 0.0 - - returned_value = nil - - # Let's do more iterations than the proposed implementation (200 iters) - (0..500).each do |number| - m = number/2 - - numerator = if number == 0 - 1.0 - elsif number % 2 == 0 - (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m)) - else - top = -((alp + m) * (alp + bet + m) * x) - down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0)) - - top/down - end - - d = 1.0 + numerator * d - d = tiny if d.abs < tiny - d = 1.0 / d - - c = 1.0 + numerator / c - c = tiny if c.abs < tiny - - cd = (c*d).freeze - f = f * cd - - if (1.0 - cd).abs < 1.0E-10 - returned_value = front * (f - 1.0) - break - end - end - - returned_value - end - - def prep_x(x, train: true) - coefficients = @coefficients - - if daru?(x) - x = x.to_a[0] - else - x = x.map do |xi| - case xi - when Hash - xi - when Array - Hash[xi.map.with_index { |v, i| [:"x#{i}", v] }] - else - {x0: xi} - end - end - end - - # get column types - if train - column_types = {} - if x.any? - row = x.first - row.each do |k, v| - column_types[k] = categorical?(v) ? "categorical" : "numeric" - end - end - else - # get column types for prediction - column_types = {} - coefficients.each do |k, v| - next if k == :_intercept - if k.is_a?(Array) - column_types[k.first] = "categorical" - else - column_types[k] = "numeric" - end - end - end - - # if !train && x.any? - # # check first row against coefficients - # ckeys = coefficients.keys.map(&:to_s) - # bad_keys = x[0].keys.map(&:to_s).reject { |k| ckeys.any? { |c| c.start_with?(k) } } - # raise "Unknown keys: #{bad_keys.join(", ")}" if bad_keys.any? - # end - - supports_categorical = train || coefficients.any? { |k, _| k.is_a?(Array) } - - cache = {} - first_key = {} - i = 0 - rows = [] - x.each do |xi| - row = {} - xi.each do |k, v| - categorical = column_types[k.to_sym] == "categorical" || (!supports_categorical && categorical?(v)) - - key = categorical ? [k.to_sym, v.to_s] : k.to_sym - v2 = categorical ? 1 : v - - # TODO make more efficient - check_key = supports_categorical ? key : symbolize_coef(key) - next if !train && !coefficients.key?(check_key) - - raise "Missing data" if v2.nil? - - unless cache[key] - cache[key] = i - first_key[k] ||= key if categorical - i += 1 - end - - row[key] = v2 - end - rows << row - end - - if train - # remove one degree of freedom - first_key.values.each do |v| - num = cache.delete(v) - cache.each do |k, v2| - cache[k] -= 1 if v2 > num - end - end - end - - ret2 = [] - rows.each do |row| - ret = [0] * cache.size - row.each do |k, v| - if cache[k] - ret[cache[k]] = v - end - end - ret2 << ([1] + ret) - end - - # flatten keys - c = [:_intercept] + cache.sort_by { |_, v| v }.map(&:first) - - unless supports_categorical - c = c.map { |v| symbolize_coef(v) } - end - - [ret2, c] - end - - def symbolize_coef(k) - (k.is_a?(Array) ? k.join("") : k).to_sym - end - - def matrix_arr(matrix) - matrix.to_a.map { |xi| xi[0].to_f } end end end