lib/eps/linear_regression.rb in eps-0.3.0 vs lib/eps/linear_regression.rb in eps-0.3.1

- old
+ new

@@ -1,42 +1,7 @@ module Eps class LinearRegression < BaseEstimator - # pmml - - def self.load_pmml(data) - super do |data| - # TODO more validation - node = data.css("RegressionTable") - - coefficients = { - "_intercept" => node.attribute("intercept").value.to_f - } - - features = {} - - text_features, derived_fields = extract_text_features(data, features) - - node.css("NumericPredictor").each do |n| - name = n.attribute("name").value - if derived_fields[name] - name = derived_fields[name] - else - features[name] = "numeric" - end - coefficients[name] = n.attribute("coefficient").value.to_f - end - - node.css("CategoricalPredictor").each do |n| - name = n.attribute("name").value - coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f - features[name] = "categorical" - end - - Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features) - end - end - def coefficients @evaluator.coefficients end def r2 @@ -82,37 +47,50 @@ if data.size < data.columns.size + 2 raise "Number of data points must be at least two more than number of features" end x = data.map_rows(&:to_a) - data.size.times do |i| - # add intercept - x[i].unshift(1) + + intercept = @options.key?(:intercept) ? @options[:intercept] : true + if intercept + data.size.times do |i| + x[i].unshift(1) + end end gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL) v3 = if gsl x = GSL::Matrix.alloc(*x) y = GSL::Vector.alloc(data.label) - c, @covariance, _, _ = GSL::MultiFit::linear(x, y) + w = GSL::Vector.alloc(data.weight) if data.weight + c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y) c.to_a else x = Matrix.rows(x) y = Matrix.column_vector(data.label) + + # weighted OLS + # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/ + w = Matrix.diagonal(*data.weight) if data.weight + removed = [] # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf - # unforutnately, this method is unstable + # unfortunately, this method is unstable # haven't found an efficient way to do QR-factorization in Ruby # the extendmatrix gem has householder and givens (givens has bug) # but methods are too slow xt = x.t + xt *= w if w begin @xtxi = (xt * x).inverse rescue ExceptionForMatrix::ErrNotRegular + # matrix cannot be inverted + # https://en.wikipedia.org/wiki/Multicollinearity + constant = {} (1...x.column_count).each do |i| constant[i] = constant?(x.column(i)) end @@ -132,20 +110,22 @@ # @coefficient_names.delete_at(i) vectors.delete_at(i) end x = Matrix.columns(vectors) xt = x.t + xt *= w if w # try again begin @xtxi = (xt * x).inverse rescue ExceptionForMatrix::ErrNotRegular raise "Multiple solutions - GSL is needed to select one" end end # huge performance boost # by multiplying xt * y first + # for weighted, w is already included in wt v2 = @xtxi * (xt * y) # convert to array v2 = v2.to_a.map { |xi| xi[0].to_f } @@ -156,50 +136,17 @@ @removed = removed v2 end - @coefficient_names = ["_intercept"] + data.columns.keys - @coefficients = Hash[@coefficient_names.zip(v3)] - Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features) - end - - def generate_pmml - predictors = @coefficients.dup - predictors.delete("_intercept") - - data_fields = {} - @features.each do |k, type| - if type == "categorical" - data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last) - else - data_fields[k] = nil - end + if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 } + raise UnstableSolution, "GSL is needed to find a stable solution for this dataset" end - build_pmml(data_fields) do |xml| - xml.RegressionModel(functionName: "regression") do - xml.MiningSchema do - @features.each do |k, _| - xml.MiningField(name: k) - end - end - pmml_local_transformations(xml) - xml.RegressionTable(intercept: @coefficients["_intercept"]) do - predictors.each do |k, v| - if k.is_a?(Array) - if @features[k.first] == "text" - xml.NumericPredictor(name: display_field(k), coefficient: v) - else - xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v) - end - else - xml.NumericPredictor(name: k, coefficient: v) - end - end - end - end - end + @coefficient_names = data.columns.keys + @coefficient_names.unshift("_intercept") if intercept + @coefficients = Hash[@coefficient_names.zip(v3)] + Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features) end def prep_x(x) x = x.dup @features.each do |k, type|