linear_regression.rb in eps-0.3.1

- old
+ new

@@ -1,42 +1,7 @@
 module Eps
   class LinearRegression < BaseEstimator
-    # pmml
-
-    def self.load_pmml(data)
-      super do |data|
-        # TODO more validation
-        node = data.css("RegressionTable")
-
-        coefficients = {
-          "_intercept" => node.attribute("intercept").value.to_f
-        }
-
-        features = {}
-
-        text_features, derived_fields = extract_text_features(data, features)
-
-        node.css("NumericPredictor").each do |n|
-          name = n.attribute("name").value
-          if derived_fields[name]
-            name = derived_fields[name]
-          else
-            features[name] = "numeric"
-          end
-          coefficients[name] = n.attribute("coefficient").value.to_f
-        end
-
-        node.css("CategoricalPredictor").each do |n|
-          name = n.attribute("name").value
-          coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
-          features[name] = "categorical"
-        end
-
-        Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
-      end
-    end
-
     def coefficients
       @evaluator.coefficients
     end
 
     def r2
@@ -82,37 +47,50 @@
       if data.size < data.columns.size + 2
         raise "Number of data points must be at least two more than number of features"
       end
 
       x = data.map_rows(&:to_a)
-      data.size.times do |i|
-        # add intercept
-        x[i].unshift(1)
+
+      intercept = @options.key?(:intercept) ? @options[:intercept] : true
+      if intercept
+        data.size.times do |i|
+          x[i].unshift(1)
+        end
       end
 
       gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
 
       v3 =
         if gsl
           x = GSL::Matrix.alloc(*x)
           y = GSL::Vector.alloc(data.label)
-          c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
+          w = GSL::Vector.alloc(data.weight) if data.weight
+          c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
           c.to_a
         else
           x = Matrix.rows(x)
           y = Matrix.column_vector(data.label)
+
+          # weighted OLS
+          # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
+          w = Matrix.diagonal(*data.weight) if data.weight
+
           removed = []
 
           # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
-          # unforutnately, this method is unstable
+          # unfortunately, this method is unstable
           # haven't found an efficient way to do QR-factorization in Ruby
           # the extendmatrix gem has householder and givens (givens has bug)
           # but methods are too slow
           xt = x.t
+          xt *= w if w
           begin
             @xtxi = (xt * x).inverse
           rescue ExceptionForMatrix::ErrNotRegular
+            # matrix cannot be inverted
+            # https://en.wikipedia.org/wiki/Multicollinearity
+
             constant = {}
             (1...x.column_count).each do |i|
               constant[i] = constant?(x.column(i))
             end
 
@@ -132,20 +110,22 @@
               # @coefficient_names.delete_at(i)
               vectors.delete_at(i)
             end
             x = Matrix.columns(vectors)
             xt = x.t
+            xt *= w if w
 
             # try again
             begin
               @xtxi = (xt * x).inverse
             rescue ExceptionForMatrix::ErrNotRegular
               raise "Multiple solutions - GSL is needed to select one"
             end
           end
           # huge performance boost
           # by multiplying xt * y first
+          # for weighted, w is already included in wt
           v2 = @xtxi * (xt * y)
 
           # convert to array
           v2 = v2.to_a.map { |xi| xi[0].to_f }
 
@@ -156,50 +136,17 @@
           @removed = removed
 
           v2
         end
 
-      @coefficient_names = ["_intercept"] + data.columns.keys
-      @coefficients = Hash[@coefficient_names.zip(v3)]
-      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
-    end
-
-    def generate_pmml
-      predictors = @coefficients.dup
-      predictors.delete("_intercept")
-
-      data_fields = {}
-      @features.each do |k, type|
-        if type == "categorical"
-          data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
-        else
-          data_fields[k] = nil
-        end
+      if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
+        raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
       end
 
-      build_pmml(data_fields) do |xml|
-        xml.RegressionModel(functionName: "regression") do
-          xml.MiningSchema do
-            @features.each do |k, _|
-              xml.MiningField(name: k)
-            end
-          end
-          pmml_local_transformations(xml)
-          xml.RegressionTable(intercept: @coefficients["_intercept"]) do
-            predictors.each do |k, v|
-              if k.is_a?(Array)
-                if @features[k.first] == "text"
-                  xml.NumericPredictor(name: display_field(k), coefficient: v)
-                else
-                  xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
-                end
-              else
-                xml.NumericPredictor(name: k, coefficient: v)
-              end
-            end
-          end
-        end
-      end
+      @coefficient_names = data.columns.keys
+      @coefficient_names.unshift("_intercept") if intercept
+      @coefficients = Hash[@coefficient_names.zip(v3)]
+      Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
     end
 
     def prep_x(x)
       x = x.dup
       @features.each do |k, type|