lib/eps/linear_regression.rb in eps-0.3.0 vs lib/eps/linear_regression.rb in eps-0.3.1
- old
+ new
@@ -1,42 +1,7 @@
module Eps
class LinearRegression < BaseEstimator
- # pmml
-
- def self.load_pmml(data)
- super do |data|
- # TODO more validation
- node = data.css("RegressionTable")
-
- coefficients = {
- "_intercept" => node.attribute("intercept").value.to_f
- }
-
- features = {}
-
- text_features, derived_fields = extract_text_features(data, features)
-
- node.css("NumericPredictor").each do |n|
- name = n.attribute("name").value
- if derived_fields[name]
- name = derived_fields[name]
- else
- features[name] = "numeric"
- end
- coefficients[name] = n.attribute("coefficient").value.to_f
- end
-
- node.css("CategoricalPredictor").each do |n|
- name = n.attribute("name").value
- coefficients[[name, n.attribute("value").value]] = n.attribute("coefficient").value.to_f
- features[name] = "categorical"
- end
-
- Evaluators::LinearRegression.new(coefficients: coefficients, features: features, text_features: text_features)
- end
- end
-
def coefficients
@evaluator.coefficients
end
def r2
@@ -82,37 +47,50 @@
if data.size < data.columns.size + 2
raise "Number of data points must be at least two more than number of features"
end
x = data.map_rows(&:to_a)
- data.size.times do |i|
- # add intercept
- x[i].unshift(1)
+
+ intercept = @options.key?(:intercept) ? @options[:intercept] : true
+ if intercept
+ data.size.times do |i|
+ x[i].unshift(1)
+ end
end
gsl = options.key?(:gsl) ? options[:gsl] : defined?(GSL)
v3 =
if gsl
x = GSL::Matrix.alloc(*x)
y = GSL::Vector.alloc(data.label)
- c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
+ w = GSL::Vector.alloc(data.weight) if data.weight
+ c, @covariance, _, _ = w ? GSL::MultiFit.wlinear(x, w, y) : GSL::MultiFit.linear(x, y)
c.to_a
else
x = Matrix.rows(x)
y = Matrix.column_vector(data.label)
+
+ # weighted OLS
+ # http://www.real-statistics.com/multiple-regression/weighted-linear-regression/weighted-regression-basics/
+ w = Matrix.diagonal(*data.weight) if data.weight
+
removed = []
# https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
- # unforutnately, this method is unstable
+ # unfortunately, this method is unstable
# haven't found an efficient way to do QR-factorization in Ruby
# the extendmatrix gem has householder and givens (givens has bug)
# but methods are too slow
xt = x.t
+ xt *= w if w
begin
@xtxi = (xt * x).inverse
rescue ExceptionForMatrix::ErrNotRegular
+ # matrix cannot be inverted
+ # https://en.wikipedia.org/wiki/Multicollinearity
+
constant = {}
(1...x.column_count).each do |i|
constant[i] = constant?(x.column(i))
end
@@ -132,20 +110,22 @@
# @coefficient_names.delete_at(i)
vectors.delete_at(i)
end
x = Matrix.columns(vectors)
xt = x.t
+ xt *= w if w
# try again
begin
@xtxi = (xt * x).inverse
rescue ExceptionForMatrix::ErrNotRegular
raise "Multiple solutions - GSL is needed to select one"
end
end
# huge performance boost
# by multiplying xt * y first
+ # for weighted, w is already included in wt
v2 = @xtxi * (xt * y)
# convert to array
v2 = v2.to_a.map { |xi| xi[0].to_f }
@@ -156,50 +136,17 @@
@removed = removed
v2
end
- @coefficient_names = ["_intercept"] + data.columns.keys
- @coefficients = Hash[@coefficient_names.zip(v3)]
- Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
- end
-
- def generate_pmml
- predictors = @coefficients.dup
- predictors.delete("_intercept")
-
- data_fields = {}
- @features.each do |k, type|
- if type == "categorical"
- data_fields[k] = predictors.keys.select { |k, v| k.is_a?(Array) && k.first == k }.map(&:last)
- else
- data_fields[k] = nil
- end
+ if @xtxi && @xtxi.each(:diagonal).any? { |v| v < 0 }
+ raise UnstableSolution, "GSL is needed to find a stable solution for this dataset"
end
- build_pmml(data_fields) do |xml|
- xml.RegressionModel(functionName: "regression") do
- xml.MiningSchema do
- @features.each do |k, _|
- xml.MiningField(name: k)
- end
- end
- pmml_local_transformations(xml)
- xml.RegressionTable(intercept: @coefficients["_intercept"]) do
- predictors.each do |k, v|
- if k.is_a?(Array)
- if @features[k.first] == "text"
- xml.NumericPredictor(name: display_field(k), coefficient: v)
- else
- xml.CategoricalPredictor(name: k[0], value: k[1], coefficient: v)
- end
- else
- xml.NumericPredictor(name: k, coefficient: v)
- end
- end
- end
- end
- end
+ @coefficient_names = data.columns.keys
+ @coefficient_names.unshift("_intercept") if intercept
+ @coefficients = Hash[@coefficient_names.zip(v3)]
+ Evaluators::LinearRegression.new(coefficients: @coefficients, features: @features, text_features: @text_features)
end
def prep_x(x)
x = x.dup
@features.each do |k, type|