lib/statsample/regression.rb in statsample-0.3.0 vs lib/statsample/regression.rb in statsample-0.3.1
- old
+ new
@@ -1,522 +1,10 @@
+require 'statsample/regression/simple'
+require 'statsample/regression/multiple'
+require 'statsample/regression/multiple/alglibengine'
+require 'statsample/regression/multiple/rubyengine'
+
module Statsample
- # module for regression methods
+ # Module for regression procedures
module Regression
- # Class for calculation of linear regressions
- # To create a SimpleRegression object:
- # * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
- # * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
- #
- class SimpleRegression
- attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
- private_class_method :new
- def initialize(init_method, *argv)
- self.send(init_method, *argv)
- end
- def y(val_x)
- @a+@b*val_x
- end
- def x(val_y)
- (val_y-@a) / @b.to_f
- end
- # Sum of square error
- def sse
- (0...@vx.size).inject(0) {|acum,i|
- acum+((@vy[i]-y(@vx[i]))**2)
- }
- end
- def standard_error
- Math::sqrt(sse / (@vx.size-2).to_f)
- end
- # Sum of square regression
- def ssr
- vy_mean=@vy.mean
- (0...@vx.size).inject(0) {|a,i|
- a+((y(@vx[i])-vy_mean)**2)
- }
-
- end
- # Sum of square total
- def sst
- @vy.sum_of_squared_deviation
- end
- # Value of r
- def r
- @b * (@vx.sds / @vy.sds)
- end
- # Value of r^2
- def r2
- r**2
- end
- class << self
- def new_from_gsl(ar)
- new(:init_gsl, *ar)
- end
- def new_from_vectors(vx,vy)
- new(:init_vectors,vx,vy)
- end
- end
- def init_vectors(vx,vy)
- @vx,@vy=Statsample.only_valid(vx,vy)
- x_m=@vx.mean
- y_m=@vy.mean
- num=den=0
- (0...@vx.size).each {|i|
- num+=(@vx[i]-x_m)*(@vy[i]-y_m)
- den+=(@vx[i]-x_m)**2
- }
- @b=num.to_f/den
- @a=y_m - @b*x_m
- end
- def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
- @a=a
- @b=b
- @cov00=cov00
- @cov01=cov01
- @covx1=covx1
- @chisq=chisq
- @status=status
- end
- end
-
-
- class MultipleRegressionBase
- def initialize(ds,y_var)
- @ds=ds
- @y_var=y_var
- @r2=nil
-
- end
- def assign_names(c)
- a={}
- @fields.each_index {|i|
- a[@fields[i]]=c[i]
- }
- a
- end
- def predicted
- (0...@ds.cases).collect { |i|
- invalid=false
- vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
- if invalid
- nil
- else
- process(vect)
- end
- }.to_vector(:scale)
- end
- def standarized_predicted
- predicted.standarized
- end
- def residuals
- (0...@ds.cases).collect{|i|
- invalid=false
- vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
- if invalid or @ds[@y_var][i].nil?
- nil
- else
- @ds[@y_var][i] - process(vect)
- end
- }.to_vector(:scale)
- end
- def r
- raise "You should implement this"
- end
- def sst
- raise "You should implement this"
- end
- def ssr
- r2*sst
- end
- def sse
- sst - ssr
- end
-
- def coeffs_t
- out={}
- se=coeffs_se
- coeffs.each{|k,v|
- out[k]=v / se[k]
- }
- out
- end
-
- def mse
- sse/df_e
- end
-
- def df_r
- @dep_columns.size
- end
- def df_e
- @ds_valid.cases-@dep_columns.size-1
- end
- def f
- (ssr.quo(df_r)).quo(sse.quo(df_e))
- end
- # Significance of Fisher
- def significance
- if HAS_GSL
- GSL::Cdf.fdist_Q(f,df_r,df_e)
- else
- raise "Need Ruby/GSL"
- end
- end
- # Tolerance for a given variable
- # http://talkstats.com/showthread.php?t=5056
- def tolerance(var)
- ds=assign_names(@dep_columns)
- ds.each{|k,v|
- ds[k]=v.to_vector(:scale)
- }
- if HAS_ALGIB
- lr_class=::Statsample::Regression::MultipleRegressionAlglib
- ds=ds.to_dataset
- else
- lr_class=MultipleRegressionPairwise
- ds=ds.to_dataset.dup_only_valid
- end
- lr=lr_class.new(ds,var)
- 1-lr.r2
- end
- def coeffs_tolerances
- @fields.inject({}) {|a,f|
- a[f]=tolerance(f);
- a
- }
- end
- def coeffs_se
- out={}
- mse=sse.quo(df_e)
- coeffs.each {|k,v|
- out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
- }
- out
- end
- def estimated_variance_covariance_matrix
- mse_p=mse
- columns=[]
- @ds_valid.each_vector{|k,v|
- columns.push(v.data) unless k==@y_var
- }
- columns.unshift([1.0]*@ds_valid.cases)
- x=Matrix.columns(columns)
- matrix=((x.t*x)).inverse * mse
- matrix.collect {|i|
-
- Math::sqrt(i) if i>0
- }
- end
- def constant_t
- constant.to_f/constant_se
- end
- def constant_se
- estimated_variance_covariance_matrix[0,0]
- end
- def summary(report_type=ConsoleSummary)
- c=coeffs
- out=""
- out.extend report_type
- out.add <<HEREDOC
-Summary for regression of #{@fields.join(',')} over #{@y_var}"
-*************************************************************
-Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
-r=#{sprintf("%0.3f",r)}
-r2=#{sprintf("%0.3f",r2)}
-ssr=#{sprintf("%0.3f",ssr)}
-sse=#{sprintf("%0.3f",sse)}
-sst=#{sprintf("%0.3f",sst)}
-F#{sprintf("(%d,%d)=%0.3f, p=%0.3f",df_r,df_e,f,significance)}
-Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
-
-HEREDOC
-
- end
-
-
- # Deprecated
- # Sum of squares of error (manual calculation)
- # using the predicted value minus the y_i value
- def sse_manual
- pr=predicted
- cases=0
- sse=(0...@ds.cases).inject(0) {|a,i|
- if !@dy.data_with_nils[i].nil? and !pr[i].nil?
- cases+=1
- a+((pr[i]-@dy[i])**2)
- else
- a
- end
- }
- sse*(min_n_valid-1.0).quo(cases-1)
- end
- # Sum of squares of regression
- # using the predicted value minus y mean
- def ssr_direct
- mean=@dy.mean
- cases=0
- ssr=(0...@ds.cases).inject(0) {|a,i|
- invalid=false
- v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
- if !invalid
- cases+=1
- a+((process(v)-mean)**2)
- else
- a
- end
- }
- ssr
- end
- def sse_direct
- sst-ssr
- end
-
-
- end
-
-
-
-
-
- if HAS_ALGIB
- # Class for calculation of multiple regression.
- # Requires Alglib gem.
- # To create a SimpleRegression object:
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- # lr=Statsample::Regression::MultipleRegression.new(ds,'y')
- #
- class MultipleRegressionAlglib < MultipleRegressionBase
- def initialize(ds,y_var)
- @ds=ds.dup_only_valid
- @ds_valid=@ds
- @y_var=y_var
- @dy=@ds[@y_var]
- @ds_indep=ds.dup(ds.fields-[y_var])
- # Create a custom matrix
- columns=[]
- @fields=[]
- @ds.fields.each{|f|
- if f!=@y_var
- columns.push(@ds[f].to_a)
- @fields.push(f)
- end
- }
- @dep_columns=columns.dup
- columns.push(@ds[@y_var])
- matrix=Matrix.columns(columns)
- @lr_s=nil
- @lr=::Alglib::LinearRegression.build_from_matrix(matrix)
- end
-
- def _dump(i)
- Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
- end
- def self._load(data)
- h=Marshal.load(data)
- MultipleRegression.new(h['ds'], h['y_var'])
- end
-
- def coeffs
- assign_names(@lr.coeffs)
- end
- # Coefficients using a constant
- # Based on http://www.xycoon.com/ols1.htm
- def matrix_resolution
- mse_p=mse
- columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
- columns.unshift([1.0]*@ds.cases)
- y=Matrix.columns([@dy.data.map {|i| i.to_f}])
- x=Matrix.columns(columns)
- xt=x.t
- matrix=((xt*x)).inverse*xt
- matrix*y
- end
- def r2
- r**2
- end
- def r
- Bivariate::pearson(@dy,predicted)
- end
- def sst
- @dy.ss
- end
- def constant
- @lr.constant
- end
- def standarized_coeffs
- l=lr_s
- assign_names(l.coeffs)
- end
- def lr_s
- if @lr_s.nil?
- build_standarized
- end
- @lr_s
- end
- def build_standarized
- @ds_s=@ds.standarize
- columns=[]
- @ds_s.fields.each{|f|
- columns.push(@ds_s[f].to_a) unless f==@y_var
- }
- @dep_columns_s=columns.dup
- columns.push(@ds_s[@y_var])
- matrix=Matrix.columns(columns)
- @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
- end
- def process(v)
- @lr.process(v)
- end
- def process_s(v)
- lr_s.process(v)
- end
- # ???? Not equal to SPSS output
- def standarized_residuals
- res=residuals
- red_sd=residuals.sds
- res.collect {|v|
- v.quo(red_sd)
- }.to_vector(:scale)
- end
- end
- end
-
-
-
-
-
-
-
-
-
-
-
-
- class MultipleRegressionPairwise < MultipleRegressionBase
- def initialize(ds,y_var)
- super
- @dy=ds[@y_var]
- @ds_valid=ds.dup_only_valid
- @ds_indep=ds.dup(ds.fields-[y_var])
- @fields=@ds_indep.fields
- set_dep_columns
- obtain_y_vector
- @matrix_x = Bivariate.correlation_matrix(@ds_indep)
- @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
- @min_n_valid=nil
- end
- def min_n_valid
- if @min_n_valid.nil?
- min=@ds.cases
- m=Bivariate::n_valid_matrix(@ds)
- for x in 0...m.row_size
- for y in 0...m.column_size
- min=m[x,y] if m[x,y] < min
- end
- end
- @min_n_valid=min
- end
- @min_n_valid
- end
- def set_dep_columns
- @dep_columns=[]
- @ds_indep.each_vector{|k,v|
- @dep_columns.push(v.data_with_nils)
- }
- end
- # Sum of square total
- def sst
- #if @sst.nil?
- @sst=@dy.variance*(min_n_valid-1.0)
- #end
- @sst
- end
- def r2
- if @r2.nil?
- c=@matrix_y
- rxx=obtain_predictor_matrix
- matrix=(c.t*rxx.inverse*c)
- @r2=matrix[0,0]
- end
- @r2
- end
- def r
- Math::sqrt(r2)
- end
-
- def df_e
- min_n_valid-@dep_columns.size-1
- end
- def fix_with_mean
- i=0
- @ds_indep.each{|row|
- empty=[]
- row.each{|k,v|
- empty.push(k) if v.nil?
- }
- if empty.size==1
- @ds_indep[empty[0]][i]=@ds[empty[0]].mean
- end
- i+=1
- }
- @ds_indep.update_valid_data
- set_dep_columns
- end
- def fix_with_regression
- i=0
- @ds_indep.each{|row|
- empty=[]
- row.each{|k,v|
- empty.push(k) if v.nil?
- }
- if empty.size==1
- field=empty[0]
- lr=MultipleRegression.new(@ds_indep,field)
- fields=[]
- @ds_indep.fields.each{|f|
- fields.push(row[f]) unless f==field
- }
- @ds_indep[field][i]=lr.process(fields)
- end
- i+=1
- }
- @ds_indep.update_valid_data
- set_dep_columns
- end
- def obtain_y_vector
- @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
- Bivariate.pearson(@dy, @ds_indep[f])
- }])
- end
- def obtain_predictor_matrix
- Bivariate::correlation_matrix(@ds_indep)
- end
- def constant
- c=coeffs
- @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
- end
- def process(v)
- c=coeffs
- total=constant
- @fields.each_index{|i|
- total+=c[@fields[i]]*v[i]
- }
- total
- end
- def coeffs
- sc=standarized_coeffs
- assign_names(@fields.collect{|f|
- (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
- })
- end
- def standarized_coeffs
- assign_names(@coeffs_stan)
- end
- end
-
-
end
end