module Statsample # module for regression methods module Regression # Class for calculation of linear regressions # To create a SimpleRegression object: # * SimpleRegression.new_from_vectors(vx,vy) # * SimpleRegression.new_from_gsl(gsl) # class SimpleRegression attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status private_class_method :new def initialize(init_method, *argv) self.send(init_method, *argv) end def y(val_x) @a+@b*val_x end def x(val_y) (val_y-@a) / @b.to_f end # Sum of square error def sse (0...@vx.size).inject(0) {|acum,i| acum+((@vy[i]-y(@vx[i]))**2) } end def standard_error Math::sqrt(sse / (@vx.size-2).to_f) end # Sum of square regression def ssr vy_mean=@vy.mean (0...@vx.size).inject(0) {|a,i| a+((y(@vx[i])-vy_mean)**2) } end # Sum of square total def sst @vy.sum_of_squared_deviation end # Value of r def r @b * (@vx.sds / @vy.sds) end # Value of r^2 def r2 r**2 end class << self def new_from_gsl(ar) new(:init_gsl, *ar) end def new_from_vectors(vx,vy) new(:init_vectors,vx,vy) end end def init_vectors(vx,vy) @vx,@vy=Statsample.only_valid(vx,vy) x_m=@vx.mean y_m=@vy.mean num=den=0 (0...@vx.size).each {|i| num+=(@vx[i]-x_m)*(@vy[i]-y_m) den+=(@vx[i]-x_m)**2 } @b=num.to_f/den @a=y_m - @b*x_m end def init_gsl(a,b,cov00, cov01, covx1, chisq, status) @a=a @b=b @cov00=cov00 @cov01=cov01 @covx1=covx1 @chisq=chisq @status=status end end class MultipleRegressionBase def initialize(ds,y_var) @ds=ds @y_var=y_var @r2=nil end def assign_names(c) a={} @fields.each_index {|i| a[@fields[i]]=c[i] } a end def predicted (0...@ds.cases).collect { |i| invalid=false vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]} if invalid nil else process(vect) end }.to_vector(:scale) end def standarized_predicted predicted.standarized end def residuals (0...@ds.cases).collect{|i| invalid=false vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]} if invalid or @ds[@y_var][i].nil? nil else @ds[@y_var][i] - process(vect) end }.to_vector(:scale) end def r raise "You should implement this" end def sst raise "You should implement this" end def ssr r2*sst end def sse sst - ssr end def coeffs_t out={} se=coeffs_se coeffs.each{|k,v| out[k]=v / se[k] } out end def mse sse/df_e end def df_r @dep_columns.size end def df_e @ds_valid.cases-@dep_columns.size-1 end def f (ssr.quo(df_r)).quo(sse.quo(df_e)) end # Significance of Fisher def significance if HAS_GSL GSL::Cdf.fdist_Q(f,df_r,df_e) else raise "Need Ruby/GSL" end end # Tolerance for a given variable # http://talkstats.com/showthread.php?t=5056 def tolerance(var) ds=assign_names(@dep_columns) ds.each{|k,v| ds[k]=v.to_vector(:scale) } if HAS_ALGIB lr_class=::Statsample::Regression::MultipleRegressionAlglib ds=ds.to_dataset else lr_class=MultipleRegressionPairwise ds=ds.to_dataset.dup_only_valid end lr=lr_class.new(ds,var) 1-lr.r2 end def coeffs_tolerances @fields.inject({}) {|a,f| a[f]=tolerance(f); a } end def coeffs_se out={} mse=sse.quo(df_e) coeffs.each {|k,v| out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k))) } out end def estimated_variance_covariance_matrix mse_p=mse columns=[] @ds_valid.each_vector{|k,v| columns.push(v.data) unless k==@y_var } columns.unshift([1.0]*@ds_valid.cases) x=Matrix.columns(columns) matrix=((x.t*x)).inverse * mse matrix.collect {|i| Math::sqrt(i) if i>0 } end def constant_t constant.to_f/constant_se end def constant_se estimated_variance_covariance_matrix[0,0] end def summary(report_type=ConsoleSummary) c=coeffs out="" out.extend report_type out.add <@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset # lr=Statsample::Regression::MultipleRegression.new(ds,'y') # class MultipleRegressionAlglib < MultipleRegressionBase def initialize(ds,y_var) @ds=ds.dup_only_valid @ds_valid=@ds @y_var=y_var @dy=@ds[@y_var] @ds_indep=ds.dup(ds.fields-[y_var]) # Create a custom matrix columns=[] @fields=[] @ds.fields.each{|f| if f!=@y_var columns.push(@ds[f].to_a) @fields.push(f) end } @dep_columns=columns.dup columns.push(@ds[@y_var]) matrix=Matrix.columns(columns) @lr_s=nil @lr=::Alglib::LinearRegression.build_from_matrix(matrix) end def _dump(i) Marshal.dump({'ds'=>@ds,'y_var'=>@y_var}) end def self._load(data) h=Marshal.load(data) MultipleRegression.new(h['ds'], h['y_var']) end def coeffs assign_names(@lr.coeffs) end # Coefficients using a constant # Based on http://www.xycoon.com/ols1.htm def matrix_resolution mse_p=mse columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}} columns.unshift([1.0]*@ds.cases) y=Matrix.columns([@dy.data.map {|i| i.to_f}]) x=Matrix.columns(columns) xt=x.t matrix=((xt*x)).inverse*xt matrix*y end def r2 r**2 end def r Bivariate::pearson(@dy,predicted) end def sst @dy.ss end def constant @lr.constant end def standarized_coeffs l=lr_s assign_names(l.coeffs) end def lr_s if @lr_s.nil? build_standarized end @lr_s end def build_standarized @ds_s=@ds.standarize columns=[] @ds_s.fields.each{|f| columns.push(@ds_s[f].to_a) unless f==@y_var } @dep_columns_s=columns.dup columns.push(@ds_s[@y_var]) matrix=Matrix.columns(columns) @lr_s=Alglib::LinearRegression.build_from_matrix(matrix) end def process(v) @lr.process(v) end def process_s(v) lr_s.process(v) end # ???? Not equal to SPSS output def standarized_residuals res=residuals red_sd=residuals.sds res.collect {|v| v.quo(red_sd) }.to_vector(:scale) end end end class MultipleRegressionPairwise < MultipleRegressionBase def initialize(ds,y_var) super @dy=ds[@y_var] @ds_valid=ds.dup_only_valid @ds_indep=ds.dup(ds.fields-[y_var]) @fields=@ds_indep.fields set_dep_columns obtain_y_vector @matrix_x = Bivariate.correlation_matrix(@ds_indep) @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a @min_n_valid=nil end def min_n_valid if @min_n_valid.nil? min=@ds.cases m=Bivariate::n_valid_matrix(@ds) for x in 0...m.row_size for y in 0...m.column_size min=m[x,y] if m[x,y] < min end end @min_n_valid=min end @min_n_valid end def set_dep_columns @dep_columns=[] @ds_indep.each_vector{|k,v| @dep_columns.push(v.data_with_nils) } end # Sum of square total def sst #if @sst.nil? @sst=@dy.variance*(min_n_valid-1.0) #end @sst end def r2 if @r2.nil? c=@matrix_y rxx=obtain_predictor_matrix matrix=(c.t*rxx.inverse*c) @r2=matrix[0,0] end @r2 end def r Math::sqrt(r2) end def df_e min_n_valid-@dep_columns.size-1 end def fix_with_mean i=0 @ds_indep.each{|row| empty=[] row.each{|k,v| empty.push(k) if v.nil? } if empty.size==1 @ds_indep[empty[0]][i]=@ds[empty[0]].mean end i+=1 } @ds_indep.update_valid_data set_dep_columns end def fix_with_regression i=0 @ds_indep.each{|row| empty=[] row.each{|k,v| empty.push(k) if v.nil? } if empty.size==1 field=empty[0] lr=MultipleRegression.new(@ds_indep,field) fields=[] @ds_indep.fields.each{|f| fields.push(row[f]) unless f==field } @ds_indep[field][i]=lr.process(fields) end i+=1 } @ds_indep.update_valid_data set_dep_columns end def obtain_y_vector @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f| Bivariate.pearson(@dy, @ds_indep[f]) }]) end def obtain_predictor_matrix Bivariate::correlation_matrix(@ds_indep) end def constant c=coeffs @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)} end def process(v) c=coeffs total=constant @fields.each_index{|i| total+=c[@fields[i]]*v[i] } total end def coeffs sc=standarized_coeffs assign_names(@fields.collect{|f| (sc[f]*@dy.sds).quo(@ds_indep[f].sds) }) end def standarized_coeffs assign_names(@coeffs_stan) end end end end