require 'statsample/bivariate/pearson' module Statsample # Diverse methods and classes to calculate bivariate relations # Specific classes: # * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r) # * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation # * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series) module Bivariate autoload(:Polychoric, 'statsample/bivariate/polychoric') autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric') class << self # Covariance between two vectors def covariance(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) return nil if v1a.size==0 if Statsample.has_gsl? GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl) else covariance_slow(v1a,v2a) end end # Estimate the ML between two dichotomic vectors def maximum_likehood_dichotomic(pred,real) preda,reala=Statsample.only_valid_clone(pred,real) sum=0 preda.each_index{|i| sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i])) } sum end def covariance_slow(v1,v2) # :nodoc: v1a,v2a=Statsample.only_valid(v1,v2) sum_of_squares(v1a,v2a) / (v1a.size-1) end def sum_of_squares(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) v1a.reset_index! v2a.reset_index! m1=v1a.mean m2=v2a.mean (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)} end # Calculate Pearson correlation coefficient (r) between 2 vectors def pearson(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) return nil if v1a.size ==0 if Statsample.has_gsl? GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl) else pearson_slow(v1a,v2a) end end def pearson_slow(v1,v2) # :nodoc: v1a,v2a=Statsample.only_valid_clone(v1,v2) # Calculate sum of squares ss=sum_of_squares(v1a,v2a) ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares)) end alias :correlation :pearson # Retrieves the value for t test for a pearson correlation # between two vectors to test the null hipothesis of r=0 def t_pearson(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) r=pearson(v1a,v2a) if(r==1.0) 0 else t_r(r,v1a.size) end end # Retrieves the value for t test for a pearson correlation # giving r and vector size # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm def t_r(r,size) r * Math::sqrt(((size)-2).to_f / (1 - r**2)) end # Retrieves the probability value (a la SPSS) # for a given t, size and number of tails. # Uses a second parameter # * :both or 2 : for r!=0 (default) # * :right, :positive or 1 : for r > 0 # * :left, :negative : for r < 0 def prop_pearson(t, size, tails=:both) tails=:both if tails==2 tails=:right if tails==1 or tails==:positive tails=:left if tails==:negative n_tails=case tails when :both then 2 else 1 end t=-t if t>0 and (tails==:both) cdf=Distribution::T.cdf(t, size-2) if(tails==:right) 1.0-(cdf*n_tails) else cdf*n_tails end end # Predicted time for pairwise correlation matrix, in miliseconds # See benchmarks/correlation_matrix.rb to see mode of calculation def prediction_pairwise(vars,cases) ((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100 end # Predicted time for optimized correlation matrix, in miliseconds # See benchmarks/correlation_matrix.rb to see mode of calculation def prediction_optimized(vars,cases) ((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100 end # Returns residual score after delete variance # from another variable # def residuals(from,del) r=Statsample::Bivariate.pearson(from,del) froms, dels = from.vector_standarized, del.vector_standarized nv=[] froms.reset_index! dels.reset_index! froms.each_index do |i| if froms[i].nil? or dels[i].nil? nv.push(nil) else nv.push(froms[i]-r*dels[i]) end end Daru::Vector.new(nv) end # Correlation between v1 and v2, controling the effect of # control on both. def partial_correlation(v1,v2,control) v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control) rv1v2=pearson(v1a,v2a) rv1con=pearson(v1a,cona) rv2con=pearson(v2a,cona) (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2)) end def covariance_matrix_optimized(ds) x=ds.to_gsl n=x.row_size m=x.column_size means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0) centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means)) ss=centered.transpose*centered s=((1/(n-1).to_f))*ss s end # Covariance matrix. # Order of rows and columns depends on Dataset#fields order def covariance_matrix(ds) vars,cases = ds.ncols, ds.nrows if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=covariance_matrix_optimized(ds) else cm=covariance_matrix_pairwise(ds) end cm.extend(Statsample::CovariateMatrix) cm.fields = ds.vectors.to_a cm end def covariance_matrix_pairwise(ds) cache={} vectors = ds.vectors.to_a mat_rows = vectors.collect do |row| vectors.collect do |col| if (ds[row].type!=:numeric or ds[col].type!=:numeric) nil elsif row==col ds[row].variance else if cache[[col,row]].nil? cov=covariance(ds[row],ds[col]) cache[[row,col]]=cov cov else cache[[col,row]] end end end end Matrix.rows mat_rows end # Correlation matrix. # Order of rows and columns depends on Dataset#fields order def correlation_matrix(ds) vars, cases = ds.ncols, ds.nrows if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=correlation_matrix_optimized(ds) else cm=correlation_matrix_pairwise(ds) end cm.extend(Statsample::CovariateMatrix) cm.fields = ds.vectors.to_a cm end def correlation_matrix_optimized(ds) s=covariance_matrix_optimized(ds) sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1)) cm=sds*s*sds # Fix diagonal s.row_size.times {|i| cm[i,i]=1.0 } cm end def correlation_matrix_pairwise(ds) cache={} vectors = ds.vectors.to_a cm = vectors.collect do |row| vectors.collect do |col| if row==col 1.0 elsif (ds[row].type!=:numeric or ds[col].type!=:numeric) nil else if cache[[col,row]].nil? r=pearson(ds[row],ds[col]) cache[[row,col]]=r r else cache[[col,row]] end end end end Matrix.rows cm end # Retrieves the n valid pairwise. def n_valid_matrix(ds) vectors = ds.vectors.to_a m = vectors.collect do |row| vectors.collect do |col| if row==col ds[row].reject_values(*Daru::MISSING_VALUES).size else rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col]) rowa.size end end end Matrix.rows m end # Matrix of correlation probabilities. # Order of rows and columns depends on Dataset#fields order def correlation_probability_matrix(ds, tails=:both) rows=ds.fields.collect do |row| ds.fields.collect do |col| v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col]) (row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails) end end Matrix.rows(rows) end # Spearman ranked correlation coefficient (rho) between 2 vectors def spearman(v1,v2) v1a,v2a = Statsample.only_valid_clone(v1,v2) v1r,v2r = v1a.ranked, v2a.ranked pearson(v1r,v2r) end # Calculate Point biserial correlation. Equal to Pearson correlation, with # one dichotomous value replaced by "0" and the other by "1" def point_biserial(dichotomous,continous) ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES) raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2 raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric f0=ds[:d].factors.sort.to_a[0] m0=ds.filter_vector(:c) {|c| c[:d] == f0} m1=ds.filter_vector(:c) {|c| c[:d] != f0} ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2) end # Kendall Rank Correlation Coefficient (Tau a) # Based on Hervé Adbi article def tau_a(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) n=v1.size v1r,v2r=v1a.ranked,v2a.ranked o1=ordered_pairs(v1r) o2=ordered_pairs(v2r) delta= o1.size*2-(o2 & o1).size*2 1-(delta * 2 / (n*(n-1)).to_f) end # Calculates Goodman and Kruskal’s Tau b correlation. # Tb is an asymmetric P-R-E measure of association for nominal scales # (Mielke, X) # # Tau-b defines perfect association as strict monotonicity. Although it # requires strict monotonicity to reach 1.0, it does not penalize ties as # much as some other measures. # == Reference # Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA. # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm def tau_b(matrix) v=pairs(matrix) ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f) end # Calculates Goodman and Kruskal's gamma. # # Gamma is the surplus of concordant pairs over discordant pairs, as a # percentage of all pairs ignoring ties. # # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm def gamma(matrix) v=pairs(matrix) (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f end # Calculate indexes for a matrix the rows and cols has to be ordered def pairs(matrix) # calculate concordant #p matrix rs=matrix.row_size cs=matrix.column_size conc=disc=ties_x=ties_y=0 (0...(rs-1)).each do |x| (0...(cs-1)).each do |y| ((x+1)...rs).each do |x2| ((y+1)...cs).each do |y2| # #p sprintf("%d:%d,%d:%d",x,y,x2,y2) conc+=matrix[x,y]*matrix[x2,y2] end end end end (0...(rs-1)).each {|x| (1...(cs)).each{|y| ((x+1)...rs).each{|x2| (0...y).each{|y2| # #p sprintf("%d:%d,%d:%d",x,y,x2,y2) disc+=matrix[x,y]*matrix[x2,y2] } } } } (0...(rs-1)).each {|x| (0...(cs)).each{|y| ((x+1)...(rs)).each{|x2| ties_x+=matrix[x,y]*matrix[x2,y] } } } (0...rs).each {|x| (0...(cs-1)).each{|y| ((y+1)...(cs)).each{|y2| ties_y+=matrix[x,y]*matrix[x,y2] } } } {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x} end def ordered_pairs(vector) d = vector.to_a a = [] (0...(d.size-1)).each do |i| ((i+1)...(d.size)).each do |j| a.push([d[i],d[j]]) end end a end =begin def sum_of_codeviated(v1,v2) v1a,v2a=Statsample.only_valid(v1,v2) sum=0 (0...v1a.size).each{|i| sum+=v1a[i]*v2a[i] } sum-((v1a.sum*v2a.sum) / v1a.size.to_f) end =end # Report the minimum number of cases valid of a covariate matrix # based on a dataset def min_n_valid(ds) min = ds.nrows m = n_valid_matrix(ds) for x in 0...m.row_size for y in 0...m.column_size min=m[x,y] if m[x,y] < min end end min end end end end