module Statsample # Dominance Analysis is a procedure based on an examination of the R2 values # for all possible subset models, to identify the relevance of one or more # predictors in the prediction of criterium. # # See Budescu(1993), Azen & Budescu (2003, 2006) for more information. # # == Use # # a=1000.times.collect {rand}.to_scale # b=1000.times.collect {rand}.to_scale # c=1000.times.collect {rand}.to_scale # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()} # da=Statsample::DominanceAnalysis.new(ds,'y') # puts da.summary # # === Output: # # Report: Report 2010-02-08 19:10:11 -0300 # Table: Dominance Analysis result # ------------------------------------------------------------ # | | r2 | sign | a | b | c | # ------------------------------------------------------------ # | Model 0 | | | 0.648 | 0.265 | 0.109 | # ------------------------------------------------------------ # | a | 0.648 | 0.000 | -- | 0.229 | 0.104 | # | b | 0.265 | 0.000 | 0.612 | -- | 0.104 | # | c | 0.109 | 0.000 | 0.643 | 0.260 | -- | # ------------------------------------------------------------ # | k=1 Average | | | 0.627 | 0.244 | 0.104 | # ------------------------------------------------------------ # | a*b | 0.877 | 0.000 | -- | -- | 0.099 | # | a*c | 0.752 | 0.000 | -- | 0.224 | -- | # | b*c | 0.369 | 0.000 | 0.607 | -- | -- | # ------------------------------------------------------------ # | k=2 Average | | | 0.607 | 0.224 | 0.099 | # ------------------------------------------------------------ # | a*b*c | 0.976 | 0.000 | -- | -- | -- | # ------------------------------------------------------------ # | Overall averages | | | 0.628 | 0.245 | 0.104 | # ------------------------------------------------------------ # # Table: Pairwise dominance # ----------------------------------------- # | Pairs | Total | Conditional | General | # ----------------------------------------- # | a - b | 1.0 | 1.0 | 1.0 | # | a - c | 1.0 | 1.0 | 1.0 | # | b - c | 1.0 | 1.0 | 1.0 | # ----------------------------------------- # # == Reference: # * Budescu, D. V. (1993). Dominance analysis: a new approach to the problem of relative importance of predictors in multiple regression. Psychological Bulletin, 114, 542-551. # * Azen, R. & Budescu, D.V. (2003). The dominance analysis approach for comparing predictors in multiple regression. Psychological Methods, 8(2), 129-148. # * Azen, R. & Budescu, D.V. (2006). Comparing predictors in Multivariate Regression Models: An extension of Dominance Analysis. Journal of Educational and Behavioral Statistics, 31(2), 157-180. # class DominanceAnalysis include Summarizable # Class to generate the regressions. Default to Statsample::Regression::Multiple::MatrixEngine attr_accessor :regression_class # Name of analysis attr_accessor :name # Set to true if you want to build from dataset, not correlation matrix attr_accessor :build_from_dataset # Array with independent variables. You could create subarrays, # to test groups of predictors as blocks attr_accessor :predictors # If you provide a matrix as input, you should set # the number of cases to define significance of R^2 attr_accessor :cases # Method of :regression_class used to measure association. # # Only necessary to change if you have multivariate dependent. # * :r2yx (R^2_yx), the default option, is the option when distinction # between independent and dependents variable is arbitrary # * :p2yx is the option when the distinction between independent and dependents variables is real. # attr_accessor :method_association attr_reader :dependent UNIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MatrixEngine MULTIVARIATE_REGRESSION_CLASS=Statsample::Regression::Multiple::MultipleDependent def self.predictor_name(variable) if variable.is_a? Array sprintf("(%s)", variable.join(",")) else variable end end # Creates a new DominanceAnalysis object # Parameters: # * input: A Matrix or Dataset object # * dependent: Name of dependent variable. Could be an array, if you want to # do an Multivariate Regression Analysis. If nil, set to all # fields on input, except criteria def initialize(input, dependent, opts=Hash.new) @build_from_dataset=false if dependent.is_a? Array @regression_class= MULTIVARIATE_REGRESSION_CLASS @method_association=:r2yx else @regression_class= UNIVARIATE_REGRESSION_CLASS @method_association=:r2 end @name=nil opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @dependent=dependent @dependent=[@dependent] unless @dependent.is_a? Array @predictors ||= input.fields-@dependent @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil? if input.is_a? Statsample::Dataset @ds=input @matrix=Statsample::Bivariate.correlation_matrix(input) @cases=Statsample::Bivariate.min_n_valid(input) elsif input.is_a? ::Matrix @ds=nil @matrix=input else raise ArgumentError.new("You should use a Matrix or a Dataset") end @models=nil @models_data=nil @general_averages=nil end # Compute models. def compute create_models fill_models end def models if @models.nil? compute end @models end def models_data if @models_data.nil? compute end @models_data end def create_models @models=[] @models_data={} for i in 1..@predictors.size c=Statsample::Combination.new(i,@predictors.size) c.each do |data| independent=data.collect {|i1| @predictors[i1] } @models.push(independent) if (@build_from_dataset) data=@ds.dup(independent.flatten+@dependent) else data=@matrix.submatrix(independent.flatten+@dependent) end modeldata=ModelData.new(independent, data, self) models_data[independent.sort {|a,b| a.to_s<=>b.to_s}]=modeldata end end end def fill_models @models.each do |m| @predictors.each do |f| next if m.include? f base_model=md(m) comp_model=md(m+[f]) base_model.add_contribution(f,comp_model.r2) end end end private :create_models, :fill_models def dominance_for_nil_model(i,j) if md([i]).r2>md([j]).r2 1 elsif md([i]).r2m.contributions[j] dominances.push(1) elsif m.contributions[i]1 ? 0.5 : final[0] end # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined def conditional_dominance_pairwise(i,j) dm=dominance_for_nil_model(i,j) return 0.5 if dm==0.5 dominances=[dm] for k in 1...@predictors.size a=average_k(k) if a[i]>a[j] dominances.push(1) elsif a[i]1 ? 0.5 : final[0] end # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined def general_dominance_pairwise(i,j) ga=general_averages if ga[i]>ga[j] 1 elsif ga[i]b.to_s}] end # Get all model of size k def md_k(k) out=[] models=@models.each{|m| out.push(md(m)) if m.size==k } out end # For a hash with arrays of numbers as values # Returns a hash with same keys and # value as the mean of values of original hash def get_averages(averages) out={} averages.each{|key,val| out[key]=val.to_vector(:scale).mean } out end # Hash with average for each k size model. def average_k(k) return nil if k==@predictors.size models=md_k(k) averages=@predictors.inject({}) {|a,v| a[v]=[];a} models.each do |m| @predictors.each do |f| averages[f].push(m.contributions[f]) unless m.contributions[f].nil? end end get_averages(averages) end def general_averages if @general_averages.nil? averages=@predictors.inject({}) {|a,v| a[v]=[md([v]).r2];a} for k in 1...@predictors.size ak=average_k(k) @predictors.each do |f| averages[f].push(ak[f]) end end @general_averages=get_averages(averages) end @general_averages end def report_building(g) compute if @models.nil? g.section(:name=>@name) do |generator| header=["","r2",_("sign")]+@predictors.collect {|c| DominanceAnalysis.predictor_name(c) } generator.table(:name=>_("Dominance Analysis result"), :header=>header) do |t| row=[_("Model 0"),"",""]+@predictors.collect{|f| sprintf("%0.3f",md([f]).r2) } t.row(row) t.hr for i in 1..@predictors.size mk=md_k(i) mk.each{|m| t.row(m.add_table_row) } # Report averages a=average_k(i) if !a.nil? t.hr row=[_("k=%d Average") % i,"",""] + @predictors.collect{|f| sprintf("%0.3f",a[f]) } t.row(row) t.hr end end g=general_averages t.hr row=[_("Overall averages"),"",""]+@predictors.collect{|f| sprintf("%0.3f",g[f]) } t.row(row) end td=total_dominance cd=conditional_dominance gd=general_dominance generator.table(:name=>_("Pairwise dominance"), :header=>[_("Pairs"),_("Total"),_("Conditional"),_("General")]) do |t| pairs.each{|pair| name=pair.map{|v| v.is_a?(Array) ? "("+v.join("-")+")" : v}.join(" - ") row=[name, sprintf("%0.1f",td[pair]), sprintf("%0.1f",cd[pair]), sprintf("%0.1f",gd[pair])] t.row(row) } end end end class ModelData # :nodoc: attr_reader :contributions def initialize(independent, data, da) @independent=independent @data=data @predictors=da.predictors @dependent=da.dependent @cases=da.cases @method=da.method_association @contributions=@independent.inject({}){|a,v| a[v]=nil;a} r_class=da.regression_class if @dependent.size==1 @lr=r_class.new(data, @dependent[0], :cases=>@cases) else @lr=r_class.new(data, @dependent, :cases=>@cases) end end def add_contribution(f, v) @contributions[f]=v-r2 end def r2 @lr.send(@method) end def name @independent.collect {|variable| DominanceAnalysis.predictor_name(variable) }.join("*") end def add_table_row if @cases sign=sprintf("%0.3f", @lr.probability) else sign="???" end [name, sprintf("%0.3f",r2), sign] + @predictors.collect{|k| v=@contributions[k] if v.nil? "--" else sprintf("%0.3f",v) end } end def summary out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",name, r2, @lr.significance, @lr.sst) out << @predictors.collect{|k| v=@contributions[k] if v.nil? "--" else sprintf("%s=%0.3f",k,v) end }.join(" | ") out << "\n" return out end end # end ModelData end # end Dominance Analysis end require 'statsample/dominanceanalysis/bootstrap'