require 'statsample/vector' class Hash # Creates a Statsample::Dataset based on a Hash def to_dataset(*args) Statsample::Dataset.new(self, *args) end end class Array def prefix(s) # :nodoc: self.collect{|c| s+c.to_s } end def suffix(s) # :nodoc: self.collect{|c| c.to_s+s } end end module Statsample class DatasetException < RuntimeError # :nodoc: attr_reader :ds,:exp def initialize(ds,e) @ds=ds @exp=e end def to_s m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n") m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil? m end end # Set of cases with values for one or more variables, # analog to a dataframe on R or a standard data file of SPSS. # Every vector has #field name, which represent it. By default, # the vectors are ordered by it field name, but you can change it # the fields order manually. # The Dataset work as a Hash, with keys are field names # and values are Statsample::Vector # # # ==Usage # Create a empty dataset: # Dataset.new() # Create a dataset with three empty vectors, called v1, v2 and v3: # Dataset.new(%w{v1 v2 v3}) # Create a dataset with two vectors, called v1 # and v2: # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector}) # Create a dataset with two given vectors (v1 and v2), # with vectors on inverted order: # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1']) # # The fast way to create a dataset uses Hash#to_dataset, with # field order as arguments # v1 = [1,2,3].to_numeric # v2 = [1,2,3].to_numeric # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1}) class Dataset include Writable include Summarizable # Hash of Statsample::Vector attr_reader :vectors # Ordered ids of vectors attr_reader :fields # Name of dataset attr_accessor :name # Number of cases attr_reader :cases # Location of pointer on enumerations methods (like #each) attr_reader :i # Generates a new dataset, using three vectors # - Rows # - Columns # - Values # # For example, you have these values # # x y v # a a 0 # a b 1 # b a 1 # b b 0 # # You obtain # id a b # a 0 1 # b 1 0 # # Useful to process outputs from databases def self.crosstab_by_asignation(rows,columns,values) raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size cols_values=columns.factors cols_n=cols_values.size h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){ |a1,v1| a1[v1]=nil; a1 } ;a} values.each_index{|i| h_rows[rows[i]][columns[i]]=values[i] } ds=Dataset.new(["_id"]+cols_values) cols_values.each{|c| ds[c].type=values.type } rows.factors.each {|row| n_row=Array.new(cols_n+1) n_row[0]=row cols_values.each_index {|i| n_row[i+1]=h_rows[row][cols_values[i]] } ds.add_case_array(n_row) } ds.update_valid_data ds end # Return true if any vector has missing data def has_missing_data? @vectors.any? {|k,v| v.has_missing_data?} end # Return a nested hash using fields as keys and # an array constructed of hashes with other values. # If block provided, is used to provide the # values, with parameters +row+ of dataset, # +current+ last hash on hierarchy and # +name+ of the key to include def nest(*tree_keys,&block) tree_keys=tree_keys[0] if tree_keys[0].is_a? Array out=Hash.new each do |row| current=out # Create tree tree_keys[0,tree_keys.size-1].each do |f| root=row[f] current[root]||=Hash.new current=current[root] end name=row[tree_keys.last] if !block current[name]||=Array.new current[name].push(row.delete_if{|key,value| tree_keys.include? key}) else current[name]=block.call(row, current,name) end end out end # Creates a new dataset. A dataset is a set of ordered named vectors # of the same size. # # [vectors] With an array, creates a set of empty vectors named as # values on the array. With a hash, each Vector is assigned as # a variable of the Dataset named as its key # [fields] Array of names for vectors. Is only used for set the # order of variables. If empty, vectors keys on alfabethic order as # used as fields. def initialize(vectors={}, fields=[]) @@n_dataset||=0 @@n_dataset+=1 @name=_("Dataset %d") % @@n_dataset @cases=0 @gsl=nil @i=nil if vectors.instance_of? Array @fields=vectors.dup @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a} else # Check vectors @vectors=vectors @fields=fields check_order check_length end end # # Creates a copy of the given dataset, deleting all the cases with # missing data on one of the vectors. # # @param array of fields to include. No value include all fields # def dup_only_valid(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields if fields_to_include.size==0 if fields_to_include.any? {|f| @vectors[f].has_missing_data?} ds=Dataset.new(fields_to_include) fields_to_include.each {|f| ds[f].type=@vectors[f].type} each {|row| unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]} row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac} ds.add_case(row_2) end } else ds=dup fields_to_include end ds.name= self.name ds end # # Returns a duplicate of the Dataset. # All vectors are copied, so any modification on new # dataset doesn't affect original dataset's vectors. # If fields given as parameter, only include those vectors. # # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def dup(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields if fields_to_include.size==0 vectors={} fields=[] fields_to_include.each{|f| raise "Vector #{f} doesn't exists" unless @vectors.has_key? f vectors[f]=@vectors[f].dup fields.push(f) } ds=Dataset.new(vectors,fields) ds.name= self.name ds end # Returns an array with the fields from first argumen to last argument def from_to(from,to) raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to @fields.slice(@fields.index(from)..@fields.index(to)) end # Returns (when possible) a cheap copy of dataset. # If no vector have missing values, returns original vectors. # If missing values presents, uses Dataset.dup_only_valid. # # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def clone_only_valid(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields.dup if fields_to_include.size==0 if fields_to_include.any? {|v| @vectors[v].has_missing_data?} dup_only_valid(fields_to_include) else clone(fields_to_include) end end # Returns a shallow copy of Dataset. # Object id will be distinct, but @vectors will be the same. # @param array of fields to include. No value include all fields # @return {Statsample::Dataset} def clone(*fields_to_include) if fields_to_include.size==1 and fields_to_include[0].is_a? Array fields_to_include=fields_to_include[0] end fields_to_include=@fields.dup if fields_to_include.size==0 ds=Dataset.new fields_to_include.each{|f| raise "Vector #{f} doesn't exists" unless @vectors.has_key? f ds[f]=@vectors[f] } ds.fields=fields_to_include ds.name=@name ds.update_valid_data ds end # Creates a copy of the given dataset, without data on vectors # # @return {Statsample::Dataset} def dup_empty vectors=@vectors.inject({}) {|a,v| a[v[0]]=v[1].dup_empty a } Dataset.new(vectors,@fields.dup) end # Merge vectors from two datasets # In case of name collition, the vectors names are changed to # x_1, x_2 .... # # @return {Statsample::Dataset} def merge(other_ds) raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type} new_fields = (@fields+other_ds.fields).recode_repeated ds_new=Statsample::Dataset.new(new_fields) new_fields.each_index{|i| field=new_fields[i] ds_new[field].type=types[i] } @cases.times {|i| row=case_as_array(i)+other_ds.case_as_array(i) ds_new.add_case_array(row) } ds_new.update_valid_data ds_new end # Join 2 Datasets by given fields # type is one of :left and :inner, default is :left # # @return {Statsample::Dataset} def join(other_ds,fields_1=[],fields_2=[],type=:left) fields_new = other_ds.fields - fields_2 fields = self.fields + fields_new other_ds_hash = {} other_ds.each do |row| key = row.select{|k,v| fields_2.include?(k)}.values value = row.select{|k,v| fields_new.include?(k)} if other_ds_hash[key].nil? other_ds_hash[key] = [value] else other_ds_hash[key] << value end end new_ds = Dataset.new(fields) self.each do |row| key = row.select{|k,v| fields_1.include?(k)}.values new_case = row.dup if other_ds_hash[key].nil? if type == :left fields_new.each{|field| new_case[field] = nil} new_ds.add_case(new_case) end else other_ds_hash[key].each do |new_values| new_ds.add_case new_case.merge(new_values) end end end new_ds end # Returns a dataset with standarized data. # # @return {Statsample::Dataset} def standarize ds=dup() ds.fields.each do |f| ds[f]=ds[f].vector_standarized end ds end # Generate a matrix, based on fields of dataset # # @return {::Matrix} def collect_matrix rows=@fields.collect{|row| @fields.collect{|col| yield row,col } } Matrix.rows(rows) end # We have the same datasets if +vectors+ and +fields+ are the same # # @return {Boolean} def ==(d2) @vectors==d2.vectors and @fields==d2.fields end # Returns vector c # # @return {Statsample::Vector} def col(c) @vectors[c] end alias_method :vector, :col # Equal to Dataset[name]=vector # # @return self def add_vector(name, vector) raise ArgumentError, "Vector have different size" if vector.size!=@cases @vectors[name]=vector check_order self end # Returns true if dataset have vector v. # # @return {Boolean} def has_vector? (v) return @vectors.has_key?(v) end # Creates a dataset with the random data, of a n size # If n not given, uses original number of cases. # # @return {Statsample::Dataset} def bootstrap(n=nil) n||=@cases ds_boot=dup_empty n.times do ds_boot.add_case_array(case_as_array(rand(n))) end ds_boot.update_valid_data ds_boot end # Fast version of #add_case. # Can only add one case and no error check if performed # You SHOULD use #update_valid_data at the end of insertion cycle # # def add_case_array(v) v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])} end # Insert a case, using: # * Array: size equal to number of vectors and values in the same order as fields # * Hash: keys equal to fields # If uvd is false, #update_valid_data is not executed after # inserting a case. This is very useful if you want to increase the # performance on inserting many cases, because #update_valid_data # performs check on vectors and on the dataset def add_case(v,uvd=true) case v when Array if (v[0].is_a? Array) v.each{|subv| add_case(subv,false)} else raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size v.each_index {|i| @vectors[@fields[i]].add(v[i],false)} end when Hash raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort @fields.each{|f| @vectors[f].add(v[f],false)} else raise TypeError, 'Value must be a Array or a Hash' end if uvd update_valid_data end end # Check vectors and fields after inserting data. Use only # after #add_case_array or #add_case with second parameter to false def update_valid_data @gsl=nil @fields.each{|f| @vectors[f].set_valid_data} check_length end # Delete vector named +name+. Multiple fields accepted. def delete_vector(*args) if args.size==1 and args[0].is_a? Array names=args[0] else names=args end names.each do |name| @fields.delete(name) @vectors.delete(name) end end def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN) split=@vectors[name_].split_by_separator(sep) i=1 split.each{|k,v| new_field=name_+join+i.to_s v.name=name_+":"+k add_vector(new_field,v) i+=1 } end def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN) split=@vectors[name].split_by_separator(sep) split.each{|k,v| add_vector(name+join+k,v) } end def vector_by_calculation(type=:numeric) a=[] each do |row| a.push(yield(row)) end a.to_vector(type) end # Returns a vector with sumatory of fields # if fields parameter is empty, sum all fields def vector_sum(fields=nil) fields||=@fields vector=collect_with_index do |row, i| if(fields.find{|f| !@vectors[f].data_with_nils[i]}) nil else fields.inject(0) {|ac,v| ac + row[v].to_f} end end vector.name=_("Sum from %s") % @name vector end # Check if #fields attribute is correct, after inserting or deleting vectors def check_fields(fields) fields||=@fields raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0 fields end # Returns a vector with the numbers of missing values for a case def vector_missing_values(fields=nil) fields=check_fields(fields) collect_with_index do |row, i| fields.inject(0) {|a,v| a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0) } end end def vector_count_characters(fields=nil) fields=check_fields(fields) collect_with_index do |row, i| fields.inject(0){|a,v| a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size) } end end # Returns a vector with the mean for a set of fields # if fields parameter is empty, return the mean for all fields # if max invalid parameter > 0, returns the mean for all tuples # with 0 to max_invalid invalid fields def vector_mean(fields=nil, max_invalid=0) a=[] fields=check_fields(fields) size=fields.size each_with_index do |row, i | # numero de invalidos sum=0 invalids=0 fields.each{|f| if !@vectors[f].data_with_nils[i].nil? sum+=row[f].to_f else invalids+=1 end } if(invalids>max_invalid) a.push(nil) else a.push(sum.quo(size-invalids)) end end a=a.to_vector(:numeric) a.name=_("Means from %s") % @name a end # Check vectors for type and size. def check_length # :nodoc: size=nil @vectors.each do |k,v| raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector if size.nil? size=v.size else if v.size!=size raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}" end end end @cases=size end # Retrieves each vector as [key, vector] def each_vector # :yield: |key, vector| @fields.each{|k| yield k, @vectors[k]} end if Statsample::STATSAMPLE__.respond_to?(:case_as_hash) def case_as_hash(c) # :nodoc: Statsample::STATSAMPLE__.case_as_hash(self,c) end else # Retrieves case i as a hash def case_as_hash(i) _case_as_hash(i) end end if Statsample::STATSAMPLE__.respond_to?(:case_as_array) def case_as_array(c) # :nodoc: Statsample::STATSAMPLE__.case_as_array(self,c) end else # Retrieves case i as a array, ordered on #fields order def case_as_array(i) _case_as_array(i) end end def _case_as_hash(c) # :nodoc: @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a } end def _case_as_array(c) # :nodoc: @fields.collect {|x| @vectors[x][c]} end # Returns each case as a hash def each begin @i=0 @cases.times {|i| @i=i row=case_as_hash(i) yield row } @i=nil rescue =>e raise DatasetException.new(self, e) end end # Returns each case as hash and index def each_with_index # :yield: |case, i| begin @i=0 @cases.times{|i| @i=i row=case_as_hash(i) yield row, i } @i=nil rescue =>e raise DatasetException.new(self, e) end end # Returns each case as an array, coding missing values as nils def each_array_with_nils m=fields.size @cases.times {|i| @i=i row=Array.new(m) fields.each_index{|j| f=fields[j] row[j]=@vectors[f].data_with_nils[i] } yield row } @i=nil end # Returns each case as an array def each_array @cases.times {|i| @i=i row=case_as_array(i) yield row } @i=nil end # Set fields order. If you omit one or more vectors, they are # ordered by alphabetic order. def fields=(f) @fields=f check_order end # Check congruence between +fields+ attribute # and keys on +vectors def check_order #:nodoc: if(@vectors.keys.sort!=@fields.sort) @fields=@fields&@vectors.keys @fields+=@vectors.keys.sort-@fields end end # Returns the vector named i def[](i) if i.is_a? Range fields=from_to(i.begin,i.end) clone(*fields) elsif i.is_a? Array clone(i) else raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i) @vectors[i] end end # Retrieves a Statsample::Vector, based on the result # of calculation performed on each case. def collect(type=:numeric) data=[] each {|row| data.push yield(row) } Statsample::Vector.new(data,type) end # Same as Statsample::Vector.collect, but giving case index as second parameter on yield. def collect_with_index(type=:numeric) data=[] each_with_index {|row, i| data.push(yield(row, i)) } Statsample::Vector.new(data,type) end # Recode a vector based on a block def recode!(vector_name) 0.upto(@cases-1) {|i| @vectors[vector_name].data[i]=yield case_as_hash(i) } @vectors[vector_name].set_valid_data end def crosstab(v1,v2,opts={}) Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts) end def[]=(i,v) if v.instance_of? Statsample::Vector @vectors[i]=v check_order else raise ArgumentError,"Should pass a Statsample::Vector" end end # Return data as a matrix. Column are ordered by #fields and # rows by orden of insertion def to_matrix rows=[] self.each_array{|c| rows.push(c) } Matrix.rows(rows) end if Statsample.has_gsl? def clear_gsl @gsl=nil end def to_gsl if @gsl.nil? if cases.nil? update_valid_data end @gsl=GSL::Matrix.alloc(cases,fields.size) self.each_array{|c| @gsl.set_row(@i,c) } end @gsl end end # Return a correlation matrix for fields included as parameters. # By default, uses all fields of dataset def correlation_matrix(fields = nil) if fields ds = clone(fields) else ds = self end Statsample::Bivariate.correlation_matrix(ds) end # Return a correlation matrix for fields included as parameters. # By default, uses all fields of dataset def covariance_matrix(fields = nil) if fields ds = clone(fields) else ds = self end Statsample::Bivariate.covariance_matrix(ds) end # Create a new dataset with all cases which the block returns true def filter ds=self.dup_empty each {|c| ds.add_case(c, false) if yield c } ds.update_valid_data ds.name=_("%s(filtered)") % @name ds end # creates a new vector with the data of a given field which the block returns true def filter_field(field) a=[] each do |c| a.push(c[field]) if yield c end a.to_vector(@vectors[field].type) end # Creates a Stastample::Multiset, using one or more fields # to split the dataset. def to_multiset_by_split(*fields) require 'statsample/multiset' if fields.size==1 to_multiset_by_split_one_field(fields[0]) else to_multiset_by_split_multiple_fields(*fields) end end # Creates a Statsample::Multiset, using one field def to_multiset_by_split_one_field(field) raise ArgumentError,"Should use a correct field name" if !@fields.include? field factors=@vectors[field].factors ms=Multiset.new_empty_vectors(@fields, factors) each {|c| ms[c[field]].add_case(c,false) } #puts "Ingreso a los dataset" ms.datasets.each {|k,ds| ds.update_valid_data ds.name=@vectors[field].labeling(k) ds.vectors.each{|k1,v1| # puts "Vector #{k1}:"+v1.to_s v1.type=@vectors[k1].type v1.name=@vectors[k1].name v1.labels=@vectors[k1].labels } } ms end def to_multiset_by_split_multiple_fields(*fields) factors_total=nil fields.each do |f| if factors_total.nil? factors_total=@vectors[f].factors.collect{|c| [c] } else suma=[] factors=@vectors[f].factors factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } } factors_total=suma end end ms=Multiset.new_empty_vectors(@fields,factors_total) p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }" each{|c| p1.call(c)} ms.datasets.each do |k,ds| ds.update_valid_data ds.name=fields.size.times.map {|i| f=fields[i] sk=k[i] @vectors[f].labeling(sk) }.join("-") ds.vectors.each{|k1,v1| v1.type=@vectors[k1].type v1.name=@vectors[k1].name v1.labels=@vectors[k1].labels } end ms end # Returns a vector, based on a string with a calculation based # on vector # The calculation will be eval'ed, so you can put any variable # or expression valid on ruby # For example: # a=[1,2].to_vector(scale) # b=[3,4].to_vector(scale) # ds={'a'=>a,'b'=>b}.to_dataset # ds.compute("a+b") # => Vector [4,6] def compute(text) @fields.each{|f| if @vectors[f].type=:numeric text.gsub!(f,"row['#{f}'].to_f") else text.gsub!(f,"row['#{f}']") end } collect_with_index {|row, i| invalid=false @fields.each{|f| if @vectors[f].data_with_nils[i].nil? invalid=true end } if invalid nil else eval(text) end } end # Test each row with one or more tests # each test is a Proc with the form # Proc.new {|row| row['age']>0} # The function returns an array with all errors def verify(*tests) if(tests[0].is_a? String) id=tests[0] tests.shift else id=@fields[0] end vr=[] i=0 each do |row| i+=1 tests.each{|test| if ! test[2].call(row) values="" if test[1].size>0 values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")" end vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}") end } end vr end def to_s "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s end def inspect self.to_s end # Creates a new dataset for one to many relations # on a dataset, based on pattern of field names. # # for example, you have a survey for number of children # with this structure: # id, name, child_name_1, child_age_1, child_name_2, child_age_2 # with # ds.one_to_many(%w{id}, "child_%v_%n" # the field of first parameters will be copied verbatim # to new dataset, and fields which responds to second # pattern will be added one case for each different %n. # For example # cases=[ # ['1','george','red',10,'blue',20,nil,nil], # ['2','fred','green',15,'orange',30,'white',20], # ['3','alfred',nil,nil,nil,nil,nil,nil] # ] # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3}) # cases.each {|c| ds.add_case_array c } # ds.one_to_many(['id'],'car_%v%n').to_matrix # => Matrix[ # ["red", "1", 10], # ["blue", "1", 20], # ["green", "2", 15], # ["orange", "2", 30], # ["white", "2", 20] # ] # def one_to_many(parent_fields, pattern) #base_pattern=pattern.gsub(/%v|%n/,"") re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)") ds_vars=parent_fields vars=[] max_n=0 h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a } # Adding _row_id h['_col_id']=[].to_numeric ds_vars.push("_col_id") @fields.each do |f| if f=~re if !vars.include? $1 vars.push($1) h[$1]=Statsample::Vector.new([], @vectors[f].type) end max_n=$2.to_i if max_n < $2.to_i end end ds=Dataset.new(h,ds_vars+vars) each do |row| row_out={} parent_fields.each do |f| row_out[f]=row[f] end max_n.times do |n1| n=n1+1 any_data=false vars.each do |v| data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)] row_out[v]=data any_data=true if !data.nil? end if any_data row_out["_col_id"]=n ds.add_case(row_out,false) end end end ds.update_valid_data ds end def report_building(b) b.section(:name=>@name) do |g| g.text _"Cases: %d" % cases @fields.each do |f| g.text "Element:[#{f}]" g.parse_element(@vectors[f]) end end end end end