lib/statsample/dataset.rb in statsample-0.5.0 vs lib/statsample/dataset.rb in statsample-0.5.1
- old
+ new
@@ -1,227 +1,289 @@
require 'statsample/vector'
class Hash
- def to_dataset(*args)
- Statsample::Dataset.new(self,*args)
- end
+ def to_dataset(*args)
+ Statsample::Dataset.new(self,*args)
+ end
end
class Array
- def prefix(s)
- self.collect{|c|
- s+c.to_s
+ def prefix(s)
+ self.collect{|c| s+c.to_s }
+ end
+ def suffix(s)
+ self.collect{|c| c.to_s+s }
+ end
+end
+
+module Statsample
+ class DatasetException < RuntimeError
+ attr_reader :ds,:exp
+ def initialize(ds,e)
+ @ds=ds
+ @exp=e
+ end
+ def to_s
+ m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
+ m+="\nRow: #{@i}" unless @i.nil?
+ m
+ end
+ end
+ class Dataset
+ include Writable
+ attr_reader :vectors, :fields, :cases, :i
+ attr_accessor :labels
+
+ # Generates a new dataset, using three vectors
+ # - Rows
+ # - Columns
+ # - Values
+ # For example, you have these values
+ #
+ # x y v
+ # a a 0
+ # a b 1
+ # b a 1
+ # b b 0
+ #
+ # You obtain
+ # id a b
+ # a 0 1
+ # b 1 0
+ #
+ # Useful to process outputs from databases
+ #
+ def self.crosstab_by_asignation(rows,columns,values)
+ raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
+ cols_values=columns.factors
+ cols_n=cols_values.size
+ h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
+ |a1,v1| a1[v1]=nil; a1
}
+ ;a}
+ values.each_index{|i|
+ h_rows[rows[i]][columns[i]]=values[i]
+ }
+ ds=Dataset.new(["_id"]+cols_values)
+ cols_values.each{|c|
+ ds[c].type=values.type
+ }
+ rows.factors.each {|row|
+ n_row=Array.new(cols_n+1)
+ n_row[0]=row
+ cols_values.each_index {|i|
+ n_row[i+1]=h_rows[row][cols_values[i]]
+ }
+ ds.add_case_array(n_row)
+ }
+ ds.update_valid_data
+ ds
end
- def suffix(s)
- self.collect{|c|
- c.to_s+s
+ # Creates a new dataset. A dataset is a set of ordered named vectors
+ # of the same size.
+ #
+ # [vectors] With an array, creates a set of empty vectors named as
+ # values on the array. With a hash, each Vector is assigned as
+ # a variable of the Dataset named as its key
+ # [fields] Array of names for vectors. Is only used for set the
+ # order of variables. If empty, vectors keys on alfabethic order as
+ # used as fields
+ # [labels] Hash to set names for fields.
+ #
+ #
+ # Dataset.new()
+ # Dataset.new(%w{v1 v2 v3})
+ # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
+ # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
+ #
+ # The fast way to create a dataset uses Hash#to_dataset, with
+ # fields and labels as arguments
+ # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
+ #
+ def initialize(vectors={}, fields=[], labels={})
+ if vectors.instance_of? Array
+ @fields=vectors.dup
+ @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
+ else
+ # Check vectors
+ @vectors=vectors
+ @fields=fields
+ check_order
+ check_length
+ end
+ @i=nil
+ @labels=labels
+ end
+ def to_gsl_matrix
+ matrix=GSL::Matrix.alloc(cases,@vectors.size)
+ each_array do |row|
+ row.each_index{|y| matrix.set(@i,y,row[y]) }
+ end
+ matrix
+ end
+ def vector_label(v_id)
+ raise "Vector #{v} doesn't exists" unless @fields.include? v_id
+ @labels[v_id].nil? ? v_id : @labels[v_id]
+ end
+ # Creates a copy of the given dataset, deleting all the cases with
+ # missing data on one of the vectors
+ def dup_only_valid
+ if @vectors.find{|field,vector| vector.has_missing_data?}
+ ds=dup_empty
+ each_array { |c|
+ ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
}
+ ds.update_valid_data
+ else
+ ds=dup()
+ end
+ ds
end
-end
-
-module Statsample
- class DatasetException < RuntimeError
- attr_reader :ds,:exp
- def initialize(ds,e)
- @ds=ds
- @exp=e
- end
- def to_s
- m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
- m+="\nRow: #{@i}" unless @i.nil?
- m
- end
+ # Returns an array with the fields from first argumen to last argument
+ def from_to(from,to)
+ raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
+ raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
+ @fields.slice(@fields.index(from)..@fields.index(to))
end
- class Dataset
- include Writable
- attr_reader :vectors, :fields, :cases, :i
- attr_accessor :labels
- # Creates a new dataset. A dataset is a set of ordered named vectors
- # of the same size.
- #
- # [vectors] With an array, creates a set of empty vectors named as
- # values on the array. With a hash, each Vector is assigned as
- # a variable of the Dataset named as its key
- # [fields] Array of names for vectors. Is only used for set the
- # order of variables. If empty, vectors keys on alfabethic order as
- # used as fields
- # [labels] Hash to set names for fields.
- #
- #
- # Dataset.new()
- # Dataset.new(%w{v1 v2 v3})
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
- #
- # The fast way to create a dataset uses Hash#to_dataset, with
- # fields and labels as arguments
- # ds = {'v1'=>[1,2,3].to_vector}.to_dataset
- #
- def initialize(vectors={}, fields=[], labels={})
- if vectors.instance_of? Array
- @fields=vectors.dup
- @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
- else
- @vectors=vectors
- @fields=fields
- check_order
- check_length
- end
- @i=nil
- @labels=labels
+ # Returns a duplicate of the Database
+ # If fields given, only include those vectors
+ def dup(*fields_to_include)
+ if fields_to_include.size==1 and fields_to_include[0].is_a? Array
+ fields_to_include=fields_to_include[0]
+ end
+ fields_to_include=@fields if fields_to_include.size==0
+ vectors={}
+ fields=[]
+ new_labels={}
+ fields_to_include.each{|f|
+ raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
+ vectors[f]=@vectors[f].dup
+ new_labels[f]=@labels[f]
+ fields.push(f)
+ }
+ Dataset.new(vectors,fields,new_labels)
+ end
+ # Creates a copy of the given dataset, without data on vectors
+ def dup_empty
+ vectors=@vectors.inject({}) {|a,v|
+ a[v[0]]=v[1].dup_empty
+ a
+ }
+ Dataset.new(vectors,@fields.dup,@labels.dup)
+ end
+ # Merge vectors from two datasets
+ # In case of name collition, the vectors names are changed to
+ # x_1, x_2 ....
+ def merge(other_ds)
+ raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
+ types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
+ new_fields = (@fields+other_ds.fields).recode_repeated
+ ds_new=Statsample::Dataset.new(new_fields)
+ new_fields.each_index{|i|
+ field=new_fields[i]
+ ds_new[field].type=types[i]
+ }
+ @cases.times {|i|
+ row=case_as_array(i)+other_ds.case_as_array(i)
+ ds_new.add_case_array(row)
+ }
+ ds_new.update_valid_data
+ ds_new
+ end
+ # Returns a dataset with standarized data
+ def standarize
+ ds=dup()
+ ds.fields.each {|f|
+ ds[f]=ds[f].vector_standarized
+ }
+ ds
+ end
+ # Generate a matrix, based on fields of dataset
+ def collect_matrix
+ rows=@fields.collect{|row|
+ @fields.collect{|col|
+ yield row,col
+ }
+ }
+ Matrix.rows(rows)
+ end
+ # We have the same datasets if the labels and vectors are the same
+ def ==(d2)
+ @vectors==d2.vectors and @fields==d2.fields
+ end
+ def col(c)
+ @vectors[c]
+ end
+ alias_method :vector, :col
+ def add_vector(name,vector)
+ raise ArgumentError, "Vector have different size" if vector.size!=@cases
+ @vectors[name]=vector
+ check_order
+ end
+ def has_vector? (v)
+ return @vectors.has_key?(v)
+ end
+ # Creates a dataset with the random data, of a n size
+ # If n not given, uses original number of cases
+ def bootstrap(n=nil)
+ n||=@cases
+ ds_boot=dup_empty
+ for i in 1..n
+ ds_boot.add_case_array(case_as_array(rand(n)))
+ end
+ ds_boot.update_valid_data
+ ds_boot
+ end
+ # Fast version of add case
+ # Can only add one case and no error check if performed
+ # You SHOULD use update_valid_data at the end of insertion cycle
+ def add_case_array(v)
+ v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
+ end
+ def add_case(v,uvd=true)
+ case v
+ when Array
+ if (v[0].is_a? Array)
+ v.each{|subv| add_case(subv,false)}
+ else
+ raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
+ v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
end
- def to_gsl_matrix
- matrix=GSL::Matrix.alloc(cases,@vectors.size)
- each_array do |row|
- row.each_index{|y| matrix.set(@i,y,row[y]) }
- end
- matrix
- end
- def vector_label(v_id)
- raise "Vector #{v} doesn't exists" unless @fields.include? v_id
- @labels[v_id].nil? ? v_id : @labels[v_id]
- end
- # Creates a copy of the given dataset, deleting all the cases with
- # missing data on one of the vectors
- def dup_only_valid
- if @vectors.find{|field,vector| vector.has_missing_data?}
- ds=dup_empty
- each_array { |c|
- ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
- }
- ds.update_valid_data
- else
- ds=dup()
- end
- ds
- end
- # Returns an array with the fields from first argumen to last argument
- def from_to(from,to)
- raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
- raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
- @fields.slice(@fields.index(from)..@fields.index(to))
- end
- # Returns a duplicate of the Database
- # If fields given, only include those vectors
- def dup(*fields_to_include)
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
- fields_to_include=fields_to_include[0]
- end
- fields_to_include=@fields if fields_to_include.size==0
- vectors={}
- fields=[]
- labels={}
- fields_to_include.each{|f|
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
- vectors[f]=@vectors[f].dup
- labels[f]=@labels[f]
- fields.push(f)
- }
- Dataset.new(vectors,fields,labels)
- end
- # Creates a copy of the given dataset, without data on vectors
- def dup_empty
- vectors=@vectors.inject({}) {|a,v|
- a[v[0]]=v[1].dup_empty
- a
- }
- Dataset.new(vectors,@fields.dup,@labels.dup)
- end
- # Returns a dataset with standarized data
- def standarize
- ds=dup()
- ds.fields.each {|f|
- ds[f]=ds[f].vector_standarized
- }
- ds
- end
- # Generate a matrix, based on fields of dataset
- def collect_matrix
- rows=@fields.collect{|row|
- @fields.collect{|col|
- yield row,col
- }
- }
- Matrix.rows(rows)
- end
- # We have the same datasets if the labels and vectors are the same
- def ==(d2)
- @vectors==d2.vectors and @fields==d2.fields
- end
- def col(c)
- @vectors[c]
- end
- alias_method :vector, :col
- def add_vector(name,vector)
- raise ArgumentError, "Vector have different size" if vector.size!=@cases
- @vectors[name]=vector
- check_order
- end
- def has_vector? (v)
- return @vectors.has_key?(v)
- end
- # Creates a dataset with the random data, of a n size
- # If n not given, uses original number of cases
- def bootstrap(n=nil)
- n||=@cases
- ds_boot=dup_empty
- for i in 1..n
- ds_boot.add_case_array(case_as_array(rand(n)))
- end
- ds_boot.update_valid_data
- ds_boot
- end
- # Fast version of add case
- # Can only add one case and no error check if performed
- # You SHOULD use update_valid_data at the end of insertion cycle
- def add_case_array(v)
- v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
- end
- def add_case(v,uvd=true)
- case v
- when Array
- if (v[0].is_a? Array)
- v.each{|subv| add_case(subv,false)}
- else
- raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
- v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
- end
- when Hash
- raise ArgumentError, "Hash keys should be equal to fields" if @fields.sort!=v.keys.sort
- @fields.each{|f| @vectors[f].add(v[f],false)}
- else
- raise TypeError, 'Value must be a Array or a Hash'
- end
- if uvd
- update_valid_data
- end
- end
- def update_valid_data
- @fields.each{|f| @vectors[f].set_valid_data}
- check_length
- end
- def delete_vector(name)
- @fields.delete(name)
- @vectors.delete(name)
- end
- def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
- split=@vectors[name].split_by_separator(sep)
- i=1
- split.each{|k,v|
- new_field=name+join+i.to_s
- @labels[new_field]=name+":"+k
- add_vector(new_field,v)
- i+=1
- }
- end
- def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
- split=@vectors[name].split_by_separator(sep)
- split.each{|k,v|
- add_vector(name+join+k,v)
- }
- end
+ when Hash
+ raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
+ @fields.each{|f| @vectors[f].add(v[f],false)}
+ else
+ raise TypeError, 'Value must be a Array or a Hash'
+ end
+ if uvd
+ update_valid_data
+ end
+ end
+ def update_valid_data
+ @fields.each{|f| @vectors[f].set_valid_data}
+ check_length
+ end
+ def delete_vector(name)
+ @fields.delete(name)
+ @vectors.delete(name)
+ end
+ def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
+ split=@vectors[name].split_by_separator(sep)
+ i=1
+ split.each{|k,v|
+ new_field=name+join+i.to_s
+ @labels[new_field]=name+":"+k
+ add_vector(new_field,v)
+ i+=1
+ }
+ end
+ def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
+ split=@vectors[name].split_by_separator(sep)
+ split.each{|k,v|
+ add_vector(name+join+k,v)
+ }
+ end
def vector_by_calculation(type=:scale)
a=[]
each {|row|
a.push(yield(row))
}
@@ -236,360 +298,350 @@
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
nil
else
fields.inject(0) {|ac,v| ac + row[v].to_f}
end
- end
+ end
end
- # Returns a vector with the numbers of missing values for a case
-
- def vector_missing_values(fields=nil)
- fields||=@fields
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
-
- collect_with_index do |i,row|
- fields.inject(0){|a,v|
- a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
- }
- end
- end
- def vector_count_characters(fields=nil)
- fields||=@fields
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
- collect_with_index do |i,row|
- fields.inject(0){|a,v|
-
- a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
- }
- end
- end
- # Returns a vector with the mean for a set of fields
- # if fields parameter is empty, return the mean for all fields
- # if max invalid parameter > 0, returns the mean for all tuples
- # with 0 to max_invalid invalid fields
- def vector_mean(fields=nil,max_invalid=0)
- a=[]
- fields||=@fields
- size=fields.size
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
- each_with_index do |i, row|
- # numero de invalidos
- sum=0
- invalids=0
- fields.each{|f|
- if !@vectors[f].data_with_nils[i].nil?
- sum+=row[f].to_f
- else
- invalids+=1
- end
- }
- if(invalids>max_invalid)
- a.push(nil)
- else
- a.push(sum.quo(size-invalids))
- end
- end
- a.to_vector(:scale)
- end
- def check_length
- size=nil
- @vectors.each do |k,v|
- raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
- if size.nil?
- size=v.size
- else
- if v.size!=size
- p v.to_a.size
- raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
- end
- end
- end
- @cases=size
- end
- def each_vector
- @fields.each{|k| yield k,@vectors[k]}
- end
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
- def case_as_hash(c) # :nodoc:
- Statsample::STATSAMPLE__.case_as_hash(self,c)
- end
+ def check_fields(fields)
+ fields||=@fields
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
+ fields
+ end
+ # Returns a vector with the numbers of missing values for a case
+
+ def vector_missing_values(fields=nil)
+ fields=check_fields(fields)
+ collect_with_index do |i,row|
+ fields.inject(0) {|a,v|
+ a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
+ }
+ end
+ end
+ def vector_count_characters(fields=nil)
+ fields=check_fields(fields)
+ collect_with_index do |i,row|
+ fields.inject(0){|a,v|
+
+ a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
+ }
+ end
+ end
+ # Returns a vector with the mean for a set of fields
+ # if fields parameter is empty, return the mean for all fields
+ # if max invalid parameter > 0, returns the mean for all tuples
+ # with 0 to max_invalid invalid fields
+ def vector_mean(fields=nil,max_invalid=0)
+ a=[]
+ fields=check_fields(fields)
+ size=fields.size
+ each_with_index do |i, row|
+ # numero de invalidos
+ sum=0
+ invalids=0
+ fields.each{|f|
+ if !@vectors[f].data_with_nils[i].nil?
+ sum+=row[f].to_f
+ else
+ invalids+=1
+ end
+ }
+ if(invalids>max_invalid)
+ a.push(nil)
else
- def case_as_hash(c)
- _case_as_hash(c)
- end
+ a.push(sum.quo(size-invalids))
end
-
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
- def case_as_array(c) # :nodoc:
- Statsample::STATSAMPLE__.case_as_array(self,c)
- end
+ end
+ a.to_vector(:scale)
+ end
+ def check_length
+ size=nil
+ @vectors.each do |k,v|
+ raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
+ if size.nil?
+ size=v.size
else
- def case_as_array(c)
- _case_as_array(c)
- end
+ if v.size!=size
+ p v.to_a.size
+ raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
+ end
end
- def _case_as_hash(c) # :nodoc:
- @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
- end
- def _case_as_array(c) # :nodoc:
- @fields.collect {|x| @vectors[x][c]}
- end
- # Returns each case as a hash
- def each
- begin
- @i=0
- @cases.times {|i|
- @i=i
- row=case_as_hash(i)
- yield row
- }
- @i=nil
- rescue =>e
- raise DatasetException.new(self,e)
- end
- end
- # Returns each case as index and hash
- def each_with_index
- begin
- @i=0
- @cases.times{|i|
- @i=i
- row=case_as_hash(i)
- yield i,row
- }
- @i=nil
- rescue =>e
- raise DatasetException.new(self,e)
- end
- end
- # Returns each case as an array, coding missing values as nils
- def each_array_with_nils
- m=fields.size
- @cases.times {|i|
- @i=i
- row=Array.new(m)
- fields.each_index{|j|
- f=fields[j]
- row[j]=@vectors[f].data_with_nils[i]
- }
- yield row
- }
- @i=nil
- end
- # Returns each case as an array
- def each_array
- @cases.times {|i|
- @i=i
- row=case_as_array(i)
- yield row
- }
- @i=nil
- end
- def fields=(f)
- @fields=f
- check_order
- end
- def check_order
- if(@vectors.keys.sort!=@fields.sort)
- @fields=@fields&@vectors.keys
- @fields+=@vectors.keys.sort-@fields
- end
- end
- # Returns the vector named i
- def[](i)
- if i.is_a? String
- raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
- @vectors[i]
- elsif i.is_a? Range
- fields=from_to(i.begin,i.end)
- vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
- ds=Dataset.new(vectors,fields)
- else
- raise ArgumentError, "You need a String or a Range"
- end
- end
- def collect(type=:scale)
- data=[]
- each {|row|
- data.push(yield(row))
- }
- Statsample::Vector.new(data,type)
- end
- def collect_with_index(type=:scale)
- data=[]
- each_with_index {|i,row|
- data.push(yield(i,row))
- }
- Statsample::Vector.new(data,type)
- end
- # Recode a vector based on a block
- def recode!(vector_name)
- 0.upto(@cases-1) {|i|
- @vectors[vector_name].data[i]=yield case_as_hash(i)
- }
- @vectors[vector_name].set_valid_data
- end
- def crosstab(v1,v2)
- Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
- end
- def[]=(i,v)
- if v.instance_of? Statsample::Vector
- @vectors[i]=v
- check_order
- else
- raise ArgumentError,"Should pass a Statsample::Vector"
- end
- end
- def to_matrix
- rows=[]
- self.each_array{|c|
- rows.push(c)
- }
- Matrix.rows(rows)
- end
+ end
+ @cases=size
+ end
+ def each_vector
+ @fields.each{|k| yield k,@vectors[k]}
+ end
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
+ def case_as_hash(c) # :nodoc:
+ Statsample::STATSAMPLE__.case_as_hash(self,c)
+ end
+ else
+ def case_as_hash(c)
+ _case_as_hash(c)
+ end
+ end
+
+ if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
+ def case_as_array(c) # :nodoc:
+ Statsample::STATSAMPLE__.case_as_array(self,c)
+ end
+ else
+ def case_as_array(c)
+ _case_as_array(c)
+ end
+ end
+ def _case_as_hash(c) # :nodoc:
+ @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
+ end
+ def _case_as_array(c) # :nodoc:
+ @fields.collect {|x| @vectors[x][c]}
+ end
+ # Returns each case as a hash
+ def each
+ begin
+ @i=0
+ @cases.times {|i|
+ @i=i
+ row=case_as_hash(i)
+ yield row
+ }
+ @i=nil
+ rescue =>e
+ raise DatasetException.new(self,e)
+ end
+ end
+ # Returns each case as index and hash
+ def each_with_index
+ begin
+ @i=0
+ @cases.times{|i|
+ @i=i
+ row=case_as_hash(i)
+ yield i,row
+ }
+ @i=nil
+ rescue =>e
+ raise DatasetException.new(self,e)
+ end
+ end
+ # Returns each case as an array, coding missing values as nils
+ def each_array_with_nils
+ m=fields.size
+ @cases.times {|i|
+ @i=i
+ row=Array.new(m)
+ fields.each_index{|j|
+ f=fields[j]
+ row[j]=@vectors[f].data_with_nils[i]
+ }
+ yield row
+ }
+ @i=nil
+ end
+ # Returns each case as an array
+ def each_array
+ @cases.times {|i|
+ @i=i
+ row=case_as_array(i)
+ yield row
+ }
+ @i=nil
+ end
+ def fields=(f)
+ @fields=f
+ check_order
+ end
+ def check_order
+ if(@vectors.keys.sort!=@fields.sort)
+ @fields=@fields&@vectors.keys
+ @fields+=@vectors.keys.sort-@fields
+ end
+ end
+ # Returns the vector named i
+ def[](i)
+ if i.is_a? String
+ raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
+ @vectors[i]
+ elsif i.is_a? Range
+ fields=from_to(i.begin,i.end)
+ vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
+ ds=Dataset.new(vectors,fields)
+ else
+ raise ArgumentError, "You need a String or a Range"
+ end
+ end
+ def collect(type=:scale)
+ data=[]
+ each {|row|
+ data.push(yield(row))
+ }
+ Statsample::Vector.new(data,type)
+ end
+ def collect_with_index(type=:scale)
+ data=[]
+ each_with_index {|i,row|
+ data.push(yield(i,row))
+ }
+ Statsample::Vector.new(data,type)
+ end
+ # Recode a vector based on a block
+ def recode!(vector_name)
+ 0.upto(@cases-1) {|i|
+ @vectors[vector_name].data[i]=yield case_as_hash(i)
+ }
+ @vectors[vector_name].set_valid_data
+ end
+ def crosstab(v1,v2)
+ Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
+ end
+ def[]=(i,v)
+ if v.instance_of? Statsample::Vector
+ @vectors[i]=v
+ check_order
+ else
+ raise ArgumentError,"Should pass a Statsample::Vector"
+ end
+ end
+ def to_matrix
+ rows=[]
+ self.each_array{|c|
+ rows.push(c)
+ }
+ Matrix.rows(rows)
+ end
def to_multiset_by_split(*fields)
require 'statsample/multiset'
if fields.size==1
to_multiset_by_split_one_field(fields[0])
else
to_multiset_by_split_multiple_fields(*fields)
end
end
- # create a new dataset with all the data which the block returns true
- def filter
- ds=self.dup_empty
- each {|c|
- ds.add_case(c,false) if yield c
- }
- ds.update_valid_data
- ds
- end
+ # create a new dataset with all the data which the block returns true
+ def filter
+ ds=self.dup_empty
+ each {|c|
+ ds.add_case(c,false) if yield c
+ }
+ ds.update_valid_data
+ ds
+ end
# creates a new vector with the data of a given field which the block returns true
def filter_field(field)
a=[]
each {|c|
a.push(c[field]) if yield c
}
a.to_vector(@vectors[field].type)
end
- def to_multiset_by_split_one_field(field)
- raise ArgumentError,"Should use a correct field name" if !@fields.include? field
- factors=@vectors[field].factors
- ms=Multiset.new_empty_vectors(@fields,factors)
- each {|c|
- ms[c[field]].add_case(c,false)
- }
- #puts "Ingreso a los dataset"
- ms.datasets.each {|k,ds|
- ds.update_valid_data
- ds.vectors.each{|k1,v1|
- # puts "Vector #{k1}:"+v1.to_s
- v1.type=@vectors[k1].type
- }
- }
- ms
+ def to_multiset_by_split_one_field(field)
+ raise ArgumentError,"Should use a correct field name" if !@fields.include? field
+ factors=@vectors[field].factors
+ ms=Multiset.new_empty_vectors(@fields,factors)
+ each {|c|
+ ms[c[field]].add_case(c,false)
+ }
+ #puts "Ingreso a los dataset"
+ ms.datasets.each {|k,ds|
+ ds.update_valid_data
+ ds.vectors.each{|k1,v1|
+ # puts "Vector #{k1}:"+v1.to_s
+ v1.type=@vectors[k1].type
+ }
+ }
+ ms
+ end
+ def to_multiset_by_split_multiple_fields(*fields)
+ factors_total=nil
+ fields.each do |f|
+ if factors_total.nil?
+ factors_total=@vectors[f].factors.collect{|c|
+ [c]
+ }
+ else
+ suma=[]
+ factors=@vectors[f].factors
+ factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
+ factors_total=suma
end
- def to_multiset_by_split_multiple_fields(*fields)
- factors_total=nil
- fields.each{|f|
- if factors_total.nil?
- factors_total=@vectors[f].factors.collect{|c|
- [c]
- }
- else
- suma=[]
- factors=@vectors[f].factors
- factors_total.each{|f1|
- factors.each{|f2|
- suma.push(f1+[f2])
- }
- }
- factors_total=suma
- end
- }
- ms=Multiset.new_empty_vectors(@fields,factors_total)
- p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
- each{|c|
- p1.call(c)
- }
- ms.datasets.each {|k,ds|
- ds.update_valid_data
- ds.vectors.each{|k1,v1|
- # puts "Vector #{k1}:"+v1.to_s
- v1.type=@vectors[k1].type
- }
- }
- ms
-
- end
- # Returns a vector, based on a string with a calculation based
- # on vector
- # The calculation will be eval'ed, so you can put any variable
- # or expression valid on ruby
- # For example:
- # a=[1,2].to_vector(scale)
- # b=[3,4].to_vector(scale)
- # ds={'a'=>a,'b'=>b}.to_dataset
- # ds.calculate("a+b")
- # => Vector [4,6]
- def compute(text)
- @fields.each{|f|
- if @vectors[f].type=:scale
- text.gsub!(f,"row['#{f}'].to_f")
- else
- text.gsub!(f,"row['#{f}']")
-
- end
-
- }
- collect_with_index {|i,row|
- invalid=false
- @fields.each{|f|
- if @vectors[f].data_with_nils[i].nil?
- invalid=true
- end
- }
- if invalid
- nil
- else
- eval(text)
- end
- }
+ end
+ ms=Multiset.new_empty_vectors(@fields,factors_total)
+ p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
+ each{|c| p1.call(c)}
+ ms.datasets.each do |k,ds|
+ ds.update_valid_data
+ ds.vectors.each{|k1,v1| v1.type=@vectors[k1].type }
+ end
+ ms
+
+ end
+ # Returns a vector, based on a string with a calculation based
+ # on vector
+ # The calculation will be eval'ed, so you can put any variable
+ # or expression valid on ruby
+ # For example:
+ # a=[1,2].to_vector(scale)
+ # b=[3,4].to_vector(scale)
+ # ds={'a'=>a,'b'=>b}.to_dataset
+ # ds.compute("a+b")
+ # => Vector [4,6]
+ def compute(text)
+ @fields.each{|f|
+ if @vectors[f].type=:scale
+ text.gsub!(f,"row['#{f}'].to_f")
+ else
+ text.gsub!(f,"row['#{f}']")
end
- # Test each row with one or more tests
- # each test is a Proc with the form
- # Proc.new {|row| row['age']>0}
- # The function returns an array with all errors
- def verify(*tests)
- if(tests[0].is_a? String)
- id=tests[0]
- tests.shift
- else
- id=@fields[0]
- end
- vr=[]
- i=0
- each do |row|
- i+=1
- tests.each{|test|
- if ! test[2].call(row)
- values=""
- if test[1].size>0
- values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
- end
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
- end
- }
- end
- vr
+ }
+ collect_with_index {|i,row|
+ invalid=false
+ @fields.each{|f|
+ if @vectors[f].data_with_nils[i].nil?
+ invalid=true
+ end
+ }
+ if invalid
+ nil
+ else
+ eval(text)
end
- def to_s
- "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
- end
- def inspect
- self.to_s
- end
+ }
+ end
+ # Test each row with one or more tests
+ # each test is a Proc with the form
+ # Proc.new {|row| row['age']>0}
+ # The function returns an array with all errors
+ def verify(*tests)
+ if(tests[0].is_a? String)
+ id=tests[0]
+ tests.shift
+ else
+ id=@fields[0]
+ end
+ vr=[]
+ i=0
+ each do |row|
+ i+=1
+ tests.each{|test|
+ if ! test[2].call(row)
+ values=""
+ if test[1].size>0
+ values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
+ end
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
+ end
+ }
+ end
+ vr
+ end
+ def to_s
+ "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
+ end
+ def inspect
+ self.to_s
+ end
def summary
out=""
out << "Summary for dataset\n"
@vectors.each{|k,v|
out << "###############\n"
@@ -598,12 +650,12 @@
out << "###############\n"
}
out
end
- def as_r
- require 'rsruby/dataframe'
- r=RSRuby.instance
-
- end
+ def as_r
+ require 'rsruby/dataframe'
+ r=RSRuby.instance
+
end
+ end
end