class Array def to_vector(*args) Statsample::Vector.new(self,*args) end end module Statsample class << self # Create a matrix using vectors as columns # Use: # # matrix=Statsample.vector_cols_matrix(v1,v2) def vector_cols_matrix(*vs) # test size=vs[0].size vs.each{|v| raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector raise ArgumentError,"Vectors size should be the same" if v.size!=size } Matrix.rows((0...size).to_a.collect() {|i| vs.collect{|v| v[i]} }) end end # Returns a duplicate of the input vectors, without missing data # for any of the vectors # # a=[1,2,3,6,7,nil,3,5].to_vector(:scale) # b=[nil,nil,5,6,4,5,10,2].to_vector(:scale) # c=[2,4,6,7,4,5,6,7].to_vector(:scale) # a2,b2,c2=Statsample.only_valid(a,b,c) # => [#, # #, # #] # def self.only_valid(*vs) i=1 h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a} ds=Statsample::Dataset.new(h).dup_only_valid ds.vectors.values end class Vector < DelegateClass(Array) include Enumerable attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils attr_accessor :labels # Creates a new # data = Array of data # t = level of meausurement. Could be: # [:nominal] : Nominal level of measurement # [:ordinal] : Ordinal level of measurement # [:scale] : Scale level of meausurement # def initialize(data=[],t=:nominal,missing_values=[],labels={}) raise "Data should be an array" unless data.is_a? Array @data=data @missing_values=missing_values @labels=labels @type=t @valid_data=[] @data_with_nils=[] @missing_data=[] @has_missing_data=nil _set_valid_data self.type=t super(@delegate) end def dup Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup) end # Returns an empty duplicate of the vector. Maintains the type, missing # values, labels def dup_empty Vector.new([],@type,@missing_values.dup,@labels.dup) end # Return a vector usign the standarized values for data # with sd with denominator N def vector_standarized_pop vector_standarized(true) end # Return a vector usign the standarized values for data # with sd with denominator n-1 def vector_standarized(use_population=false) raise "Should be a scale" unless @type==:scale mean=@delegate.mean sd=use_population ? @delegate.sdp : @delegate.sds @data_with_nils.collect{|x| if !x.nil? (x.to_f - mean).quo(sd) else nil end }.to_vector(:scale) end alias_method :standarized, :vector_standarized def box_cox_transformation(lambda) raise "Should be a scale" unless @type==:scale @data_with_nils.collect{|x| if !x.nil? if(lambda==0) Math.log(x) else (x**lambda-1).quo(lambda) end else nil end }.to_vector(:scale) end # Vector equality # Two vector will be the same if their data, missing values, type, labels are equals def ==(v2) raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels end def _dump(i) Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type}) end def self._load(data) h=Marshal.load(data) Vector.new(h['data'], h['type'], h['missing_values'], h['labels']) end def recode @data.collect{|x| yield x }.to_vector(@type) end def recode! @data.collect!{|x| yield x } set_valid_data end def each @data.each{|x| yield(x) } end def each_index (0...@data.size).each {|i| yield(i) } end # Add a value at the end of the vector # If second argument set to false, you should update valid data usign # Vector#set_valid_data at the end of your insertion cycle def add(v,update_valid=true) @data.push(v) set_valid_data if update_valid end def set_valid_data @valid_data.clear @missing_data.clear @data_with_nils.clear _set_valid_data @delegate.set_gsl if(@type==:scale) end def _set_valid_data if Statsample::OPTIMIZED Statsample::_set_valid_data(self) else @data.each do |n| if is_valid? n @valid_data.push(n) @data_with_nils.push(n) else @data_with_nils.push(nil) @missing_data.push(n) end end @has_missing_data=@missing_data.size>0 end end # Retrieves true if data has one o more missing values def has_missing_data? @has_missing_data end def labeling(x) @labels.has_key?(x) ? @labels[x].to_s : x.to_s end # Returns a Vector with the data with labels replaced by the label def vector_labeled d=@data.collect{|x| if @labels.has_key? x @labels[x] else x end } Vector.new(d,@type) end def size @data.size end def [](i) @data[i] end def []=(i,v) @data[i]=v end # Return true if a value is valid (not nil and not included on missing values) def is_valid?(x) !(x.nil? or @missing_values.include? x) end # Set missing_values def missing_values=(vals) @missing_values = vals set_valid_data end # Set level of measurement. def type=(t) case t when :nominal @delegate=Nominal.new(@valid_data) when :ordinal @delegate=Ordinal.new(@valid_data) when :scale @delegate=Scale.new(@valid_data) else raise "Type doesn't exists" end __setobj__(@delegate) @type=t end def n; @data.size ; end def to_a @data.dup end # Redundant, but necessary # Spreadsheet creates Array#sum, so calling sum # doesn't call the delegates method def sum @delegate.sum end alias_method :to_ary, :to_a # Vector sum. # - If v is a scalar, add this value to all elements # - If v is a Array or a Vector, should be of the same size of this vector # every item of this vector will be added to the value of the # item at the same position on the other vector def +(v) _vector_ari("+",v) end # Vector rest. # - If v is a scalar, rest this value to all elements # - If v is a Array or a Vector, should be of the same # size of this vector # every item of this vector will be rested to the value of the # item at the same position on the other vector def -(v) _vector_ari("-",v) end # Reports all values that doesn't comply with a condition # Returns a hash with the index of data and the invalid data def verify h={} (0...@data.size).to_a.each{|i| if !(yield @data[i]) h[i]=@data[i] end } h end def _vector_ari(method,v) # :nodoc: if(v.is_a? Vector or v.is_a? Array) if v.size==@data.size i=0 sum=[] 0.upto(v.size-1) {|i| if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?)) sum.push(@data[i].send(method,v[i])) else sum.push(nil) end } Statsample::Vector.new(sum) else raise ArgumentError, "The array/vector parameter should be of the same size of the original vector" end elsif(v.respond_to? method ) Statsample::Vector.new( @data.collect {|x| if(!x.nil?) x.send(method,v) else nil end } ) else raise TypeError,"You should pass a scalar or a array/vector" end end # Return an array with the data splitted by a separator # a=Vector.new(["a,b","c,d","a,b","d"]) # a.splitted # [["a","b"],["c","d"],["a","b"],["d"]] def splitted(sep=Statsample::SPLIT_TOKEN) @data.collect{|x| if x.nil? nil elsif (x.respond_to? :split) x.split(sep) else [x] end } end # Returns a hash of Vectors, defined by the different values # defined on the fields # Example: # # a=Vector.new(["a,b","c,d","a,b"]) # a.split_by_separator # {"a"=>#, # "b"=>#, # "c"=>#} # def split_by_separator(sep=Statsample::SPLIT_TOKEN) split_data=splitted(sep) factors=split_data.flatten.uniq.compact out=factors.inject({}) {|a,x| a[x]=[] a } split_data.each{|r| if r.nil? factors.each{|f| out[f].push(nil) } else factors.each{|f| out[f].push(r.include?(f) ? 1:0) } end } out.inject({}){|s,v| s[v[0]]=Vector.new(v[1],:nominal) s } end def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN) split_by_separator(sep).inject({}) {|a,v| a[v[0]]=v[1].inject {|s,x| s+x.to_i} a } end # Returns an random sample of size n, with replacement, # only with valid data. # # In all the trails, every item have the same probability # of been selected def sample_with_replacement(sample=1) Vector.new(@delegate.sample_with_replacement(sample) ,@type) end # Returns an random sample of size n, without replacement, # only with valid data. # # Every element could only be selected once # A sample of the same size of the vector is the vector itself def sample_without_replacement(sample=1) Vector.new(@delegate.sample_without_replacement(sample),@type) end def count(x=false) if block_given? r=@data.inject(0) {|s, i| r=yield i s+(r ? 1 : 0) } r.nil? ? 0 : r else frequencies[x].nil? ? 0 : frequencies[x] end end # returns the real type for the vector, according to its content def db_type(dbs='mysql') # first, detect any character not number if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/} return "DATE" elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ } return "VARCHAR (255)" elsif @data.find {|v| v.to_s=~/\./} return "DOUBLE" else return "INTEGER" end end def summary(out="") @delegate.summary(@labels,out) end def to_s sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(",")) end def inspect self.to_s end end class Nominal def initialize(data) @data=data # @factors=data.uniq end def delegate_data @data end # Return an array of the different values of the data def factors @data.uniq.sort end # Returns a hash with the distribution of frecuencies of # the sample def frequencies_slow @data.inject(Hash.new) {|a,x| a[x]||=0 a[x]=a[x]+1 a } end # Plot frequencies on a chart, using gnuplot def plot_frequencies require 'gnuplot' x=[] y=[] self.frequencies.sort.each{|k,v| x.push(k) y.push(v) } Gnuplot.open do |gp| Gnuplot::Plot.new( gp ) do |plot| plot.boxwidth("0.9 absolute") plot.yrange("[0:#{y.max}]") plot.style("fill solid 1.00 border -1") plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0") plot.style("histogram") plot.style("data histogram") i=-1 plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")") plot.data << Gnuplot::DataSet.new( [y] ) do |ds| end end end end # Returns the most frequent item def mode frequencies.max{|a,b| a[1]<=>b[1]}[0] end # The numbers of item with valid data def n_valid @data.size end # Returns a hash with the distribution of proportions of # the sample def proportions frequencies.inject({}){|a,v| a[v[0]] = v[1].quo(@data.size) a } end # Proportion of a given value. def proportion(v=1) frequencies[v].quo(@data.size) end def summary(labels,out="") out << sprintf("n valid:%d\n",n_valid) out << sprintf("factors:%s\n",factors.join(",")) out << "mode:"+mode.to_s+"\n" out << "Distribution:\n" frequencies.sort.each{|k,v| key=labels.has_key?(k) ? labels[k]:k out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100) } out end # Returns an random sample of size n, with replacement, # only with valid data. # # In all the trails, every item have the same probability # of been selected def sample_with_replacement(sample) (0...sample).collect{ @data[rand(@data.size)] } end # Returns an random sample of size n, without replacement, # only with valid data. # # Every element could only be selected once # A sample of the same size of the vector is the vector itself def sample_without_replacement(sample) raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size out=[] size=@data.size while out.size0 end end # The range of the data (max - min) def range; @data.max - @data.min; end # The sum of values for the data def sum @data.inject(0){|a,x|x+a} ; end # The arithmetical mean of data def mean sum.to_f.quo(n_valid) end def sum_of_squares(m=nil) m||=mean @data.inject(0){|a,x| a+(x-m).square} end # Sum of squared deviation def sum_of_squared_deviation @data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid)) end # Population variance (divided by n) def variance_population(m=nil) m||=mean squares=@data.inject(0){|a,x| x.square+a} squares.quo(n_valid) - m.square end # Population Standard deviation (divided by n) def standard_deviation_population(m=nil) Math::sqrt( variance_population(m) ) end # Sample Variance (divided by n-1) def variance_sample(m=nil) m||=mean sum_of_squares(m).quo(n_valid - 1) end # Sample Standard deviation (divided by n-1) def standard_deviation_sample(m=nil) m||=m Math::sqrt(variance_sample(m)) end def skew m=mean thirds=@data.inject(0){|a,x| a+((x-mean)**3)} thirds.quo((@data.size-1)*sd**3) end def kurtosis m=mean thirds=@data.inject(0){|a,x| a+((x-mean)**4)} thirds.quo((@data.size-1)*sd**4) end def product @data.inject(1){|a,x| a*x } end if HAS_GSL %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m| m_nuevo=(m+"_slow").intern alias_method m_nuevo, m.intern } def sum # :nodoc: @gsl.sum end def mean # :nodoc: @gsl.mean end def variance_sample(m=nil) # :nodoc: m||=mean @gsl.variance_m end def standard_deviation_sample(m=nil) # :nodoc: m||=mean @gsl.sd(m) end def variance_population(m=nil) # :nodoc: m||=mean @gsl.variance_with_fixed_mean(m) end def standard_deviation_population(m=nil) # :nodoc: m||=mean @gsl.sd_with_fixed_mean(m) end def skew @gsl.skew end def kurtosis @gsl.kurtosis end # Create a GSL::Histogram # With a fixnum, creates X bins within the range of data # With an Array, each value will be a cut point def histogram(bins=10) if bins.is_a? Array h=GSL::Histogram.alloc(bins) else # ugly patch. The upper limit for a bin has the form # x < range h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001]) end h.increment(@gsl) h end def plot_histogram(bins=10,options="") self.histogram(bins).graph(options) end def sample_with_replacement(k) r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) r.sample(@gsl, k).to_a end def sample_without_replacement(k) r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) r.choose(@gsl, k).to_a end end # Coefficient of variation # Calculed with the sample standard deviation def coefficient_of_variation standard_deviation_sample.quo(mean) end def summary(labels,out="") out << sprintf("n valid:%d\n",n_valid) out << "mean:"+mean.to_s+"\n" out << "sum:"+sum.to_s+"\n" out << "range:"+range.to_s+"\n" out << "variance (pop):"+variance_population.to_s+"\n" out << "sd (pop):"+sdp.to_s+"\n" out << "variance (sample):"+variance_sample.to_s+"\n" out << "sd (sample):"+sds.to_s+"\n" out end alias_method :sdp, :standard_deviation_population alias_method :sds, :standard_deviation_sample alias_method :cov, :coefficient_of_variation alias_method :variance, :variance_sample alias_method :sd, :standard_deviation_sample alias_method :ss, :sum_of_squares end end