## # Module supplying probability functions module Digiproc::Probability ## # Interchangable strategy #Strategy requirements: #can be initialized with no arguments (gives mean = 0 and stddev = 1), or with a mean and stddev #instance method: #rand returns normal distribution with mean = 0, stddev = 1 @gaussian_generator = Digiproc::Strategies::GaussianGeneratorBoxMullerStrategy.new ## # Returns a new instance of a normal number random generator, which contains an instance method #rand which will return a # single random number from the normal distribution # == Arguments # mean:: [Float] mean of the distribution defaults to 0 # stddev:: [Float] standard deviation of the distribution defaults to 1 ## gen = Digiproc::Probability.normal_random_generator(100, 15) # => return rand gen of IQ scores ## scores = [] ## 10.times do ## scores << gen.rand ## end ## puts scores # => [98.22222989456891, 102.78474153867536, 130.32345913801984, 90.68720464516447, 94.45065665514275, 100.46571157090098, 105.40955807686252, 85.9616084359681, 103.05219341559669, 96.62475435904693] def self.normal_random_generator(mean = 0, stddev = 1) @gaussian_generator.class.new(mean, stddev) end ## # == Arguments # size:: [Integer] number of returns expected, defaults to 1 # If `size` is 1, returns a single number from the distribution # For `size` > 1, return an array of numbers from the distribution ## Digiproc::Probability.nrand(5) # => [-0.4870987490684469, 0.48974360810927925, -0.3722088483576355, -0.2829898781247938, 0.12540113064787742] ## Digiproc::Probability.nrand # => 0.7978655621417761 def self.nrand(size = 1) return @gaussian_generator.rand if size == 1 rand_nums = [] size.times do rand_nums << @gaussian_generator.rand end return rand_nums end ## # Returns [Float] the mean of the inputted data # == Arguments # data:: [Array] data to be evaluated ## dat = [4,12,19,4,8,9] ## Digiproc::Probability.mean(dat) # => 9.333333333333334 def self.mean(data) data.sum / data.length.to_f end ## # Returns [Float] the variance of the inputted data # == Arguments # data:: [Array] data to be evaluated # variance = sum_over_datavals( (dataval - mean) ^ 2 ) / (number_of_datavals - 1) ## dat = [4,12,19,4,8,9] ## Digiproc::Probability.variance(dat) # => 31.866666666666664 def self.variance(data) mu = mean(data) summation = data.map{ |val| (val - mu) ** 2 }.sum summation.to_f / (data.length - 1) end ## # Returns [Float] the variance of inputted data for a stationary process # (stationary proces = mean, variance, autocorrelation do not change with time) # Will run faster over large datasets because each val in data is not undergoing a subtraction with mu # If the process is not stationary, an incorrect result with be given # == Arguments # data:: [Array] data to be evaluated # stat_var = sum_over_datavals( val^2 ) / number_ov_vals ## dat = [4,12,19,4,8,9] ## Digiproc::Probability.stationary_variance(dat) # => 26.555555555555543 def self.stationary_variance(data) mu = mean(data) summation = data.map{ |val| val ** 2 }.sum (summation.to_f / data.length) - (mu ** 2) end ## # Returns [Float] the standard deviation of the inputted data # == Arguments # data:: [Array] data to be evaluated # stddev = sqrt(variance) ## dat = [4,12,19,4,8,9] ## Digiproc::Probability.stddev # => 5.645056834671079 def self.stddev(data) variance(data) ** (0.5) end ## # Returns [Float] the covariance (if process is stationary) of two datasets # == Arguments # data1:: [Array] first dataset # data2:: [Array] second datatset ## dat = [4,12,19,4,8,9] ## dat2 = [19, 20, 35, 15, 13, 17] ## Digiproc::Probability.stationary_covariance(dat, dat2) # => 31.22222222222223 def self.stationary_covariance(data1, data2) raise ArgumentError.new("Datasets must be of equal length") if data1.length != data2.length xcs = data1.dot data2 mu1 = mean(data1) mu2 = mean(data2) (xcs / (data1.length.to_f)) - (mu1 * mu2) end ## # Returns [Float] covariance of two datasets # == Arguments # data1:: [Array] first dataset # data2:: [Array] second datatset ## dat = [4,12,19,4,8,9] ## dat2 = [19, 20, 35, 15, 13, 17] ## Digiproc::Probability.ovariance(dat, dat2) # => 37.46666666666667 def self.covariance(data1, data2) raise ArgumentError.new("Datasets must be of equal length") if data1.length != data2.length mu1 = mean(data1) mu2 = mean(data2) summation = 0 for i in 0...data1.length do summation += ((data1[i] - mu1) * (data2[i] - mu2)) end summation.to_f / (data1.length - 1) end ## # Calculates Pearson's correlation coefficient of two datasets # == Arguments # data1:: [Array] first dataset # data2:: [Array] second dataset ## dat = [4,12,19,4,8,9] ## dat2 = [19, 20, 35, 15, 13, 17] ## Digiproc::Probability.correlation_coefficient(dat, dat2) # => 0.84450000297225 ## cdat = [1,2,3,4,5] ## ddat = cdat.map{ |v| v * -29 } ## Digiproc::Probability.correlation_coefficient(cdat, ddat) # => -1.0 def self.correlation_coefficient(data1, data2) covar = covariance(data1, data2) var1 = variance(data1) var2 = variance(data2) return covar.to_f / ((var1 ** 0.5) * (var2 ** 0.5)) end ## # Alias for #variance def self.var(d) variance(d) end ## # Alias for #covariance def self.cov(d1, d2) covariance(d1,d2) end ## # Alias for #correlation_coefficient def self.corr_coeff(d1, d2) correlation_coefficient(d1, d2) end ## # Returns the error function output [Float] of the input ie: # For an input x and a normal distribution with mean 0 and variance 0.5, it is the probability a random variable will be between -x and x # Mirrors Math.erf(x) # == Arguments # x:: [Float] ## Digiproc::Prbobability.erf(0.3) # => 0.32862675945912734 def self.erf(x) Math.erf(x) end ## # Returns the erfc [Float] per Math.erfc # For an input x and a normal distribution with mean 0 and variance 0.5, it is the probability a random variable will not be between -x and x # == Arguments # x:: [Float] ## Digiproc::Prbobability.erfc(0.3) # => 0.6713732405408727 def self.erfc(x) Math.erfc(x) end ## # Returns [Float] probability that a random variable from a normal distribution of a certain mean and standard deviation will be less than a value x # == Arguments # x:: [Float] the value which the probability will be evaluated that a random variable will be below # mean:: [Float] the mean of the normal distribution (defaults to 0) # stddev:: [Float] the standard deviation of the normal distribution (defaults to 1) ## Digiproc::Probability.normal_cdf(0.5) # => 0.691462461274013 ## Digiproc::Probability.normal_cdf(140, 100, 15) # => 0.9961696194324102 def self.normal_cdf(x, mean = 0, stddev = 1) 1 - normal_q(x, mean, stddev) end ## # Returns [Float] the probability that a random variable from a normal distribution of a certain mean and standard deviation will be greater than a value x # == Arguments # x:: [Float] the value which the probability will be evaluated that a random variable will be below # mean:: [Float] the mean of the normal distribution (defaults to 0) # stddev:: [Float] the standard deviation of the normal distribution (defaults to 1) ## Digiproc::Probability.normal_q(0.5) # => 0.30853753872598694 ## Digiproc::Probability.normal_q(140, 100, 15) # => 0.0038303805675897395 def self.normal_q(x, mean = 0, stddev = 1) xformed_x = (x - mean) / stddev.to_f 0.5 * erfc(xformed_x / (2 ** 0.5)) end ## # Returns [Hash] the discrete probability distribution function of an inputted array of values # == Arguments # data:: [Array] discrete values in a dataset ## dat = [3,3,4,5,4,5,6,5,6,5,7,6,5,4] ## Digiproc::Probability.pdf(dat) # => {3=>2, 4=>3, 5=>5, 6=>3, 7=>1} def self.pdf(data) pdf = {} data.each do |datapoint| pt = datapoint.round(2) if pdf[pt].nil? pdf[pt] = 1 else pdf[pt] += 1 end end pdf end end