module Statsample
# Multiset joins multiple dataset with the same fields and vectors
# but with different number of cases.
# This is the base class for stratified and cluster sampling estimation
class Multiset
attr_reader :fields, :datasets
# To create a multiset
# * Multiset.new(%w{f1 f2 f3}) # define only fields
def initialize(fields)
@fields=fields
@datasets={}
end
def self.new_empty_vectors(fields,ds_names)
ms=Multiset.new(fields)
ds_names.each{|d|
ms.add_dataset(d,Dataset.new(fields))
}
ms
end
def datasets_names
@datasets.keys.sort
end
def n_datasets
@datasets.size
end
def add_dataset(key,ds)
if(ds.fields!=@fields)
raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
else
@datasets[key]=ds
end
end
def sum_field(field)
@datasets.inject(0) {|a,da|
stratum_name=da[0]
vector=da[1][field]
val=yield stratum_name,vector
a+val
}
end
def collect_vector(field)
@datasets.collect {|k,v|
yield k, v[field]
}
end
def each_vector(field)
@datasets.each {|k,v|
yield k, v[field]
}
end
def[](i)
@datasets[i]
end
end
class StratifiedSample
class << self
# mean for an array of vectors
def mean(*vectors)
n_total=0
means=vectors.inject(0){|a,v|
n_total+=v.size
a+v.sum
}
means.to_f/n_total
end
def standard_error_ksd_wr(es)
n_total=0
sum=es.inject(0){|a,h|
n_total+=h['N']
a+((h['N']**2 * h['s']**2) / h['n'].to_f)
}
(1.to_f / n_total)*Math::sqrt(sum)
end
def variance_ksd_wr(es)
standard_error_ksd_wr(es)**2
end
def calculate_n_total(es)
es.inject(0) {|a,h| a+h['N'] }
end
# Source : Cochran (1972)
def variance_ksd_wor(es)
n_total=calculate_n_total(es)
es.inject(0){|a,h|
val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
a+val
}
end
def standard_error_ksd_wor(es)
Math::sqrt(variance_ksd_wor(es))
end
def variance_esd_wor(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
a+val
}
(1.0/(n_total**2))*sum
end
def standard_error_esd_wor(es)
Math::sqrt(variance_ksd_wor(es))
end
# Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
def variance_esd_wr(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
a+val
}
(1.0/(n_total**2))*sum
end
def standard_error_esd_wr(es)
Math::sqrt(variance_esd_wr(es))
end
def proportion_variance_ksd_wor(es)
n_total=calculate_n_total(es)
es.inject(0){|a,h|
val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
a+val
}
end
def proportion_sd_ksd_wor(es)
Math::sqrt(proportion_variance_ksd_wor(es))
end
def proportion_sd_ksd_wr(es)
n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
a+val
}
Math::sqrt(sum) * (1.0/n_total)
end
def proportion_variance_ksd_wr(es)
proportion_variance_ksd_wor(es)**2
end
def proportion_variance_esd_wor(es)
n_total=n_total=calculate_n_total(es)
sum=es.inject(0){|a,h|
a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
a+val
}
Math::sqrt(sum) * (1.0/n_total**2)
end
def proportion_sd_esd_wor(es)
Math::sqrt(proportion_variance_ksd_wor(es))
end
end
def initialize(ms,strata_sizes)
raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
@ms=ms
raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
@strata_sizes=strata_sizes
@population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
@strata_number=@ms.n_datasets
@sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
end
# Number of strata
def strata_number
@strata_number
end
# Population size. Equal to sum of strata sizes
# Symbol: Nh
def population_size
@population_size
end
# Sample size. Equal to sum of sample of each stratum
def sample_size
@sample_size
end
# Size of stratum x
def stratum_size(h)
@strata_sizes[h]
end
def vectors_by_field(field)
@ms.datasets.collect{|k,ds|
ds[field]
}
end
# Population proportion based on strata
def proportion(field, v=1)
@ms.sum_field(field) {|s_name,vector|
stratum_ponderation(s_name)*vector.proportion(v)
}
end
# Stratum ponderation.
# Symbol: W\h\
def stratum_ponderation(h)
@strata_sizes[h].to_f / @population_size
end
alias_method :wh, :stratum_ponderation
# Population mean based on strata
def mean(field)
@ms.sum_field(field) {|s_name,vector|
stratum_ponderation(s_name)*vector.mean
}
end
# Standard error with estimated population variance and without replacement.
# Source: Cochran (1972)
def standard_error_wor(field)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
}
StratifiedSample.standard_error_esd_wor(es)
end
# Standard error with estimated population variance and without replacement.
# Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
def standard_error_wor_2(field)
sum=@ms.sum_field(field) {|s_name,vector|
s_size=@strata_sizes[s_name]
(s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
}
(1/@population_size.to_f)*Math::sqrt(sum)
end
def standard_error_wr(field)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
}
StratifiedSample.standard_error_esd_wr(es)
end
def proportion_sd_esd_wor(field,v=1)
es=@ms.collect_vector(field) {|s_n, vector|
{'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
}
StratifiedSample.proportion_sd_esd_wor(es)
end
def proportion_standard_error(field,v=1)
prop=proportion(field,v)
sum=@ms.sum_field(field) {|s_name,vector|
nh=vector.size
s_size=@strata_sizes[s_name]
(s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
}
(1.quo(@population_size)) * Math::sqrt(sum)
end
# Cochran(1971), p. 150
def variance_pst(field,v=1)
sum=@ms.datasets.inject(0) {|a,da|
stratum_name=da[0]
ds=da[1]
nh=ds.cases.to_f
s_size=@strata_sizes[stratum_name]
prop=ds[field].proportion(v)
a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
}
(1/@population_size.to_f ** 2)*sum
end
end
end