# See this project for code to compute chi_square and contingency_coefficient
# https://github.com/bioruby/bioruby/blob/master/lib/bio/util/contingency_table.rb
#
# Resources for Chi Squared Test
# * http://www.quora.com/What-is-the-most-intuitive-explanation-for-the-chi-square-test
# * http://people.revoledu.com/kardi/tutorial/Questionnaire/Chi-Square%20IndependentTest.html
# * http://stattrek.com/chi-square-test/independence.aspx?Tutorial=AP
# Contingency table and chi squared test is a good tool for interpreting A/B tests.
class RailsDataExplorer
class Chart
class ContingencyTable < Chart
def initialize(_data_set, options = {})
@data_set = _data_set
@options = {}.merge(options)
end
def compute_chart_attrs
x_candidates = @data_set.data_series.find_all { |ds|
(ds.chart_roles[Chart::ContingencyTable] & [:x, :any]).any?
}
y_candidates = @data_set.data_series.find_all { |ds|
(ds.chart_roles[Chart::ContingencyTable] & [:y, :any]).any?
}
x_ds = x_candidates.first
y_ds = (y_candidates - [x_ds]).first
return false if x_ds.nil? || y_ds.nil?
# Compute @observed_vals, @expected_vals, etc.
compute_contingency_and_chi_squared!(x_ds, y_ds)
x_sorted_keys = x_ds.uniq_vals.sort(
&x_ds.label_sorter(
nil,
lambda { |a,b| @observed_vals[b][:_sum] <=> @observed_vals[a][:_sum] }
)
)
y_sorted_keys = y_ds.uniq_vals.sort(
&y_ds.label_sorter(
nil,
lambda { |a,b| @observed_vals[:_sum][b] <=> @observed_vals[:_sum][a] }
)
)
ca = case @data_set.dimensions_count
when 2
# Table
OpenStruct.new(
# Top header row
:rows => [
OpenStruct.new(
:css_class => 'rde-column_header',
:tag => :tr,
:cells => [
OpenStruct.new(:tag => :th, :value => '')
] +
x_sorted_keys.map { |x_val|
OpenStruct.new(:tag => :th, :value => x_val)
} +
[OpenStruct.new(:tag => :th, :value => 'Totals')]
)
] +
# Data rows
y_sorted_keys.map { |y_val|
OpenStruct.new(
:css_class => 'rde-data_row',
:tag => :tr,
:cells => [
OpenStruct.new(:tag => :th, :value => y_val, :css_class => 'rde-row_header')
] +
x_sorted_keys.map { |x_val|
OpenStruct.new(
:tag => :td,
:value => @observed_vals[x_val][y_val],
:css_class => 'rde-numerical',
:title => "Expected value: #{ number_with_precision(@expected_vals[x_val][y_val]) }",
:style => "color: #{ @delta_attrs[x_val][y_val][:color] };",
)
} +
[OpenStruct.new(:tag => :th, :value => @observed_vals[:_sum][y_val])]
)
} +
# Footer row
[
OpenStruct.new(
:css_class => 'rde-column_header',
:tag => :tr,
:cells => [
OpenStruct.new(:tag => :th, :value => 'Totals', :css_class => 'rde-row_header')
] +
x_sorted_keys.map { |x_val|
OpenStruct.new(:tag => :th, :value => @observed_vals[x_val][:_sum])
} +
[OpenStruct.new(:tag => :th, :value => @observed_vals[:_sum][:_sum])]
)
]
)
else
raise(ArgumentError.new("Exactly two data series required for contingency table."))
end
ca
end
def render
return '' unless render?
ca = compute_chart_attrs
return '' unless ca
content_tag(:div, :class => 'rde-chart rde-contingency-table', :id => dom_id) do
content_tag(:h3, "Contingency Table", :class => 'rde-chart-title') +
render_html_table(ca)
end +
content_tag(:p, @conclusion)
end
def render?
# http://en.wikipedia.org/wiki/Pearson's_chi-squared_test#Assumptions
true
end
private
# Computes @observed_vals, @expected_vals, @chi_squared, etc.
# @param[DataSeries] x_ds
# @param[DataSeries] y_ds
def compute_contingency_and_chi_squared!(x_ds, y_ds)
# Compute the observed values table
@observed_vals = { :_sum => { :_sum => 0 } }
x_ds.uniq_vals.each { |x_val|
@observed_vals[x_val] = {}
@observed_vals[x_val][:_sum] = 0
y_ds.uniq_vals.each { |y_val|
@observed_vals[x_val][y_val] = 0
@observed_vals[:_sum][y_val] = 0
}
}
x_ds.values.length.times { |idx|
x_val = x_ds.values[idx]
y_val = y_ds.values[idx]
@observed_vals[x_val][y_val] += 1
@observed_vals[:_sum][y_val] += 1
@observed_vals[x_val][:_sum] += 1
@observed_vals[:_sum][:_sum] += 1
}
# Compute degrees of freedom
@degrees_of_freedom = (x_ds.uniq_vals_count - 1) * (y_ds.uniq_vals_count - 1)
# Compute the expected values table
@expected_vals = {}
x_ds.uniq_vals.each { |x_val|
@expected_vals[x_val] = {}
y_ds.uniq_vals.each { |y_val|
@expected_vals[x_val][y_val] = (
@observed_vals[:_sum][y_val] * @observed_vals[x_val][:_sum]
) / (@observed_vals[:_sum][:_sum]).to_f
}
}
# Compute Chi squared
@chi_squared = 0
x_ds.uniq_vals.each { |x_val|
y_ds.uniq_vals.each { |y_val|
@chi_squared += (
(@observed_vals[x_val][y_val] - @expected_vals[x_val][y_val]) ** 2
) / @expected_vals[x_val][y_val]
}
}
# Compute deltas
@delta_attrs = {}
color_scale = RailsDataExplorer::Utils::ColorScale.new
x_ds.uniq_vals.each { |x_val|
@delta_attrs[x_val] = {}
y_ds.uniq_vals.each { |y_val|
delta = @observed_vals[x_val][y_val] - @expected_vals[x_val][y_val]
delta_factor = delta / @expected_vals[x_val][y_val].to_f
@delta_attrs[x_val][y_val] = {
:expected => @expected_vals[x_val][y_val],
:color => color_scale.compute(delta_factor),
:delta => delta,
:delta_factor => delta_factor,
}
}
}
# Compute probability of observing a sample statistic as extreme as the
# observed test statistic.
@p_value = 1 - Distribution::ChiSquare.cdf(@chi_squared, @degrees_of_freedom)
# Set significance_level
@significance_level = 0.05
# Compute conclusion
all_observed_vals = []
x_ds.uniq_vals.each { |x_val|
y_ds.uniq_vals.each { |y_val|
all_observed_vals << @observed_vals[x_val][y_val]
}
}
observed_vals_less_than_five = all_observed_vals.find_all { |e| e < 5 }
ratio_of_observed_vals_below_five = observed_vals_less_than_five.length / all_observed_vals.length.to_f
if ratio_of_observed_vals_below_five > 0.2
@conclusion = [
"We did not run the ",
%(Pearson chi squared test of independence ),
"since #{ number_to_percentage(ratio_of_observed_vals_below_five * 100, :precision => 0) } ",
"of observed values in the contingency table are below 5 (cutoff is 20%)."
].join
elsif([x_ds, y_ds].any? { |e| e.uniq_vals.length < 2 })
@conclusion = [
"We did not run the ",
%(Pearson chi squared test of independence ),
"since there are not enough observed values in the contingency table."
].join
else
@conclusion = %(Pearson chi squared test of independence suggests that )
@conclusion << if @p_value <= @significance_level
%("#{ x_ds.name }" and "#{ y_ds.name }" are dependent variables (p_value: #{ number_with_precision(@p_value) }))
else
%("#{ x_ds.name }" and "#{ y_ds.name }" are independent variables (p_value: #{ number_with_precision(@p_value) }))
end
end
@conclusion = @conclusion.html_safe
end
end
end
end