# -*- coding: utf-8 -*-
class RailsDataExplorer
class Chart
# Contingency table and chi squared test are great tools for interpreting
# A/B tests.
#
# Responsibilities:
# * Render a contingency table for bivariate analysis of two categorical
# data series.
#
# Collaborators:
# * DataSet
#
# See this project for code to compute chi_square and contingency_coefficient
# https://github.com/bioruby/bioruby/blob/master/lib/bio/util/contingency_table.rb
#
# Resources for Chi Squared Test
# * http://www.quora.com/What-is-the-most-intuitive-explanation-for-the-chi-square-test
# * http://people.revoledu.com/kardi/tutorial/Questionnaire/Chi-Square%20IndependentTest.html
# * http://stattrek.com/chi-square-test/independence.aspx?Tutorial=AP
class ContingencyTable < Chart
def initialize(_data_set, options = {})
@data_set = _data_set
@options = {}.merge(options)
end
def compute_chart_attrs
x_candidates = @data_set.data_series.find_all { |ds|
(ds.chart_roles[Chart::ContingencyTable] & [:x, :any]).any?
}
y_candidates = @data_set.data_series.find_all { |ds|
(ds.chart_roles[Chart::ContingencyTable] & [:y, :any]).any?
}
x_ds = x_candidates.first
y_ds = (y_candidates - [x_ds]).first
return false if x_ds.nil? || y_ds.nil?
# Compute @observed_vals, @expected_vals, etc.
compute_contingency_and_chi_squared!(x_ds, y_ds)
x_sorted_keys = x_ds.uniq_vals.sort(
&x_ds.label_sorter(
nil,
lambda { |a,b| @observed_vals[b][:_sum] <=> @observed_vals[a][:_sum] }
)
)
y_sorted_keys = y_ds.uniq_vals.sort(
&y_ds.label_sorter(
nil,
lambda { |a,b| @observed_vals[:_sum][b] <=> @observed_vals[:_sum][a] }
)
)
ca = case @data_set.dimensions_count
when 2
Utils::RdeTable.new(
# Top header row
[
Utils::RdeTableRow.new(
:tr,
[Utils::RdeTableCell.new(:th, '')] +
x_sorted_keys.map { |x_val| Utils::RdeTableCell.new(:th, x_val) } +
[Utils::RdeTableCell.new(:th, 'Totals')],
css_class: 'rde-column_header'
)
] +
# Data rows
y_sorted_keys.map { |y_val|
Utils::RdeTableRow.new(
:tr,
[
Utils::RdeTableCell.new(:th, y_val, css_class: 'rde-row_header')
] +
x_sorted_keys.map { |x_val|
Utils::RdeTableCell.new(
:td,
@observed_vals[x_val][y_val],
css_class: 'rde-numerical',
title: [
"Expected value: #{ number_with_precision(@expected_vals[x_val][y_val], precision: 3, significant: true) }",
"Percentage of row: #{ number_to_percentage(@delta_attrs[x_val][y_val][:percentage_of_row], precision: 3, significant: true) }",
"Percentage of col: #{ number_to_percentage(@delta_attrs[x_val][y_val][:percentage_of_col], precision: 3, significant: true) }",
].join("\n"),
style: "color: #{ @delta_attrs[x_val][y_val][:color] };",
)
} +
[
Utils::RdeTableCell.new(
:th,
@observed_vals[:_sum][y_val],
title: "Percentage of col: #{ number_to_percentage(@delta_attrs[:_sum][y_val][:percentage_of_col], precision: 3, significant: true) }"
)
],
css_class: 'rde-data_row'
)
} +
# Footer row
[
Utils::RdeTableRow.new(
:tr,
[Utils::RdeTableCell.new(:th, 'Totals', css_class: 'rde-row_header')] +
x_sorted_keys.map { |x_val|
Utils::RdeTableCell.new(
:th,
@observed_vals[x_val][:_sum],
title: "Percentage of row: #{ number_to_percentage(@delta_attrs[x_val][:_sum][:percentage_of_row], precision: 3, significant: true) }"
)
} +
[Utils::RdeTableCell.new(:th, @observed_vals[:_sum][:_sum])],
css_class: 'rde-column_header'
)
]
)
else
raise(ArgumentError.new("Exactly two data series required for contingency table."))
end
ca
end
def render
return '' unless render?
ca = compute_chart_attrs
return '' unless ca
content_tag(:div, class: 'rde-chart rde-contingency-table', id: dom_id) do
content_tag(:h3, "Contingency Table", class: 'rde-chart-title') +
render_html_table(ca)
end +
content_tag(:p, @conclusion)
end
def render?
# http://en.wikipedia.org/wiki/Pearson's_chi-squared_test#Assumptions
true
end
private
# Computes @observed_vals, @expected_vals, @chi_squared, etc.
# @param[DataSeries] x_ds
# @param[DataSeries] y_ds
def compute_contingency_and_chi_squared!(x_ds, y_ds)
# Compute the observed values table
@observed_vals = { _sum: { _sum: 0 } }
x_ds.uniq_vals.each { |x_val|
@observed_vals[x_val] = {}
@observed_vals[x_val][:_sum] = 0
y_ds.uniq_vals.each { |y_val|
@observed_vals[x_val][y_val] = 0
@observed_vals[:_sum][y_val] = 0
}
}
x_ds.values.length.times { |idx|
x_val = x_ds.values[idx]
y_val = y_ds.values[idx]
@observed_vals[x_val][y_val] += 1
@observed_vals[:_sum][y_val] += 1
@observed_vals[x_val][:_sum] += 1
@observed_vals[:_sum][:_sum] += 1
}
# Compute degrees of freedom
@degrees_of_freedom = (x_ds.uniq_vals_count - 1) * (y_ds.uniq_vals_count - 1)
# Compute the expected values table
@expected_vals = {}
x_ds.uniq_vals.each { |x_val|
@expected_vals[x_val] = {}
y_ds.uniq_vals.each { |y_val|
@expected_vals[x_val][y_val] = (
@observed_vals[:_sum][y_val] * @observed_vals[x_val][:_sum]
) / (@observed_vals[:_sum][:_sum]).to_f
}
}
# Compute Chi squared
@chi_squared = 0
x_ds.uniq_vals.each { |x_val|
y_ds.uniq_vals.each { |y_val|
@chi_squared += (
(@observed_vals[x_val][y_val] - @expected_vals[x_val][y_val]) ** 2
) / @expected_vals[x_val][y_val]
}
}
# Compute deltas
@delta_attrs = { _sum: {} }
color_scale = RailsDataExplorer::Utils::ColorScale.new
x_ds.uniq_vals.each { |x_val|
@delta_attrs[x_val] = { _sum: {} }
@delta_attrs[x_val][:_sum][:percentage_of_row] = (@observed_vals[x_val][:_sum] / @observed_vals[:_sum][:_sum].to_f) * 100
y_ds.uniq_vals.each { |y_val|
delta = @observed_vals[x_val][y_val] - @expected_vals[x_val][y_val]
delta_factor = delta / @expected_vals[x_val][y_val].to_f
@delta_attrs[x_val][y_val] = {
expected: @expected_vals[x_val][y_val],
color: color_scale.compute(delta_factor),
delta: delta,
delta_factor: delta_factor,
percentage_of_row: (@observed_vals[x_val][y_val] / @observed_vals[:_sum][y_val].to_f) * 100,
percentage_of_col: (@observed_vals[x_val][y_val] / @observed_vals[x_val][:_sum].to_f) * 100,
}
@delta_attrs[:_sum][y_val] ||= {
percentage_of_col: (@observed_vals[:_sum][y_val] / @observed_vals[:_sum][:_sum].to_f) * 100
}
}
}
# Compute probability of observing a sample statistic as extreme as the
# observed test statistic.
@p_value = 1 - Distribution::ChiSquare.cdf(@chi_squared, @degrees_of_freedom)
# Set significance_level
@significance_level = 0.05
# Compute conclusion
all_observed_vals = []
x_ds.uniq_vals.each { |x_val|
y_ds.uniq_vals.each { |y_val|
all_observed_vals << @observed_vals[x_val][y_val]
}
}
observed_vals_less_than_five = all_observed_vals.find_all { |e| e < 5 }
ratio_of_observed_vals_below_five = observed_vals_less_than_five.length / all_observed_vals.length.to_f
if ratio_of_observed_vals_below_five > 0.2
@conclusion = [
"We did not run the ",
%(Pearson chi squared test of independence ),
"since #{ number_to_percentage(ratio_of_observed_vals_below_five * 100, precision: 0) } ",
"of observed values in the contingency table are below 5 (cutoff is 20%)."
].join
elsif([x_ds, y_ds].any? { |e| e.uniq_vals.length < 2 })
@conclusion = [
"We did not run the ",
%(Pearson chi squared test of independence ),
"since there are not enough observed values in the contingency table."
].join
else
@conclusion = %(Pearson chi squared test of independence suggests that )
@conclusion << if @p_value <= @significance_level
%("#{ x_ds.name }" and "#{ y_ds.name }" are dependent variables (p_value of #{ number_with_precision(@p_value) } <= #{ number_with_precision(@significance_level )}))
else
%("#{ x_ds.name }" and "#{ y_ds.name }" are independent variables (p_value of #{ number_with_precision(@p_value) } > #{ number_with_precision(@significance_level )}))
end
end
@conclusion = @conclusion.html_safe
end
end
end
end