module RubyStatistics
  module StatisticalTest
    class WilcoxonRankSumTest
      def rank(elements)
        ranked_elements = {}

        elements.sort.each_with_index do |element, index|
          if ranked_elements.fetch(element, false)
            # This allow us to solve the ties easily when performing the rank summation per group
            ranked_elements[element][:counter] += 1
            ranked_elements[element][:rank] += (index + 1)
          else
            ranked_elements[element] = { counter: 1, rank: (index + 1) }
          end
        end

        # ranked_elements = [{ x => { counter: 1, rank: y } ]
        ranked_elements
      end

      # Steps to perform the calculation are based on http://www.mit.edu/~6.s085/notes/lecture5.pdf
      def perform(alpha, tails, group_one, group_two)
        # Size for each group
        n1, n2 = group_one.size, group_two.size

        # Rank all data
        total_ranks = rank(group_one + group_two)

        # sum rankings per group
        r1 = ranked_sum_for(total_ranks, group_one)
        r2 = ranked_sum_for(total_ranks, group_two)

        # calculate U statistic
        u1 = (n1 * (n1 + 1)/2.0) - r1
        u2 = (n2 * (n2 + 1)/2.0 ) - r2

        u_statistic = [u1.abs, u2.abs].min

        median_u = (n1 * n2)/2.0

        ties = total_ranks.values.select { |element| element[:counter] > 1 }

        std_u = if ties.size > 0
                  corrected_sigma(ties, n1, n2)
                else
                  Math.sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
                end

        z = (u_statistic - median_u)/std_u

        # Most literature are not very specific about the normal distribution to be used.
        # We ran multiple tests with a Normal(median_u, std_u) and Normal(0, 1) and we found
        # the latter to be more aligned with the results.
        probability = Distribution::StandardNormal.new.cumulative_function(z.abs)
        p_value = 1 - probability
        p_value *= 2 if tails == :two_tail

        { probability: probability,
          u: u_statistic,
          z: z,
          p_value: p_value,
          alpha: alpha,
          null: alpha < p_value,
          alternative: p_value <= alpha,
          confidence_level: 1 - alpha }
      end

      # Formula extracted from http://www.statstutor.ac.uk/resources/uploaded/mannwhitney.pdf
      private def corrected_sigma(ties, total_group_one, total_group_two)
        n = total_group_one + total_group_two

        rank_sum = ties.reduce(0) do |memo, t|
                    memo += ((t[:counter] ** 3) - t[:counter])/12.0
                  end

        left = (total_group_one * total_group_two)/(n * (n - 1)).to_r
        right = (((n ** 3) - n)/12.0) - rank_sum

        Math.sqrt(left * right)
      end

      private def ranked_sum_for(total, group)
        # sum rankings per group
        group.reduce(0) do |memo, element|
          rank_of_element = total[element][:rank] / total[element][:counter].to_r
          memo += rank_of_element
        end
      end
    end

    # Both test are the same. To keep the selected name, we just alias the class
    # with the implementation.
    MannWhitneyU = WilcoxonRankSumTest
  end
end