require File.join File.dirname(__FILE__), 'helper'

module ClassifierBase
  def train
    @classifier.train :spam, "spam and great spam"   # spam:2 great:1
    @classifier.train :good, "words for processing" # word:1 process:1
    @classifier.train :good, "good word"            # word:1 good:1
  end

  def test_train
    counts = @storage.get_word_counts(:spam)
    assert_equal counts[:spam], 2
    counts = @storage.get_word_counts(:word)
    assert_equal counts[:good], 2
    assert_equal @storage.get_total_word_count(:good), 4
    assert_equal @storage.get_doc_count(:good), 2
    assert_equal @storage.get_total_word_count(:spam), 3
    assert_equal @storage.get_doc_count(:spam), 1
    totals = @storage.doc_count_totals
    assert_equal totals.values.inject { |x,y| x+y }, 3
    assert_equal totals[:spam], 1
    assert_equal totals[:good], 2

    vocab = @storage.get_vocabulary_sizes
    assert_equal vocab[:spam], 2
    assert_equal vocab[:good], 3
  end

  def teardown
    @storage.drop_tables
    @storage.close
  end
end


module NBClassifierBase
  include ClassifierBase

  def setup
    @classifier = Ankusa::NaiveBayesClassifier.new @storage
    train
  end

  def test_untrained
    @storage.reset

    string = "spam is tastey"

    hash = {:spam => 0.5, :good => 0.5}
    assert_equal hash, @classifier.classifications(string)
    assert_equal nil, @classifier.classify(string)
  end


  def test_probs
    spamlog = Math.log(3.0 / 5.0) + Math.log(1.0 / 5.0) + Math.log(2.0 / 5.0)
    goodlog = Math.log(1.0 / 7.0) + Math.log(1.0 / 7.0) + Math.log(3.0 / 5.0)

    # exponentiate
    spamex = Math.exp(spamlog)
    goodex = Math.exp(goodlog)

    # normalize
    spam = spamex / (spamex + goodex)
    good = goodex / (spamex + goodex)

    cs = @classifier.classifications("spam is tastey")
    assert_equal cs[:spam], spam
    assert_equal cs[:good], good

    cs = @classifier.log_likelihoods("spam is tastey")
    assert_equal cs[:spam], spamlog
    assert_equal cs[:good], goodlog

    @classifier.train :somethingelse, "this is something else entirely spam"
    cs = @classifier.classifications("spam is tastey", [:spam, :good])
    assert_equal cs[:spam], spam
    assert_equal cs[:good], good

    # test for class we didn't train on
    cs = @classifier.classifications("spam is super tastey if you are a zombie", [:spam, :nothing])
    assert cs[:nothing] < Float::EPSILON
    assert cs[:nothing] < cs[:spam]
  end

  def test_prob_result
    cs = @classifier.classifications("spam is tastey").sort_by { |c| -c[1] }.first.first
    klass = @classifier.classify("spam is tastey")
    assert_equal cs, klass
    assert_equal klass, :spam
  end
end


module KLClassifierBase
  include ClassifierBase

  def setup
    @classifier = Ankusa::KLDivergenceClassifier.new @storage
    train
  end

  def test_distances
    ds = @classifier.distances("spam is tastey")
    thprob_spam = 1.0 / 2.0
    thprob_tastey = 1.0 / 2.0

    train_prob_spam = (2 + 1).to_f / (3 + 2).to_f
    train_prob_tastey = (0 + 1).to_f / (3 + 2).to_f
    dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
    dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
    assert_equal ds[:spam], dist

    train_prob_spam = 1.0 / (4 + 3).to_f
    train_prob_tastey = 1.0 / (4 + 3).to_f
    dist = thprob_spam * Math.log(thprob_spam / train_prob_spam)
    dist += thprob_tastey * Math.log(thprob_tastey / train_prob_tastey)
    assert_equal ds[:good], dist
  end

  def test_distances_result
    cs = @classifier.distances("spam is tastey").sort_by { |c| c[1] }.first.first
    klass = @classifier.classify("spam is tastey")
    assert_equal cs, klass
    assert_equal klass, :spam

    # assert distance from class we didn't train with is Infinity (1.0/0.0 is a way to get at Infinity)
    cs = @classifier.distances("spam is tastey", [:spam, :nothing])
    assert_equal cs[:nothing], (1.0/0.0)
  end
end