lib/ankusa/classifier.rb in ankusa-0.0.6 vs lib/ankusa/classifier.rb in ankusa-0.0.7

- old
+ new

@@ -1,8 +1,8 @@ module Ankusa - class Classifier + module Classifier attr_reader :classnames def initialize(storage) @storage = storage @storage.init_tables @@ -40,47 +40,9 @@ @storage.incr_doc_count klass, -doccount # cache is now dirty of these vars @doc_count_totals = nil @vocab_sizes = nil th - end - - def classify(text, classes=nil) - # return the most probable class - log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first - end - - # Classes is an array of classes to look at - def classifications(text, classnames=nil) - result = log_likelihoods text, classnames - result.keys.each { |k| - result[k] = Math.exp result[k] - } - - # normalize to get probs - sum = result.values.inject { |x,y| x+y } - result.keys.each { |k| result[k] = result[k] / sum } - result - end - - # Classes is an array of classes to look at - def log_likelihoods(text, classnames=nil) - classnames ||= @classnames - result = Hash.new 0 - - TextHash.new(text).each { |word, count| - probs = get_word_probs(word, classnames) - classnames.each { |k| result[k] += (Math.log(probs[k]) * count) } - } - - # add the prior and exponentiate - doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v } - doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f - classnames.each { |k| - result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total) - } - - result end protected def get_word_probs(word, classnames) probs = Hash.new 0