lib/ankusa/classifier.rb in ankusa-0.0.6 vs lib/ankusa/classifier.rb in ankusa-0.0.7
- old
+ new
@@ -1,8 +1,8 @@
module Ankusa
- class Classifier
+ module Classifier
attr_reader :classnames
def initialize(storage)
@storage = storage
@storage.init_tables
@@ -40,47 +40,9 @@
@storage.incr_doc_count klass, -doccount
# cache is now dirty of these vars
@doc_count_totals = nil
@vocab_sizes = nil
th
- end
-
- def classify(text, classes=nil)
- # return the most probable class
- log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
- end
-
- # Classes is an array of classes to look at
- def classifications(text, classnames=nil)
- result = log_likelihoods text, classnames
- result.keys.each { |k|
- result[k] = Math.exp result[k]
- }
-
- # normalize to get probs
- sum = result.values.inject { |x,y| x+y }
- result.keys.each { |k| result[k] = result[k] / sum }
- result
- end
-
- # Classes is an array of classes to look at
- def log_likelihoods(text, classnames=nil)
- classnames ||= @classnames
- result = Hash.new 0
-
- TextHash.new(text).each { |word, count|
- probs = get_word_probs(word, classnames)
- classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
- }
-
- # add the prior and exponentiate
- doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
- doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
- classnames.each { |k|
- result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
- }
-
- result
end
protected
def get_word_probs(word, classnames)
probs = Hash.new 0