lib/ankusa/naive_bayes.rb in ankusa-0.0.14 vs lib/ankusa/naive_bayes.rb in ankusa-0.0.15

- old
+ new

@@ -4,43 +4,54 @@ class NaiveBayesClassifier include Classifier def classify(text, classes=nil) # return the most probable class - log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first + + result = log_likelihoods(text, classes) + if result.values.uniq.size. === 1 + # unless all classes are equally likely, then return nil + return nil + else + result.sort_by { |c| -c[1] }.first.first + end end - + # Classes is an array of classes to look at def classifications(text, classnames=nil) result = log_likelihoods text, classnames result.keys.each { |k| result[k] = (result[k] == -INFTY) ? 0 : Math.exp(result[k]) } # normalize to get probs - sum = result.values.inject { |x,y| x+y } - result.keys.each { |k| result[k] = result[k] / sum } + sum = result.values.inject{ |x,y| x+y } + result.keys.each { |k| + result[k] = result[k] / sum + } unless sum.zero? result end # Classes is an array of classes to look at def log_likelihoods(text, classnames=nil) classnames ||= @classnames result = Hash.new 0 TextHash.new(text).each { |word, count| probs = get_word_probs(word, classnames) - classnames.each { |k| + classnames.each { |k| # log likelihood should be negative infinity if we've never seen the klass result[k] += probs[k] > 0 ? (Math.log(probs[k]) * count) : -INFTY } } # add the prior doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v } - doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f - classnames.each { |k| - result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total) + + doc_count_total = (doc_counts.inject(0){ |x,y| x+y } + classnames.length).to_f + + classnames.each { |k| + result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total) } result end