module Ankusa class Classifier attr_reader :classnames def initialize(storage) @storage = storage @storage.init_tables @classnames = @storage.classnames end # text can be either an array of strings or a string # klass is a symbol def train(klass, text) th = TextHash.new(text) th.each { |word, count| @storage.incr_word_count klass, word, count yield word, count if block_given? } @storage.incr_total_word_count klass, th.word_count doccount = (text.kind_of? Array) ? text.length : 1 @storage.incr_doc_count klass, doccount @classnames << klass if not @classnames.include? klass th end # text can be either an array of strings or a string # klass is a symbol def untrain(klass, text) th = TextHash.new(text) th.each { |word, count| @storage.incr_word_count klass, word, -count yield word, count if block_given? } @storage.incr_total_word_count klass, -th.word_count doccount = (text.kind_of? Array) ? text.length : 1 @storage.incr_doc_count klass, -doccount th end def classify(text) # return the most probable class classifications(text).sort_by { |c| -c[1] }.first.first end def classifications(text) result = Hash.new 0 TextHash.new(text).each { |word, count| probs = get_word_probs(word) @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) } } # add the prior and exponentiate @classnames.each { |k| result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f) result[k] = Math.exp(result[k]) } # normalize to get probs sum = result.values.inject { |x,y| x+y } @classnames.each { |k| result[k] = result[k] / sum } result end protected def get_word_probs(word) probs = @storage.get_word_counts(word) @classnames.each { |cn| # use a laplacian smoother probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f } probs end end end