lib/ankusa/classifier.rb in ankusa-0.0.5 vs lib/ankusa/classifier.rb in ankusa-0.0.6
- old
+ new
@@ -19,10 +19,13 @@
}
@storage.incr_total_word_count klass, th.word_count
doccount = (text.kind_of? Array) ? text.length : 1
@storage.incr_doc_count klass, doccount
@classnames << klass if not @classnames.include? klass
+ # cache is now dirty of these vars
+ @doc_count_totals = nil
+ @vocab_sizes = nil
th
end
# text can be either an array of strings or a string
# klass is a symbol
@@ -33,45 +36,71 @@
yield word, count if block_given?
}
@storage.incr_total_word_count klass, -th.word_count
doccount = (text.kind_of? Array) ? text.length : 1
@storage.incr_doc_count klass, -doccount
+ # cache is now dirty of these vars
+ @doc_count_totals = nil
+ @vocab_sizes = nil
th
end
- def classify(text)
+ def classify(text, classes=nil)
# return the most probable class
- classifications(text).sort_by { |c| -c[1] }.first.first
+ log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
end
- def classifications(text)
+ # Classes is an array of classes to look at
+ def classifications(text, classnames=nil)
+ result = log_likelihoods text, classnames
+ result.keys.each { |k|
+ result[k] = Math.exp result[k]
+ }
+
+ # normalize to get probs
+ sum = result.values.inject { |x,y| x+y }
+ result.keys.each { |k| result[k] = result[k] / sum }
+ result
+ end
+
+ # Classes is an array of classes to look at
+ def log_likelihoods(text, classnames=nil)
+ classnames ||= @classnames
result = Hash.new 0
TextHash.new(text).each { |word, count|
- probs = get_word_probs(word)
- @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
+ probs = get_word_probs(word, classnames)
+ classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
}
# add the prior and exponentiate
- @classnames.each { |k|
- result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f)
- result[k] = Math.exp(result[k])
+ doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
+ doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
+ classnames.each { |k|
+ result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
}
- # normalize to get probs
- sum = result.values.inject { |x,y| x+y }
- @classnames.each { |k| result[k] = result[k] / sum }
result
end
protected
- def get_word_probs(word)
- probs = @storage.get_word_counts(word)
- @classnames.each { |cn|
+ def get_word_probs(word, classnames)
+ probs = Hash.new 0
+ @storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
+ vs = vocab_sizes
+ classnames.each { |cn|
# use a laplacian smoother
- probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f
+ probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
}
probs
+ end
+
+ def doc_count_totals
+ @doc_count_totals ||= @storage.doc_count_totals
+ end
+
+ def vocab_sizes
+ @vocab_sizes ||= @storage.get_vocabulary_sizes
end
end
end