classifier.rb in ankusa-0.0.6

- old
+ new

@@ -19,10 +19,13 @@
       }
       @storage.incr_total_word_count klass, th.word_count
       doccount = (text.kind_of? Array) ? text.length : 1
       @storage.incr_doc_count klass, doccount
       @classnames << klass if not @classnames.include? klass
+      # cache is now dirty of these vars
+      @doc_count_totals = nil
+      @vocab_sizes = nil
       th
     end
 
     # text can be either an array of strings or a string
     # klass is a symbol
@@ -33,45 +36,71 @@
         yield word, count if block_given?
       }
       @storage.incr_total_word_count klass, -th.word_count
       doccount = (text.kind_of? Array) ? text.length : 1
       @storage.incr_doc_count klass, -doccount
+      # cache is now dirty of these vars
+      @doc_count_totals = nil
+      @vocab_sizes = nil
       th
     end
 
-    def classify(text)
+    def classify(text, classes=nil)
       # return the most probable class
-      classifications(text).sort_by { |c| -c[1] }.first.first
+      log_likelihoods(text, classes).sort_by { |c| -c[1] }.first.first
     end
     
-    def classifications(text)
+    # Classes is an array of classes to look at
+    def classifications(text, classnames=nil)
+      result = log_likelihoods text, classnames
+      result.keys.each { |k|
+        result[k] = Math.exp result[k] 
+      }
+
+      # normalize to get probs
+      sum = result.values.inject { |x,y| x+y }
+      result.keys.each { |k| result[k] = result[k] / sum }
+      result
+    end
+
+    # Classes is an array of classes to look at
+    def log_likelihoods(text, classnames=nil)
+      classnames ||= @classnames
       result = Hash.new 0
 
       TextHash.new(text).each { |word, count|
-        probs = get_word_probs(word)
-        @classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
+        probs = get_word_probs(word, classnames)
+        classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
       }
 
       # add the prior and exponentiate
-      @classnames.each { |k| 
-        result[k] += Math.log(@storage.get_doc_count(k).to_f / @storage.doc_count_total.to_f) 
-        result[k] = Math.exp(result[k])
+      doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
+      doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
+      classnames.each { |k| 
+        result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total) 
       }
       
-      # normalize to get probs
-      sum = result.values.inject { |x,y| x+y }
-      @classnames.each { |k| result[k] = result[k] / sum }
       result
     end
 
     protected
-    def get_word_probs(word)
-      probs = @storage.get_word_counts(word)
-      @classnames.each { |cn| 
+    def get_word_probs(word, classnames)
+      probs = Hash.new 0
+      @storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
+      vs = vocab_sizes
+      classnames.each { |cn| 
         # use a laplacian smoother
-        probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + 1).to_f
+        probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
       }
       probs
+    end
+
+    def doc_count_totals
+      @doc_count_totals ||= @storage.doc_count_totals
+    end
+
+    def vocab_sizes
+      @vocab_sizes ||= @storage.get_vocabulary_sizes
     end
 
   end
 
 end