hbase_storage.rb in ankusa-0.0.6

- old
+ new

@@ -51,16 +51,21 @@
       row = freq_table.get_row(word)
       return counts if row.length == 0
 
       row.first.columns.each { |colname, cell|
         classname = colname.split(':')[1].intern
-        counts[classname] = cell.to_i64.to_f
+        # in case untrain has been called too many times
+        counts[classname] = [cell.to_i64.to_f, 0].max
       }
 
       counts
     end
 
+    def get_vocabulary_sizes
+      get_summary "totals:vocabsize"
+    end
+
     def get_total_word_count(klass)
       @klass_word_counts.fetch(klass) {
         @klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
       }
     end
@@ -70,33 +75,45 @@
         @klass_doc_counts[klass] = summary_table.get(klass, "totals:doccount").first.to_i64.to_f
       }
     end
 
     def incr_word_count(klass, word, count)
-      freq_table.atomic_increment word, "classes:#{klass.to_s}", count
+      size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count
+      # if this is a new word, increase the klass's vocab size.  If the new word
+      # count is 0, then we need to decrement our vocab size
+      if size == count
+        summary_table.atomic_increment klass, "totals:vocabsize"
+      elsif size == 0
+        summary_table.atomic_increment klass, "totals:vocabsize", -1        
+      end
+      size
     end
 
     def incr_total_word_count(klass, count)
       @klass_word_counts[klass] = summary_table.atomic_increment klass, "totals:wordcount", count
     end
 
     def incr_doc_count(klass, count)
       @klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
     end
 
-    def doc_count_total
-      total = 0
-      summary_table.create_scanner("", "totals:doccount") { |row|
-        total += row.columns["totals:doccount"].to_i64
-      }
-      total
+    def doc_count_totals
+      get_summary "totals:doccount"
     end
 
     def close
       @hbase.close
     end
 
     protected
+    def get_summary(name)
+      counts = Hash.new 0
+      summary_table.create_scanner("", name) { |row|
+        counts[row.row.intern] = row.columns[name].to_i64
+      }
+      counts
+    end
+
     def summary_table
       @stable ||= @hbase.get_table @stablename
     end
 
     def freq_table