lib/ankusa/hbase_storage.rb in ankusa-0.0.5 vs lib/ankusa/hbase_storage.rb in ankusa-0.0.6
- old
+ new
@@ -51,16 +51,21 @@
row = freq_table.get_row(word)
return counts if row.length == 0
row.first.columns.each { |colname, cell|
classname = colname.split(':')[1].intern
- counts[classname] = cell.to_i64.to_f
+ # in case untrain has been called too many times
+ counts[classname] = [cell.to_i64.to_f, 0].max
}
counts
end
+ def get_vocabulary_sizes
+ get_summary "totals:vocabsize"
+ end
+
def get_total_word_count(klass)
@klass_word_counts.fetch(klass) {
@klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f
}
end
@@ -70,33 +75,45 @@
@klass_doc_counts[klass] = summary_table.get(klass, "totals:doccount").first.to_i64.to_f
}
end
def incr_word_count(klass, word, count)
- freq_table.atomic_increment word, "classes:#{klass.to_s}", count
+ size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count
+ # if this is a new word, increase the klass's vocab size. If the new word
+ # count is 0, then we need to decrement our vocab size
+ if size == count
+ summary_table.atomic_increment klass, "totals:vocabsize"
+ elsif size == 0
+ summary_table.atomic_increment klass, "totals:vocabsize", -1
+ end
+ size
end
def incr_total_word_count(klass, count)
@klass_word_counts[klass] = summary_table.atomic_increment klass, "totals:wordcount", count
end
def incr_doc_count(klass, count)
@klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count
end
- def doc_count_total
- total = 0
- summary_table.create_scanner("", "totals:doccount") { |row|
- total += row.columns["totals:doccount"].to_i64
- }
- total
+ def doc_count_totals
+ get_summary "totals:doccount"
end
def close
@hbase.close
end
protected
+ def get_summary(name)
+ counts = Hash.new 0
+ summary_table.create_scanner("", name) { |row|
+ counts[row.row.intern] = row.columns[name].to_i64
+ }
+ counts
+ end
+
def summary_table
@stable ||= @hbase.get_table @stablename
end
def freq_table