lib/ankusa/hbase_storage.rb in ankusa-0.0.5 vs lib/ankusa/hbase_storage.rb in ankusa-0.0.6

- old
+ new

@@ -51,16 +51,21 @@ row = freq_table.get_row(word) return counts if row.length == 0 row.first.columns.each { |colname, cell| classname = colname.split(':')[1].intern - counts[classname] = cell.to_i64.to_f + # in case untrain has been called too many times + counts[classname] = [cell.to_i64.to_f, 0].max } counts end + def get_vocabulary_sizes + get_summary "totals:vocabsize" + end + def get_total_word_count(klass) @klass_word_counts.fetch(klass) { @klass_word_counts[klass] = summary_table.get(klass, "totals:wordcount").first.to_i64.to_f } end @@ -70,33 +75,45 @@ @klass_doc_counts[klass] = summary_table.get(klass, "totals:doccount").first.to_i64.to_f } end def incr_word_count(klass, word, count) - freq_table.atomic_increment word, "classes:#{klass.to_s}", count + size = freq_table.atomic_increment word, "classes:#{klass.to_s}", count + # if this is a new word, increase the klass's vocab size. If the new word + # count is 0, then we need to decrement our vocab size + if size == count + summary_table.atomic_increment klass, "totals:vocabsize" + elsif size == 0 + summary_table.atomic_increment klass, "totals:vocabsize", -1 + end + size end def incr_total_word_count(klass, count) @klass_word_counts[klass] = summary_table.atomic_increment klass, "totals:wordcount", count end def incr_doc_count(klass, count) @klass_doc_counts[klass] = summary_table.atomic_increment klass, "totals:doccount", count end - def doc_count_total - total = 0 - summary_table.create_scanner("", "totals:doccount") { |row| - total += row.columns["totals:doccount"].to_i64 - } - total + def doc_count_totals + get_summary "totals:doccount" end def close @hbase.close end protected + def get_summary(name) + counts = Hash.new 0 + summary_table.create_scanner("", name) { |row| + counts[row.row.intern] = row.columns[name].to_i64 + } + counts + end + def summary_table @stable ||= @hbase.get_table @stablename end def freq_table