lib/omnicat/classifiers/bayes.rb in omnicat-0.1.1 vs lib/omnicat/classifiers/bayes.rb in omnicat-0.1.2

- old
+ new

@@ -1,10 +1,10 @@ module OmniCat module Classifiers class Bayes < ::OmniCat::Classifiers::Base - attr_accessor :categories, :category_count, :doc_count, :token_count + attr_accessor :categories, :category_count, :doc_count, :token_count, :uniq_token_count attr_accessor :k_value # helper val for skipping some Bayes theorem errors def initialize(bayes_hash = {}) self.categories = ::OmniCat::Hash.new if bayes_hash.has_key?(:categories) @@ -14,10 +14,11 @@ end self.category_count = bayes_hash[:category_count].to_i self.doc_count = bayes_hash[:doc_count].to_i self.k_value = bayes_hash[:k_value] || 1.0 self.token_count = bayes_hash[:token_count].to_i + self.uniq_token_count = bayes_hash[:uniq_token_count].to_i end # Allows adding new classification category # # ==== Parameters @@ -51,22 +52,30 @@ # # Train the desired category # bayes.train("positive", "clear documentation") # bayes.train("positive", "good, very well") # bayes.train("negative", "bad dog") # bayes.train("neutral", "how is the management gui") - def train(category, doc) - if category_exists?(category) + def train(category_name, doc) + if category_exists?(category_name) self.doc_count += 1 - categories[category].doc_count += 1 + categories[category_name].doc_count += 1 doc.tokenize_with_counts.each do |token, count| + uniq_token_addition = 0 + categories.each do |name, category| + if category.tokens.has_key?(token) + uniq_token_addition = 1 + break + end + end + self.uniq_token_count += 1 if uniq_token_addition == 0 self.token_count += count - self.categories[category].tokens[token] = self.categories[category].tokens[token].to_i + count - self.categories[category].token_count += count + self.categories[category_name].tokens[token] = self.categories[category_name].tokens[token].to_i + count + self.categories[category_name].token_count += count end else raise StandardError, - "Category with name '#{category}' does not exist!" + "Category with name '#{category_name}' does not exist!" end end # Classify the given document # @@ -92,17 +101,21 @@ result = ::OmniCat::Result.new categories.each do |name, category| prior = category.doc_count / doc_count.to_f result.scores[name] = k_value doc.tokenize_with_counts.each do |token, count| - result.scores[name] *= ( - (category.tokens[token].to_i + k_value) / - (category.token_count + token_count) - ) if category.tokens.has_key?(token) + if category.tokens[token].to_i == 0 + result.scores[name] *= k_value / token_count + else + result.scores[name] *= ( + count * ( + (category.tokens[token].to_i + k_value) / + (category.token_count + uniq_token_count) + ) + ) + end end - result.scores[name] = ( - result.scores[name].to_f == 1.0 ? 0 : (prior * result.scores[name]) - ) + result.scores[name] = prior * result.scores[name] if result.scores[name] > score result.category[:name] = name; score = result.scores[name]; end result.total_score += result.scores[name] \ No newline at end of file