groupie.rb in groupie-0.5.0

- old
+ new

@@ -1,18 +1,23 @@
 # frozen_string_literal: true
 
 require_relative 'groupie/version'
 require_relative 'groupie/group'
-require_relative 'groupie/core_ext/string'
+require 'set'
 
 # Groupie is a text grouper and classifier, using naive Bayesian filtering.
 class Groupie
   # Wrap all errors we raise in this so our own errors are recognizable.
   class Error < StandardError; end
 
-  def initialize
+  attr_accessor :smart_weight
+
+  # @param [true, false] smart_weight (false) Whether smart weight is enabled or not.
+  def initialize(smart_weight: false)
     @groups = {}
+    @smart_weight = smart_weight
+    @known_words = Set.new
   end
 
   # Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
   # This attempts to remove most common punctuation marks and types of whitespace.
   #
@@ -31,11 +36,11 @@
   # Access an existing Group or create a new one.
   #
   # @param [Object] group The name of the group to access.
   # @return [Groupie::Group] An existing or new group identified by +group+.
   def [](group)
-    @groups[group] ||= Group.new(group)
+    @groups[group] ||= Group.new(group, self)
   end
 
   # Classify a text by taking the average of all word classifications.
   #
   # @param [Array<String>] words List of words to be classified
@@ -49,29 +54,31 @@
     group_score_sums.each.with_object({}) do |(group, sum), averages|
       averages[group] = hits.positive? ? sum / hits : 0
     end
   end
 
-  # Classify a single word against all groups.
+  # Classify a single word against all groups, returning the probability distribution.
   #
   # @param [String] entry A word to be classified
-  # @param [Symbol] strategy
-  # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
+  # @param [Symbol] strategy (:sum) the strategy to use on the score
+  # @return [Hash<Object, Float>] Hash with <group, probability> pairings.
+  #   Probabilities are always in 0.0..1.0, and add up to 1.0 (i.e. it's a probability distribution)
   # @raise [Groupie::Error] Raise when an invalid strategy is provided
   def classify(entry, strategy = :sum)
-    results = {}
-    total_count = @groups.values.inject(0) do |sum, group|
-      sum + apply_count_strategy(group.count(entry), strategy)
+    # Calculate default weight once outside of the loop
+    default_weight = self.default_weight
+    # Each group calculates the count, then reduces it to a score: <group name, score>
+    per_group_score = @groups.transform_values do |group|
+      apply_count_strategy(default_weight + group.count(entry), strategy)
     end
-    return results if total_count.zero?
+    # When we have no scores, we have no results, so abort early
+    # Note that when smart_weight is enabled we always have a score.
+    total_score = per_group_score.values.sum
+    return {} if total_score.zero?
 
-    @groups.each do |name, group|
-      count = apply_count_strategy(group.count(entry), strategy)
-      results[name] = count.positive? ? count.to_f / total_count : 0.0
-    end
-
-    results
+    # Final results must be within 0.0..1.0, so divide each score by the total score
+    per_group_score.transform_values { |group_score| group_score.to_f / total_score }
   end
 
   # Return a word score dictionary that excludes the 4th quartile most popular words.
   # Why do this? So the most common (and thus meaningless) words are ignored
   # and less common words gain more predictive power.
@@ -83,14 +90,40 @@
     # Iterate over all Groups and merge their <word, count> dictionaries into one
     total_count = @groups.inject({}) do |total, (_name, group)|
       total.merge!(group.word_counts) { |_key, o, n| o + n }
     end
     # Extract the word count that's at the top 75%
-    top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
+    top_quartile_index = [((total_count.size * 3) / 4) - 1, 1].max
     top_quartile_frequency = total_count.values.sort[top_quartile_index]
     # Throw out all words which have a count that's above this frequency
     total_count.reject! { |_word, count| count > top_quartile_frequency }
     total_count.keys
+  end
+
+  # Default weight is used when +smart_weight+ is enabled.
+  # Each word's count is increased by the +default_weight+ value,
+  # which is the average frequency of each unique word we know about.
+  #
+  # Example: if we have indexed 1000 total words, of which 500 were unique,
+  #   the default_weight would be 1000/500=2.0
+  #
+  # @return [Float] The default weight for all words
+  def default_weight
+    # Default weight only applies when smart weight is enabled
+    return 0.0 unless smart_weight
+
+    # If we don't know any words, the weight is also zero
+    return 0.0 unless @known_words.any?
+
+    # Gather counts and calculate
+    total_words = @groups.each_value.sum(&:total_word_count)
+    total_unique_words = @known_words.count
+    total_words / total_unique_words.to_f
+  end
+
+  # Private method used by Groups to register known words with the Group.
+  def add_word(word)
+    @known_words << word
   end
 
   private
 
   # Calculate grouped scores