lib/groupie.rb in groupie-0.4.1 vs lib/groupie.rb in groupie-0.5.0

- old
+ new

@@ -1,18 +1,23 @@ # frozen_string_literal: true require_relative 'groupie/version' require_relative 'groupie/group' -require_relative 'groupie/core_ext/string' +require 'set' # Groupie is a text grouper and classifier, using naive Bayesian filtering. class Groupie # Wrap all errors we raise in this so our own errors are recognizable. class Error < StandardError; end - def initialize + attr_accessor :smart_weight + + # @param [true, false] smart_weight (false) Whether smart weight is enabled or not. + def initialize(smart_weight: false) @groups = {} + @smart_weight = smart_weight + @known_words = Set.new end # Turn a String (or anything else that responds to #to_s) into an Array of String tokens. # This attempts to remove most common punctuation marks and types of whitespace. # @@ -31,11 +36,11 @@ # Access an existing Group or create a new one. # # @param [Object] group The name of the group to access. # @return [Groupie::Group] An existing or new group identified by +group+. def [](group) - @groups[group] ||= Group.new(group) + @groups[group] ||= Group.new(group, self) end # Classify a text by taking the average of all word classifications. # # @param [Array<String>] words List of words to be classified @@ -49,29 +54,31 @@ group_score_sums.each.with_object({}) do |(group, sum), averages| averages[group] = hits.positive? ? sum / hits : 0 end end - # Classify a single word against all groups. + # Classify a single word against all groups, returning the probability distribution. # # @param [String] entry A word to be classified - # @param [Symbol] strategy - # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0 + # @param [Symbol] strategy (:sum) the strategy to use on the score + # @return [Hash<Object, Float>] Hash with <group, probability> pairings. + # Probabilities are always in 0.0..1.0, and add up to 1.0 (i.e. it's a probability distribution) # @raise [Groupie::Error] Raise when an invalid strategy is provided def classify(entry, strategy = :sum) - results = {} - total_count = @groups.values.inject(0) do |sum, group| - sum + apply_count_strategy(group.count(entry), strategy) + # Calculate default weight once outside of the loop + default_weight = self.default_weight + # Each group calculates the count, then reduces it to a score: <group name, score> + per_group_score = @groups.transform_values do |group| + apply_count_strategy(default_weight + group.count(entry), strategy) end - return results if total_count.zero? + # When we have no scores, we have no results, so abort early + # Note that when smart_weight is enabled we always have a score. + total_score = per_group_score.values.sum + return {} if total_score.zero? - @groups.each do |name, group| - count = apply_count_strategy(group.count(entry), strategy) - results[name] = count.positive? ? count.to_f / total_count : 0.0 - end - - results + # Final results must be within 0.0..1.0, so divide each score by the total score + per_group_score.transform_values { |group_score| group_score.to_f / total_score } end # Return a word score dictionary that excludes the 4th quartile most popular words. # Why do this? So the most common (and thus meaningless) words are ignored # and less common words gain more predictive power. @@ -83,14 +90,40 @@ # Iterate over all Groups and merge their <word, count> dictionaries into one total_count = @groups.inject({}) do |total, (_name, group)| total.merge!(group.word_counts) { |_key, o, n| o + n } end # Extract the word count that's at the top 75% - top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max + top_quartile_index = [((total_count.size * 3) / 4) - 1, 1].max top_quartile_frequency = total_count.values.sort[top_quartile_index] # Throw out all words which have a count that's above this frequency total_count.reject! { |_word, count| count > top_quartile_frequency } total_count.keys + end + + # Default weight is used when +smart_weight+ is enabled. + # Each word's count is increased by the +default_weight+ value, + # which is the average frequency of each unique word we know about. + # + # Example: if we have indexed 1000 total words, of which 500 were unique, + # the default_weight would be 1000/500=2.0 + # + # @return [Float] The default weight for all words + def default_weight + # Default weight only applies when smart weight is enabled + return 0.0 unless smart_weight + + # If we don't know any words, the weight is also zero + return 0.0 unless @known_words.any? + + # Gather counts and calculate + total_words = @groups.each_value.sum(&:total_word_count) + total_unique_words = @known_words.count + total_words / total_unique_words.to_f + end + + # Private method used by Groups to register known words with the Group. + def add_word(word) + @known_words << word end private # Calculate grouped scores