lib/groupie.rb in groupie-0.3.0 vs lib/groupie.rb in groupie-0.4.0
- old
+ new
@@ -1,86 +1,133 @@
-lib_dir = File.expand_path(File.dirname(__FILE__))
-$LOAD_PATH.unshift(lib_dir) unless $LOAD_PATH.include?(lib_dir)
-require 'groupie/group'
-require 'groupie/core_ext/string'
+# frozen_string_literal: true
+require_relative 'groupie/version'
+require_relative 'groupie/group'
+require_relative 'groupie/core_ext/string'
+
+# Groupie is a text grouper and classifier, using naive Bayesian filtering.
class Groupie
+ # Wrap all errors we raise in this so our own errors are recognizable.
+ class Error < StandardError; end
+
def initialize
@groups = {}
end
+ # Turn a String (or anything else that responds to #to_s) into an Array of String tokens.
+ # This attempts to remove most common punctuation marks and types of whitespace.
+ #
+ # @param [String, #to_s] object
+ # @return [Array<String>]
+ def self.tokenize(object)
+ object
+ .to_s
+ .downcase
+ .gsub(/\s/, ' ')
+ .gsub(/[$']/, '')
+ .gsub(/<[^>]+?>|[^\w -.,]/, '')
+ .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
+ end
+
+ # Access an existing Group or create a new one.
+ #
+ # @param [Object] group The name of the group to access.
+ # @return [Groupie::Group] An existing or new group identified by +group+.
def [](group)
@groups[group] ||= Group.new(group)
end
- def unique_words
- @unique_words ||= (
- total_count = @groups.values.map {|group| group.word_counts}.inject{|total, counts| total.merge(counts){|key,o,n| o+n}}
- median_index = [total_count.values.size * 3 / 4 - 1, 1].max
- median_frequency = total_count.values.sort[median_index]
- total_count.select{|word, count| count <= median_frequency}.map(&:first)
- )
+ # Classify a text by taking the average of all word classifications.
+ #
+ # @param [Array<String>] words List of words to be classified
+ # @param [Symbol] strategy
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
+ def classify_text(words, strategy = :sum)
+ words &= unique_words if strategy == :unique
+ group_score_sums, hits = calculate_group_scores(words, strategy)
+
+ group_score_sums.each.with_object({}) do |(group, sum), averages|
+ averages[group] = hits.positive? ? sum / hits : 0
+ end
end
- def classify(entry, strategy=:sum)
+ # Classify a single word against all groups.
+ #
+ # @param [String] entry A word to be classified
+ # @param [Symbol] strategy
+ # @return [Hash<Object, Float>] Hash with <group, score> pairings. Scores are always in 0.0..1.0
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
+ def classify(entry, strategy = :sum)
results = {}
- total_count = @groups.inject(0) do |sum, name_group|
- group = name_group.last
- count = group.count(entry)
- case strategy
- when :sum
- sum += count
- when :sqrt, :unique
- sum += Math::sqrt(count)
- when :log
- sum += Math::log10(count) if count > 0
- else
- raise "Invalid strategy: #{strategy}"
- end
- next sum
+ total_count = @groups.values.inject(0) do |sum, group|
+ sum + apply_count_strategy(group.count(entry), strategy)
end
- return results if 0 == total_count
+ return results if total_count.zero?
@groups.each do |name, group|
- count = group.count(entry)
- case strategy
- when :sum
- # keep count
- when :sqrt, :unique
- count = Math::sqrt(count)
- when :log
- count = Math::log10(count) if count > 0
- else
- raise "Invalid strategy: #{strategy}"
- end
- results[name] = count > 0 ? count.to_f / total_count : 0.0
+ count = apply_count_strategy(group.count(entry), strategy)
+ results[name] = count.positive? ? count.to_f / total_count : 0.0
end
- return results
+
+ results
end
- # Classify a text by taking the average of all word classifications.
- def classify_text(words, strategy=:sum)
- hits = 0
- if strategy==:unique
- words = words & unique_words
+ # Return a word score dictionary that excludes the 4th quartile most popular words.
+ # Why do this? So the most common (and thus meaningless) words are ignored
+ # and less common words gain more predictive power.
+ #
+ # This is used by the :unique strategy of the classifier.
+ #
+ # @return [Hash<String, Integer>]
+ def unique_words
+ # Iterate over all Groups and merge their <word, count> dictionaries into one
+ total_count = @groups.inject({}) do |total, (_name, group)|
+ total.merge!(group.word_counts) { |_key, o, n| o + n }
end
- group_score_sums = words.inject({}) do |results, word|
+ # Extract the word count that's at the top 75%
+ top_quartile_index = [total_count.size * 3 / 4 - 1, 1].max
+ top_quartile_frequency = total_count.values.sort[top_quartile_index]
+ # Throw out all words which have a count that's above this frequency
+ total_count.reject! { |_word, count| count > top_quartile_frequency }
+ total_count.keys
+ end
+
+ private
+
+ # Calculate grouped scores
+ #
+ # @param [Array<String>] words
+ # @param [Symbol] strategy
+ # @return [Array<Enumerator<String>, Integer>] a Hash with <group, score> pairs and an integer with the number of hits
+ def calculate_group_scores(words, strategy)
+ hits = 0
+ group_score_sums = words.each.with_object({}) do |word, results|
word_results = classify(word, strategy)
next results if word_results.empty?
+
hits += 1
- results.merge(word_results) do |key, old, new|
- old + new
- end
+ results.merge!(word_results) { |_key, old, new| old + new }
end
- averages={}
- group_score_sums.each do |group, sum|
- averages[group] = hits > 0 ? sum / hits : 0
- end
-
- averages
+ [group_score_sums, hits]
end
- def self.version
- File.read(File.join(File.dirname(File.expand_path(__FILE__)), "..", "VERSION")).strip
+ # Helper function to reduce a raw word count to a strategy-modified weight.
+ # @param [Integer] count
+ # @param [Symbol] strategy
+ # @return [Integer, Float]
+ # @raise [Groupie::Error] Raise when an invalid strategy is provided
+ def apply_count_strategy(count, strategy)
+ case strategy
+ when :sum
+ # keep count
+ when :sqrt, :unique
+ count = Math.sqrt(count)
+ when :log
+ count = Math.log10(count) if count.positive?
+ else
+ raise Error, "Invalid strategy: #{strategy}"
+ end
+ count
end
-end
\ No newline at end of file
+end