lib/groupie.rb in groupie-0.5.0 vs lib/groupie.rb in groupie-0.6.0
- old
+ new
@@ -1,9 +1,10 @@
# frozen_string_literal: true
require_relative 'groupie/version'
require_relative 'groupie/group'
+require_relative 'groupie/tokenizer'
require 'set'
# Groupie is a text grouper and classifier, using naive Bayesian filtering.
class Groupie
# Wrap all errors we raise in this so our own errors are recognizable.
@@ -22,17 +23,11 @@
# This attempts to remove most common punctuation marks and types of whitespace.
#
# @param [String, #to_s] object
# @return [Array<String>]
def self.tokenize(object)
- object
- .to_s
- .downcase
- .gsub(/\s/, ' ')
- .gsub(/[$']/, '')
- .gsub(/<[^>]+?>|[^\w -.,]/, '')
- .split.map { |str| str.gsub(/\A['"]+|[!,."']+\Z/, '') }
+ Tokenizer.new(object).to_tokens
end
# Access an existing Group or create a new one.
#
# @param [Object] group The name of the group to access.
@@ -50,10 +45,10 @@
def classify_text(words, strategy = :sum)
words &= unique_words if strategy == :unique
group_score_sums, hits = calculate_group_scores(words, strategy)
group_score_sums.each.with_object({}) do |(group, sum), averages|
- averages[group] = hits.positive? ? sum / hits : 0
+ averages[group] = sum / hits
end
end
# Classify a single word against all groups, returning the probability distribution.
#