lib/autosuggest.rb in autosuggest-0.1.1 vs lib/autosuggest.rb in autosuggest-0.1.2
- old
+ new
@@ -13,18 +13,22 @@
def initialize(top_queries)
@top_queries = top_queries
@concepts = {}
@words = Set.new
@non_duplicates = Set.new
- @blocked_words = Set.new
- @blacklisted_words = Set.new
+ @blocked_words = {}
+ @blacklisted_words = {}
@preferred_queries = {}
- @profane_words = Set.new(Obscenity::Base.blacklist)
+ @profane_words = {}
+ @concept_tree = {}
+ add_nodes(@profane_words, Obscenity::Base.blacklist)
end
def add_concept(name, values)
- @concepts[name] = Set.new(values.compact.uniq.map(&:downcase))
+ values = values.compact.uniq
+ add_nodes(@concept_tree, values)
+ @concepts[name] = Set.new(values.map(&:downcase))
end
def parse_words(phrases, options = {})
min = options[:min] || 1
@@ -48,32 +52,31 @@
@non_duplicates << pair.map(&:downcase).sort
end
end
def block_words(words)
- words.each do |word|
- @blocked_words << word.downcase
- end
+ add_nodes(@blocked_words, words)
+ words
end
def blacklist_words(words)
warn "[autosuggest] blacklist_words is deprecated. Use block_words instead."
- words.each do |word|
- @blacklisted_words << word.downcase
- end
+ add_nodes(@blacklisted_words, words)
+ words
end
def prefer(queries)
queries.each do |query|
@preferred_queries[normalize_query(query)] ||= query
end
end
- def suggestions
+ # TODO add queries method for filter: false and make suggestions use filter: true in 0.2.0
+ def suggestions(filter: false)
stemmed_queries = {}
added_queries = Set.new
- @top_queries.sort_by { |_query, count| -count }.map do |query, count|
+ results = @top_queries.sort_by { |_query, count| -count }.map do |query, count|
query = query.to_s
# TODO do not ignore silently
next if query.length < 2
@@ -108,16 +111,18 @@
concepts = []
@concepts.each do |name, values|
concepts << name if values.include?(query)
end
+ tokens = tokenize(query)
+
# exclude misspellings that are not brands
- misspelling = @words.any? && misspellings?(query)
+ misspelling = @words.any? && misspellings?(tokens)
- profane = blocked?(query, @profane_words)
- blocked = blocked?(query, @blocked_words)
- blacklisted = blocked?(query, @blacklisted_words)
+ profane = blocked?(tokens, @profane_words)
+ blocked = blocked?(tokens, @blocked_words)
+ blacklisted = blocked?(tokens, @blacklisted_words)
notes = []
notes << "duplicate of #{duplicate}" if duplicate
notes.concat(concepts)
notes << "misspelling" if misspelling
@@ -138,10 +143,14 @@
}
result[:blacklisted] = blacklisted if @blacklisted_words.any?
result[:notes] = notes
result
end
+ if filter
+ results.reject! { |s| s[:duplicate] || s[:misspelling] || s[:profane] || s[:blocked] }
+ end
+ results
end
def pretty_suggestions
str = "%-30s %5s %s\n" % %w(Query Score Notes)
suggestions.each do |suggestion|
@@ -150,44 +159,47 @@
str
end
protected
- def misspellings?(query)
- recurse(tokenize(query)).each do |terms|
- if terms.all? { |t| @concepts.any? { |_, values| values.include?(t) } || @words.include?(t) }
- return false
+ def misspellings?(tokens)
+ pos = [0]
+ while i = pos.shift
+ return false if i == tokens.size
+
+ if @words.include?(tokens[i])
+ pos << i + 1
end
+
+ node = @concept_tree[tokens[i]]
+ j = i
+ while node
+ j += 1
+ pos << j if node[:eos]
+ break if j == tokens.size
+ node = node[tokens[j]]
+ end
+
+ pos.uniq!
end
true
end
- def blocked?(query, blocked_words)
- recurse(tokenize(query)).each do |terms|
- return true if terms.any? { |t| blocked_words.include?(t) }
+ def blocked?(tokens, blocked_words)
+ tokens.each_with_index do |token, i|
+ node = blocked_words[token]
+ j = i
+ while node
+ return true if node[:eos]
+ j += 1
+ break if j == tokens.size
+ node = node[tokens[j]]
+ end
end
false
end
- def recurse(words)
- if words.size == 1
- [words]
- else
- result = [[words.join(" ")]]
- i = 0
- while i < words.size - 1
- recurse(words[0..i]).each do |v1|
- recurse(words[i + 1..-1]).each do |v2|
- result << v1 + v2
- end
- end
- i += 1
- end
- result.uniq
- end
- end
-
def tokenize(str)
str.to_s.downcase.split(" ")
end
# from https://blog.lojic.com/2008/09/04/how-to-write-a-spelling-corrector-in-ruby/
@@ -203,7 +215,18 @@
deletion + transposition + alteration + insertion
end
def normalize_query(query)
tokenize(query.to_s.gsub("&", "and")).map { |q| Lingua.stemmer(q) }.sort.join
+ end
+
+ def add_nodes(var, words)
+ words.each do |word|
+ node = var
+ tokenize(word).each do |token|
+ node = (node[token] ||= {})
+ end
+ node[:eos] = true
+ end
+ var
end
end