lib/autosuggest.rb in autosuggest-0.1.1 vs lib/autosuggest.rb in autosuggest-0.1.2

- old
+ new

@@ -13,18 +13,22 @@ def initialize(top_queries) @top_queries = top_queries @concepts = {} @words = Set.new @non_duplicates = Set.new - @blocked_words = Set.new - @blacklisted_words = Set.new + @blocked_words = {} + @blacklisted_words = {} @preferred_queries = {} - @profane_words = Set.new(Obscenity::Base.blacklist) + @profane_words = {} + @concept_tree = {} + add_nodes(@profane_words, Obscenity::Base.blacklist) end def add_concept(name, values) - @concepts[name] = Set.new(values.compact.uniq.map(&:downcase)) + values = values.compact.uniq + add_nodes(@concept_tree, values) + @concepts[name] = Set.new(values.map(&:downcase)) end def parse_words(phrases, options = {}) min = options[:min] || 1 @@ -48,32 +52,31 @@ @non_duplicates << pair.map(&:downcase).sort end end def block_words(words) - words.each do |word| - @blocked_words << word.downcase - end + add_nodes(@blocked_words, words) + words end def blacklist_words(words) warn "[autosuggest] blacklist_words is deprecated. Use block_words instead." - words.each do |word| - @blacklisted_words << word.downcase - end + add_nodes(@blacklisted_words, words) + words end def prefer(queries) queries.each do |query| @preferred_queries[normalize_query(query)] ||= query end end - def suggestions + # TODO add queries method for filter: false and make suggestions use filter: true in 0.2.0 + def suggestions(filter: false) stemmed_queries = {} added_queries = Set.new - @top_queries.sort_by { |_query, count| -count }.map do |query, count| + results = @top_queries.sort_by { |_query, count| -count }.map do |query, count| query = query.to_s # TODO do not ignore silently next if query.length < 2 @@ -108,16 +111,18 @@ concepts = [] @concepts.each do |name, values| concepts << name if values.include?(query) end + tokens = tokenize(query) + # exclude misspellings that are not brands - misspelling = @words.any? && misspellings?(query) + misspelling = @words.any? && misspellings?(tokens) - profane = blocked?(query, @profane_words) - blocked = blocked?(query, @blocked_words) - blacklisted = blocked?(query, @blacklisted_words) + profane = blocked?(tokens, @profane_words) + blocked = blocked?(tokens, @blocked_words) + blacklisted = blocked?(tokens, @blacklisted_words) notes = [] notes << "duplicate of #{duplicate}" if duplicate notes.concat(concepts) notes << "misspelling" if misspelling @@ -138,10 +143,14 @@ } result[:blacklisted] = blacklisted if @blacklisted_words.any? result[:notes] = notes result end + if filter + results.reject! { |s| s[:duplicate] || s[:misspelling] || s[:profane] || s[:blocked] } + end + results end def pretty_suggestions str = "%-30s %5s %s\n" % %w(Query Score Notes) suggestions.each do |suggestion| @@ -150,44 +159,47 @@ str end protected - def misspellings?(query) - recurse(tokenize(query)).each do |terms| - if terms.all? { |t| @concepts.any? { |_, values| values.include?(t) } || @words.include?(t) } - return false + def misspellings?(tokens) + pos = [0] + while i = pos.shift + return false if i == tokens.size + + if @words.include?(tokens[i]) + pos << i + 1 end + + node = @concept_tree[tokens[i]] + j = i + while node + j += 1 + pos << j if node[:eos] + break if j == tokens.size + node = node[tokens[j]] + end + + pos.uniq! end true end - def blocked?(query, blocked_words) - recurse(tokenize(query)).each do |terms| - return true if terms.any? { |t| blocked_words.include?(t) } + def blocked?(tokens, blocked_words) + tokens.each_with_index do |token, i| + node = blocked_words[token] + j = i + while node + return true if node[:eos] + j += 1 + break if j == tokens.size + node = node[tokens[j]] + end end false end - def recurse(words) - if words.size == 1 - [words] - else - result = [[words.join(" ")]] - i = 0 - while i < words.size - 1 - recurse(words[0..i]).each do |v1| - recurse(words[i + 1..-1]).each do |v2| - result << v1 + v2 - end - end - i += 1 - end - result.uniq - end - end - def tokenize(str) str.to_s.downcase.split(" ") end # from https://blog.lojic.com/2008/09/04/how-to-write-a-spelling-corrector-in-ruby/ @@ -203,7 +215,18 @@ deletion + transposition + alteration + insertion end def normalize_query(query) tokenize(query.to_s.gsub("&", "and")).map { |q| Lingua.stemmer(q) }.sort.join + end + + def add_nodes(var, words) + words.each do |word| + node = var + tokenize(word).each do |token| + node = (node[token] ||= {}) + end + node[:eos] = true + end + var end end