lib/autosuggest.rb in autosuggest-0.1.3 vs lib/autosuggest.rb in autosuggest-0.2.0

- old
+ new

@@ -5,228 +5,13 @@ # dependencies require "lingua/stemmer" require "obscenity" # modules -require "autosuggest/version" +require_relative "autosuggest/generator" +require_relative "autosuggest/version" -class Autosuggest - def initialize(top_queries) - @top_queries = top_queries - @concepts = {} - @words = Set.new - @non_duplicates = Set.new - @blocked_words = {} - @blacklisted_words = {} - @preferred_queries = {} - @profane_words = {} - @concept_tree = {} - add_nodes(@profane_words, Obscenity::Base.blacklist) - end - - def add_concept(name, values) - values = values.compact.uniq - add_nodes(@concept_tree, values) - @concepts[name] = Set.new(values.map(&:downcase)) - end - - def parse_words(phrases, options = {}) - min = options[:min] || 1 - - word_counts = Hash.new(0) - phrases.each do |phrase| - words = tokenize(phrase) - words.each do |word| - word_counts[word] += 1 - end - end - - word_counts.select { |_, c| c >= min }.each do |word, _| - @words << word - end - - word_counts - end - - def not_duplicates(pairs) - pairs.each do |pair| - @non_duplicates << pair.map(&:downcase).sort - end - end - - def block_words(words) - add_nodes(@blocked_words, words) - words - end - - def blacklist_words(words) - warn "[autosuggest] blacklist_words is deprecated. Use block_words instead." - add_nodes(@blacklisted_words, words) - words - end - - def prefer(queries) - queries.each do |query| - @preferred_queries[normalize_query(query)] ||= query - end - end - - # TODO add queries method for filter: false and make suggestions use filter: true in 0.2.0 - def suggestions(filter: false) - stemmed_queries = {} - added_queries = Set.new - results = @top_queries.sort_by { |_query, count| -count }.map do |query, count| - query = query.to_s - - # TODO do not ignore silently - next if query.length < 2 - - stemmed_query = normalize_query(query) - - # get preferred term - preferred_query = @preferred_queries[stemmed_query] - if preferred_query && preferred_query != query - original_query, query = query, preferred_query - end - - # exclude duplicates - duplicate = stemmed_queries[stemmed_query] - stemmed_queries[stemmed_query] ||= query - - # also detect possibly misspelled duplicates - # TODO use top query as duplicate - if !duplicate && query.length > 4 - edits(query).each do |edited_query| - if added_queries.include?(edited_query) - duplicate = edited_query - break - end - end - end - if duplicate && @non_duplicates.include?([duplicate, query].sort) - duplicate = nil - end - added_queries << query unless duplicate - - # find concepts - concepts = [] - @concepts.each do |name, values| - concepts << name if values.include?(query) - end - - tokens = tokenize(query) - - # exclude misspellings that are not brands - misspelling = @words.any? && misspellings?(tokens) - - profane = blocked?(tokens, @profane_words) - blocked = blocked?(tokens, @blocked_words) - blacklisted = blocked?(tokens, @blacklisted_words) - - notes = [] - notes << "duplicate of #{duplicate}" if duplicate - notes.concat(concepts) - notes << "misspelling" if misspelling - notes << "profane" if profane - notes << "blocked" if blocked - notes << "blacklisted" if blacklisted - notes << "originally #{original_query}" if original_query - - result = { - query: query, - original_query: original_query, - score: count, - duplicate: duplicate, - concepts: concepts, - misspelling: misspelling, - profane: profane, - blocked: blocked - } - result[:blacklisted] = blacklisted if @blacklisted_words.any? - result[:notes] = notes - result - end - if filter - results.reject! { |s| s[:duplicate] || s[:misspelling] || s[:profane] || s[:blocked] } - end - results - end - - def pretty_suggestions - str = "%-30s %5s %s\n" % %w(Query Score Notes) - suggestions.each do |suggestion| - str << "%-30s %5d %s\n" % [suggestion[:query], suggestion[:score], suggestion[:notes].join(", ")] - end - str - end - - protected - - def misspellings?(tokens) - pos = [0] - while i = pos.shift - return false if i == tokens.size - - if @words.include?(tokens[i]) - pos << i + 1 - end - - node = @concept_tree[tokens[i]] - j = i - while node - j += 1 - pos << j if node[:eos] - break if j == tokens.size - node = node[tokens[j]] - end - - pos.uniq! - end - true - end - - def blocked?(tokens, blocked_words) - tokens.each_with_index do |token, i| - node = blocked_words[token] - j = i - while node - return true if node[:eos] - j += 1 - break if j == tokens.size - node = node[tokens[j]] - end - end - false - end - - def tokenize(str) - str.to_s.downcase.split(" ") - end - - # from https://blog.lojic.com/2008/09/04/how-to-write-a-spelling-corrector-in-ruby/ - LETTERS = ("a".."z").to_a.join + "'" - def edits(word) - n = word.length - deletion = (0...n).collect { |i| word[0...i] + word[i + 1..-1] } - transposition = (0...n - 1).collect { |i| word[0...i] + word[i + 1, 1] + word[i, 1] + word[i + 2..-1] } - alteration = [] - n.times { |i| LETTERS.each_byte { |l| alteration << word[0...i] + l.chr + word[i + 1..-1] } } - insertion = [] - (n + 1).times { |i| LETTERS.each_byte { |l| insertion << word[0...i] + l.chr + word[i..-1] } } - deletion + transposition + alteration + insertion - end - - def normalize_query(query) - tokenize(query.to_s.gsub("&", "and")).map { |q| Lingua.stemmer(q) }.sort.join - end - - def add_nodes(var, words) - words.each do |word| - node = var - tokenize(word).each do |token| - node = (node[token] ||= {}) - end - node[:eos] = true - end - var +module Autosuggest + def self.new(*args, **options) + Generator.new(*args, **options) end end