lib/string/similarity.rb in string-similarity-2.0.1 vs lib/string/similarity.rb in string-similarity-2.1.0

- old
+ new

@@ -10,21 +10,24 @@ # For an explanation of the Cosine similarity of two strings read # {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}. # # @param str1 [String] first string # @param str2 [String] second string + # @param ngram [Int] how many characters at once to use # @return [Float] cosine similarity of the two arguments. # - +1.0+ if the strings are identical # - +0.0+ if the strings are completely different # - +0.0+ if one of the strings is empty - def self.cosine(str1, str2) + def self.cosine(str1, str2, ngram: 1) + raise ArgumentError.new('ngram should be >= 1') if ngram < 1 + return 1.0 if str1 == str2 return 0.0 if str1.empty? || str2.empty? # convert both texts to vectors - v1 = vector(str1) - v2 = vector(str2) + v1 = vector(str1, ngram) + v2 = vector(str2, ngram) # calculate the dot product dot_product = dot(v1, v2) # calculate the magnitude @@ -92,16 +95,36 @@ return str1.length if str2.empty? false end # create a vector from +str+ + # keys have a special format: + # '[left padding, right padding, "string"]' # # @example - # v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1} - # v1["x"] # => 0 - def self.vector(str) + # v1 = vector('aba', 1) # => {'[0, 0, "a"]' => 2, '[0, 0, "b"]' => 1} + # v1['[0, 0, "x"]'] # => 0 + # @example + # vector('abacaba', 2) # => { + # # '[1, 0, "a"]' => 1, + # # '[0, 0, "ab"]' => 2, + # # '[0, 0, "ba"]' => 2, + # # '[0, 0, "ac"]' => 1, + # # '[0, 0, "ca"]' => 1 + # # '[0, 1, "a"]' => 1 + # # } + def self.vector(str, ngram) v = Hash.new(0) - str.each_char { |c| v[c] += 1 } + + ((1 - ngram)..(str.length - 1)).each do |i| + before = [-i, 0].max + after = [ngram - (str.length - i), 0].max + slice = str[[i, 0].max .. [i + ngram - 1, str.length - 1].min] + key = [before, after, slice].to_s + + v[key] += 1 + end + v end # calculate the dot product of +vector1+ and +vector2+ def self.dot(vector1, vector2)