lib/string/similarity.rb in string-similarity-1.0.1 vs lib/string/similarity.rb in string-similarity-1.1.0

- old
+ new

@@ -1,25 +1,38 @@ require 'string/similarity/version' class String - + # Returns the cosine similarity to +other+ + # @see String::Similarity#cosine def cosine_similarity_to(other) String::Similarity.cosine(self, other) end + # Returns the Levenshtein distance to +other+ + # @see String::Similarity.levenshtein_distance + def levenshtein_distance_to(other) + String::Similarity.levenshtein_distance(self, other) + end + + # Returns the Levenshtein similarity to +other+ + # @see String::Similarity.levenshtein + def levenshtein_similarity_to(other) + String::Similarity.levenshtein(self, other) + end + # +String::Similarity+ provides various methods for # calculating string distances. module Similarity extend self - - - # Calcuate the - # {https://en.wikipedia.org/wiki/Cosine_similarity Cosine similarity} - # of two strings. + # Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity + # Cosine similarity} of two strings. # + # For an explanation of the Cosine similarity of two strings read + # {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}. + # # @param str1 [String] first string # @param str2 [String] second string - # @return [Float] cosine distance of the two arguments. + # @return [Float] cosine similarity of the two arguments. # - +1.0+ if the strings are identical # - +0.0+ if the strings are completely different # - +0.0+ if one of the strings is empty def cosine(str1, str2) return 1.0 if str1 == str2 @@ -34,34 +47,88 @@ # calculate the magnitude magnitude = mag(v1.values) * mag(v2.values) dot_product / magnitude end + # Calculate the Levenshtein similarity for two strings. + # + # This is basically the inversion of the levenshtein_distance, i.e. + # 1 / levenshtein_distance(str1, str2) + # + # @param str1 [String] first string + # @param str2 [String] second string + # @return [Float] levenshtein similarity of the two arguments. + # - +1.0+ if the strings are identical + # - +0.0+ if one of the strings is empty + # @see #levenshtein_distance + def levenshtein(str1, str2) + return 1.0 if str1.eql?(str2) + return 0.0 if str1.empty? || str2.empty? + 1.0 / levenshtein_distance(str1, str2) + end + + # Calculate the {https://en.wikipedia.org/wiki/Levenshtein_distance + # Levenshtein distance} of two strings. + # + # @param str1 [String] first string + # @param str2 [String] second string + # @return [Fixnum] edit distance between the two strings + # - +0+ if the strings are identical + def levenshtein_distance(str1, str2) + # base cases + return 0 if str1.eql?(str2) + return str2.length if str1.empty? + return str1.length if str2.empty? + + # Initialize cost-matrix rows + previous = (0..str2.length).to_a + current = [] + + (0...str1.length).each do |i| + # first element is always the edit distance from an empty string. + current[0] = i + 1 + (0...str2.length).each do |j| + current[j+1] = [ + # insertion + current[j] + 1, + # deletion + previous[j+1] + 1, + # substitution or no operation + previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1) + ].min + end + previous = current.dup + end + + current[str2.length] + end + private # create a vector from +str+ # # @example - # vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1} + # v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1} + # v1["x"] # => 0 def vector(str) v = Hash.new(0) str.each_char { |c| v[c] += 1 } v end # calculate the dot product of +vector1+ and +vector2+ def dot(vector1, vector2) product = 0 - vector1.each do |k,v| + vector1.each do |k, v| product += v * vector2[k] end product end # calculate the magnitude for +vector+ def mag(vector) # calculate the sum of squares - sq = vector.inject(0) { |s,n| s + n**2 } + sq = vector.inject(0) { |s, n| s + n**2 } Math.sqrt(sq) end end end