lib/string/similarity.rb in string-similarity-1.1.0 vs lib/string/similarity.rb in string-similarity-1.1.1

- old
+ new

@@ -1,7 +1,8 @@ require 'string/similarity/version' +# For convenience, String is extended by a couple of helper methods class String # Returns the cosine similarity to +other+ # @see String::Similarity#cosine def cosine_similarity_to(other) String::Similarity.cosine(self, other) @@ -19,11 +20,11 @@ String::Similarity.levenshtein(self, other) end # +String::Similarity+ provides various methods for # calculating string distances. - module Similarity extend self + module Similarity # Calcuate the {https://en.wikipedia.org/wiki/Cosine_similarity # Cosine similarity} of two strings. # # For an explanation of the Cosine similarity of two strings read # {http://stackoverflow.com/a/1750187/405454 this excellent SO answer}. @@ -32,16 +33,17 @@ # @param str2 [String] second string # @return [Float] cosine similarity of the two arguments. # - +1.0+ if the strings are identical # - +0.0+ if the strings are completely different # - +0.0+ if one of the strings is empty - def cosine(str1, str2) + def self.cosine(str1, str2) return 1.0 if str1 == str2 return 0.0 if str1.empty? || str2.empty? # convert both texts to vectors - v1, v2 = vector(str1), vector(str2) + v1 = vector(str1) + v2 = vector(str2) # calculate the dot product dot_product = dot(v1, v2) # calculate the magnitude @@ -58,11 +60,11 @@ # @param str2 [String] second string # @return [Float] levenshtein similarity of the two arguments. # - +1.0+ if the strings are identical # - +0.0+ if one of the strings is empty # @see #levenshtein_distance - def levenshtein(str1, str2) + def self.levenshtein(str1, str2) return 1.0 if str1.eql?(str2) return 0.0 if str1.empty? || str2.empty? 1.0 / levenshtein_distance(str1, str2) end @@ -71,29 +73,28 @@ # # @param str1 [String] first string # @param str2 [String] second string # @return [Fixnum] edit distance between the two strings # - +0+ if the strings are identical - def levenshtein_distance(str1, str2) + def self.levenshtein_distance(str1, str2) # base cases - return 0 if str1.eql?(str2) - return str2.length if str1.empty? - return str1.length if str2.empty? + result = base_case?(str1, str2) + return result if result # Initialize cost-matrix rows previous = (0..str2.length).to_a current = [] (0...str1.length).each do |i| # first element is always the edit distance from an empty string. current[0] = i + 1 (0...str2.length).each do |j| - current[j+1] = [ + current[j + 1] = [ # insertion current[j] + 1, # deletion - previous[j+1] + 1, + previous[j + 1] + 1, # substitution or no operation previous[j] + (str1[i].eql?(str2[j]) ? 0 : 1) ].min end previous = current.dup @@ -102,33 +103,40 @@ current[str2.length] end private + def self.base_case?(str1, str2) + return 0 if str1.eql?(str2) + return str2.length if str1.empty? + return str1.length if str2.empty? + false + end + # create a vector from +str+ # # @example # v1 = vector('hello') # => {"h"=>1, "e"=>1, "l"=>2, "o"=>1} # v1["x"] # => 0 - def vector(str) + def self.vector(str) v = Hash.new(0) str.each_char { |c| v[c] += 1 } v end # calculate the dot product of +vector1+ and +vector2+ - def dot(vector1, vector2) + def self.dot(vector1, vector2) product = 0 vector1.each do |k, v| product += v * vector2[k] end product end # calculate the magnitude for +vector+ - def mag(vector) + def self.mag(vector) # calculate the sum of squares - sq = vector.inject(0) { |s, n| s + n**2 } + sq = vector.inject(0) { |a, e| a + e**2 } Math.sqrt(sq) end end end