Sha256: 69d2169077ca010a21d63036087cec3ffa87b95428f35cce0c4a9f1bc98bcbd7
Contents?: true
Size: 1.55 KB
Versions: 1
Compression:
Stored size: 1.55 KB
Contents
# encoding: utf-8 # Original author: Wilker LĂșcio <wilkerlucio@gmail.com> require "set" module Text # Ruby implementation of the string similarity described by Simon White # at: http://www.catalysoft.com/articles/StrikeAMatch.html # # 2 * |pairs(s1) INTERSECT pairs(s2)| # similarity(s1, s2) = ----------------------------------- # |pairs(s1)| + |pairs(s2)| # # e.g. # 2 * |{FR, NC}| # similarity(FRANCE, FRENCH) = --------------------------------------- # |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}| # # = (2 * 2) / (5 + 5) # # = 0.4 # # WhiteSimilarity.new.similarity("FRANCE", "FRENCH") # class WhiteSimilarity def self.similarity(str1, str2) new.similarity(str1, str2) end def initialize @word_letter_pairs = {} end def similarity(str1, str2) pairs1 = word_letter_pairs(str1) pairs2 = word_letter_pairs(str2).dup union = pairs1.count + pairs2.count intersection = 0 pairs1.each_with_index do |pair1| if index = pairs2.index(pair1) intersection += 1 pairs2.delete_at(index) end end (2.0 * intersection) / union end private def word_letter_pairs(str) @word_letter_pairs[str] ||= str.upcase.split(/\s+/).map{ |word| (0 ... (word.length - 1)).map { |i| word[i, 2] } }.flatten.freeze end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
text-1.0.3 | lib/text/white_similarity.rb |