class String # A fuzzy matching mechanism. Returns a score from 0-1, # based on the number of shared edges. # To be effective, the strings must be of length 2 or greater. # # "Alexsander".fuzzy_match( "Aleksander" ) #=> 0.9 # # The way it works: # # * Converts each string into a "graph like" object, with edges # "alexsander" -> [ alexsander, alexsand, alexsan ... lexsand ... san ... an, etc ] # "aleksander" -> [ aleksander, aleksand ... etc. ] # * Perform match, then remove any subsets from this matched set (i.e. a hit # on "san" is a subset of a hit on "sander") # Above example, once reduced -> [ ale, sander ] # * See's how many of the matches remain, and calculates a score based # on how many matches, their length, and compare to the length of the # larger of the two words. #-- # Credit goes to Derek Lewis. Thanks Derek! # Still a bit rough. Any suggestions for improvement are welcome. #++ def similarity( str_in ) return 0 if str_in == nil return 1 if self == str_in # Make a graph of each word (okay, so its not a true graph, but is similar) graph_A = Array.new graph_B = Array.new # "graph" self last = self.length (0..last).each do |ff| loc = self.length break if ff == last - 1 wordB = (1..(last-1)).to_a.reverse! if (wordB != nil) wordB.each do |ss| break if ss == ff graph_A.push( "#{self[ff..ss]}" ) end end end # "graph" input string last = str_in.length (0..last).each{ |ff| loc = str_in.length break if ff == last - 1 wordB = (1..(last-1)).to_a.reverse! wordB.each do |ss| break if ss == ff graph_B.push( "#{str_in[ff..ss]}" ) end } # count how many of these "graph edges" we have that are the same matches = graph_A & graph_B #matches = Array.new #graph_A.each do |aa| # matches.push( aa ) if( graph_B.include?( aa ) ) #end # For eliminating subsets, we want to start with the smallest hits. matches.sort!{|x,y| x.length <=> y.length} # eliminate any subsets mclone = matches.dup mclone.each_index do |ii| reg = Regexp.compile( mclone[ii] ) count = 0.0 matches.each{|xx| count += 1 if xx =~ reg} matches.delete(mclone[ii]) if count > 1 end score = 0.0 matches.each{ |mm| score += mm.length } self.length > str_in.length ? largest = self.length : largest = str_in.length return score/largest end alias_method :fuzzy_match, :similarity end # _____ _ # |_ _|__ ___| |_ # | |/ _ \/ __| __| # | | __/\__ \ |_ # |_|\___||___/\__| # =begin test require 'test/unit' class TCString < Test::Unit::TestCase def test_similarity assert_nothing_raised { "Hello World!".similarity( "helo wrld" ) } end end =end