#-- # Credit goes to Derek Lewis. Thanks Derek! #++ class String # A fuzzy matching mechanism. # Returns a score from 0-1, based on the number of shared edges. # (matches of a sequence of characters of length 2 or more) # # "Alexsander".fuzzy_match( "Aleksander" ) #=> 0.9 # # The way it works: # # * Converts each string into a "graph like" object, with edges # "alexsander" - > [ alexsander, alexsand, alexsan ... lexsand ... san ... an, etc ] # "aleksander" - > [ aleksander, aleksand ... etc. ] # * Perform match, then remove any subsets from this matched set (i.e. a hit # on "san" is a subset of a hit on "sander") # Above example, once reduced -> [ ale, sander ] # * See's how many of the matches remain, and calculates a score based # on how many matches, their length, and compare to the length of the # larger of the two words. #-- # Still a bit rough. Any suggestions for improvement are welcome. #++ def fuzzy_match( str_in ) return 0 if str_in == nil return 1 if self == str_in # Make a graph of each word (okay, so its not a true graph, but is similar) graph_A = Array.new graph_B = Array.new # "graph" self last = self.length (0..last).each do |ff| loc = self.length break if ff == last - 1 wordB = (1..(last-1)).to_a.reverse! if (wordB != nil) wordB.each do |ss| break if ss == ff graph_A.push( "#{self[ff..ss]}" ) end end end # "graph" input string last = str_in.length (0..last).each{ |ff| loc = str_in.length break if ff == last - 1 wordB = (1..(last-1)).to_a.reverse! wordB.each do |ss| break if ss == ff graph_B.push( "#{str_in[ff..ss]}" ) end } # count how many of these "graph edges" we have that are the same matches = Array.new graph_A.each do |aa| matches.push( aa ) if( graph_B.include?( aa ) ) end matches.sort!{|x,y| x.length <=> y.length} # For eliminating subsets, we want to start with the smallest hits # eliminate any subsets mclone = matches.dup mclone.each_index do |ii| reg = Regexp.compile( mclone[ii] ) count = 0.0 matches.each{|xx| count += 1 if xx =~ reg} matches.delete(mclone[ii]) if count > 1 end score = 0.0 matches.each{ |mm| score += mm.length } self.length > str_in.length ? largest = self.length : largest = str_in.length return score/largest end end