lib/string/matching.rb in bblib-0.1.0 vs lib/string/matching.rb in bblib-0.1.1

- old
+ new

@@ -3,12 +3,11 @@ ############################################## module BBLib # A simple rendition of the levenshtein distance algorithm - def self.levenshtein_distance a, b, case_sensitive = false - if !case_sensitive then a, b = a.downcase, b.downcase end + def self.levenshtein_distance a, b costs = (0..b.length).to_a (1..a.length).each do |i| costs[0], nw = i, i - 1 (1..b.length).each do |j| costs[j], nw = [costs[j] + 1, costs[j-1] + 1, a[i-1] == b[j-1] ? nw : nw + 1].min, costs[j] @@ -16,19 +15,18 @@ end costs[b.length] end # Calculates a percentage based match using the levenshtein distance algorithm - def self.levenshtein_similarity a, b, case_sensitive = false - distance = BBLib.levenshtein_distance a, b, case_sensitive + def self.levenshtein_similarity a, b + distance = BBLib.levenshtein_distance a, b max = [a.length, b.length].max.to_f return ((max - distance.to_f) / max) * 100.0 end # Calculates a percentage based match of two strings based on their character composition. - def self.composition_similarity a, b, case_sensitive = false - if !case_sensitive then a, b = a.downcase, b.downcase end + def self.composition_similarity a, b if a.length <= b.length then t = a; a = b; b = t; end matches, temp = 0, b a.chars.each do |c| if temp.chars.include? c matches+=1 @@ -37,27 +35,25 @@ end (matches / [a.length, b.length].max.to_f )* 100.0 end # Calculates a percentage based match between two strings based on the similarity of word matches. - def self.phrase_similarity a, b, case_sensitive = false - if !case_sensitive then a, b = a.downcase, b.downcase end - temp = b.split ' ' + def self.phrase_similarity a, b + temp = b.drop_symbols.split ' ' matches = 0 - a.split(' ').each do |w| + a.drop_symbols.split(' ').each do |w| if temp.include? w matches+=1 temp.delete_at temp.find_index w end end (matches.to_f / [a.split(' ').size, b.split(' ').size].max.to_f) * 100.0 end # Extracts all numbers from two strings and compares them and generates a percentage of match. # Percentage calculations here need to be weighted better...TODO - def self.numeric_similarity a, b, case_sensitive = false - if !case_sensitive then a, b = a.downcase, b.downcase end + def self.numeric_similarity a, b a, b = a.extract_numbers, b.extract_numbers return 100.0 if a.empty? && b.empty? matches = [] for i in 0..[a.size, b.size].max-1 matches << 1.0 / ([a[i].to_f, b[i].to_f].max - [a[i].to_f, b[i].to_f].min + 1.0) @@ -65,11 +61,11 @@ (matches.inject{ |sum, m| sum + m } / matches.size.to_f) * 100.0 end # A simple character distance calculator that uses qwerty key positions to determine how similar two strings are. # May be useful for typo detection. - def self.qwerty_similarity a, b + def self.qwerty_distance a, b a, b = a.downcase.strip, b.downcase.strip if a.length <= b.length then t = a; a = b; b = t; end qwerty = { 1 => ['1','2','3','4','5','6','7','8','9','0'], 2 => ['q','w','e','r','t','y','u','i','o','p'], @@ -91,29 +87,29 @@ offset end end class String - def levenshtein_distance str, case_sensitive = false - BBLib.levenshtein_distance self, str, case_sensitive + def levenshtein_distance str + BBLib.levenshtein_distance self, str end - def levenshtein_similarity str, case_sensitive = false - BBLib.levenshtein_similarity self, str, case_sensitive + def levenshtein_similarity str + BBLib.levenshtein_similarity self, str end - def composition_similarity str, case_sensitive = false - BBLib.composition_similarity self, str, case_sensitive + def composition_similarity str + BBLib.composition_similarity self, str end - def phrase_similarity str, case_sensitive = false - BBLib.phrase_similarity self, str, case_sensitive + def phrase_similarity str + BBLib.phrase_similarity self, str end - def numeric_similarity str, case_sensitive = false - BBLib.numeric_similarity self, str, case_sensitive + def numeric_similarity str + BBLib.numeric_similarity self, str end - def qwerty_similarity str - BBLib.qwerty_similarity self, str + def qwerty_distance str + BBLib.qwerty_distance self, str end end