lib/string/matching.rb in bblib-0.1.0 vs lib/string/matching.rb in bblib-0.1.1
- old
+ new
@@ -3,12 +3,11 @@
##############################################
module BBLib
# A simple rendition of the levenshtein distance algorithm
- def self.levenshtein_distance a, b, case_sensitive = false
- if !case_sensitive then a, b = a.downcase, b.downcase end
+ def self.levenshtein_distance a, b
costs = (0..b.length).to_a
(1..a.length).each do |i|
costs[0], nw = i, i - 1
(1..b.length).each do |j|
costs[j], nw = [costs[j] + 1, costs[j-1] + 1, a[i-1] == b[j-1] ? nw : nw + 1].min, costs[j]
@@ -16,19 +15,18 @@
end
costs[b.length]
end
# Calculates a percentage based match using the levenshtein distance algorithm
- def self.levenshtein_similarity a, b, case_sensitive = false
- distance = BBLib.levenshtein_distance a, b, case_sensitive
+ def self.levenshtein_similarity a, b
+ distance = BBLib.levenshtein_distance a, b
max = [a.length, b.length].max.to_f
return ((max - distance.to_f) / max) * 100.0
end
# Calculates a percentage based match of two strings based on their character composition.
- def self.composition_similarity a, b, case_sensitive = false
- if !case_sensitive then a, b = a.downcase, b.downcase end
+ def self.composition_similarity a, b
if a.length <= b.length then t = a; a = b; b = t; end
matches, temp = 0, b
a.chars.each do |c|
if temp.chars.include? c
matches+=1
@@ -37,27 +35,25 @@
end
(matches / [a.length, b.length].max.to_f )* 100.0
end
# Calculates a percentage based match between two strings based on the similarity of word matches.
- def self.phrase_similarity a, b, case_sensitive = false
- if !case_sensitive then a, b = a.downcase, b.downcase end
- temp = b.split ' '
+ def self.phrase_similarity a, b
+ temp = b.drop_symbols.split ' '
matches = 0
- a.split(' ').each do |w|
+ a.drop_symbols.split(' ').each do |w|
if temp.include? w
matches+=1
temp.delete_at temp.find_index w
end
end
(matches.to_f / [a.split(' ').size, b.split(' ').size].max.to_f) * 100.0
end
# Extracts all numbers from two strings and compares them and generates a percentage of match.
# Percentage calculations here need to be weighted better...TODO
- def self.numeric_similarity a, b, case_sensitive = false
- if !case_sensitive then a, b = a.downcase, b.downcase end
+ def self.numeric_similarity a, b
a, b = a.extract_numbers, b.extract_numbers
return 100.0 if a.empty? && b.empty?
matches = []
for i in 0..[a.size, b.size].max-1
matches << 1.0 / ([a[i].to_f, b[i].to_f].max - [a[i].to_f, b[i].to_f].min + 1.0)
@@ -65,11 +61,11 @@
(matches.inject{ |sum, m| sum + m } / matches.size.to_f) * 100.0
end
# A simple character distance calculator that uses qwerty key positions to determine how similar two strings are.
# May be useful for typo detection.
- def self.qwerty_similarity a, b
+ def self.qwerty_distance a, b
a, b = a.downcase.strip, b.downcase.strip
if a.length <= b.length then t = a; a = b; b = t; end
qwerty = {
1 => ['1','2','3','4','5','6','7','8','9','0'],
2 => ['q','w','e','r','t','y','u','i','o','p'],
@@ -91,29 +87,29 @@
offset
end
end
class String
- def levenshtein_distance str, case_sensitive = false
- BBLib.levenshtein_distance self, str, case_sensitive
+ def levenshtein_distance str
+ BBLib.levenshtein_distance self, str
end
- def levenshtein_similarity str, case_sensitive = false
- BBLib.levenshtein_similarity self, str, case_sensitive
+ def levenshtein_similarity str
+ BBLib.levenshtein_similarity self, str
end
- def composition_similarity str, case_sensitive = false
- BBLib.composition_similarity self, str, case_sensitive
+ def composition_similarity str
+ BBLib.composition_similarity self, str
end
- def phrase_similarity str, case_sensitive = false
- BBLib.phrase_similarity self, str, case_sensitive
+ def phrase_similarity str
+ BBLib.phrase_similarity self, str
end
- def numeric_similarity str, case_sensitive = false
- BBLib.numeric_similarity self, str, case_sensitive
+ def numeric_similarity str
+ BBLib.numeric_similarity self, str
end
- def qwerty_similarity str
- BBLib.qwerty_similarity self, str
+ def qwerty_distance str
+ BBLib.qwerty_distance self, str
end
end