lib/jaro_winkler.rb in jaro_winkler-1.2.8 vs lib/jaro_winkler.rb in jaro_winkler-1.3.0.beta

- old
+ new

@@ -1,53 +1,63 @@ require 'jaro_winkler/fallback' +require 'jaro_winkler/adjusting_table' require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback? module JaroWinkler module_function - def jaro_distance s1, s2 + def jaro_distance s1, s2, options = {} + options = {adj_table: false}.merge options length1, length2 = s1.length, s2.length # Guarantee the length order if s1.length > s2.length s1, s2 = s2, s1 length1, length2 = length2, length1 end window_size = (length2 / 2) - 1 window_size = 0 if window_size < 0 matches = 0.0 + sim_matches = 0.0 transpositions = 0 previous_index = -1 max_index = length2 - 1 s1.chars.each_with_index do |c1, i| left = i - window_size right = i + window_size left = 0 if left < 0 right = max_index if right > max_index matched = false + sim_matched = false found = false s2[left..right].chars.each_with_index do |c2, j| if c1 == c2 matched = true s2_index = left + j if !found && s2_index > previous_index previous_index = s2_index found = true end + elsif options[:adj_table] && DEFAULT_ADJ_TABLE[c1][c2] + sim_matched = true end end if matched matches += 1 transpositions += 1 unless found + elsif sim_matched # not matched but similarly matched + sim_matches += 3 end end # Don't divide transpositions by 2 since it's been counted directly by above code. - matches == 0 ? 0 : (matches / length1 + matches / length2 + (matches - transpositions) / matches) / 3.0 + similarity = matches + similarity += sim_matches / 10 if options[:adj_table] + matches == 0 ? 0 : (similarity / length1 + similarity / length2 + (matches - transpositions) / matches) / 3.0 end def r_distance s1, s2, options = {} options = {weight: 0.1, threshold: 0.7, ignore_case: false}.merge options weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case] raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25 s1, s2 = s1.upcase, s2.upcase if ignore_case - distance = jaro_distance(s1, s2) + distance = jaro_distance(s1, s2, options) prefix = 0 max_length = [4, s1.length, s2.length].min s1[0, max_length].chars.each_with_index do |c1, i| c1 == s2[i] ? prefix += 1 : break end \ No newline at end of file