lib/taxamatch_rb/phonetizer.rb in dimus-taxamatch_rb-0.1.7 vs lib/taxamatch_rb/phonetizer.rb in dimus-taxamatch_rb-0.5.0

- old
+ new

@@ -1,72 +1,75 @@ # encoding: UTF-8 +module Taxamatch -class Phonetizer + class Phonetizer - def self.near_match(a_word, normalize_ending = false) - a_word = a_word.strip rescue '' - return '' if a_word == '' - a_word = Normalizer.normalize a_word - case a_word - when /^AE/ - a_word = 'E' + a_word[2..-1] - when /^CN/ - a_word = 'N' + a_word[2..-1] - when /^CT/ - a_word = 'T' + a_word[2..-1] - when /^CZ/ - a_word = 'C' + a_word[2..-1] - when /^DJ/ - a_word = 'J' + a_word[2..-1] - when /^EA/ - a_word = 'E' + a_word[2..-1] - when /^EU/ - a_word = 'U' + a_word[2..-1] - when /^GN/ - a_word = 'N' + a_word[2..-1] - when /^KN/ - a_word = 'N' + a_word[2..-1] - when /^MC/ - a_word = 'MAC' + a_word[2..-1] - when /^MN/ - a_word = 'N' + a_word[2..-1] - when /^OE/ - a_word = 'E' + a_word[2..-1] - when /^QU/ - a_word = 'Q' + a_word[2..-1] - when /^PS/ - a_word = 'S' + a_word[2..-1] - when /^PT/ - a_word = 'T' + a_word[2..-1] - when /^TS/ - a_word = 'S' + a_word[2..-1] - when /^WR/ - a_word = 'R' + a_word[2..-1] - when /^X/ - a_word = 'Z' + a_word[1..-1] - end - first_char = a_word.split('')[0] - rest_chars = a_word.split('')[1..-1].join('') - rest_chars.gsub!('AE', 'I') - rest_chars.gsub!('IA', 'A') - rest_chars.gsub!('OE', 'I') - rest_chars.gsub!('OI', 'A') - rest_chars.gsub!('SC', 'S') - rest_chars.gsub!('H', '') - rest_chars.tr!('EOUYKZ', 'IAIICS') - a_word = (first_char + rest_chars).squeeze + def self.near_match(a_word, normalize_ending = false) + a_word = a_word.strip rescue '' + return '' if a_word == '' + a_word = Taxamatch::Normalizer.normalize a_word + case a_word + when /^AE/ + a_word = 'E' + a_word[2..-1] + when /^CN/ + a_word = 'N' + a_word[2..-1] + when /^CT/ + a_word = 'T' + a_word[2..-1] + when /^CZ/ + a_word = 'C' + a_word[2..-1] + when /^DJ/ + a_word = 'J' + a_word[2..-1] + when /^EA/ + a_word = 'E' + a_word[2..-1] + when /^EU/ + a_word = 'U' + a_word[2..-1] + when /^GN/ + a_word = 'N' + a_word[2..-1] + when /^KN/ + a_word = 'N' + a_word[2..-1] + when /^MC/ + a_word = 'MAC' + a_word[2..-1] + when /^MN/ + a_word = 'N' + a_word[2..-1] + when /^OE/ + a_word = 'E' + a_word[2..-1] + when /^QU/ + a_word = 'Q' + a_word[2..-1] + when /^PS/ + a_word = 'S' + a_word[2..-1] + when /^PT/ + a_word = 'T' + a_word[2..-1] + when /^TS/ + a_word = 'S' + a_word[2..-1] + when /^WR/ + a_word = 'R' + a_word[2..-1] + when /^X/ + a_word = 'Z' + a_word[1..-1] + end + first_char = a_word.split('')[0] + rest_chars = a_word.split('')[1..-1].join('') + rest_chars.gsub!('AE', 'I') + rest_chars.gsub!('IA', 'A') + rest_chars.gsub!('OE', 'I') + rest_chars.gsub!('OI', 'A') + rest_chars.gsub!('SC', 'S') + rest_chars.gsub!('H', '') + rest_chars.tr!('EOUYKZ', 'IAIICS') + a_word = (first_char + rest_chars).squeeze - if normalize_ending && a_word.size > 4 - a_word = self.normalize_ending(a_word) + if normalize_ending && a_word.size > 4 + a_word = self.normalize_ending(a_word) + end + a_word end - a_word - end - def self.normalize_ending(a_word) - # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os) - # -- at the end of a string translate all to -a - a_word.gsub!(/IS$/, 'A') - a_word.gsub!(/IM$/, 'A') - a_word.gsub(/AS$/, 'A') - end + def self.normalize_ending(a_word) + # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os) + # -- at the end of a string translate all to -a + a_word.gsub!(/IS$/, 'A') + a_word.gsub!(/IM$/, 'A') + a_word.gsub(/AS$/, 'A') + end + end + end \ No newline at end of file