lib/namor/namor.rb in namor-0.4.0 vs lib/namor/namor.rb in namor-0.4.1

- old
+ new

@@ -1,26 +1,38 @@ class Namor::Namor def initialize(opts = {}) config(opts) + @re_cache = {} end def config(opts) @config = opts end + + def suppression_re(supp_list) + suppression_list = (@config[:suppress] || []) + (supp_list || []) + + re = '\b(' + suppression_list.compact.map{|s| s.chomp('.')}.map(&:upcase).join('|') + ')\b' + Regexp.new(re) + # bits = suppression_list.compact.map do |s| + # '\b' + s.upcase.chomp('.') + '\b' + # end + # Regexp.new(bits.join('|')) + end + # clean up a single name component # * output all converted to uppercase # * strip leading ZZ+ or XX+ (frequently used as invalid-account prefixes) # * remove any words that are in the user-provided suppression list # * remove words from list of common suffixes (Jr, Sr etc) # * remove anything inside parenthesis # * remove punctuation # * squeeze whitespace & trim spaces from ends def scrub(name, opts = {}) - suppression_list = @config[:suppress] || [] - suppression_re = Regexp.new('(\s|^)' + (suppression_list + (opts[:suppress]||[])).compact.map(&:upcase).join('|') + '(\s|\.|$)') + @re_cache[opts[:suppress]] ||= suppression_re(opts[:suppress]) - name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(suppression_re, '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip + name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip end def fullscrub(name, opts = {}) final_cleaning(scrub(name, opts)) end