# encoding: utf-8 # References: # http://www.w3.org/International/questions/qa-personal-names # https://github.com/berkmancenter/namae # https://github.com/mericson # http://en.wikipedia.org/wiki/Types_of_business_entity # http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(USA) # http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) # http://en.wikipedia.org/wiki/Nobiliary_particle # http://en.wikipedia.org/wiki/Spanish_naming_customs # http://linguistlist.org/pubs/tocs/JournalUnifiedStyleSheet2007.pdf [PDF] class NameTamer attr_reader :name, :contact_type class << self def [](name, args = {}) new name, args end end def nice_name if @nice_name.nil? @nice_name = @name.dup # Start with the name we've received tidy_spacing # " John Smith " -> "John Smith" consolidate_initials # "I. B. M." -> "I.B.M." remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John" fixup_last_name_first # "Smith, John" -> "John Smith" fixup_mismatched_braces # "Ceres (AZ" -> "Ceres (AZ)" remove_adfixes # prefixes and suffixes: "Mr John Smith Jr." -> "John Smith" name_wrangle # proper name case and non-breaking spaces use_nonbreaking_spaces_in_compound_names end @nice_name end def search_name if @search_name.nil? @search_name = nice_name.dup # Start with nice name remove_initials # "John Q. Doe" -> "John Doe" remove_middle_names # "Philip Seymour Hoffman" -> "Philip Hoffman" remove_dots_from_abbreviations # "J.P.R. Williams" -> "JPR Williams" standardize_words # "B&Q Intl" -> "B and Q International" @search_name = ensure_whitespace_is_ascii_space @search_name end @search_name end def slug if @slug.nil? @slug = search_name.dup # Start with search name slugify # "John Doe" -> "john-doe" end @slug end def contact_type nice_name # make sure we've done the bit which infers contact_type contact_type_best_effort end =begin These lines aren't used and aren't covered by specs def name=(new_name) initialize new_name, :contact_type => @contact_type end def contact_type=(new_contact_type) initialize @name, :contact_type => new_contact_type end def to_hash { name: @name, nice_name: @nice_name, search_name: @search_name, slug: @slug, contact_type: @contact_type, last_name: @last_name, remainder: @remainder, adfix_found: @adfix_found } end =end private #-------------------------------------------------------- # Tidy up the name we've received #-------------------------------------------------------- def tidy_spacing @nice_name.gsub!(/,\s*/, ', ') # Ensure commas have exactly one space after them @nice_name.strip! # remove leading & trailing whitespace @nice_name = ensure_whitespace_is_ascii_space @nice_name end # Remove spaces from groups of initials def consolidate_initials @nice_name.gsub!(/\b([a-z])\.* (?=[a-z][\. ])/i) { |match| "#{$1}." } # Remove spaces from initial groups @nice_name.gsub!(/\b([a-z](?:\.[a-z])+)\.?(?= )/i) { |match| "#{$1}." } # Ensure each group ends with a dot end # An adfix is either a prefix or a suffix def remove_adfixes if @last_name.nil? # Our name is still in one part, not two begin @nice_name = remove_outermost_adfix(:suffix, @nice_name) end while @adfix_found begin @nice_name = remove_outermost_adfix(:prefix, @nice_name) end while @adfix_found else # Our name is currently in two halves begin @last_name = remove_outermost_adfix(:suffix, @last_name) end while @adfix_found begin @remainder = remove_outermost_adfix(:prefix, @remainder) end while @adfix_found end end # Names in the form "Smith, John" need to be turned around to "John Smith" def fixup_last_name_first unless @contact_type == :organization parts = @nice_name.split ', ' if parts.count == 2 @last_name = parts[0] # Sometimes the last name alone is all caps and we can name-case it @remainder = parts[1] end end end # Sometimes we end up with mismatched braces after adfix stripping # e.g. "Ceres (Ceres Holdings LLC)" -> "Ceres (Ceres Holdings" def fixup_mismatched_braces left_brace_count = @nice_name.count '(' right_brace_count = @nice_name.count ')' if left_brace_count > right_brace_count @nice_name += ')' elsif left_brace_count < right_brace_count @nice_name = '(' + @nice_name end end def name_wrangle # Fix case if all caps or all lowercase if @last_name.nil? lowercase = @nice_name.downcase uppercase = @nice_name.upcase # Some companies like to be all lowercase so don't mess with them @nice_name = name_case(lowercase) if @nice_name == uppercase || ( @nice_name == lowercase && @contact_type != :organization ) else lowercase = @last_name.downcase uppercase = @last_name.upcase @last_name = name_case(lowercase) if @last_name == uppercase || @last_name == lowercase @nice_name = "#{@remainder} #{@last_name}" end end # Conjoin compound names with non-breaking spaces def use_nonbreaking_spaces_in_compound_names # Fix known last names that have spaces (not hyphens!) [ 'Lane Fox', 'Bonham Carter', 'Pitt Rivers', 'Lloyd Webber', 'Sebag Montefiore', 'Holmes à Court', 'Holmes a Court', 'Baron Cohen', 'Service Company', 'Corporation Company', 'Corporation System', 'Incorporations Limited' ].each do |compound_name| @nice_name.gsub!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE)) end NAME_MODIFIERS.each do |modifier| @nice_name.gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |match| "#{$1}#{NONBREAKING_SPACE}" } end end #-------------------------------------------------------- # Make search name from nice name #-------------------------------------------------------- # Remove initials from personal names unless they are the only identifier. # i.e. only remove initials if there's also a proper name there def remove_initials if @contact_type == :person name = @search_name.gsub(/\b([a-z](?:\.*\s+|\.))/i, '') # If the name still has at least one space we're OK @search_name = name if name.include?(ASCII_SPACE) end end def remove_middle_names if @contact_type == :person parts = @search_name.split @search_name = "#{parts[0]} #{parts[-1]}" if parts.count > 2 end end def remove_dots_from_abbreviations @search_name.gsub!(/\b([a-z])\./i) { |match| $1 } end def standardize_words @search_name.gsub!(/ *& */, ' and ') # replace ampersand characters with ' and ' @search_name.gsub!(/ *\+ */, ' plus ') # replace plus signs with ' plus ' @search_name.gsub!(/\bintl\b/i, 'International') # replace 'intl' with 'International' end #-------------------------------------------------------- # Make slug from search name #-------------------------------------------------------- def slugify # Inflector::parameterize just gives up with non-latin characters so... #@slug = @slug.parameterize # Can't use this # Instead we'll do it ourselves @slug = parameterize @slug end #-------------------------------------------------------- # Initialization and utilities #-------------------------------------------------------- def initialize(name, args = {}) @name = name || '' @contact_type = args[:contact_type].to_sym unless args[:contact_type].nil? @nice_name = nil @search_name = nil @slug = nil @last_name = nil @remainder = nil @adfix_found = false end def set_contact_type contact_type contact_type_sym = contact_type.to_sym puts "Changing contact type of #{@name} from #{@contact_type} to #{contact_type}".red unless @contact_type.nil? || @contact_type == contact_type_sym @contact_type = contact_type_sym end # If we don't know the contact type, what's our best guess? def contact_type_best_effort if @contact_type @contact_type else # If it's just one word we'll assume organization. # If more then we'll assume a person @name.include?(ASCII_SPACE) ? :person : :organization end end def ensure_whitespace_is_ascii_space string string.gsub(/[[:space:]]+/, ASCII_SPACE) # /\s/ doesn't match Unicode whitespace in Ruby 1.9.3 end # We pass to this routine either prefixes or suffixes def remove_outermost_adfix adfix_type, name_part adfixes = ADFIX_PATTERNS[adfix_type] contact_type = contact_type_best_effort parts = name_part.partition adfixes[contact_type] @adfix_found = !parts[1].empty? # If the contact type is indeterminate and we didn't find a diagnostic adfix # for a person then try again for an organization if @contact_type.nil? unless @adfix_found contact_type = :organization parts = name_part.partition adfixes[contact_type] @adfix_found = !parts[1].empty? end end if @adfix_found # If we've found a diagnostic adfix then set the contact type set_contact_type contact_type # The remainder of the name will be in parts[0] or parts[2] depending # on whether this is a prefix or a suffix. # We'll also remove any trailing commas we've exposed. result = (parts[0] + parts[2]).gsub(/\s*,\s*$/, '') else result = name_part end result end # Original Version of NameCase: # Copyright (c) Mark Summerfield 1998-2008. All Rights Reserved # This module may be used/distributed/modified under the same terms as Perl itself # http://dev.perl.org/licenses/ (GPL) # # Ruby Version: # Copyright (c) Aaron Patterson 2006 # NameCase is distributed under the GPL license. # # Substantially modified for Xendata # Improved in several areas, also now adds non-breaking spaces for # compound names like "van der Pump" def name_case lowercase name = lowercase # We assume the name is passed already downcased name.gsub!(/\b\w/) { |first| first.upcase } name.gsub!(/\'\w\b/) { |c| c.downcase } # Lowercase 's # Our list of terminal characters that indicate a non-celtic name used # to include o but we removed it because of MacMurdo. if name =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ or name =~ /\bMc/ name.gsub!(/\b(Ma?c)([A-Za-z]+)/) { |match| $1 + $2.capitalize } # Fix Mac exceptions [ 'MacEdo', 'MacEvicius', 'MacHado', 'MacHar', 'MacHin', 'MacHlin', 'MacIas', 'MacIulis', 'MacKie', 'MacKle', 'MacKlin', 'MacKmin', 'MacKmurdo', 'MacQuarie', 'MacLise', 'MacKenzie' ].each { |mac_name| name.gsub!(/\b#{mac_name}/, mac_name.capitalize) } end # Fix ff wierdybonks [ 'Fforbes', 'Fforde', 'Ffinch', 'Ffrench', 'Ffoulkes' ].each { |ff_name| name.gsub!(ff_name,ff_name.downcase) } # Fixes for name modifiers followed by space # Also replaces spaces with non-breaking spaces NAME_MODIFIERS.each do |modifier| name.gsub!(/((?:[[:space:]]|^)#{modifier})(\s+|-)/) { |match| "#{$1.rstrip.downcase}#{$2.tr(ASCII_SPACE, NONBREAKING_SPACE)}" } end # Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte ['Dell', 'D'].each do |modifier| name.gsub!(/(.#{modifier}')(\w)/) { |match| "#{$1.rstrip.downcase}#{$2}" } end # Upcase words with no vowels, e.g JPR Williams name.gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |match| $1.upcase } # Except Ng name.gsub!(/\b(NG)\b/i) { |match| $1.capitalize } # http://en.wikipedia.org/wiki/Ng name end def parameterize string, args = {} sep = args[:sep] || SLUG_DELIMITER rfc3987 = args[:rfc3987] || false filter = args[:filter] || (rfc3987 ? FILTER_RFC3987 : FILTER_COMPAT) # First we unescape any pct-encoded characters. These might turn into # things we want to alter for the slug, like whitespace (e.g. %20) parameterized_string = URI.unescape(string) # Then we change any whitespace into our separator character parameterized_string.gsub!(/\s+/, sep) # Then we strip any illegal characters out completely parameterized_string.gsub!(filter, '') # Make sure separators are not where they shouldn't be unless sep.nil? || sep.empty? re_sep = Regexp.escape(sep) # No more than one of the separator in a row. parameterized_string.gsub!(/#{re_sep}{2,}/, sep) # Remove leading/trailing separator. parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '') end # downcase if it's all latin parameterized_string.downcase end #-------------------------------------------------------- # Constants #-------------------------------------------------------- NONBREAKING_SPACE = "\u00a0" ASCII_SPACE = "\u0020" ADFIX_JOINERS = "[#{ASCII_SPACE}-]" SLUG_DELIMITER = '-' # Constants for parameterizing Unicode strings for IRIs # # Allowed characters in an IRI segment are defined by RFC 3987 # (https://tools.ietf.org/html/rfc3987#section-2.2) as follows: # # isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims # / "@" ) # ; non-zero-length segment without any colon ":" # # iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar # # pct-encoded = "%" HEXDIG HEXDIG # # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" # / "*" / "+" / "," / ";" / "=" # # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD # / %xD0000-DFFFD / %xE1000-EFFFD # # Note that we can't use Unicode code points above \uFFFF because of # regex limitations, so we'll ignore ucschar above that point. # # We're using the most restrictive segment definition (isegment-nz-nc) # to avoid any possible problems with the IRI that it one day might # get placed in. ALPHA = 'A-Za-z' DIGIT = '0-9' UCSCHAR = '\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF' IUNRESERVED = "#{ALPHA}#{DIGIT}\\-\\._~#{UCSCHAR}" SUBDELIMS = '!$&\'\(\)\*+,;=' ISEGMENT_NZ_NC = "#{IUNRESERVED}#{SUBDELIMS}@" # pct-encoded not needed FILTER_RFC3987 = /[^#{ISEGMENT_NZ_NC}]/ FILTER_COMPAT = /[^#{ALPHA}#{DIGIT}\-_#{UCSCHAR}]/ NAME_MODIFIERS = [ 'Al', 'Ap', 'Ben', 'Dell[ae]', 'D[aeiou]', 'De[lr]', 'D[ao]s', 'El', 'La', 'L[eo]', 'V[ao]n', 'Of', 'St[\.]?' ] # These are the prefixes and suffixes we want to remove # If you add to the list, you can use spaces and dots where appropriate # Ensure any single letters are followed by a dot because we'll add one to the string # during processing, e.g. "y Cía." should be "y. Cía." ADFIXES = { prefix: { person: [ 'Baron', 'Baroness', 'Capt.', 'Captain', 'Col.', 'Colonel', 'Dame', 'Doctor', 'Dr.', 'Judge', 'Justice', 'Lady', 'Lieut.', 'Lieutenant', 'Lord', 'Madame', 'Major', 'Master', 'Matron', 'Messrs.', 'Mgr.', 'Miss', 'Mister', 'Mlle.', 'Mme.', 'Mons.', 'Mr.', 'Mr. & Mrs.', 'Mr. and Mrs.', 'Mrs.', 'Msgr.', 'Prof.', 'Professor', 'Rev.', 'Reverend', 'Sir', 'Sister', 'The Hon.', 'The Lady.', 'The Lord', 'The Rt. Hon.' ], organization: [ 'Fa.', 'P.T.', 'P.T. Tbk.', 'U.D.' ], before:'\\A', after:ADFIX_JOINERS }, suffix: { person: [ 'C.I.S.S.P.', 'B.Tech.', 'D.Phil.', 'B.Eng.', 'C.F.A.', 'D.B.E.', 'D.D.S.', 'Eng.D.', 'M.B.A.', 'M.B.E.', 'M.E.P.', 'M.Eng.', 'M.S.P.', 'O.B.E.', 'P.M.C.', 'P.M.P.', 'P.S.P.', 'B.Ed.', 'B.Sc.', 'Ed.D.', 'LL.B.', 'LL.D.', 'LL.M.', 'M.Ed.', 'M.Sc.', 'Ph.D.', 'B.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.', 'O.K.', 'P.A.', 'Q.C.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V' ], organization: [ 'S. de R.L. de C.V.', 'S.A.P.I. de C.V.', 'y. Cía. S. en C.', 'Private Limited', 'S.M. Pte. Ltd.', 'Cía. S. C. A.', 'y. Cía. S. C.', 'S.A. de C.V.', 'spol. s.r.o.', '(Pty.) Ltd.', '(Pvt.) Ltd.', 'A.D.S.I.Tz.', 'S.p. z.o.o.', '(Pvt.)Ltd.', 'akc. spol.', 'Cía. Ltda.', 'E.B.V.B.A.', 'P. Limited', 'S. de R.L.', 'S.I.C.A.V.', 'S.P.R.L.U.', 'А.Д.С.И.Ц.', '(P.) Ltd.', 'C. por A.', 'Comm.V.A.', 'Ltd. Şti.', 'Plc. Ltd.', 'Pte. Ltd.', 'Pty. Ltd.', 'Pvt. Ltd.', 'Soc. Col.', 'A.M.B.A.', 'A.S.B.L.', 'A.V.E.E.', 'B.V.B.A.', 'B.V.I.O.', 'C.V.B.A.', 'C.V.O.A.', 'E.E.I.G.', 'E.I.R.L.', 'E.O.O.D.', 'E.U.R.L.', 'F.M.B.A.', 'G.m.b.H.', 'Ges.b.R.', 'I.L.L.C.', 'K.G.a.A.', 'L.L.L.P.', 'Ltd. Co.', 'Ltd. Co.', 'M.E.P.E.', 'n.y.r.t.', 'O.V.E.E.', 'P.E.E.C.', 'P.L.L.C.', 'P.L.L.C.', 'S. en C.', 'S.a.p.a.', 'S.A.R.L.', 'S.à.R.L.', 'S.A.S.U.', 'S.C.e.I.', 'S.C.O.P.', 'S.C.p.A.', 'S.C.R.I.', 'S.C.R.L.', 'S.M.B.A.', 'S.P.R.L.', 'Е.О.О.Д.', 'and Co.', 'Comm.V.', 'Limited', 'P. Ltd.', 'Part.G.', 'Sh.p.k.', '&. Co.', 'C.X.A.', 'd.n.o.', 'd.o.o.', 'E.A.D.', 'e.h.f.', 'E.P.E.', 'E.S.V.', 'F.C.P.', 'F.I.E.', 'G.b.R.', 'G.I.E.', 'G.M.K.', 'G.S.K.', 'H.U.F.', 'K.D.A.', 'k.f.t.', 'k.h.t.', 'k.k.t.', 'L.L.C.', 'L.L.P.', 'o.h.f.', 'O.H.G.', 'O.O.D.', 'O.y.j.', 'p.l.c.', 'P.S.U.', 'S.A.E.', 'S.A.S.', 'S.C.A.', 'S.C.E.', 'S.C.S.', 'S.E.M.', 'S.E.P.', 's.e.s.', 'S.G.R.', 'S.N.C.', 'S.p.A.', 'S.P.E.', 'S.R.L.', 's.r.o.', 'Unltd.', 'V.O.F.', 'V.o.G.', 'v.o.s.', 'V.Z.W.', 'z.r.t.', 'А.А.Т.', 'Е.А.Д.', 'З.А.Т.', 'К.Д.А.', 'О.О.Д.', 'Т.А.А.', '股份有限公司', 'Ap.S.', 'Corp.', 'ltda.', 'Sh.A.', 'st.G.', 'Ultd.', 'a.b.', 'A.D.', 'A.E.', 'A.G.', 'A.S.', 'A.Ş.', 'A.y.', 'B.M.', 'b.t.', 'B.V.', 'C.A.', 'C.V.', 'd.d.', 'e.c.', 'E.E.', 'e.G.', 'E.I.', 'E.P.', 'E.T.', 'E.U.', 'e.v.', 'G.K.', 'G.P.', 'h.f.', 'Inc.', 'K.D.', 'K.G.', 'K.K.', 'k.s.', 'k.v.', 'K.y.', 'L.C.', 'L.P.', 'Ltd.', 'N.K.', 'N.L.', 'N.V.', 'O.E.', 'O.G.', 'O.Ü.', 'O.y.', 'P.C.', 'p.l.', 'Pty.', 'PUP.', 'Pvt.', 'r.t.', 'S.A.', 'S.D.', 'S.E.', 's.f.', 'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.', 'ПУП.', 'С.Д.', 'בע"מ', '任意組合', '匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社', 'A/S', 'G/S', 'I/S', 'K/S', 'P/S' ], before:ADFIX_JOINERS, after:'\\z' } } ADFIX_PATTERNS = {} [:prefix, :suffix].each do |adfix_type| patterns = {} adfix = ADFIXES[adfix_type] [:person, :organization].each do |contact_type| with_optional_spaces = adfix[contact_type].map { |p| p.gsub(ASCII_SPACE,' *') } pattern_string = with_optional_spaces.join('|').gsub('.', '\.*') patterns[contact_type] = /#{adfix[:before]}\(*(?:#{pattern_string})\)*#{adfix[:after]}/i end ADFIX_PATTERNS[adfix_type] = patterns end end