# encoding: UTF-8
module ICU
class Name
# Revert to the default sets of alternative names.
def self.reset_alternatives
@@alts = Hash.new
@@cmps = Hash.new
end
# Perform a reset when the class is first loaded.
self.reset_alternatives
# Construct a new name from one or two strings or any objects that have a to_s method.
def initialize(name1='', name2='')
@name1 = Util::String.to_utf8(name1.to_s)
@name2 = Util::String.to_utf8(name2.to_s)
originalize
canonicalize
@first.freeze
@last.freeze
@original.freeze
end
# Original text getter.
def original(opts={})
return transliterate(@original, opts[:chars]) if opts[:chars]
@original.dup
end
# First name getter.
def first(opts={})
return transliterate(@first, opts[:chars]) if opts[:chars]
@first.dup
end
# Last name getter.
def last(opts={})
return transliterate(@last, opts[:chars]) if opts[:chars]
@last.dup
end
# Return a complete name, first name first, no comma.
def name(opts={})
name = ''
name << first(opts)
name << ' ' if @first.length > 0 && @last.length > 0
name << last(opts)
name
end
# Return a reversed complete name, first name last after a comma.
def rname(opts={})
name = ''
name << last(opts)
name << ', ' if @first.length > 0 && @last.length > 0
name << first(opts)
name
end
# Convert to a string (same as rname).
def to_s(opts={})
rname(opts)
end
# Match another name to this object, returning true or false.
def match(name1='', name2='', opts={})
other = Name.new(name1, name2)
match_first(first(opts), other.first(opts)) && match_last(last(opts), other.last(opts))
end
# Load a set of first or last name alternatives. If no data is absent, a default set will be loaded.
# type should be :first or :last.
def self.load_alternatives(type, data=nil)
compile_alts(check_type(type), data, true)
end
# Show first name or last name alternatives.
def alternatives(type)
get_alts(check_type(type))
end
# :stopdoc:
private
# Save the original inputs without any cleanup other than whitespace.
def originalize
@original = @name2 == '' ? @name1.clone : "#{@name2.strip}, #{@name1.strip}"
@original.strip!
@original.gsub!(/\s+/, ' ')
end
# Transliterate characters to ASCII.
def transliterate(str, chars='US-ASCII')
if chars.match(/ASCII/i)
Util::String.transliterate(str)
else
str.dup
end
end
# Canonicalise the first and last names.
def canonicalize
first, last = partition
@first = finish_first(first)
@last = finish_last(last)
end
# Split one complete name into first and last parts.
def partition
if @name2.length == 0
# Only one input so we must split it into first and last.
parts = @name1.split(/,/)
if parts.size > 1
last = clean(parts.shift || '')
first = clean(parts.join(' '))
else
parts = clean(@name1).split(/ /)
last = parts.pop || ''
last = "#{parts.pop}'#{last}" if parts.size > 1 && parts.last.match(/^O$/i) && !last.match(/^O'/i) # "O", "Reilly" => "O'Reilly"
first = parts.join(' ')
end
else
# Two inputs, so we are given first and last.
first = clean(@name1)
last = clean(@name2)
end
[first, last]
end
# Clean up characters in any name keeping only letters (including accented), hyphens, and single quotes.
def clean(name)
name.gsub!(/[`‘’′‛]/, "'")
name.gsub!(/./) do |m|
# Keep only hyphens, normal characters, accented Latin1, full stops, single quotes and spaces.
m.ord < 256 && m.match(/\A[-a-zA-Z\u{c0}-\u{d6}\u{d8}-\u{f6}\u{f8}-\u{ff}.'\s]\z/) ? m : ''
end
name.gsub!(/\./, ' ')
name.gsub!(/\s*-\s*/, '-')
name.gsub!(/'+/, "'")
name.strip!
name = Util::String.downcase(name)
name.split(/\s+/).map do |n|
n.sub!(/^-+/, '')
n.sub!(/-+$/, '')
n.split(/-/).map do |p|
Util::String.capitalize(p)
end.join('-')
end.join(' ')
end
# Apply final touches to finish canonicalising a first name.
def finish_first(names)
names.gsub!(/([A-Z\u{c0}-\u{de}])\b/) { $1 + '.' } # Mark J L => Mark J. L.
names.gsub(/\b(I(v|ii?))\b/) { $1.upcase } # Richard Iii => Richard III
end
# Apply final touches to finish canonicalising a last name.
def finish_last(names)
names.gsub!(/\b([A-Z\u{c0}-\u{de}]')([a-z\u{e0}-\u{ff}])/) { |m| $1 + Util::String.upcase($2) }
names.gsub!(/\b(Mc)([a-z\u{e0}-\u{ff}])/) { |m| $1 + Util::String.upcase($2) }
names.gsub!(/\bMac([a-z\u{e0}-\u{ff}])/) do |m|
letter = $1 # capitalize after "Mac" only if the original clearly indicates it
upper = Util::String.upcase(letter)
'Mac'.concat(@original.match(/\bMac#{upper}/) ? upper : letter)
end
names.gsub!(/\bO ([A-Z\u{c0}-\u{de}])/) { |m| "O'" + $1 } # O Kelly => "O'Kelly"
names.gsub!(/\b(I(v|ii?))\b/) { $1.upcase } # Ford iv => Ford IV
names
end
# Check the type argument to the public methods.
def check_type(type) self.class.instance_eval { check_type(type) }; end
def self.check_type(type) type = type.to_s == "last" ? :last : :first; end
# Match a complete first name.
def match_first(first1, first2)
# Is this one a walk in the park?
return true if first1 == first2
# No easy ride. Begin by splitting into individual first names.
first1 = split_first(first1)
first2 = split_first(first2)
# Get the long list and the short list.
long, short = first1.size >= first2.size ? [first1, first2] : [first2, first1]
# The short one must be a "subset" of the long one. An extra condition must also be satisfied:
# either there has to be at least one match not involving initials or the first names must match.
# For example "M. J." matches "Mark" but not "John".
extra = false
(0..long.size-1).each do |i|
lword = long.shift
score = match_first_name(lword, short.first)
if score >= 0
short.shift
extra = true if i == 0 || score == 0
end
break if short.empty? || long.empty?
end
# There's a match if the following is true.
short.empty? && extra
end
# Match a complete last name.
def match_last(last1, last2)
return true if last1 == last2
return true if match_alt(:last, last1, last2)
[last1, last2].each do |last|
last.downcase! # case insensitive
last.gsub!(/\bmac/, 'mc') # MacDonaugh and McDonaugh
last.tr!('-', ' ') # Lowry-O'Reilly and Lowry O'Reilly
end
last1 == last2
end
# Split a complete first name for matching.
def split_first(first)
first.tr!('-', ' ') # J. K. and J.-K.
first = first.split(/ /) # split on spaces
first = [''] if first.size == 0 # in case input was empty string
first
end
# Match individual first names or initials.
# -1 = no match
# 0 = full match
# 1 = match involving 1 initial
# 2 = match involving 2 initials
def match_first_name(first1, first2)
initials = 0
initials+= 1 if first1.match(/^[A-Z\u{c0}-\u{de}]\.?$/)
initials+= 1 if first2.match(/^[A-Z\u{c0}-\u{de}]\.?$/)
return initials if first1 == first2 # "W." and "W." or "William" and "William"
return 0 if initials == 0 && match_alt(:first, first1, first2) # "William"" and "Bill"
return -1 unless initials > 0 # "William" and "Patricia"
return initials if first1[0] == first2[0] # "W." and "William" or "W." and "W"
-1
end
# Match two names that might be equivalent due to nicknames, misspellings, changed married names etc.
def match_alt(type, nam1, nam2)
self.class.compile_alts(type)
return false unless nams = @@alts[type][nam1]
return false unless cond = nams[nam2]
return true if cond == true
cond.match(type == :first ? @last : @first)
end
# Return an array of alternative first or second names (not including the original name).
# Allow for double barrelled last names or multiple first names.
def get_alts(type)
self.class.compile_alts(type)
name = self.send(type)
names = name.split(/[- ]/)
names.push(name) if names.length > 1
target = type == :first ? @last : @first
alts = Array.new
names.each do |n|
next unless @@alts[type][n]
@@alts[type][n].each_pair do |k, v|
alts.push k if v == true || v.match(target)
end
end
alts.concat(automatic_alts(names))
alts
end
# Add automatic alternatives - those not dependent on a compiled list.
# Currently only provides alternative for apostrophes, as backticks are often used instead by FIDE.
def automatic_alts(names)
names.find_all{|n| n.index("'")}.map{|n| n.gsub!("'", "`")}
end
# Compile an alternative names hash (for either first names or last names) before matching is first attempted.
def self.compile_alts(type, data=nil, force=false)
return if @@alts[type] && !force
unless data
file = File.expand_path(File.dirname(__FILE__) + "/../../config/#{type}_alternatives.yaml")
data = File.open(file) { |fd| YAML.load(fd) }
end
@@cmps[type] ||= 0
@@alts[type] = Hash.new
code = 1
data.each do |alts|
cond = true
alts.reject! do |a|
if a.instance_of?(Regexp)
cond = a
else
false
end
end
alts.each do |name|
alts.each do |other|
unless other == name
@@alts[type][name] ||= Hash.new
@@alts[type][name][other] = cond
end
end
end
code+= 1
end
@@cmps[type] += 1
end
# Return the number of YAML file compilations (for testing).
def self.alt_compilations(type)
@@cmps[check_type(type)] || 0
end
end
end