# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Shared class Caser REGEX = /./ class << self def upcase(string) string.gsub(REGEX, uppercasing_hash) end def downcase(string) string.gsub(REGEX, lowercasing_hash) end # toTitlecase(X): Find the word boundaries in X according # to Unicode Standard Annex #29, "Unicode Text Segmentation." # For each word boundary, find the first cased character F # following the word boundary. If F exists, map F to # Titlecase_Mapping(F); then map all characters C between F # and the following word boundary to Lowercase_Mapping(C). def titlecase(string) string.dup.tap do |result| boundary_rule_set.each_boundary(result).each_cons(2) do |boundary_pair| if cased_pos = first_cased(string, *boundary_pair) result[cased_pos] = titlecasing_hash[result[cased_pos]] (cased_pos + 1).upto(boundary_pair.last - 1) do |pos| result[pos] = lowercasing_hash[result[pos]] end end end end end private def first_cased(string, start_pos, end_pos) start_pos.upto(end_pos) do |pos| return pos if cased?(string[pos]) end end def boundary_rule_set @boundary_rule_set ||= Segmentation::RuleSet.load(:en, 'word') end def cased?(char) props = CodePoint.properties_for_code_point(char.ord) props.general_category.include?('Lt') || props.uppercase || props.lowercase end def uppercasing_hash @uppercasing_hash ||= Hash.new do |hash, key| memoize_value(:simple_uppercase_map, hash, key) end end def lowercasing_hash @lowercasing_hash ||= Hash.new do |hash, key| memoize_value(:simple_lowercase_map, hash, key) end end def titlecasing_hash @titlecasing_hash ||= Hash.new do |hash, key| memoize_value(:simple_titlecase_map, hash, key) end end def memoize_value(field, hash, key) cp = TwitterCldr::Shared::CodePoint.get(key.ord) mapped_result = cp.send(field) hash[key] = mapped_result ? mapped_result : key end end end end end