# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 # Documentation: https://github.com/hunspell/hyphen/blob/21127cc8493a68d4fe9adbb71377b469b4f2b550/doc/tb87nemeth.pdf module TwitterCldr module Shared class Hyphenator class UnsupportedLocaleError < StandardError; end BASE_RESOURCE_PATH = %w(shared hyphenation) DEFAULT_LEFT_HYPHEN_MIN = 2 DEFAULT_RIGHT_HYPHEN_MIN = 2 DEFAULT_NO_HYPHEN = %w(- ' ’) class << self def get(locale) locale = find_supported_locale(locale) unless locale raise UnsupportedLocaleError, "'#{locale}' is not a supported hyphenation locale" end cache[locale] ||= begin resource = resource_for(locale) new(resource[:rules], locale, resource[:options]) end end def supported_locale?(locale) !!find_supported_locale(locale) end def supported_locales @supported_locales ||= begin absolute_resource_path = TwitterCldr.absolute_resource_path( File.join(BASE_RESOURCE_PATH) ) files = Dir.glob(File.join(absolute_resource_path, '*.yml')) files.map { |f| File.basename(f).chomp('.yml') } end end private def find_supported_locale(locale) maximized_locale = Locale.parse(locale.to_s).maximize maximized_locale.permutations('-').find do |locale_candidate| TwitterCldr.resource_exists?( *BASE_RESOURCE_PATH, locale_candidate ) end end def cache @cache ||= {} end def resource_for(locale) TwitterCldr.get_resource(*BASE_RESOURCE_PATH, locale) end end attr_reader :rules, :locale, :options, :trie def initialize(rules, locale, options) @rules = rules @locale = locale @options = options @trie = build_trie_from(rules) end # 0x00AD is a soft hyphen def hyphenate(text, hyphen = "\u00AD") each_chunk(text).to_a.join(hyphen) end def each_chunk(text) if block_given? last_pos = 0 each_position(text) do |pos| yield text[last_pos...pos].tap { last_pos = pos } end if last_pos < text.size yield text[last_pos..text.size] end else to_enum(__method__, text) end end def each_position(text) if block_given? text = ".#{text}." break_weights = break_weights_for(text) left = left_hyphen_min right = text.size - right_hyphen_min - 2 (left...right).each do |idx| yield idx if break_weights[idx].odd? end else to_enum(__method__, text) end end private def break_weights_for(text) break_weights = Array.new(text.size - 1, 0) text.each_char.with_index do |char, idx| subtrie = trie.root counter = idx while subtrie subtrie = subtrie.child(text[counter]) counter += 1 if subtrie && subtrie.has_value? update_break_weights(subtrie.value, break_weights, idx) end end end remove_illegal_hyphens(break_weights, text) end def update_break_weights(pattern, break_weights, start_idx) pattern_idx = 0 pattern.each_char do |segment| if segment =~ /\d/ int_seg = segment.to_i idx = (start_idx + pattern_idx) - 1 break if idx >= break_weights.size break_weights[idx] = if break_weights[idx] > int_seg break_weights[idx] else int_seg end else pattern_idx += 1 end end end def remove_illegal_hyphens(break_weights, text) break_weights.map.with_index do |break_weight, idx| next break_weight if idx.zero? next 0 if no_hyphen.include?(text[idx - 1]) break_weight end end def left_hyphen_min @left_hyphen_min ||= options.fetch(:lefthyphenmin, DEFAULT_LEFT_HYPHEN_MIN) end def right_hyphen_min @right_hyphen_min ||= options.fetch(:righthyphenmin, DEFAULT_RIGHT_HYPHEN_MIN) end def no_hyphen @no_hyphen ||= options.fetch(:nohyphen, DEFAULT_NO_HYPHEN) end def build_trie_from(rules) TwitterCldr::Utils::Trie.new.tap do |trie| rules.each do |rule| trie.add(rule.gsub(/\d/, '').each_char, rule) end end end end end end