lib/ting/hanyu_pinyin_parser.rb in ting-0.11.0 vs lib/ting/hanyu_pinyin_parser.rb in ting-0.12.0

- old
+ new

@@ -12,29 +12,62 @@ def all_syllables @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse end - def sylls_with_erhua - @with_erhua ||= all_syllables.map{|p| p + 'r'} + def consonant_syllables + @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i) end def pinyin_regexp - @pinyin_regexp ||= Regexp.union(*sylls_with_erhua, *all_syllables) + # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation. + @pinyin_cluster_regexp ||= /\A + # Every syllable can appear at the start of a cluster. + (#{Regexp.union(all_syllables)}) + # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to + # be prefixed with an apostrophe. + # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with + # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid + # syllable. + (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})* + (r)? + \Z/x end - def split_pinyin(pinyin) - pinyin.scan(pinyin_regexp).flat_map do |syll| - if sylls_with_erhua.include?(syll) && ! all_syllables.include?(syll) - [ syll[0..-2], 'er'] - else - [ syll ] + def pinyin_separator_regexp + # A regular expression that matches every character that can *not* appear in pinyin. + @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+") + end + + def parse_cluster(pinyin) + syllables = [] + + # Chop off one syllable at a time from the end by continuously matching the same regular expression. + # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will + # only contain the *last* content it has matched, we have to use a loop. + while match = pinyin_regexp.match(pinyin) + # If an 'r' at the end was matched, this implies that all other parts of the string were matched as + # syllables, and this cluster uses erhua. + if 'r' == match[3] + syllables << 'er' + pinyin = pinyin.chop end + last_syllable = match[2] || match[1] + syllables << last_syllable + pinyin = pinyin[0, pinyin.length - last_syllable.length] end + + raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty? + + syllables.reverse end def parse(pinyin) - split_pinyin(pinyin).map(&hanyu_reader) + # hanyu_reader cannot parse uppercase pinyin. + pinyin = pinyin.downcase + + clusters = pinyin.split(pinyin_separator_regexp) + clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader) end alias call parse end end