lib/ting/hanyu_pinyin_parser.rb in ting-0.11.0 vs lib/ting/hanyu_pinyin_parser.rb in ting-0.12.0
- old
+ new
@@ -12,29 +12,62 @@
def all_syllables
@all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
end
- def sylls_with_erhua
- @with_erhua ||= all_syllables.map{|p| p + 'r'}
+ def consonant_syllables
+ @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
end
def pinyin_regexp
- @pinyin_regexp ||= Regexp.union(*sylls_with_erhua, *all_syllables)
+ # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
+ @pinyin_cluster_regexp ||= /\A
+ # Every syllable can appear at the start of a cluster.
+ (#{Regexp.union(all_syllables)})
+ # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
+ # be prefixed with an apostrophe.
+ # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
+ # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
+ # syllable.
+ (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
+ (r)?
+ \Z/x
end
- def split_pinyin(pinyin)
- pinyin.scan(pinyin_regexp).flat_map do |syll|
- if sylls_with_erhua.include?(syll) && ! all_syllables.include?(syll)
- [ syll[0..-2], 'er']
- else
- [ syll ]
+ def pinyin_separator_regexp
+ # A regular expression that matches every character that can *not* appear in pinyin.
+ @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
+ end
+
+ def parse_cluster(pinyin)
+ syllables = []
+
+ # Chop off one syllable at a time from the end by continuously matching the same regular expression.
+ # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
+ # only contain the *last* content it has matched, we have to use a loop.
+ while match = pinyin_regexp.match(pinyin)
+ # If an 'r' at the end was matched, this implies that all other parts of the string were matched as
+ # syllables, and this cluster uses erhua.
+ if 'r' == match[3]
+ syllables << 'er'
+ pinyin = pinyin.chop
end
+ last_syllable = match[2] || match[1]
+ syllables << last_syllable
+ pinyin = pinyin[0, pinyin.length - last_syllable.length]
end
+
+ raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
+
+ syllables.reverse
end
def parse(pinyin)
- split_pinyin(pinyin).map(&hanyu_reader)
+ # hanyu_reader cannot parse uppercase pinyin.
+ pinyin = pinyin.downcase
+
+ clusters = pinyin.split(pinyin_separator_regexp)
+ clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
end
alias call parse
end
end