lib/pascoale/syllable_separator.rb in pascoale-0.2.1 vs lib/pascoale/syllable_separator.rb in pascoale-0.3.0

- old
+ new

@@ -2,27 +2,40 @@ class SyllableSeparator include Constants ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])" - # Still in doubt if we should add suffixes to the "i" semivowel... - # it slightly improves the the matches, but some of them causes more - # noise than fix things =\ - #NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)" - NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)" + CODA = '[bcdfghjklmnpqrstvwxz]' - CODA = "[#{CONSONANTS}]" + # Biggest problem are "sinéreses" and "diéreses". + # It seems some consonants like "n" and "m" in the next syllable can cause it. + NUCLEUS_RULES = ['ãe', + 'ão', + 'õe', + 'au', + 'ou', + 'iu(?!m$)', + '[áâàãéêíóôú][iu]', + '[aieou][iu](?=[aeo])', + "ai(?!m$|ns$|r$|ç[ãõ]|[nm]#{ONSET}|nh)", + "eu(?![nm]#{ONSET})", + "ei(?![nm]#{ONSET})", + "ui(?!m$|ns$|ç[ãõ]|r$|dade$|z|[nm]#{ONSET}|nar$|d[ao]$|dora?$)", + "oi(?!m$|ns$|ç[ãõ]|r$|dade$|z|[nm]#{ONSET}|nar$|dora?$)", + '[aáâàãeéêiíoóôuúy]'] + NUCLEUS = "(?:#{NUCLEUS_RULES.join('|')})" + # The concept of "rhyme" does not help in this algorithm. It seems the - # concept makes no sense for syllable separation in portuguese + # concept makes no sense for syllable separation in portuguese (by an algorithm, at least) KERNEL = "#{ONSET}?#{NUCLEUS}" def initialize(word) @word = word end - def separated + def separate rest = @word result = [] while rest && rest.size > 0 if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s @@ -31,14 +44,17 @@ # Pneu, Gnomo, Mnemônica, Pseudônimo elsif result.size == 0 if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s + else + raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}") end else raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}") end end result end + alias separated separate end end