module Pascoale class SyllableSeparator include Constants ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])" CODA = '[bcdfghjklmnpqrstvwxz]' # Biggest problem are "sinéreses" and "diéreses". # It seems some consonants like "n" and "m" in the next syllable can cause it. NUCLEUS_RULES = ['ãe', 'ão', 'õe', 'au', 'ou', 'iu(?!m$)', '[áâàãéêíóôú][iu]', '[aieou][iu](?=[aeo])', "ai(?!m$|ns$|r$|ç[ãõ]|[nm]#{ONSET}|nh)", "eu(?![nm]#{ONSET})", "ei(?![nm]#{ONSET})", "ui(?!m$|ns$|ç[ãõ]|r$|dade$|z|[nm]#{ONSET}|nar$|d[ao]$|dora?$)", "oi(?!m$|ns$|ç[ãõ]|r$|dade$|z|[nm]#{ONSET}|nar$|dora?$)", '[aáâàãeéêiíoóôuúy]'] NUCLEUS = "(?:#{NUCLEUS_RULES.join('|')})" # The concept of "rhyme" does not help in this algorithm. It seems the # concept makes no sense for syllable separation in portuguese (by an algorithm, at least) KERNEL = "#{ONSET}?#{NUCLEUS}" def initialize(word) @word = word end def separate rest = @word result = [] while rest && rest.size > 0 if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s # Special case! Hate them :( # Pneu, Gnomo, Mnemônica, Pseudônimo elsif result.size == 0 if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/ result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s else raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}") end else raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}") end end result end alias separated separate end end