Sha256: b96537c69ec963c0b02d0455c33ab761475a9569fcf70300c0d6a54157dc2dfc

Contents?: true

Size: 1.61 KB

Versions: 3

Compression:

Stored size: 1.61 KB

Contents

module Pascoale
  class SyllableSeparator
    include Constants

    ONSET = "(?:ch|lh|nh|gu|qu|[pbtdcgfv][lr]|[#{CONSONANTS}])"

    # Still in doubt if we should add suffixes to the "i" semivowel...
    # it slightly improves the the matches, but some of them causes more
    # noise than fix things =\
    #NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|dora?$|ção$|dade$))?)"
    NUCLEUS = "(?:ãe|ão|õe|[#{VOWELS}](?:u|i(?!nh|r$|m$|ção$|dora?$))?)"

    CODA = "[#{CONSONANTS}]"

    # The concept of "rhyme" does not help in this algorithm. It seems the
    # concept makes no sense for syllable separation in portuguese
    KERNEL = "#{ONSET}?#{NUCLEUS}"

    def initialize(word)
      @word = word
    end

    def separated
      rest = @word
      result = []
      while rest && rest.size > 0
        if rest =~ /^(#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
          result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
          rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
          # Special case! Hate them :(
          # Pneu, Gnomo, Mnemônica, Pseudônimo
        elsif result.size == 0
          if rest =~ /^([#{CONSONANTS}]#{KERNEL})(?:(#{KERNEL})|(#{CODA})(#{KERNEL})|(#{CODA}#{CODA})(#{KERNEL})|(#{CODA}#{CODA})|(#{CODA}))?(.*)$/
            result << $1 + $3.to_s + $5.to_s + $7.to_s + $8.to_s
            rest = $2.to_s + $4.to_s + $6.to_s + $9.to_s
          end
        else
          raise %(Cannot separate "#{@word}". No rule match next syllable at "#{result.join('')}|>#{rest}")
        end
      end
      result
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
pascoale-0.2.1 lib/pascoale/syllable_separator.rb
pascoale-0.2.0 lib/pascoale/syllable_separator.rb
pascoale-0.1.0 lib/pascoale/syllable_separator.rb