# coding: utf-8 require "fonemas/version" module Fonemas require 'text/hyphen' require 'unicode_utils' def self.version VERSION end def self.clean(text) s = text.gsub(/,/,' ') s = s.gsub(/\s+/,' ') s = s.chomp.strip s = downcase(s) return s end def self.downcase(text) UnicodeUtils.downcase(text) end def self.lastVocal(word,from) #puts "last vocal for #{word} from #{from}" for i in 1..from # puts i #puts word[from-i] if word[from-i] =~ /[aeiou]/ return word[from-i] end end return false end def self.isTonica(word,i) #falta considerar las palabras que poseen acento pero no tilde return true if word.size == 1 tildes = %w(á é í ó ú ã) w = word.join if tildes.include? word[i] return true else es = Text::Hyphen.new(:language => "es", :left => 0, :right => 1) p = es.hyphenate(w) #puts es.visualize(w) hh = es.visualize(w).split("-") #puts hh.size if w =~ /[áéíóú]/ #acento ya existe en otra silaba return false else #puts es.visualize(w) if hh.size == 1 if lastVocal(w,w.size-1) == word[i] return true else return false end #monosilabos elsif hh.size == 2 #agudas, se acentuan en n,s o vocal #puts "#{word[i]} #{i}<#{p[0]} - #{lastVocal(w,p[0])}" if w =~ /[nsaeiou]$/ #termina en n s y vocal y no tiene tilde #por lo tanto es grave # puts "#{lastVocal(w,p[0])} == #{word[i]} #{word[i].class.name}" if i < p[0] and lastVocal(w,p[0]) == word[i] return true else return false end else if i < p[0] return false else return lastVocal(w,w.size) == word[i] end end elsif hh.size >= 3 #puts hh.join("-") if i > p[p.size-1] if w =~ /[nsaeiou]$/ return false else return true end elsif i > p[p.size-2] and i <= p[p.size-1] and w =~ /[nsaeiou]$/ return true else return false end end end return false end end def self.isFinal(word,i) return word.size == i-1 end def self.isFricativa(word,i) fricativas = %w(f s c z j ll y g b w b v w s m b x d) return fricativas.include? word[i] end def self.entreVocales(word,i) if i == 0 or word.size - 1 == i return false else return (isVocal(word,i-1) and isVocal(word,i+1)) end end def self.entreVocalyConsonante(word,i) return ((isVocal(word,i-1) and !isVocal(word,i+1)) or (isVocal(word,i+1) and !isVocal(word,i-1))) end def self.isVocal(word,i) vocales = %w(a e i o u á é í ó ú) return vocales.include? word[i] end def self.isDiptongo(word,first,second) f = word[first] s = word[second] abiertas = %w(a e o) cerradas = %w(i u) return ((abiertas.include? f and cerradas.include? s) or (abiertas.include? s and cerradas.include? f) or (cerradas.include? f and cerradas.include? s)) end def self.separar(word) word = downcase(word) output = [] i = 0 while(i < word.length) if word[i] == 'c' and word[i+1] == 'h' output << "ch" i+=1 elsif word[i] == 'l' and word[i+1] == 'l' output << 'll' i+=1 elsif word[i] == 'r' and word[i+1] == 'r' output << 'rr' i+=1 else output << word[i] end i +=1 end return output end def self.fonemaLetra(letra) case letra when 'a','á' then ['aa'] when 'b' then ['b ee'] when 'c' then ['s ee'] when 'd' then ['d ee'] when 'e','é' then ['ee'] when 'f' then ['ee f ee'] when 'g' then ['g ee'] when 'h' then ['aa ch e'] when 'i','í' then ['ii'] when 'j' then ['j oo t a'] when 'k' then ['k aa'] when 'l' then ['ee l e'] when 'm' then ['ee m e'] when 'n' then ['ee n e'] when 'ñ' then ['ee nh e'] when 'o','ó' then ['oo'] when 'p' then ['p ee'] when 'q' then ['k uu'] when 'r' then ['ee rr ee','ee r ee'] when 's' then ['ee s e'] when 't' then ['t ee'] when 'u','ú' then ['uu'] when 'v' then ['b ee','uu b e'] when 'w' then ['d o b l e b ee','d o b l e uu b e'] when 'x' then ['ee k i s'] when 'y' then ['ll ee'] when 'z' then ['s ee t a'] else raise "error, no conozco pronunciación de #{letra}" end end def self.fonemas(word) word = word.gsub(/'/,'') if word.size == 1 return fonemaLetra(word) end if word.include?('_') output = [] for a in word.split('_') if a.size > 0 output << Fonemas.fonemas(a) end end return [output.join(" ")] end word = separar(word) fonema = [] for i in 0..(word.length-1) letra = word[i] case letra when 'á','ã' then fonema << 'aa' when 'é' then fonema << 'ee' when 'í' then fonema << 'ii' when 'ó' then fonema << 'oo' when 'ú' then fonema << 'uu' when 'a' then if isTonica(word,i) fonema << 'aa' else fonema << 'a' end when 'b','v' then if isVocal(word,i-1) and (word[i+1] == 'b' or word[i+1] == 'v') fonema << ['bb',''] elsif i == 0 and isVocal(word,i+1) if word[i+1] == 'u' and isDiptongo(word,i+1,i+2) fonema << ['bb','b','g'] else fonema << ['bb','b'] end elsif word[i+1] == 'u' and isDiptongo(word,i+1,i+2) if entreVocales(word,i) fonema << ['b','g',''] else fonema << ['bb','g'] end elsif isFricativa(word,i-1) fonema << 'b' elsif isFinal(word,i) fonema << 'b' elsif entreVocales(word,i) fonema << ['b',''] else fonema << 'bb' end when 'c' then if word[i+1] == 'e' or word[i+1] == 'i' fonema << 's' else fonema << 'k' end when 'ch' then if entreVocales(word,i) fonema << ['ch','sh','tch','j'] else fonema << ['ch','sh','tch'] end when 'd' then if i == 0 and isVocal(word,i+1) fonema << ['dd','d'] elsif entreVocales(word,i) or i == word.size-1 fonema << ['d',''] elsif entreVocalyConsonante(word,i) fonema << ['dd','d'] else fonema << 'd' end when 'e' then if isTonica(word,i) fonema << 'ee' else fonema << 'e' end when 'f' then fonema << 'f' when 'g' then if word[i+1] == 'u' and i == 0 and isTonica(word,i+2) #nada elsif word[i+1] == 'e' or word[i+1] == 'i' fonema << 'j' else if !entreVocales(word,i) and word[i-1] != 'n' fonema << 'gg' else fonema << 'g' end end when 'h' then if word[i+1] == 'u' and isDiptongo(word,i+1,i+2) fonema << ['','g'] end #nada when 'i' then if isTonica(word,i) fonema << 'ii' else fonema << 'i' end when 'j' then fonema << 'j' when 'k' then fonema << 'k' when 'l' then fonema << 'l' when 'll' then fonema << ['ll','lli','i'] when 'm' then fonema << 'm' when 'n' then fonema << 'n' when 'ñ' then fonema << 'nh' when 'o' then if isTonica(word,i) fonema << 'oo' else fonema << 'o' end when 'p' then fonema << 'p' when 'q' then fonema << 'k' when 'r' then if i == 0 fonema << 'rr' else fonema << 'r' end when 'rr' then fonema << 'rr' when 's' then if word[i-1] == 'r' or word[i-1] == 'd' or i == word.size-1 fonema << ['s','','h'] elsif entreVocalyConsonante(word,i) fonema << ['s','h'] elsif word[i-1] == 'b' and word[i+1] == 't' fonema << ['s','h'] elsif word[i-1] == 'b' fonema << ['s',''] else fonema << 's' end when 't' then fonema << 't' when 'ü' then fonema << 'u' when 'u' then if word[i-1] == 'g' and i == 1 and isTonica(word,i+1) fonema << ['gu'] elsif isTonica(word,i) fonema << 'uu' else fonema << 'u' end when 'w' then if i == 0 fonema << ['b','bb'] elsif word[i-1] == 'o' fonema << 'u' elsif word[i+1] == 'i' fonema << 'u' else #if entreVocales(word,i) fonema << 'gu' # else # fonema << 'Gu' end when 'x' then fonema << ['ks','k','h'] when 'y' then if i == word.size - 1 fonema << 'i' else fonema << ['ll','lli','i'] end when 'z' then if i == word.size - 1 fonema << ['s','h',''] else fonema << 's' end else raise "error, no conozco pronunciación de #{letra}" end end #puts "pre: #{fonema}" t = normalize(generateFonemas(fonema)) #puts "out: #{t}" #self.checkFonemas(t) return t end #def self.checkFonemas(p) # #un ultimo chequeo de seguridad # for pronunciacion in p # for fonema in pronunciacion.split(" ") # raise "fonema invalido" unless lista_de_fonemas.include? fonema # end # end # #end def self.generateFonemas(fonema,i=0,current=[]) if i == fonema.length return current.join(' ') end c = fonema[i] if c.class.name == 'Array' output = [] for j in c if j == '' output << generateFonemas(fonema,i+1,current) else output << generateFonemas(fonema,i+1,current + [j]) end end return output else if c == '' return generateFonemas(fonema,i+1,current) else return generateFonemas(fonema,i+1,current + [c]) end end end def self.normalize(t) @output = [] anormalize(t) return @output end def self.anormalize(t) #puts "pre normalize: #{t}" if t.class.name == 'Array' for i in t anormalize(i) end else #puts "found #{t}" @output << t end end def self.lista_de_fonemas phonelist = ['SIL'] phonelist += %w{a e i o u aa ee ii oo uu} phonelist += %w{bb b d e f g h i j k l m n o p q rr r s t u w ks k h gu ch tch sh dd gg ll lli nh} phonelist.uniq end end