lib/grc.rb in grc-0.1.3 vs lib/grc.rb in grc-0.1.4

- old
+ new

@@ -6,254 +6,249 @@ module Grc class Error < StandardError; end @std_error = 'ERROR: String does not contain any greek. Summon the muse and try again.' + # General methods + + # `grc?` (str → bool) + # Returns true if the string contains greek characters. def grc? !scan(/(\p{Greek})/).empty? end - def no_downcase_diacritics - return @std_error unless grc? - - tr('ἀἄᾄἂᾂἆᾆᾀἁἅᾅἃᾃἇᾇᾁάάᾴὰᾲᾰᾶᾷᾱᾳἐἔἒἑἕἓέέὲἠἤᾔἢᾒἦᾖᾐἡἥᾕἣᾓἧᾗᾑήήῄὴῂῆῇῃἰἴἲἶἱἵἳἷίίὶῐῖϊϊΐῒῗῑὀὄὂὁὅὃόόὸῤῥὐὔὒὖὑὕὓὗύύὺῠῦϋΰΰΰῢῧῡὠὤᾤὢᾢὦᾦᾠὡὥᾥὣᾣὧᾧᾡώώῴὼῲῶῷῳ', - 'ααααααααααααααααααααααααααεεεεεεεεεηηηηηηηηηηηηηηηηηηηηηηηηιιιιιιιιιιιιιιιιιιιοοοοοοοοορρυυυυυυυυυυυυυυυυυυυυωωωωωωωωωωωωωωωωωωωωωωωω') + # `tokenize` (str → array) + # Returns an array of tokens from the string. + def tokenize + gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split end - def no_upcase_diacritics + # `transliterate` (str → str) + # Returns a string with greek characters replaced with their transliteration. + def transliterate return @std_error unless grc? + result = [] str = self - # Adhoc solution for odd combinations of diacritics with capital letters - ars = [[/[́̀͂́́́̀͂]/, ''], [/Α͂/, 'Α'], [/Η͂/, 'Η'], [/Ί|Ὶ|Ι͂|́Ι|̀Ι|͂Ι/, 'Ι'], - [/Ρ̓/, 'Ρ'], [/ Ὺ| ́Υ|Υ̓|Ύ|Ὺ|Υ͂|́Υ|̀Υ|͂Υ/, 'Υ'], [/͂Ω/, 'Ω']] - ars.each do |a| - str = str.gsub(/#{a[0]}/, a[1]) + str.tokenize do |token| + result << if token.grc? + the_word = token.gsub(/ῥ/, 'rh') + the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics + hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) } + the_word + else + word + end end - str.tr('ἈἌἊἎἉἍἋἏΆᾺᾸᾹἘἜἚἙἝἛΈῈἨἬἪἮἩἭἫἯΉῊἸἼἺἾἹἽἻἿΊῚῘΪῙὈὌὊὉὍὋΌῸΡῬὙὝὛὟΎῪῨΫῩὨὬὪὮὩὭὫὯΏῺ', - 'ΑΑΑΑΑΑΑΑΑΑΑΑΕΕΕΕΕΕΕΕΗΗΗΗΗΗΗΗΗΗΙΙΙΙΙΙΙΙΙΙΙΙΙΟΟΟΟΟΟΟΟΡΡΥΥΥΥΥΥΥΥΥΩΩΩΩΩΩΩΩΩΩ') + result.join(' ') end - def no_diacritics - return @std_error unless grc? + # Unicode Inspection Methods - no_downcase_diacritics.no_upcase_diacritics + # `unicode_points` (str → array) + # Returns an array of unicode points from the string. + def unicode_points + unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" } end - def tonos_to_oxia - return @std_error unless grc? + # `hash_dump`: (str → hash) + # Returns a hash of the string's unicode points (Char: Unicode_points). + def hash_dump + hash = {} + each_char do |character| + hash[character] = character.dump + end + hash + end - tr('άΆέΈήΉίΊΐόΌύΎΰώΏ', - 'άΆέΈήΉίΊΐόΌύΎΰώΏ') + # `unicode_name` (str → array) + # Returns an array of unicode names from the string. + def unicode_name + require 'unicode/name' + each_char.map { |character| Unicode::Name.of character } end - def to_oxia + # Unicode Normalization + + # `nfd` (str → str) + # Returns a string with the canonical decomposition of the string. + def nfd + unicode_normalize(:nfd) + end + + # `nfc` (str → str) + # Returns a string with the canonical composition of the string. + def nfc + unicode_normalize(:nfc) + end + + # Case folding + + # `grc_downcase` (str → str) + # Returns the lowercase version of string for greek characters resolving confusable characters. + # See https://www.w3.org/TR/charmod-norm/#PreNormalization + def grc_downcase + nfd.downcase.nfc + end + + # `grc_upcase` (str → str) + # Default `upcase` methods strips diacritical marks from greek characters. + # This method returns the corresponding uppercase version of string for greek characters preserving diacritical marks. + # See pages 1-7 of http://www.tlg.uci.edu/encoding/precomposed.pdf + # https://icu.unicode.org/design/case/greek-upper + def grc_upcase + case_map = { + ᾀ: 'ᾈ', + ᾁ: 'ᾉ', + ᾂ: 'ᾊ', + ᾃ: 'ᾋ', + ᾄ: 'ᾌ', + ᾅ: 'ᾍ', + ᾆ: 'ᾎ', + ᾇ: 'ᾏ', + ᾐ: 'ᾘ', + ᾑ: 'ᾙ', + ᾒ: 'ᾚ', + ᾓ: 'ᾛ', + ᾔ: 'ᾜ', + ᾕ: 'ᾝ', + ᾖ: 'ᾞ', + ᾗ: 'ᾟ', + ᾠ: 'ᾨ', + ᾡ: 'ᾩ', + ᾢ: 'ᾪ', + ᾣ: 'ᾫ', + ᾤ: 'ᾬ', + ᾥ: 'ᾭ', + ᾦ: 'ᾮ', + ᾧ: 'ᾯ', + ᾳ: 'ᾼ', + ῃ: 'ῌ', + ῳ: 'ῼ' + } + + nfc.each_char.map do |char| + if char.grc? + case_map[:"#{char}"] || char.upcase + else + char + end + end.join + end + + # Diacritical marks + + # `no_downcase_diacritics` (str → str) + # Returns a string with the diacritics removed from lowercase characters. + def no_downcase_diacritics return @std_error unless grc? - tonos_to_oxia + each_char.map do |char| # Loop through each character + if char.grc? && char.lower? # If character is greek and lowercase + char.nfd.gsub(/\p{Mn}/, '').nfc # decompose, remove non-spacing markers (diacritics), recompose and return + else # else + char # return char + end + end.join # end char loop end - def oxia_to_tonos + # `no_upcase_diacritics` (str → str) + # Returns a string with the diacritics removed from uppercase characters. + def no_upcase_diacritics return @std_error unless grc? - tr('άΆέΈήΉίΊΐόΌύΎΰώΏ', - 'άΆέΈήΉίΊΐόΌύΎΰώΏ') + each_char.map do |char| # Loop through each character + if char.grc? && char.upper? # If character is greek and uppercase + char.nfd.gsub(/\p{Mn}/, '').nfc # Decompose, remove non-spacing markers (diacritics), recompose and return + else # else + char # Return char + end + end.join end - def to_tonos + # `no_diacritics` (str → str) + # Returns a string with the diacritics removed. + def no_diacritics return @std_error unless grc? - oxia_to_tonos + no_downcase_diacritics.no_upcase_diacritics end - def acute_to_grave + # Accents + + # `to_grave` (str → str) + # Returns a string with the grave replacing the acute accent. + def to_grave return @std_error unless grc? + # Simple transform method with grave to acute mapping tr('ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ', 'ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ') end - def grave_to_acute + # `to_acute` (str → str) + # Returns a string with the acute replacing the grave accent. + def to_acute return @std_error unless grc? + # Simple transform method with acute to grave mapping tr('ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ', 'ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ') end - def tokenize - gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split + # `to_oxia` (str → str) + # Returns a string with the oxia replacing the tonos. + def to_oxia + return @std_error unless grc? + + tr('άΆέΈήΉίΊΐόΌύΎΰώΏ', + 'άΆέΈήΉίΊΐόΌύΎΰώΏ') end - def unicode_char - each_char.map(&:to_s) + # `to_tonos` (str → str) + # Returns a string with the tonos replacing the oxia. + # See page 9 of http://www.tlg.uci.edu/encoding/precomposed.pdf + def to_tonos + return @std_error unless grc? + + tr('άΆέΈήΉίΊΐόΌύΎΰώΏ', + 'άΆέΈήΉίΊΐόΌύΎΰώΏ') end - def unicode_name - require 'unicode/name' - each_char.map { |character| Unicode::Name.of character } + def upper? + !!match(/\p{Upper}/) end - def unicode_points - unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" } + def lower? + !!match(/\p{Lower}/) end - def hash_dump - hash = {} - each_char do |character| - hash[character] = character.dump - end - hash + def oxia_to_tonos + return @std_error unless grc? + + to_tonos end - def transliterate + def tonos_to_oxia return @std_error unless grc? - hash = { - ῥ: 'rh', - ͱ: '', - Ͳ: '', - ͳ: '', - ʹ: '', - "\u0375": '', - Ͷ: '', - ͷ: '', - ͺ: '', - ͻ: '', - ͼ: '', - ͽ: '', - Α: 'a', - Β: 'b', - Γ: 'g', - Δ: 'd', - Ε: 'e', - Ζ: 'z', - Η: 'ē', - Θ: 'th', - Ι: 'i', - Κ: 'k', - Λ: 'l', - Μ: 'm', - Ν: 'n', - Ξ: 'x', - Ο: 'o', - Π: 'p', - Ρ: 'r', - Σ: 's', - Τ: 't', - Υ: 'y', - Φ: 'ph', - Χ: 'ch', - Ψ: 'ps', - Ω: 'ō', - α: 'a', - β: 'b', - γ: 'g', - δ: 'd', - ε: 'e', - ζ: 'z', - η: 'ē', - θ: 'th', - ι: 'i', - κ: 'k', - λ: 'l', - μ: 'm', - ν: 'n', - ξ: 'x', - ο: 'o', - π: 'p', - ρ: 'r', - ς: 's', - σ: 's', - τ: 't', - υ: 'y', - φ: 'ph', - χ: 'ch', - ψ: 'ps', - ω: 'ō', - Ϗ: '', - ϐ: '', - ϑ: '', - ϒ: '', - ϓ: '', - ϔ: '', - ϕ: '', - ϖ: '', - ϗ: '', - Ϙ: '', - ϙ: '', - Ϛ: '', - ϛ: '', - Ϝ: '', - ϝ: '', - Ϟ: '', - ϟ: '', - Ϡ: '', - ϡ: '', - Ϣ: '', - ϣ: '', - Ϥ: '', - ϥ: '', - Ϧ: '', - ϧ: '', - Ϩ: '', - ϩ: '', - Ϫ: '', - ϫ: '', - Ϭ: '', - ϭ: '', - Ϯ: '', - ϯ: '', - ϰ: '', - ϱ: '', - ϲ: '', - ϳ: '', - ϴ: '', - ϵ: '', - "\u03F6": '', - Ϸ: '', - ϸ: '', - Ϲ: '', - Ϻ: '', - ϻ: '', - ϼ: '', - Ͻ: '', - Ͼ: '', - Ͽ: '', - gg: 'ng', - gk: 'nk', - gx: 'nx', - gc: 'nc', - "\u{0314}": 'rh', - rr: 'rrh', - ay: 'au', - ey: 'eu', - ēy: 'ēu', - oy: 'ou', - yi: 'ui' - } - result = [] - str = self - str.split.each do |word| - result << if word.grc? - the_word = word.gsub(/ῥ/, 'rh') - the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics - hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) } - the_word - else - word - end - end - result.join(' ') + to_oxia end - def nfc - unicode_normalize(:nfc) + def grave_to_acute + return @std_error unless grc? + + to_acute end - def nfd - unicode_normalize(:nfd) + def acute_to_grave + return @std_error unless grc? + + to_grave end + end + class String include Grc end