grc.rb in grc-0.1.4

- old
+ new
@@ -6,254 +6,249 @@
 module Grc
   class Error < StandardError; end
 
   @std_error = 'ERROR: String does not contain any greek. Summon the muse and try again.'
 
+  # General methods
+
+  # `grc?` (str → bool)
+  # Returns true if the string contains greek characters.
   def grc?
     !scan(/(\p{Greek})/).empty?
   end
 
-  def no_downcase_diacritics
-    return @std_error unless grc?
-
-    tr('ἀἄᾄἂᾂἆᾆᾀἁἅᾅἃᾃἇᾇᾁάάᾴὰᾲᾰᾶᾷᾱᾳἐἔἒἑἕἓέέὲἠἤᾔἢᾒἦᾖᾐἡἥᾕἣᾓἧᾗᾑήήῄὴῂῆῇῃἰἴἲἶἱἵἳἷίίὶῐῖϊϊΐῒῗῑὀὄὂὁὅὃόόὸῤῥὐὔὒὖὑὕὓὗύύὺῠῦϋΰΰΰῢῧῡὠὤᾤὢᾢὦᾦᾠὡὥᾥὣᾣὧᾧᾡώώῴὼῲῶῷῳ',
-       'ααααααααααααααααααααααααααεεεεεεεεεηηηηηηηηηηηηηηηηηηηηηηηηιιιιιιιιιιιιιιιιιιιοοοοοοοοορρυυυυυυυυυυυυυυυυυυυυωωωωωωωωωωωωωωωωωωωωωωωω')
+  # `tokenize` (str → array)
+  # Returns an array of tokens from the string.
+  def tokenize
+    gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split
   end
 
-  def no_upcase_diacritics
+  # `transliterate` (str → str)
+  # Returns a string with greek characters replaced with their transliteration.
+  def transliterate
     return @std_error unless grc?
 
+    result = []
     str = self
-    # Adhoc solution for odd combinations of diacritics with capital letters
-    ars = [[/[́̀͂́́́̀͂]/, ''], [/Α͂/, 'Α'], [/Η͂/, 'Η'], [/Ί|Ὶ|Ι͂|́Ι|̀Ι|͂Ι/, 'Ι'],
-           [/Ρ̓/, 'Ρ'], [/ Ὺ| ́Υ|Υ̓|Ύ|Ὺ|Υ͂|́Υ|̀Υ|͂Υ/, 'Υ'], [/͂Ω/, 'Ω']]
-    ars.each do |a|
-      str = str.gsub(/#{a[0]}/, a[1])
+    str.tokenize do |token|
+      result << if token.grc?
+                  the_word = token.gsub(/ῥ/, 'rh')
+                  the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics
+                  hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) }
+                  the_word
+                else
+                  word
+                end
     end
-    str.tr('ἈἌἊἎἉἍἋἏΆᾺᾸᾹἘἜἚἙἝἛΈῈἨἬἪἮἩἭἫἯΉῊἸἼἺἾἹἽἻἿΊῚῘΪῙὈὌὊὉὍὋΌῸΡῬὙὝὛὟΎῪῨΫῩὨὬὪὮὩὭὫὯΏῺ',
-           'ΑΑΑΑΑΑΑΑΑΑΑΑΕΕΕΕΕΕΕΕΗΗΗΗΗΗΗΗΗΗΙΙΙΙΙΙΙΙΙΙΙΙΙΟΟΟΟΟΟΟΟΡΡΥΥΥΥΥΥΥΥΥΩΩΩΩΩΩΩΩΩΩ')
+    result.join(' ')
   end
 
-  def no_diacritics
-    return @std_error unless grc?
+  # Unicode Inspection Methods
 
-    no_downcase_diacritics.no_upcase_diacritics
+  # `unicode_points` (str → array)
+  # Returns an array of unicode points from the string.
+  def unicode_points
+    unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" }
   end
 
-  def tonos_to_oxia
-    return @std_error unless grc?
+  # `hash_dump`: (str → hash)
+  # Returns a hash of the string's unicode points (Char: Unicode_points).
+  def hash_dump
+    hash = {}
+    each_char do |character|
+      hash[character] = character.dump
+    end
+    hash
+  end
 
-    tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
-       'άΆέΈήΉίΊΐόΌύΎΰώΏ')
+  # `unicode_name` (str → array)
+  # Returns an array of unicode names from the string.
+  def unicode_name
+    require 'unicode/name'
+    each_char.map { |character| Unicode::Name.of character }
   end
 
-  def to_oxia
+  # Unicode Normalization
+
+  # `nfd` (str → str)
+  # Returns a string with the canonical decomposition of the string.
+  def nfd
+    unicode_normalize(:nfd)
+  end
+
+  # `nfc` (str → str)
+  # Returns a string with the canonical composition of the string.
+  def nfc
+    unicode_normalize(:nfc)
+  end
+
+  # Case folding
+
+  # `grc_downcase` (str → str)
+  # Returns the lowercase version of string for greek characters resolving confusable characters.
+  # See https://www.w3.org/TR/charmod-norm/#PreNormalization
+  def grc_downcase
+    nfd.downcase.nfc
+  end
+
+  # `grc_upcase` (str → str)
+  # Default `upcase` methods strips diacritical marks from greek characters.
+  # This method returns the corresponding uppercase version of string for greek characters preserving diacritical marks.
+  # See pages 1-7 of http://www.tlg.uci.edu/encoding/precomposed.pdf
+  # https://icu.unicode.org/design/case/greek-upper
+  def grc_upcase
+    case_map = {
+      ᾀ: 'ᾈ',
+      ᾁ: 'ᾉ',
+      ᾂ: 'ᾊ',
+      ᾃ: 'ᾋ',
+      ᾄ: 'ᾌ',
+      ᾅ: 'ᾍ',
+      ᾆ: 'ᾎ',
+      ᾇ: 'ᾏ',
+      ᾐ: 'ᾘ',
+      ᾑ: 'ᾙ',
+      ᾒ: 'ᾚ',
+      ᾓ: 'ᾛ',
+      ᾔ: 'ᾜ',
+      ᾕ: 'ᾝ',
+      ᾖ: 'ᾞ',
+      ᾗ: 'ᾟ',
+      ᾠ: 'ᾨ',
+      ᾡ: 'ᾩ',
+      ᾢ: 'ᾪ',
+      ᾣ: 'ᾫ',
+      ᾤ: 'ᾬ',
+      ᾥ: 'ᾭ',
+      ᾦ: 'ᾮ',
+      ᾧ: 'ᾯ',
+      ᾳ: 'ᾼ',
+      ῃ: 'ῌ',
+      ῳ: 'ῼ'
+    }
+
+    nfc.each_char.map do |char|
+      if char.grc?
+        case_map[:"#{char}"] || char.upcase
+      else
+        char
+      end
+    end.join
+  end
+
+  # Diacritical marks
+
+  # `no_downcase_diacritics` (str → str)
+  # Returns a string with the diacritics removed from lowercase characters.
+  def no_downcase_diacritics
     return @std_error unless grc?
 
-    tonos_to_oxia
+    each_char.map do |char| # Loop through each character
+      if char.grc? && char.lower? # If character is greek and lowercase
+        char.nfd.gsub(/\p{Mn}/, '').nfc # decompose, remove non-spacing markers (diacritics), recompose and return
+      else # else
+        char # return char
+      end
+    end.join # end char loop
   end
 
-  def oxia_to_tonos
+  # `no_upcase_diacritics` (str → str)
+  # Returns a string with the diacritics removed from uppercase characters.
+  def no_upcase_diacritics
     return @std_error unless grc?
 
-    tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
-       'άΆέΈήΉίΊΐόΌύΎΰώΏ')
+    each_char.map do |char| # Loop through each character
+      if char.grc? && char.upper? # If character is greek and uppercase
+        char.nfd.gsub(/\p{Mn}/, '').nfc # Decompose, remove non-spacing markers (diacritics), recompose and return
+      else # else
+        char # Return char
+      end
+    end.join
   end
 
-  def to_tonos
+  # `no_diacritics` (str → str)
+  # Returns a string with the diacritics removed.
+  def no_diacritics
     return @std_error unless grc?
 
-    oxia_to_tonos
+    no_downcase_diacritics.no_upcase_diacritics
   end
 
-  def acute_to_grave
+  # Accents
+
+  # `to_grave` (str → str)
+  # Returns a string with the grave replacing the acute accent.
+  def to_grave
     return @std_error unless grc?
 
+    # Simple transform method with grave to acute mapping
     tr('ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ',
        'ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ')
   end
 
-  def grave_to_acute
+  # `to_acute` (str → str)
+  # Returns a string with the acute replacing the grave accent.
+  def to_acute
     return @std_error unless grc?
 
+    # Simple transform method with acute to grave mapping
     tr('ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ',
        'ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ')
   end
 
-  def tokenize
-    gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split
+  # `to_oxia` (str → str)
+  # Returns a string with the oxia replacing the tonos.
+  def to_oxia
+    return @std_error unless grc?
+
+    tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
+       'άΆέΈήΉίΊΐόΌύΎΰώΏ')
   end
 
-  def unicode_char
-    each_char.map(&:to_s)
+  # `to_tonos` (str → str)
+  # Returns a string with the tonos replacing the oxia.
+  # See page 9 of http://www.tlg.uci.edu/encoding/precomposed.pdf
+  def to_tonos
+    return @std_error unless grc?
+
+    tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
+       'άΆέΈήΉίΊΐόΌύΎΰώΏ')
   end
 
-  def unicode_name
-    require 'unicode/name'
-    each_char.map { |character| Unicode::Name.of character }
+  def upper?
+    !!match(/\p{Upper}/)
   end
 
-  def unicode_points
-    unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" }
+  def lower?
+    !!match(/\p{Lower}/)
   end
 
-  def hash_dump
-    hash = {}
-    each_char do |character|
-      hash[character] = character.dump
-    end
-    hash
+  def oxia_to_tonos
+    return @std_error unless grc?
+
+    to_tonos
   end
 
-  def transliterate
+  def tonos_to_oxia
     return @std_error unless grc?
 
-    hash = {
-      ῥ: 'rh',
-      ͱ: '',
-      Ͳ: '',
-      ͳ: '',
-      ʹ: '',
-      "\u0375": '',
-      Ͷ: '',
-      ͷ: '',
-      ͺ: '',
-      ͻ: '',
-      ͼ: '',
-      ͽ: '',
-      Α: 'a',
-      Β: 'b',
-      Γ: 'g',
-      Δ: 'd',
-      Ε: 'e',
-      Ζ: 'z',
-      Η: 'ē',
-      Θ: 'th',
-      Ι: 'i',
-      Κ: 'k',
-      Λ: 'l',
-      Μ: 'm',
-      Ν: 'n',
-      Ξ: 'x',
-      Ο: 'o',
-      Π: 'p',
-      Ρ: 'r',
-      Σ: 's',
-      Τ: 't',
-      Υ: 'y',
-      Φ: 'ph',
-      Χ: 'ch',
-      Ψ: 'ps',
-      Ω: 'ō',
-      α: 'a',
-      β: 'b',
-      γ: 'g',
-      δ: 'd',
-      ε: 'e',
-      ζ: 'z',
-      η: 'ē',
-      θ: 'th',
-      ι: 'i',
-      κ: 'k',
-      λ: 'l',
-      μ: 'm',
-      ν: 'n',
-      ξ: 'x',
-      ο: 'o',
-      π: 'p',
-      ρ: 'r',
-      ς: 's',
-      σ: 's',
-      τ: 't',
-      υ: 'y',
-      φ: 'ph',
-      χ: 'ch',
-      ψ: 'ps',
-      ω: 'ō',
-      Ϗ: '',
-      ϐ: '',
-      ϑ: '',
-      ϒ: '',
-      ϓ: '',
-      ϔ: '',
-      ϕ: '',
-      ϖ: '',
-      ϗ: '',
-      Ϙ: '',
-      ϙ: '',
-      Ϛ: '',
-      ϛ: '',
-      Ϝ: '',
-      ϝ: '',
-      Ϟ: '',
-      ϟ: '',
-      Ϡ: '',
-      ϡ: '',
-      Ϣ: '',
-      ϣ: '',
-      Ϥ: '',
-      ϥ: '',
-      Ϧ: '',
-      ϧ: '',
-      Ϩ: '',
-      ϩ: '',
-      Ϫ: '',
-      ϫ: '',
-      Ϭ: '',
-      ϭ: '',
-      Ϯ: '',
-      ϯ: '',
-      ϰ: '',
-      ϱ: '',
-      ϲ: '',
-      ϳ: '',
-      ϴ: '',
-      ϵ: '',
-      "\u03F6": '',
-      Ϸ: '',
-      ϸ: '',
-      Ϲ: '',
-      Ϻ: '',
-      ϻ: '',
-      ϼ: '',
-      Ͻ: '',
-      Ͼ: '',
-      Ͽ: '',
-      gg: 'ng',
-      gk: 'nk',
-      gx: 'nx',
-      gc: 'nc',
-      "\u{0314}": 'rh',
-      rr: 'rrh',
-      ay: 'au',
-      ey: 'eu',
-      ēy: 'ēu',
-      oy: 'ou',
-      yi: 'ui'
-    }
-    result = []
-    str = self
-    str.split.each do |word|
-      result << if word.grc?
-                  the_word = word.gsub(/ῥ/, 'rh')
-                  the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics
-                  hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) }
-                  the_word
-                else
-                  word
-                end
-    end
-    result.join(' ')
+    to_oxia
   end
 
-  def nfc
-    unicode_normalize(:nfc)
+  def grave_to_acute
+    return @std_error unless grc?
+
+    to_acute
   end
 
-  def nfd
-    unicode_normalize(:nfd)
+  def acute_to_grave
+    return @std_error unless grc?
+
+    to_grave
   end
+
 end
+
 
 class String
   include Grc
 end