lib/grc.rb in grc-0.1.3 vs lib/grc.rb in grc-0.1.4
- old
+ new
@@ -6,254 +6,249 @@
module Grc
class Error < StandardError; end
@std_error = 'ERROR: String does not contain any greek. Summon the muse and try again.'
+ # General methods
+
+ # `grc?` (str → bool)
+ # Returns true if the string contains greek characters.
def grc?
!scan(/(\p{Greek})/).empty?
end
- def no_downcase_diacritics
- return @std_error unless grc?
-
- tr('ἀἄᾄἂᾂἆᾆᾀἁἅᾅἃᾃἇᾇᾁάάᾴὰᾲᾰᾶᾷᾱᾳἐἔἒἑἕἓέέὲἠἤᾔἢᾒἦᾖᾐἡἥᾕἣᾓἧᾗᾑήήῄὴῂῆῇῃἰἴἲἶἱἵἳἷίίὶῐῖϊϊΐῒῗῑὀὄὂὁὅὃόόὸῤῥὐὔὒὖὑὕὓὗύύὺῠῦϋΰΰΰῢῧῡὠὤᾤὢᾢὦᾦᾠὡὥᾥὣᾣὧᾧᾡώώῴὼῲῶῷῳ',
- 'ααααααααααααααααααααααααααεεεεεεεεεηηηηηηηηηηηηηηηηηηηηηηηηιιιιιιιιιιιιιιιιιιιοοοοοοοοορρυυυυυυυυυυυυυυυυυυυυωωωωωωωωωωωωωωωωωωωωωωωω')
+ # `tokenize` (str → array)
+ # Returns an array of tokens from the string.
+ def tokenize
+ gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split
end
- def no_upcase_diacritics
+ # `transliterate` (str → str)
+ # Returns a string with greek characters replaced with their transliteration.
+ def transliterate
return @std_error unless grc?
+ result = []
str = self
- # Adhoc solution for odd combinations of diacritics with capital letters
- ars = [[/[́̀͂́́́̀͂]/, ''], [/Α͂/, 'Α'], [/Η͂/, 'Η'], [/Ί|Ὶ|Ι͂|́Ι|̀Ι|͂Ι/, 'Ι'],
- [/Ρ̓/, 'Ρ'], [/ Ὺ| ́Υ|Υ̓|Ύ|Ὺ|Υ͂|́Υ|̀Υ|͂Υ/, 'Υ'], [/͂Ω/, 'Ω']]
- ars.each do |a|
- str = str.gsub(/#{a[0]}/, a[1])
+ str.tokenize do |token|
+ result << if token.grc?
+ the_word = token.gsub(/ῥ/, 'rh')
+ the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics
+ hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) }
+ the_word
+ else
+ word
+ end
end
- str.tr('ἈἌἊἎἉἍἋἏΆᾺᾸᾹἘἜἚἙἝἛΈῈἨἬἪἮἩἭἫἯΉῊἸἼἺἾἹἽἻἿΊῚῘΪῙὈὌὊὉὍὋΌῸΡῬὙὝὛὟΎῪῨΫῩὨὬὪὮὩὭὫὯΏῺ',
- 'ΑΑΑΑΑΑΑΑΑΑΑΑΕΕΕΕΕΕΕΕΗΗΗΗΗΗΗΗΗΗΙΙΙΙΙΙΙΙΙΙΙΙΙΟΟΟΟΟΟΟΟΡΡΥΥΥΥΥΥΥΥΥΩΩΩΩΩΩΩΩΩΩ')
+ result.join(' ')
end
- def no_diacritics
- return @std_error unless grc?
+ # Unicode Inspection Methods
- no_downcase_diacritics.no_upcase_diacritics
+ # `unicode_points` (str → array)
+ # Returns an array of unicode points from the string.
+ def unicode_points
+ unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" }
end
- def tonos_to_oxia
- return @std_error unless grc?
+ # `hash_dump`: (str → hash)
+ # Returns a hash of the string's unicode points (Char: Unicode_points).
+ def hash_dump
+ hash = {}
+ each_char do |character|
+ hash[character] = character.dump
+ end
+ hash
+ end
- tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
- 'άΆέΈήΉίΊΐόΌύΎΰώΏ')
+ # `unicode_name` (str → array)
+ # Returns an array of unicode names from the string.
+ def unicode_name
+ require 'unicode/name'
+ each_char.map { |character| Unicode::Name.of character }
end
- def to_oxia
+ # Unicode Normalization
+
+ # `nfd` (str → str)
+ # Returns a string with the canonical decomposition of the string.
+ def nfd
+ unicode_normalize(:nfd)
+ end
+
+ # `nfc` (str → str)
+ # Returns a string with the canonical composition of the string.
+ def nfc
+ unicode_normalize(:nfc)
+ end
+
+ # Case folding
+
+ # `grc_downcase` (str → str)
+ # Returns the lowercase version of string for greek characters resolving confusable characters.
+ # See https://www.w3.org/TR/charmod-norm/#PreNormalization
+ def grc_downcase
+ nfd.downcase.nfc
+ end
+
+ # `grc_upcase` (str → str)
+ # Default `upcase` methods strips diacritical marks from greek characters.
+ # This method returns the corresponding uppercase version of string for greek characters preserving diacritical marks.
+ # See pages 1-7 of http://www.tlg.uci.edu/encoding/precomposed.pdf
+ # https://icu.unicode.org/design/case/greek-upper
+ def grc_upcase
+ case_map = {
+ ᾀ: 'ᾈ',
+ ᾁ: 'ᾉ',
+ ᾂ: 'ᾊ',
+ ᾃ: 'ᾋ',
+ ᾄ: 'ᾌ',
+ ᾅ: 'ᾍ',
+ ᾆ: 'ᾎ',
+ ᾇ: 'ᾏ',
+ ᾐ: 'ᾘ',
+ ᾑ: 'ᾙ',
+ ᾒ: 'ᾚ',
+ ᾓ: 'ᾛ',
+ ᾔ: 'ᾜ',
+ ᾕ: 'ᾝ',
+ ᾖ: 'ᾞ',
+ ᾗ: 'ᾟ',
+ ᾠ: 'ᾨ',
+ ᾡ: 'ᾩ',
+ ᾢ: 'ᾪ',
+ ᾣ: 'ᾫ',
+ ᾤ: 'ᾬ',
+ ᾥ: 'ᾭ',
+ ᾦ: 'ᾮ',
+ ᾧ: 'ᾯ',
+ ᾳ: 'ᾼ',
+ ῃ: 'ῌ',
+ ῳ: 'ῼ'
+ }
+
+ nfc.each_char.map do |char|
+ if char.grc?
+ case_map[:"#{char}"] || char.upcase
+ else
+ char
+ end
+ end.join
+ end
+
+ # Diacritical marks
+
+ # `no_downcase_diacritics` (str → str)
+ # Returns a string with the diacritics removed from lowercase characters.
+ def no_downcase_diacritics
return @std_error unless grc?
- tonos_to_oxia
+ each_char.map do |char| # Loop through each character
+ if char.grc? && char.lower? # If character is greek and lowercase
+ char.nfd.gsub(/\p{Mn}/, '').nfc # decompose, remove non-spacing markers (diacritics), recompose and return
+ else # else
+ char # return char
+ end
+ end.join # end char loop
end
- def oxia_to_tonos
+ # `no_upcase_diacritics` (str → str)
+ # Returns a string with the diacritics removed from uppercase characters.
+ def no_upcase_diacritics
return @std_error unless grc?
- tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
- 'άΆέΈήΉίΊΐόΌύΎΰώΏ')
+ each_char.map do |char| # Loop through each character
+ if char.grc? && char.upper? # If character is greek and uppercase
+ char.nfd.gsub(/\p{Mn}/, '').nfc # Decompose, remove non-spacing markers (diacritics), recompose and return
+ else # else
+ char # Return char
+ end
+ end.join
end
- def to_tonos
+ # `no_diacritics` (str → str)
+ # Returns a string with the diacritics removed.
+ def no_diacritics
return @std_error unless grc?
- oxia_to_tonos
+ no_downcase_diacritics.no_upcase_diacritics
end
- def acute_to_grave
+ # Accents
+
+ # `to_grave` (str → str)
+ # Returns a string with the grave replacing the acute accent.
+ def to_grave
return @std_error unless grc?
+ # Simple transform method with grave to acute mapping
tr('ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ',
'ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ')
end
- def grave_to_acute
+ # `to_acute` (str → str)
+ # Returns a string with the acute replacing the grave accent.
+ def to_acute
return @std_error unless grc?
+ # Simple transform method with acute to grave mapping
tr('ἂᾂἃᾃὰὰᾲἒἓὲὲἢᾒἣᾓὴὴῂἲἳὶὶῒὂὃὸὸὒὓὺὺῢῢὢᾢὣᾣὼῲ',
'ἄᾄἅᾅάάᾴἔἕέέἤᾔἥᾕήήῄἴἵίίΐὄὅόόὔὕύύΰΰὤᾤὥᾥώῴ')
end
- def tokenize
- gsub(/([[:punct:]]|·|·|‧|⸱|𐄁|\.|;|;)/, ' \1').split
+ # `to_oxia` (str → str)
+ # Returns a string with the oxia replacing the tonos.
+ def to_oxia
+ return @std_error unless grc?
+
+ tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
+ 'άΆέΈήΉίΊΐόΌύΎΰώΏ')
end
- def unicode_char
- each_char.map(&:to_s)
+ # `to_tonos` (str → str)
+ # Returns a string with the tonos replacing the oxia.
+ # See page 9 of http://www.tlg.uci.edu/encoding/precomposed.pdf
+ def to_tonos
+ return @std_error unless grc?
+
+ tr('άΆέΈήΉίΊΐόΌύΎΰώΏ',
+ 'άΆέΈήΉίΊΐόΌύΎΰώΏ')
end
- def unicode_name
- require 'unicode/name'
- each_char.map { |character| Unicode::Name.of character }
+ def upper?
+ !!match(/\p{Upper}/)
end
- def unicode_points
- unpack('U*').map { |i| "\\u#{i.to_s(16).rjust(4, "0").upcase}" }
+ def lower?
+ !!match(/\p{Lower}/)
end
- def hash_dump
- hash = {}
- each_char do |character|
- hash[character] = character.dump
- end
- hash
+ def oxia_to_tonos
+ return @std_error unless grc?
+
+ to_tonos
end
- def transliterate
+ def tonos_to_oxia
return @std_error unless grc?
- hash = {
- ῥ: 'rh',
- ͱ: '',
- Ͳ: '',
- ͳ: '',
- ʹ: '',
- "\u0375": '',
- Ͷ: '',
- ͷ: '',
- ͺ: '',
- ͻ: '',
- ͼ: '',
- ͽ: '',
- Α: 'a',
- Β: 'b',
- Γ: 'g',
- Δ: 'd',
- Ε: 'e',
- Ζ: 'z',
- Η: 'ē',
- Θ: 'th',
- Ι: 'i',
- Κ: 'k',
- Λ: 'l',
- Μ: 'm',
- Ν: 'n',
- Ξ: 'x',
- Ο: 'o',
- Π: 'p',
- Ρ: 'r',
- Σ: 's',
- Τ: 't',
- Υ: 'y',
- Φ: 'ph',
- Χ: 'ch',
- Ψ: 'ps',
- Ω: 'ō',
- α: 'a',
- β: 'b',
- γ: 'g',
- δ: 'd',
- ε: 'e',
- ζ: 'z',
- η: 'ē',
- θ: 'th',
- ι: 'i',
- κ: 'k',
- λ: 'l',
- μ: 'm',
- ν: 'n',
- ξ: 'x',
- ο: 'o',
- π: 'p',
- ρ: 'r',
- ς: 's',
- σ: 's',
- τ: 't',
- υ: 'y',
- φ: 'ph',
- χ: 'ch',
- ψ: 'ps',
- ω: 'ō',
- Ϗ: '',
- ϐ: '',
- ϑ: '',
- ϒ: '',
- ϓ: '',
- ϔ: '',
- ϕ: '',
- ϖ: '',
- ϗ: '',
- Ϙ: '',
- ϙ: '',
- Ϛ: '',
- ϛ: '',
- Ϝ: '',
- ϝ: '',
- Ϟ: '',
- ϟ: '',
- Ϡ: '',
- ϡ: '',
- Ϣ: '',
- ϣ: '',
- Ϥ: '',
- ϥ: '',
- Ϧ: '',
- ϧ: '',
- Ϩ: '',
- ϩ: '',
- Ϫ: '',
- ϫ: '',
- Ϭ: '',
- ϭ: '',
- Ϯ: '',
- ϯ: '',
- ϰ: '',
- ϱ: '',
- ϲ: '',
- ϳ: '',
- ϴ: '',
- ϵ: '',
- "\u03F6": '',
- Ϸ: '',
- ϸ: '',
- Ϲ: '',
- Ϻ: '',
- ϻ: '',
- ϼ: '',
- Ͻ: '',
- Ͼ: '',
- Ͽ: '',
- gg: 'ng',
- gk: 'nk',
- gx: 'nx',
- gc: 'nc',
- "\u{0314}": 'rh',
- rr: 'rrh',
- ay: 'au',
- ey: 'eu',
- ēy: 'ēu',
- oy: 'ou',
- yi: 'ui'
- }
- result = []
- str = self
- str.split.each do |word|
- result << if word.grc?
- the_word = word.gsub(/ῥ/, 'rh')
- the_word = the_word =~ /[ἁἅᾅἃᾃἇᾇᾁἑἕἓἡἥᾕἣᾓἧᾗᾑἱἵἳἷὁὅὃὑὕὓὗὡὥᾥὣᾣὧᾧᾡ]/ ? "h#{the_word.no_diacritics}" : the_word.no_diacritics
- hash.each { |k, v| the_word = the_word.gsub(/#{k}/, v) }
- the_word
- else
- word
- end
- end
- result.join(' ')
+ to_oxia
end
- def nfc
- unicode_normalize(:nfc)
+ def grave_to_acute
+ return @std_error unless grc?
+
+ to_acute
end
- def nfd
- unicode_normalize(:nfd)
+ def acute_to_grave
+ return @std_error unless grc?
+
+ to_grave
end
+
end
+
class String
include Grc
end