lib/unibits/symbolify.rb in unibits-1.1.0 vs lib/unibits/symbolify.rb in unibits-1.2.0

- old
+ new

@@ -1,5 +1,7 @@ +require "unicode/categories" + module Unibits module Symbolify ASCII_CONTROL_CODEPOINTS = "\x00-\x1F\x7F".freeze ASCII_CONTROL_SYMBOLS = "\u{2400}-\u{241F}\u{2421}".freeze ASCII_CHARS = "\x20-\x7E".freeze @@ -72,24 +74,22 @@ "\u{FE0D}" => "VS14", "\u{FE0E}" => "VS15", "\u{FE0F}" => "VS16", }.freeze COULD_BE_WHITESPACE = '[\p{Space}­᠎​‌‍⁠⁡⁢⁣⁤⠀𛲠𛲡𛲢𛲣𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺]'.freeze - # UNASSIGNED = '\p{Cn}'.freeze def self.symbolify(char, encoding = char.encoding) + return "n/a" if Unicode::Categories.category(char) == "Cn" + char.tr!( ASCII_CONTROL_CODEPOINTS.encode(encoding), ASCII_CONTROL_SYMBOLS.encode(encoding) ) char.gsub!( Regexp.compile(COULD_BE_WHITESPACE.encode(encoding)), ']\0['.encode(encoding) ) - # char.gsub!( - # Regexp.compile(UNASSIGNED.encode(encoding)), - # 'n/a'.encode(encoding) - # ) + INTERESTING_CODEPOINTS.each{ |cp, desc| char.gsub! Regexp.compile(cp.encode(encoding)), desc.encode(encoding) } char.gsub! TAG_START.encode(encoding), TAG_START_SYMBOL.encode(encoding) char.gsub! TAG_SPACE.encode(encoding), TAG_SPACE_SYMBOL.encode(encoding)