# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Shared class UnicodeRegex class << self def compile(str, modifiers = "", symbol_table = nil) new( parser.parse(tokenizer.tokenize(str), { :symbol_table => symbol_table }), modifiers ) end # All unicode characters def all_unicode @all_unicode ||= TwitterCldr::Utils::RangeSet.new( [0..0x10FFFF] ) end # A few characters (i.e. 2..7) and public/private surrogates (i.e. 55296..57343). # These don't play nicely with Ruby's regular expression engine, and I think we # can safely disregard them. def invalid_regexp_chars @invalid_regexp_chars ||= TwitterCldr::Utils::RangeSet.new( [2..7, 55296..57343] ) end def valid_regexp_chars @valid_regexp_chars ||= all_unicode.subtract(invalid_regexp_chars) end private def tokenizer @tokenizer ||= TwitterCldr::Tokenizers::UnicodeRegexTokenizer.new end def parser @parser ||= TwitterCldr::Parsers::UnicodeRegexParser.new end end extend Forwardable def_delegator :to_regexp, :match def_delegator :to_regexp, :=~ attr_reader :elements, :modifiers def initialize(elements, modifiers = nil) @elements = elements @modifiers = nil end def to_regexp if RUBY_VERSION <= "1.8.7" begin Oniguruma::ORegexp.new(to_regexp_str, modifiers) rescue NameError raise "Unicode regular expressions require the Oniguruma gem when using Ruby 1.8. Please install, require, and retry." end else @regexp ||= Regexp.new(to_regexp_str, modifiers) end end def to_regexp_str @regexp_str ||= elements.map(&:to_regexp_str).join end end end end