lib/twitter_cldr/tokenizers/base.rb in twitter_cldr-1.9.1 vs lib/twitter_cldr/tokenizers/base.rb in twitter_cldr-2.0.0
- old
+ new
@@ -5,12 +5,12 @@
module TwitterCldr
module Tokenizers
class Base
attr_reader :resource, :locale
- attr_reader :token_splitter_regex, :token_type_regexes, :paths
- attr_accessor :type, :placeholders
+ attr_reader :token_splitter_regexes, :token_type_regexes, :paths
+ attr_accessor :type, :format, :placeholders
def initialize(options = {})
@locale = TwitterCldr.convert_locale(options[:locale] || TwitterCldr::DEFAULT_LOCALE)
init_resources
init_placeholders
@@ -19,37 +19,49 @@
protected
# Not to be confused with tokenize_pattern, which pulls out placeholders. Tokenize_format actually splits a completely
# expanded format string into whatever parts are defined by the subclass's token type and token splitter regexes.
def tokenize_format(text)
- final = []
- text.split(token_splitter_regex).each_with_index do |token, index|
+ text.split(token_splitter_regex_for(type)).each_with_index.inject([]) do |ret, (token, index)|
unless index == 0 && token == ""
- token_type_regexes.each do |token_type|
- if token =~ token_type[:regex]
- if token_type[:type] == :composite
- content = token.match(token_type[:content])[1]
- final << CompositeToken.new(tokenize_format(content))
- else
- final << Token.new(:value => token, :type => token_type[:type])
- end
+ regexes = token_type_regexes_for(type)
- break
- end
+ token_type = regexes.inject([]) do |match_ret, (token_type, matchers)|
+ match_ret << token_type if token =~ matchers[:regex]
+ match_ret
+ end.min { |a, b| regexes[a][:priority] <=> regexes[b][:priority] }
+
+ if token_type == :composite
+ content = token.match(regexes[token_type][:content])[1]
+ ret << CompositeToken.new(tokenize_format(content))
+ else
+ ret << Token.new(:value => token, :type => token_type)
end
end
+ ret
end
- final
end
- def tokens_for(path, type)
+ def token_type_regexes_for(type)
+ token_type_regexes[type] || token_type_regexes[:else]
+ end
+
+ def token_splitter_regex_for(type)
+ token_splitter_regexes[type] || token_splitter_regexes[:else]
+ end
+
+ def tokens_for(path, additional_cache_key_params = [])
+ tokens_for_pattern(pattern_for(traverse(path)), path, additional_cache_key_params)
+ end
+
+ def tokens_for_pattern(pattern, path, additional_cache_key_params = [])
@@token_cache ||= {}
- cache_key = TwitterCldr::Utils.compute_cache_key(@locale, path.join('.'), type)
+ cache_key = TwitterCldr::Utils.compute_cache_key(@locale, path.join('.'), type, format || "nil", *additional_cache_key_params)
unless @@token_cache.include?(cache_key)
result = []
- tokens = expand_pattern(pattern_for(traverse(path)), type)
+ tokens = expand_pattern(pattern)
tokens.each do |token|
if token.is_a?(Token) || token.is_a?(CompositeToken)
result << token
else
@@ -106,13 +118,13 @@
else
current
end
end
- def expand_pattern(format_str, type)
+ def expand_pattern(format_str)
if format_str.is_a?(Symbol)
# symbols mean another path was given
- expand_pattern(pattern_for(traverse(format_str.to_s.split('.').map(&:to_sym))), type)
+ expand_pattern(pattern_for(traverse(format_str.to_s.split('.').map(&:to_sym))))
else
parts = tokenize_pattern(format_str)
final = []
parts.each do |part|