# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Segmentation
    class RuleSetBuilder

      class << self
        def load(locale, boundary_type, options = {})
          rules = compile_rules_for(boundary_type)
          RuleSet.new(locale, rules, boundary_type, options)
        end

        # See the comment above exceptions_for. Basically, we only support exceptions
        # for the "sentence" boundary type since the ULI JSON data doesn't distinguish
        # between boundary types.
        def exception_rule_for(locale, boundary_type)
          cache_key = TwitterCldr::Utils.compute_cache_key(locale, boundary_type)
          exceptions_cache[cache_key] ||= begin
            exceptions = exceptions_for(locale, boundary_type)
            regex_contents = exceptions.map { |exc| Regexp.escape(exc) }.join("|")
            parse("(?:#{regex_contents}) ×", nil).tap do |rule|
              rule.id = 0
            end
          end
        end

        # The implicit final rule is always "Any ÷ Any"
        def implicit_final_rule
          @implicit_final_rule ||=
            parse('. ÷ .', nil).tap do |rule|
              rule.id = 9999
            end
        end

        # The implicit initial rules are always "start-of-text ÷"
        # and "÷ end-of-text". We don't need the start-of-text one.
        def implicit_end_of_text_rule
          @implicit_end_of_text_rule ||=
            parse('.\z ÷', nil).tap do |rule|
              rule.id = 9998
            end
        end

        private

        # The boundary_type param is not currently used since the ULI JSON resource that
        # exceptions are generated from does not distinguish between boundary types. The
        # XML version does, however, so the JSON will hopefully catch up at some point and
        # we can make use of this second parameter. For the time being, compile_exception_rule_for
        # (which calls this function) assumes a "sentence" boundary type.
        def exceptions_for(locale, boundary_type)
          exceptions_resource_cache[locale] ||= begin
            TwitterCldr.get_resource('uli', 'segments', locale)[locale][:exceptions]
          rescue Resources::ResourceLoadError
            []
          end
        end

        def boundary_name_for(str)
          str.gsub(/(?:^|\_)([A-Za-z])/) { |s| $1.upcase } + 'Break'
        end

        # tokenizes and parses rules from segment_root
        def compile_rules_for(boundary_type)
          rule_cache[boundary_type] ||= begin
            boundary_name = boundary_name_for(boundary_type)
            boundary_data = resource_for(boundary_name)
            symbol_table = symbol_table_for(boundary_data)
            rules_for(boundary_data, symbol_table)
          end
        end

        def symbol_table_for(boundary_data)
          table = TwitterCldr::Parsers::SymbolTable.new
          boundary_data[:variables].each do |variable|
            id = variable[:id].to_s
            tokens = segmentation_parser.tokenize_regex(variable[:value])
            # note: variables can be redefined (add replaces if key already exists)
            table.add(id, resolve_symbols(tokens, table))
          end
          table
        end

        def resolve_symbols(tokens, symbol_table)
          tokens.inject([]) do |ret, token|
            if token.type == :variable
              ret += symbol_table.fetch(token.value)
            else
              ret << token
            end
            ret
          end
        end

        def rules_for(boundary_data, symbol_table)
          boundary_data[:rules].map do |rule|
            r = parse(rule[:value], symbol_table)
            r.string = rule[:value]
            r.id = rule[:id]
            r
          end
        end

        def parse(text, symbol_table)
          segmentation_parser.parse(
            text, { symbol_table: symbol_table }
          )
        end

        def resource_for(boundary_name)
          root_resource[:segments][boundary_name.to_sym]
        end

        def segmentation_parser
          @segmentation_parser ||= Segmentation::Parser.new
        end

        def root_resource
          @root_resource ||= TwitterCldr.get_resource(
            'shared', 'segments', 'segments_root'
          )
        end

        def rule_cache
          @rule_cache ||= {}
        end

        def exceptions_resource_cache
          @exceptions_resource_cache ||= {}
        end

        def exceptions_cache
          @exceptions_cache ||= {}
        end
      end

    end
  end
end