# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Normalization

    # Implements normalization of a Unicode string to Normalization Form KC (NFKC).
    # This normalization form includes compatibility decomposition followed by compatibility composition.
    #
    class NFKC < Base

      class << self

        VALID_HANGUL_SEQUENCES = [
          [0, :lparts],
          [1, :vparts],
          [2, :tparts]
        ]

        def normalize_code_points(code_points)
          compose(TwitterCldr::Normalization::NFKD.normalize_code_points(code_points))
        end

        protected

        def compose(code_points)
          final = []
          hangul_code_points = []

          code_points.each_with_index do |code_point, index|
            final << code_point
            hangul_type = TwitterCldr::Shared::CodePoint.hangul_type(code_point)
            next_hangul_type = TwitterCldr::Shared::CodePoint.hangul_type(code_points[index + 1])

            if valid_hangul_sequence?(hangul_code_points.size, hangul_type)
              hangul_code_points << code_point
              unless valid_hangul_sequence?(hangul_code_points.size, next_hangul_type)
                next_hangul_type = nil
              end
            else
              hangul_code_points.clear
            end

            if hangul_code_points.size > 1 && !next_hangul_type
              final.pop(hangul_code_points.size)
              final << compose_hangul(hangul_code_points)
              hangul_code_points.clear
            end
          end

          compose_normal(final)
          final
        end

        def valid_hangul_sequence?(buffer_size, hangul_type)
          VALID_HANGUL_SEQUENCES.include?([buffer_size, hangul_type])
        end

        def compose_hangul(code_points)
          TwitterCldr::Normalization::Hangul.compose(code_points)
        end

        # Implements composition of Unicode code points following the guidelines here:
        # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf - Section 3.12
        # Combining code points are combined with their base characters.  For example, "ñ"
        # can be decomposed into 006E 0303, one code point for the "n" and the "˜" respectively.
        # Composition reverses this process, turning 006E 0303 into a single 00F1 code point.
        #
        def compose_normal(code_points)
          index = 1

          while index < code_points.size
            code_point = code_points[index]
            combining_class = combining_class_for(code_point)
            starter_index = find_starter_index(index, code_points)

            # is this character blocked from combining with the last starter?
            if starter_index < index - 1
              previous_combining_class = combining_class_for(code_points[index - 1])
              blocked = (previous_combining_class == 0) || (previous_combining_class >= combining_class)
            else
              blocked = false
            end

            unless blocked
              # do a reverse-lookup for the decomposed code points
              composite = TwitterCldr::Shared::CodePoint.for_canonical_decomposition([code_points[starter_index], code_point])

              # check if two code points are canonically equivalent
              if composite && !composite.excluded_from_composition?
                # combine the characters
                code_points[starter_index] = composite.code_point
                code_points.delete_at(index)
                index -= 1
              end
            end

            index += 1
          end
        end

        def find_starter_index(start_pos, code_points)
          start_pos.times do |i|
            return start_pos - i - 1 if combining_class_for(code_points[start_pos - i - 1]) == 0
          end
        end

      end

    end
  end
end