# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'spec_helper'

include TwitterCldr::Parsers
include TwitterCldr::Tokenizers

describe UnicodeRegexParser::CharacterClass do
  let(:tokenizer) { UnicodeRegexTokenizer.new }
  let(:parser) { UnicodeRegexParser.new }

  def tokenize(str)
    tokenizer.tokenize(str)
  end

  def parse(tokens, options = {})
    parser.parse(tokens, options)
  end

  def char_class_from(elements)
    elements.first
  end

  describe "#to_set" do
    it "unions together char classes with no explicit operator" do
      char_class = char_class_from(parse(tokenize("[[a][b]]")))
      expect(char_class.to_set.to_a).to eq([97..98])
    end

    it "unions together other entities within char classes when operator is not explicit" do
      char_class = char_class_from(parse(tokenize("[a-z0-9\\u0123]")))
      expect(char_class.to_set.to_a(true)).to eq([48..57, 97..122, 291])
    end

    it "intersects correctly" do
      char_class = char_class_from(parse(tokenize("[[a-m]&[g-z]]")))
      expect(char_class.to_set.to_a).to eq([103..109])
    end

    it "finds symmetric differences correctly" do
      char_class = char_class_from(parse(tokenize("[[a-m]-[g-z]]")))
      expect(char_class.to_set.to_a).to eq([97..102, 110..122])
    end

    it "computes sets for nested expressions" do
      # (97..109) U (104..106)
      # = (104..106)
      # ((104..106) U (107..122)) subtr ((104..106) C (107..122))
      # = (104..122) subtr ()
      # = (104..122)
      char_class = char_class_from(parse(tokenize("[[[a-m]&[h-j]]-[k-z]]")))
      expect(char_class.to_set.to_a).to eq([104..122])
    end

    it "pulls in ranges for unicode character sets" do
      char_class = char_class_from(parse(tokenize("[\\p{Zs}]")))
      expect(char_class.to_set.to_a(true)).to eq([
        32, 160, 5760, 6158, 8192..8202, 8239, 8287, 12288
      ])
    end

    it "computes unions between unicode character sets" do
      char_class = char_class_from(parse(tokenize("[[\\p{Zs}][\\p{Cc}]]")))
      expect(char_class.to_set.to_a(true)).to eq([
        0..1, 8..32, 127..160, 5760, 6158, 8192..8202, 8239, 8287, 12288
      ])
    end

    it "computes intersections between unicode character sets" do
      char_class = char_class_from(parse(tokenize("[[\\p{Zs}]&[\\u2000-\\u202B]]")))
      expect(char_class.to_set.to_a(true)).to eq([8192..8202])
    end

    it "supports negating character sets" do
      char_class = char_class_from(parse(tokenize("[^\\u2000-\\u202B]")))
      expect(char_class.to_set.to_a(true)).to eq([
        0..1, 8..8191, 8236..55295, 57344..1114111
      ])
    end

    it "supports literal and escaped characters" do
      char_class = char_class_from(parse(tokenize("[abc\\edf\\g]")))
      expect(char_class.to_set.to_a(true)).to eq([97..103])
    end

    it "supports special switch characters" do
      char_class = char_class_from(parse(tokenize("[\\w]")))  # a-z, A-Z, 0-9, _
      expect(char_class.to_set.to_a(true)).to eq([48..57, 65..90, 95, 97..122])
    end

    it "supports negated switch characters" do
      char_class = char_class_from(parse(tokenize("[\\D]")))  # i.e. NOT \w
      expect(char_class.to_set.to_a(true)).to eq([
        0..1, 8..47, 58..55295, 57344..1114111
      ])
    end
  end

  describe "#to_regexp_str" do
    it "wraps ranges in square brackets" do
      char_class = char_class_from(parse(tokenize("[a-z]")))
      expect(char_class.to_regexp_str).to eq("(?:[\\141-\\172])")
    end

    it "octal-encodes and wraps sequential characters to isolate bytes" do
      char_class = char_class_from(parse(tokenize("[{foo}]")))
      expect(char_class.to_regexp_str).to eq("(?:(?:\\146)(?:\\157)(?:\\157))")
    end

    it "combines multiple components with 'or' pipe characters" do
      char_class = char_class_from(parse(tokenize("[{foo}abc]")))
      expect(char_class.to_regexp_str).to eq("(?:(?:\\146)(?:\\157)(?:\\157)|[\\141-\\143])")
    end
  end
end