require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
require 'rbbt/ner/token_trieNER'
require 'rbbt/util/tmpfile'

class TestTokenTrieNER < Test::Unit::TestCase

  def test_tokenize
    assert_equal ['a' , 'b', ',', 'c'], TokenTrieNER.tokenize('a b, c')

    assert_equal 10, TokenTrieNER.tokenize('123456789 12345').last.offset
    assert_equal 0, TokenTrieNER.tokenize('123456789 12345').first.offset

    text = '123456789 12345'
    assert_equal '12345', text[TokenTrieNER.tokenize('123456789 12345').last.range]
  end

  def test_merge
    tokens = %w(a b c)
    tokens.extend TokenTrieNER::EnumeratedArray
    index = {'a' => {'b' => {'c' => {:END => [TokenTrieNER::Code.new('CODE')]}}}}

    assert_equal 'CODE', TokenTrieNER.merge({}, TokenTrieNER.index_for_tokens(tokens, 'CODE'))['a']['b']['c'][:END].first.code
  end

  def test_process
    lexicon =<<-EOF
C1;aa;AA;bb b
C2;11;22;3 3;bb
    EOF

    TmpFile.with_file(lexicon) do |file|

      index = TokenTrieNER.process({}, TSV.open(file, :flat, :sep => ';'))

      assert_equal ['AA', 'aa', 'bb', '11', '22', '3'].sort, index.keys.sort
      assert_equal [:END], index['aa'].keys
      assert index['bb'].keys.include? 'b'
      assert index['bb'].keys.include? :END
    end
  end

  def test_find
    lexicon =<<-EOF
C1;aa;AA;bb b
C2;11;22;3 3;bb
    EOF


    TmpFile.with_file(lexicon) do |file|
      index = TokenTrieNER.process({}, TSV.open(file, :sep => ';', :type => :flat ))

      assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf').extend(TokenTrieNER::EnumeratedArray), false).first.collect{|c| c.code}.include?   'C1'
      assert_equal %w(aa), TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf').extend(TokenTrieNER::EnumeratedArray), false).last

      assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf').extend(TokenTrieNER::EnumeratedArray), true).first.collect{|c| c.code}.include?    'C1'

      assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf').extend(TokenTrieNER::EnumeratedArray), true).first.collect{|c| c.code}.include?  'C1'
      assert_equal %w(bb b), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf').extend(TokenTrieNER::EnumeratedArray), true).last

      assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf').extend(TokenTrieNER::EnumeratedArray), false).first.collect{|c| c.code}.include? 'C2'
      assert_equal %w(bb), TokenTrieNER.find(index, TokenTrieNER.tokenize('bb b asdf').extend(TokenTrieNER::EnumeratedArray), false).last

      assert TokenTrieNER.find(index, TokenTrieNER.tokenize('bb asdf').extend(TokenTrieNER::EnumeratedArray), false).first.collect{|c| c.code}.include? 'C2'
    end
  end

  def test_match
    lexicon =<<-EOF
C1;aa;AA;bb b
C2;11;22;3 3;bb
    EOF

    TmpFile.with_file(lexicon) do |file|
      index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))

      assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
    end
  end

  def test_slack
    lexicon =<<-EOF
C1;aa;AA;bb cc cc b
C2;11;22;3 3;bb;bbbb
    EOF

    TmpFile.with_file(lexicon) do |file|
      index = TokenTrieNER.new({})
      index.slack = Proc.new{|t| t =~ /^c*$/}

      index.merge TSV.open(file, :flat, :sep => ';')

      assert index.match(' aaaaa 3 cc 3').select{|m| m.code.include? 'C2'}.any?
      assert index.match(' bb cc b').select{|m| m.code.include? 'C1'}.any?
      assert index.match(' bb b').select{|m| m.code.include? 'C1'}.any?
      assert index.match(' BBBB b').select{|m| m.code.include? 'C2'}.any?
    end
  end

  def test_own_tokens
    lexicon =<<-EOF
C1;aa;AA;bb cc cc b
C2;11;22;3 3;bb
    EOF

    TmpFile.with_file(lexicon) do |file|
      index = TokenTrieNER.new({})
      index.slack = Proc.new{|t| t =~ /^c*$/}

      index.merge TSV.open(file, :flat, :sep => ';')

      assert index.match(Token.tokenize('3 cc 3')).select{|m| m.code.include? 'C2'}.any?
    end
  end

  def test_proc_index
    index = TokenTrieNER.new({})
    index.merge({ "aa" => {:PROCS => {Proc.new{|c| c == 'c'} => {:END  => [TokenTrieNER::Code.new(:entity, :C1)]}}}})

    assert index.match(Token.tokenize('3 cc 3 aa c ddd')).select{|m| m.code.include? :entity}.any?
  end

  def test_persistence
    lexicon =<<-EOF
C1;aa;AA;bb b
C2;11;22;3 3;bb
    EOF

    TmpFile.with_file(lexicon) do |file|
      index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'), :persistence => true)

      assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
    end
  end

end