# encoding: utf-8
#
require 'spec_helper'

describe Picky::Tokenizer do
  describe 'examples' do
    it 'works correctly' do
      tokenizer = described_class.new(normalizes_words: [[/\&/, 'and']])
      
      # Is this really correct? Shouldn't we split after normalizing? 
      #
      # Yes – we split using more information.
      #
      tokenizer.tokenize('M & M').should == [['m', 'and', 'm'], ['m', 'and', 'm']]
    end
    it 'works correctly' do
      tokenizer = described_class.new(stopwords: /\b(and)\b/, normalizes_words: [[/\&/, 'and']])
      
      # Is this really correct? Shouldn't we stop words after normalizing? 
      #
      # Yes – we do stopwords using more information.
      #
      tokenizer.tokenize('M & M').should == [['m', 'and', 'm'], ['m', 'and', 'm']]
    end
    it 'removes all stopwords if they do not occur alone' do
      tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
      tokenizer.tokenize('and then').should == [[], []]
    end
    it 'does not remove a stopword if it occurs alone' do
      tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
      tokenizer.tokenize('and').should == [['and'], ['and']]
    end
    it 'does not remove a stopword if it occurs alone (even with qualifier)' do
      tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
      tokenizer.tokenize('title:and').should == [['title:and'], ['title:and']]
    end
    it 'does not remove a stopword if it occurs alone (even with qualifier)' do
      tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
      tokenizer.tokenize('title:and').should == [['title:and'], ['title:and']]
    end
    it 'removes stopwords, then only lets through max words words' do
      tokenizer = described_class.new(stopwords: /\b(and|then|to|the)\b/, max_words: 2)
      tokenizer.tokenize('and then they went to the house').should == [['they', 'went'], ['they', 'went']]
    end
    it 'can take freaky splits_text_on options' do
      tokenizer = described_class.new(splits_text_on: /([A-Z]?[a-z]+)/, case_sensitive: false) # Explicit case, is false by default.
      tokenizer.tokenize('TOTALCamelCaseExample').should == [
        ["total", "camel", "case", "example"],
        ["total", "camel", "case", "example"]
      ]
    end
    it 'substitutes and removes in the right order' do
      tokenizer = described_class.new(
        substitutes_characters_with: Picky::CharacterSubstituters::WestEuropean.new,
        removes_characters: /e/
      )
      
      # Ä -> Ae -> A
      #
      tokenizer.tokenize('Ä ä').should == [['a', 'a'], ['a', 'a']]
    end
    it 'removes characters, then only lets through an ok sized token' do
      tokenizer = described_class.new(rejects_token_if: ->(token){ token.size >= 5 }, removes_characters: /e/)
      tokenizer.tokenize('hullo').should == [[], []]
      tokenizer.tokenize('hello').should == [['hllo'], ['hllo']]
    end
    it 'is case sensitive' do
      tokenizer = described_class.new(case_sensitive: true)
      tokenizer.tokenize('Kaspar codes').should == [['Kaspar', 'codes'], ['Kaspar', 'codes']]
    end
    it 'is case sensitive, also for removing characters' do
      tokenizer = described_class.new(case_sensitive: true, removes_characters: /K/)
      tokenizer.tokenize('Kaspar codes').should == [['aspar', 'codes'], ['aspar', 'codes']]
    end
  end
end