require 'spec_helper'

require 'tf-idf-similarity/extras/document'
require 'tf-idf-similarity/extras/tf_idf_model'

module TfIdfSimilarity
  describe TfIdfModel do
    def build_document(text, opts = {})
      Document.new(text, opts)
    end

    def build_model(documents)
      TfIdfModel.new(documents, :library => MATRIX_LIBRARY)
    end

    # @see https://github.com/josephwilk/rsemantic/blob/master/spec/semantic/transform/tf_idf_transform_spec.rb
    # No relevant tests to reproduce.

    # @see https://github.com/mkdynamic/vss/blob/master/test/test.rb
    context 'comparing to vss gem' do
      let :documents do
        [ "I'm not even going to mention any TV series.",
          "The Wire is the best thing ever. Fact.",
          "Some would argue that Lost got a bit too wierd after season 2.",
          "Lost is surely not in the same league as The Wire.",
          "You cannot compare the The Wire and Lost.",
        ].map do |text|
          build_document(text)
        end
      end

      let :model do
        build_model(documents)
      end

      skip "Add #search"
    end

    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_document.rb
    # @see https://github.com/bbcrd/Similarity/blob/master/test/test_term_document_matrix.rb
    context 'comparing to similarity gem' do
      let :document do
        Document.new('cow cow cow horse horse elephant')
      end

      def build_model_from_text(*texts)
        build_model(texts.map{|text| build_document(text)})
      end

      let :model_a do
        build_model_from_text("cow horse sheep", "horse bird dog")
      end

      let :model_b do
        build_model_from_text("cow cow cow bird", "horse horse horse bird")
      end

      let :model_c do
        build_model_from_text("cow cow cow", "horse horse horse")
      end

      # Normalizes to the number of tokens in the document.
      # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/document.rb#L42
      def tf(term)
        document.term_count(term) / document.size.to_f
      end

      # Does not add one to the inverse document frequency.
      # @see https://github.com/bbcrd/Similarity/blob/master/lib/similarity/corpus.rb#L44
      def idf(model, term)
        model.plain_idf(term, 0, 1)
      end

      it 'should return the terms' do
        [ "the quick brown fox",
          "the quick     brown   fox",
          "The Quick Brown Fox",
          'The, Quick! Brown. "Fox"',
        ].each do |text|
          build_document(text).terms.sort.should == ["brown", "fox", "quick", "the"]
        end
      end

      it 'should return the number of documents' do
        model_a.documents.size.should == 2
      end

      it 'should return the number of terms' do
        document.terms.size.should == 3
        model_a.terms.size.should == 5
      end

      it 'should return the term frequency' do
        tf('cow').should == 0.5
        tf('horse').should be_within(0.001).of(0.333)
        tf('sheep').should == 0
      end

      it 'should return the similarity matrix' do
        skip "Calculate the tf*idf matrix like the similarity gem does"
      end

      it 'should return the number of documents in which a term appears' do
        model_b.document_count('cow').should == 1
        model_b.document_count('horse').should == 1
        model_b.document_count('bird').should == 2
      end

      it 'should return the inverse document frequency' do
        idf(model_c, 'cow').should be_within(0.001).of(0.0)
        idf(model_c, 'bird').should be_within(0.001).of(0.693)
      end

      it 'should return the document vector' do
        skip "Calculate the tf*idf matrix like the similarity gem does"
      end
    end

    # @see https://github.com/mchung/tf-idf/blob/master/spec/tf-idf_spec.rb
    context 'comparing to tf-idf gem' do
      # Normalizes to the number of unique tokens (terms) in the document.
      # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L172

      let :corpus_a do
        1.upto(50).map do |n|
          text = []
          text << 'the' if n <= 23
          text << 'a' if n <= 17
          text << 'said' if n <= 5
          text << 'phone' if n <= 2
          text << 'girl' if n <= 1
          text << 'moon' if n <= 1
          build_document(text * ' ')
        end
      end

      let :corpus_b do
        1.upto(50).map do |n|
          text = []
          text << 'the' if n <= 23
          text << 'a' if n <= 17
          text << 'said' if n <= 5
          text << 'phone' if n <= 2
          text << 'girl' if n <= 1
          build_document(text * ' ')
        end
      end

      let :model_a do
        build_model(corpus_a)
      end

      let :model_b do
        build_model(corpus_b)
      end

      it 'should return the number of documents' do
        model_a.documents.size.should == 50
      end

      it 'should return the number of terms' do
        model_a.terms.size.should == 6
      end

      # Adds one to the numerator when calculating inverse document frequency.
      # Sets a default inverse document frequency for non-occurring terms.
      # @note The tf-idf gem has a #doc_keywords method for non-corpus documents.
      # @see https://github.com/mchung/tf-idf/blob/master/lib/tf-idf.rb#L153
      it 'should return the inverse document frequency' do
        # should query IDF for nonexistent terms
        default = model_a.plain_idf('xxx', 1, 1)
        model_a.plain_idf('nonexistent', 1, 1).should == default
        model_a.plain_idf('THE', 1, 1).should == default

        # should query IDF for existent terms
        model_a.plain_idf('a', 1, 1).should > model_a.plain_idf('the', 1, 1)
        model_a.plain_idf('girl', 1, 1).should == model_a.plain_idf('moon', 1, 1)

        # should add input documents to an existing corpus
        model_a.plain_idf('water', 1, 1).should == default
        model_a.plain_idf('moon', 1, 1).should be_within(0.001).of(3.238) # 3.23867845216438
        model_a.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627

        model = build_model(corpus_a + [build_document('water moon')])

        model.plain_idf('water', 1, 1).should be_within(0.001).of(3.258) # 3.25809653802148
        model.plain_idf('moon', 1, 1).should be_within(0.001).of(2.852) # 2.85263142991332
        model.plain_idf('said', 1, 1).should be_within(0.001).of(2.159) # 2.15948424935337

        # should add input documents to an empty corpus
        unless MATRIX_LIBRARY == :gsl
          model_c = build_model([])

          default = model_c.plain_idf('xxx', 1, 1)
          model_c.plain_idf('moon', 1, 1).should == default
          model_c.plain_idf('water', 1, 1).should == default
          model_c.plain_idf('said', 1, 1).should == default
        end

        model_d = build_model([
          build_document('moon'),
          build_document('moon said hello'),
        ])

        default = model_d.plain_idf('xxx', 1, 1)
        model_d.plain_idf('water', 1, 1).should == default
        model_d.plain_idf('said', 1, 1).should be_within(0.001).of(0.405) # 0.405465108108164
        model_d.plain_idf('moon', 1, 1).should == 0 # 0

        # should observe stopwords list
        default = model_b.plain_idf('xxx', 1, 1)
        model_b.plain_idf('water', 1, 1).should == default
        model_b.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
        model_b.plain_idf('said', 1, 1).should be_within(0.001).of(2.140) # 2.14006616349627

        model_e = build_model(corpus_b + [
          build_document('moon', :tokens => %w()),
          build_document('moon and water', :tokens => %w(and water)),
        ])

        default = model_e.plain_idf('xxx', 1, 1)
        model_e.plain_idf('water', 1, 1).should be_within(0.001).of(3.277) # 3.27714473299218
        model_e.plain_idf('moon', 1, 1).should == default # returns 0 for stopwords
        model_e.plain_idf('said', 1, 1).should be_within(0.001).of(2.178) # 2.17853244432407
      end
    end

    # @see https://github.com/reddavis/TF-IDF/blob/master/spec/tf_idf_spec.rb
    context 'comparing to tf_idf gem' do
      let :one do
        build_document('a a a a a a a a b b')
      end

      let :two do
        build_document('a a')
      end

      let :model do
        build_model([one, two])
      end

      # Normalizes to the number of tokens in the document.
      # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L76
      def tf
        one.term_count('b') / one.size.to_f
      end

      # Performs plain inverse document frequency with base 10.
      # @see https://github.com/reddavis/TF-IDF/blob/master/lib/tf_idf.rb#L50
      def idf
        model.plain_idf('b') / Math.log(10)
      end

      it 'should return the term frequency' do
        tf.should == 0.2
        model.tf(one, 'b').should be_within(0.001).of(1.414)
      end

      it 'should return the inverse document frequency' do
        idf.should be_within(0.001).of(0.301) # 0.30102999
        model.idf('b').should == 1
      end

      it 'should return the tf*idf' do
        (tf * idf).should be_within(0.001).of(0.060) # 0.0602
        model.tfidf(one, 'b').should be_within(0.001).of(1.414)
      end
    end
  end
end