Sha256: 2dd811935e018592ac2b226b78219ad11eb5973789d72e4d0428ee18b6fa1ad2

Contents?: true

Size: 1.93 KB

Versions: 3

Compression:

Stored size: 1.93 KB

Contents

# A simple document-term matrix.
class TfIdfSimilarity::TermCountModel
  include TfIdfSimilarity::MatrixMethods

  # The documents in the corpus.
  attr_reader :documents
  # The set of terms in the corpus.
  attr_reader :terms
  # The average number of tokens in a document.
  attr_reader :average_document_size

  # @param [Array<TfIdfSimilarity::Document>] documents documents
  # @param [Hash] opts optional arguments
  # @option opts [Symbol] :library :gsl, :narray, :nmatrix or :matrix (default)
  def initialize(documents, opts = {})
    @documents = documents
    @terms = Set.new(documents.map(&:terms).flatten).to_a
    @library = (opts[:library] || :matrix).to_sym

    array = Array.new(terms.size) do |i|
      Array.new(documents.size) do |j|
        documents[j].term_count(terms[i])
      end
    end

    @matrix = initialize_matrix(array)

    @average_document_size = documents.empty? ? 0 : sum / column_size.to_f
  end

  # @param [String] term a term
  # @return [Integer] the number of documents the term appears in
  def document_count(term)
    index = terms.index(term)
    if index
      case @library
      when :gsl, :narray
        row(index).where.size
      when :nmatrix
        row(index).each.count(&:nonzero?)
      else
        vector = row(index)
        unless vector.respond_to?(:count)
          vector = vector.to_a
        end
        vector.count(&:nonzero?)
      end
    else
      0
    end
  end

  # @param [String] term a term
  # @return [Integer] the number of times the term appears in the corpus
  def term_count(term)
    index = terms.index(term)
    if index
      case @library
      when :gsl, :narray
        row(index).sum
      when :nmatrix
        row(index).each.reduce(0, :+) # NMatrix's `sum` method is slower
      else
        vector = row(index)
        unless vector.respond_to?(:reduce)
          vector = vector.to_a
        end
        vector.reduce(0, :+)
      end
    else
      0
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
tf-idf-similarity-0.1.3 lib/tf-idf-similarity/term_count_model.rb
tf-idf-similarity-0.1.2 lib/tf-idf-similarity/term_count_model.rb
tf-idf-similarity-0.1.1 lib/tf-idf-similarity/term_count_model.rb