Sha256: 1be2225c0e88ceecc33e97bcf8d8f2f16cff81d2c32e591cd10766e3369fc799

Contents?: true

Size: 1.35 KB

Versions: 3

Compression:

Stored size: 1.35 KB

Contents

module BowTfidf
  class BagOfWords
    attr_reader :words, :categories

    def initialize
      @words = {}
      @categories = {}
    end

    def add_labeled_data!(data)
      validate_labeled_data(data)

      data.each do |category_key, category_words|
        category = category_by_key(category_key)

        category_words.each do |word|
          add_word(word, category)
        end
      end

      compute_tfidf
    end

    private

    def validate_labeled_data(data)
      raise(ArgumentError, 'Hash with arrays expected') unless data.is_a?(Hash)

      data.values.each do |array|
        raise(ArgumentError, 'Hash with arrays expected') unless array.is_a?(Enumerable)

        raise(ArgumentError, 'Hash with arrays of strings expected') unless array.all? { |value| value.is_a?(String) }
      end
    end

    def add_word(word, category)
      words[word] = { categories: {} } unless words[word]
      words[word][:categories][category[:id]] ||= { entries: 0 }
      words[word][:categories][category[:id]][:entries] += 1

      categories[category[:key]][:words] << word
    end

    def category_by_key(key)
      unless categories[key]
        categories[key] = {
          id: categories.length,
          key: key,
          words: Set[]
        }
      end
      categories[key]
    end

    def compute_tfidf
      Computation.new(self).call
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
bow_tfidf-0.1.2 lib/bow_tfidf/bag_of_words.rb
bow_tfidf-0.1.1 lib/bow_tfidf/bag_of_words.rb
bow_tfidf-0.1.0 lib/bow_tfidf/bag_of_words.rb