Sha256: ec1d326542e0a87ea54567bdaea23e2b7502b4457e0b6f2c91f79413950cd661

Contents?: true

Size: 1.64 KB

Versions: 12

Compression:

Stored size: 1.64 KB

Contents

module Eps
  class TextEncoder
    attr_reader :options, :vocabulary

    def initialize(**options)
      @options = options
      @vocabulary = options[:vocabulary] || []
    end

    def fit(arr)
      counts, fit = count_and_fit(arr)

      min_length = options[:min_length]
      if min_length
        counts.select! { |k, _| k.length >= min_length }
      end

      min_occurrences = options[:min_occurrences]
      if min_occurrences
        counts.select! { |_, v| v >= min_occurrences }
      end

      max_occurrences = options[:max_occurrences]
      if max_occurrences
        counts.reject! { |_, v| v > max_occurrences }
      end

      max_features = options[:max_features]
      if max_features
        counts = Hash[counts.sort_by { |_, v| -v }[0...max_features]]
      end

      @vocabulary = counts.keys

      fit
    end

    def transform(arr)
      counts, fit = count_and_fit(arr)
      fit
    end

    private

    def count_and_fit(arr)
      tokenizer = options[:tokenizer]
      stop_words = Array(options[:stop_words])

      fit =
        arr.map do |xi|
          # tokenize
          tokens = xi.to_s
          tokens = tokens.downcase unless options[:case_sensitive]
          tokens = tokens.split(tokenizer)

          # remove stop words
          tokens -= stop_words

          # count
          xc = Hash.new(0)
          tokens.each do |token|
            xc[token] += 1
          end
          xc
        end

      counts = Hash.new(0)

      fit.each do |xc|
        xc.each do |k2, v2|
          counts[k2] += v2
        end
      end

      # remove empty strings
      counts.delete("")

      [counts, fit]
    end
  end
end

Version data entries

12 entries across 12 versions & 1 rubygems

Version Path
eps-0.4.1 lib/eps/text_encoder.rb
eps-0.4.0 lib/eps/text_encoder.rb
eps-0.3.9 lib/eps/text_encoder.rb
eps-0.3.8 lib/eps/text_encoder.rb
eps-0.3.7 lib/eps/text_encoder.rb
eps-0.3.6 lib/eps/text_encoder.rb
eps-0.3.5 lib/eps/text_encoder.rb
eps-0.3.4 lib/eps/text_encoder.rb
eps-0.3.3 lib/eps/text_encoder.rb
eps-0.3.2 lib/eps/text_encoder.rb
eps-0.3.1 lib/eps/text_encoder.rb
eps-0.3.0 lib/eps/text_encoder.rb