Sha256: 6f444d14a5593ee5ce436b62acde2f1fe3d583e142ff2a95c08fa948badf00a6

Contents?: true

Size: 1.64 KB

Versions: 1

Compression:

Stored size: 1.64 KB

Contents

module Eps
  class TextEncoder
    attr_reader :options, :vocabulary

    def initialize(**options)
      @options = options
      @vocabulary = options[:vocabulary] || []
    end

    def fit(arr)
      counts, fit = count_and_fit(arr)

      min_length = options[:min_length]
      if min_length
        counts.select! { |k, _| k.length >= min_length }
      end

      min_occurrences = options[:min_occurrences]
      if min_occurrences
        counts.select! { |_, v| v >= min_occurrences }
      end

      max_occurrences = options[:max_occurrences]
      if max_occurrences
        counts.reject! { |_, v| v > max_occurrences }
      end

      max_features = options[:max_features]
      if max_features
        counts = counts.sort_by { |_, v| -v }[0...max_features].to_h
      end

      @vocabulary = counts.keys

      fit
    end

    def transform(arr)
      counts, fit = count_and_fit(arr)
      fit
    end

    private

    def count_and_fit(arr)
      tokenizer = options[:tokenizer]
      stop_words = Array(options[:stop_words])

      fit =
        arr.map do |xi|
          # tokenize
          tokens = xi.to_s
          tokens = tokens.downcase unless options[:case_sensitive]
          tokens = tokens.split(tokenizer)

          # remove stop words
          tokens -= stop_words

          # count
          xc = Hash.new(0)
          tokens.each do |token|
            xc[token] += 1
          end
          xc
        end

      counts = Hash.new(0)

      fit.each do |xc|
        xc.each do |k2, v2|
          counts[k2] += v2
        end
      end

      # remove empty strings
      counts.delete("")

      [counts, fit]
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
eps-0.5.0 lib/eps/text_encoder.rb