Sha256: bd864197634023896fa1b2d7bb56f9c030d5b9d607a82947c612d3850436d6a6

Contents?: true

Size: 1.4 KB

Versions: 3

Compression:

Stored size: 1.4 KB

Contents

module FastText
  class Vectorizer < Model
    DEFAULT_OPTIONS = {
      lr: 0.5,
      lr_update_rate: 100,
      dim: 100,
      ws: 5,
      epoch: 5,
      min_count: 1,
      min_count_label: 0,
      neg: 5,
      word_ngrams: 1,
      loss: "ns",
      model: "skipgram",
      bucket: 2000000,
      minn: 3,
      maxn: 6,
      thread: 3,
      t: 0.0001,
      verbose: 2,
      pretrained_vectors: "",
      save_output: false,
      seed: 0,
      autotune_validation_file: "",
      autotune_metric: "f1",
      autotune_predictions: 1,
      autotune_duration: 60 * 5,
      autotune_model_size: ""
    }

    def fit(x)
      input = input_path(x)
      @m ||= Ext::Model.new
      m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input))
    end

    def nearest_neighbors(word, k: 10)
      m.nearest_neighbors(word, k).map(&:reverse).to_h
    end

    def analogies(word_a, word_b, word_c, k: 10)
      m.analogies(k, word_a, word_b, word_c).map(&:reverse).to_h
    end

    private

    # separate example by newlines
    # https://github.com/facebookresearch/fastText/issues/518
    def input_path(x)
      if x.is_a?(String)
        x
      else
        tempfile = Tempfile.new("fasttext")
        x.each do |xi|
          tempfile.write(xi.gsub("\n", " ")) # replace newlines in document
          tempfile.write("\n")
        end
        tempfile.close
        tempfile.path
      end
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
fasttext-0.2.1 lib/fasttext/vectorizer.rb
fasttext-0.2.0 lib/fasttext/vectorizer.rb
fasttext-0.1.3 lib/fasttext/vectorizer.rb