Sha256: b05f5a298781d73cb2a2d89aba3dcd27a8c04b4576b96bea4076a5f5c86e8c51

Contents?: true

Size: 1.24 KB

Versions: 2

Compression:

Stored size: 1.24 KB

Contents

module FastText
  class Vectorizer < Model
    DEFAULT_OPTIONS = {
      lr: 0.5,
      lr_update_rate: 100,
      dim: 100,
      ws: 5,
      epoch: 5,
      min_count: 1,
      min_count_label: 0,
      neg: 5,
      word_ngrams: 1,
      loss: "ns",
      model: "skipgram",
      bucket: 2000000,
      minn: 3,
      maxn: 6,
      thread: 3,
      t: 0.0001,
      verbose: 2,
      pretrained_vectors: "",
      save_output: false,
      # seed: 0
    }

    def fit(x)
      input = input_path(x)
      @m ||= Ext::Model.new
      m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input))
    end

    def nearest_neighbors(word, k: 10)
      m.nearest_neighbors(word, k).map(&:reverse).to_h
    end

    def analogies(word_a, word_b, word_c, k: 10)
      m.analogies(k, word_a, word_b, word_c).map(&:reverse).to_h
    end

    private

    # separate example by newlines
    # https://github.com/facebookresearch/fastText/issues/518
    def input_path(x)
      if x.is_a?(String)
        x
      else
        tempfile = Tempfile.new("fasttext")
        x.each do |xi|
          tempfile.write(xi.gsub("\n", " ")) # replace newlines in document
          tempfile.write("\n")
        end
        tempfile.close
        tempfile.path
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
fasttext-0.1.2 lib/fasttext/vectorizer.rb
fasttext-0.1.1 lib/fasttext/vectorizer.rb