Sha256: 0fe34ad0c8f1aee3b3ec1e99d8b50f36d2daeb4010777a9df289727ba8fe114d
Contents?: true
Size: 1.85 KB
Versions: 2
Compression:
Stored size: 1.85 KB
Contents
# frozen_string_literal: true module Boxcars module VectorStore module InMemory class BuildFromFiles include VectorStore def initialize(params) @split_chunk_size = params[:split_chunk_size] || 2000 @training_data_path = File.absolute_path(params[:training_data_path]) @embedding_tool = params[:embedding_tool] || :openai validate_params(embedding_tool, training_data_path) @memory_vectors = [] end def call data = load_data_files(training_data_path) texts = split_text_into_chunks(data) vectors = generate_vectors(texts) add_vectors(vectors, texts) { type: :in_memory, vector_store: memory_vectors } end private attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :memory_vectors def validate_params(embedding_tool, training_data_path) training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, '')) raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir) raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty? return if %i[openai tensorflow].include?(embedding_tool) raise_argument_error('embedding_tool is invalid') end def add_vectors(vectors, texts) vectors.map.with_index do |vector, index| memory_vector = Document.new( content: texts[index], embedding: vector[:embedding], metadata: { doc_id: index, training_data_path: training_data_path } ) memory_vectors << memory_vector end end end end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
boxcars-0.2.13 | lib/boxcars/vector_store/in_memory/build_from_files.rb |
boxcars-0.2.12 | lib/boxcars/vector_store/in_memory/build_from_files.rb |