Sha256: 0fe34ad0c8f1aee3b3ec1e99d8b50f36d2daeb4010777a9df289727ba8fe114d

Contents?: true

Size: 1.85 KB

Versions: 2

Compression:

Stored size: 1.85 KB

Contents

# frozen_string_literal: true

module Boxcars
  module VectorStore
    module InMemory
      class BuildFromFiles
        include VectorStore

        def initialize(params)
          @split_chunk_size = params[:split_chunk_size] || 2000
          @training_data_path = File.absolute_path(params[:training_data_path])
          @embedding_tool = params[:embedding_tool] || :openai

          validate_params(embedding_tool, training_data_path)
          @memory_vectors = []
        end

        def call
          data = load_data_files(training_data_path)
          texts = split_text_into_chunks(data)
          vectors = generate_vectors(texts)
          add_vectors(vectors, texts)

          {
            type: :in_memory,
            vector_store: memory_vectors
          }
        end

        private

        attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :memory_vectors

        def validate_params(embedding_tool, training_data_path)
          training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))

          raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
          raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?

          return if %i[openai tensorflow].include?(embedding_tool)

          raise_argument_error('embedding_tool is invalid')
        end

        def add_vectors(vectors, texts)
          vectors.map.with_index do |vector, index|
            memory_vector = Document.new(
              content: texts[index],
              embedding: vector[:embedding],
              metadata: {
                doc_id: index,
                training_data_path: training_data_path
              }
            )
            memory_vectors << memory_vector
          end
        end
      end
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
boxcars-0.2.13 lib/boxcars/vector_store/in_memory/build_from_files.rb
boxcars-0.2.12 lib/boxcars/vector_store/in_memory/build_from_files.rb