Sha256: 3d12ece86635527fe0725a8623307d6329b0e9e1755720623c80466cb5a47777

Contents?: true

Size: 982 Bytes

Versions: 10

Compression:

Stored size: 982 Bytes

Contents

require_relative './text_splitter'

module Baran
  class RecursiveCharacterTextSplitter < TextSplitter
    attr_accessor :separators

    def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil)
      super(chunk_size: chunk_size, chunk_overlap: chunk_overlap)
      @separators = separators || ["\n\n", "\n", " "]
    end

    def splitted(text)
      results = []
      good_splits = []
      separator = ''

      separators.each do |s|
        if text.include?(s)
          separator = s
          break
        end
      end

      text.split(separator).each do |s|
        if s.length < chunk_size
          good_splits << s
        else
          if good_splits.length.positive?
            results += merged(good_splits, separator)
            good_splits.clear
          end
          results += splitted(s)
        end
      end

      if good_splits.length.positive?
        results += merged(good_splits, separator)
      end

      results
    end
  end
end

Version data entries

10 entries across 10 versions & 1 rubygems

Version Path
baran-0.2.1 lib/baran/recursive_character_text_splitter.rb
baran-0.2.0 lib/baran/recursive_character_text_splitter.rb
baran-0.1.12 lib/baran/recursive_character_text_splitter.rb
baran-0.1.11 lib/baran/recursive_character_text_splitter.rb
baran-0.1.10 lib/baran/recursive_character_text_splitter.rb
baran-0.1.9 lib/baran/recursive_character_text_splitter.rb
baran-0.1.8 lib/baran/recursive_character_text_splitter.rb
baran-0.1.7 lib/baran/recursive_character_text_splitter.rb
baran-0.1.6 lib/baran/recursive_character_text_splitter.rb
baran-0.1.5 lib/baran/recursive_character_text_splitter.rb