Sha256: 3d12ece86635527fe0725a8623307d6329b0e9e1755720623c80466cb5a47777
Contents?: true
Size: 982 Bytes
Versions: 10
Compression:
Stored size: 982 Bytes
Contents
require_relative './text_splitter' module Baran class RecursiveCharacterTextSplitter < TextSplitter attr_accessor :separators def initialize(chunk_size: 1024, chunk_overlap: 64, separators: nil) super(chunk_size: chunk_size, chunk_overlap: chunk_overlap) @separators = separators || ["\n\n", "\n", " "] end def splitted(text) results = [] good_splits = [] separator = '' separators.each do |s| if text.include?(s) separator = s break end end text.split(separator).each do |s| if s.length < chunk_size good_splits << s else if good_splits.length.positive? results += merged(good_splits, separator) good_splits.clear end results += splitted(s) end end if good_splits.length.positive? results += merged(good_splits, separator) end results end end end
Version data entries
10 entries across 10 versions & 1 rubygems