Sha256: 227218f13dd381a4b24479bfd3630b3c9e948c1cd029e855714b6cf38eb3df98
Contents?: true
Size: 1.12 KB
Versions: 1
Compression:
Stored size: 1.12 KB
Contents
# frozen_string_literal: true require "roseflow/text/splitter" module Roseflow module Text class RecursiveCharacterSplitter < Splitter SEPARATORS = ["\n\n", "\n", " ", ""] def initialize(separators = nil, **kwargs) super(**kwargs) @separators = separators || SEPARATORS end attr_reader :chunk_size, :chunk_overlap def split(text) segments = text.split(find_separator(text)) current_size = 0 results = [[]] segments.each do |segment| if current_size + segment.size > chunk_size overlap = [results.last.last(chunk_overlap), segment].flatten current_size = overlap.sum(&:size) + chunk_overlap results << overlap else current_size += segment.size + results.last.size results.last << segment end end results.map { |r| r.join(" ") } end private def find_separator(text) @separators.find { |separator| text.include?(separator) } || @separators.last end end # RecursiveCharacterSplitter end # Text end # Roseflow
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
roseflow-0.1.0 | lib/roseflow/text/recursive_character_splitter.rb |