Sha256: 227218f13dd381a4b24479bfd3630b3c9e948c1cd029e855714b6cf38eb3df98

Contents?: true

Size: 1.12 KB

Versions: 1

Compression:

Stored size: 1.12 KB

Contents

# frozen_string_literal: true

require "roseflow/text/splitter"

module Roseflow
  module Text
    class RecursiveCharacterSplitter < Splitter
      SEPARATORS = ["\n\n", "\n", " ", ""]

      def initialize(separators = nil, **kwargs)
        super(**kwargs)
        @separators = separators || SEPARATORS
      end

      attr_reader :chunk_size, :chunk_overlap

      def split(text)
        segments = text.split(find_separator(text))
        current_size = 0
        results = [[]]

        segments.each do |segment|
          if current_size + segment.size > chunk_size
            overlap = [results.last.last(chunk_overlap), segment].flatten
            current_size = overlap.sum(&:size) + chunk_overlap
            results << overlap
          else
            current_size += segment.size + results.last.size
            results.last << segment
          end
        end

        results.map { |r| r.join(" ") }
      end

      private

      def find_separator(text)
        @separators.find { |separator| text.include?(separator) } || @separators.last
      end
    end # RecursiveCharacterSplitter
  end # Text
end # Roseflow

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
roseflow-0.1.0 lib/roseflow/text/recursive_character_splitter.rb