From a74d42848979c46a6bd39417397b70408112e9e7 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 26 Nov 2024 18:19:18 +0800 Subject: [PATCH] update text spliter --- api/core/rag/splitter/text_splitter.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/api/core/rag/splitter/text_splitter.py b/api/core/rag/splitter/text_splitter.py index 7dd62f8de1..c2ead2c715 100644 --- a/api/core/rag/splitter/text_splitter.py +++ b/api/core/rag/splitter/text_splitter.py @@ -224,8 +224,8 @@ class CharacterTextSplitter(TextSplitter): splits = _split_text_with_regex(text, self._separator, self._keep_separator) _separator = "" if self._keep_separator else self._separator _good_splits_lengths = [] # cache the lengths of the splits - for split in splits: - _good_splits_lengths.append(self._length_function(split)) + if splits: + _good_splits_lengths.extend(self._length_function(splits)) return self._merge_splits(splits, _separator, _good_splits_lengths) @@ -478,9 +478,8 @@ class RecursiveCharacterTextSplitter(TextSplitter): _good_splits = [] _good_splits_lengths = [] # cache the lengths of the splits _separator = "" if self._keep_separator else separator - - for s in splits: - s_len = self._length_function(s) + s_lens = self._length_function(splits) + for s, s_len in zip(splits, s_lens): if s_len < self._chunk_size: _good_splits.append(s) _good_splits_lengths.append(s_len)