From 495163d5b3ba3ef3b82a3900673658eabcdcc59c Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 26 Nov 2024 18:36:11 +0800 Subject: [PATCH] update text spliter --- api/services/dataset_service.py | 4 +++- api/tasks/batch_create_segment_to_index_task.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 6e071c17ec..1f631fea38 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -1467,7 +1467,9 @@ class SegmentService: if dataset.indexing_technique == "high_quality" and embedding_model: # calc embedding use tokens if document.doc_form == "qa_model": - tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]])[0] + tokens = embedding_model.get_text_embedding_num_tokens( + texts=[content + segment_item["answer"]] + )[0] else: tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0] segment_document = DocumentSegment( diff --git a/api/tasks/batch_create_segment_to_index_task.py b/api/tasks/batch_create_segment_to_index_task.py index 39c032caad..41e1419d25 100644 --- a/api/tasks/batch_create_segment_to_index_task.py +++ b/api/tasks/batch_create_segment_to_index_task.py @@ -59,7 +59,9 @@ def batch_create_segment_to_index_task( ) word_count_change = 0 if embedding_model: - tokens_list = embedding_model.get_text_embedding_num_tokens(texts=[segment["content"] for segment in content]) + tokens_list = embedding_model.get_text_embedding_num_tokens( + texts=[segment["content"] for segment in content] + ) else: tokens_list = [0] * len(content) for segment, tokens in zip(content, tokens_list):