update text spliter

This commit is contained in:
jyong 2024-11-26 18:36:11 +08:00
parent 9ca453f7f7
commit 495163d5b3
2 changed files with 6 additions and 2 deletions

View File

@ -1467,7 +1467,9 @@ class SegmentService:
if dataset.indexing_technique == "high_quality" and embedding_model: if dataset.indexing_technique == "high_quality" and embedding_model:
# calc embedding use tokens # calc embedding use tokens
if document.doc_form == "qa_model": if document.doc_form == "qa_model":
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]])[0] tokens = embedding_model.get_text_embedding_num_tokens(
texts=[content + segment_item["answer"]]
)[0]
else: else:
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0] tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
segment_document = DocumentSegment( segment_document = DocumentSegment(

View File

@ -59,7 +59,9 @@ def batch_create_segment_to_index_task(
) )
word_count_change = 0 word_count_change = 0
if embedding_model: if embedding_model:
tokens_list = embedding_model.get_text_embedding_num_tokens(texts=[segment["content"] for segment in content]) tokens_list = embedding_model.get_text_embedding_num_tokens(
texts=[segment["content"] for segment in content]
)
else: else:
tokens_list = [0] * len(content) tokens_list = [0] * len(content)
for segment, tokens in zip(content, tokens_list): for segment, tokens in zip(content, tokens_list):