update text spliter
This commit is contained in:
parent
9ca453f7f7
commit
495163d5b3
@ -1467,7 +1467,9 @@ class SegmentService:
|
|||||||
if dataset.indexing_technique == "high_quality" and embedding_model:
|
if dataset.indexing_technique == "high_quality" and embedding_model:
|
||||||
# calc embedding use tokens
|
# calc embedding use tokens
|
||||||
if document.doc_form == "qa_model":
|
if document.doc_form == "qa_model":
|
||||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content + segment_item["answer"]])[0]
|
tokens = embedding_model.get_text_embedding_num_tokens(
|
||||||
|
texts=[content + segment_item["answer"]]
|
||||||
|
)[0]
|
||||||
else:
|
else:
|
||||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
||||||
segment_document = DocumentSegment(
|
segment_document = DocumentSegment(
|
||||||
|
|||||||
@ -59,7 +59,9 @@ def batch_create_segment_to_index_task(
|
|||||||
)
|
)
|
||||||
word_count_change = 0
|
word_count_change = 0
|
||||||
if embedding_model:
|
if embedding_model:
|
||||||
tokens_list = embedding_model.get_text_embedding_num_tokens(texts=[segment["content"] for segment in content])
|
tokens_list = embedding_model.get_text_embedding_num_tokens(
|
||||||
|
texts=[segment["content"] for segment in content]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
tokens_list = [0] * len(content)
|
tokens_list = [0] * len(content)
|
||||||
for segment, tokens in zip(content, tokens_list):
|
for segment, tokens in zip(content, tokens_list):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user