From e90642d6246355806102bad516aea543d7821e56 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 11 Mar 2025 16:53:32 +0800 Subject: [PATCH] fix text split --- .../tidb_on_qdrant/tidb_on_qdrant_vector.py | 2 +- api/core/rag/retrieval/dataset_retrieval.py | 28 ++++++++++------ .../knowledge_retrieval_node.py | 32 ++++++++++++------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py index 8af61ab8b2..3c2f6ed53e 100644 --- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py +++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py @@ -331,7 +331,7 @@ class TidbOnQdrantVector(BaseVector): should_conditions = [] for document_id_filter in document_ids_filter: should_conditions.append( - models.FieldCondition( + models.FieldCondition( key="metadata.document_id", match=models.MatchValue(value=document_id_filter), ) diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index 0b56e7438c..8c3f4194d6 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -7,7 +7,7 @@ from collections.abc import Generator, Mapping from typing import Any, Optional, Union, cast from flask import Flask, current_app -from sqlalchemy import Integer, and_, or_ +from sqlalchemy import Integer, and_, or_, text from sqlalchemy import cast as sqlalchemy_cast from core.app.app_config.entities import ( @@ -847,7 +847,7 @@ class DatasetRetrieval: for condition in metadata_filtering_conditions.conditions: metadata_name = condition.name expected_value = condition.value - if expected_value: + if expected_value or condition.comparison_operator in ("empty", "not empty"): if isinstance(expected_value, str): expected_value = self._replace_metadata_filter_value(expected_value, inputs) filters = self._process_metadata_filter_func( @@ -930,17 +930,27 @@ class DatasetRetrieval: return None return automatic_metadata_filters - def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: str, filters: list): + def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: Optional[str], filters: list): match condition: case "contains": - filters.append(DatasetDocument.doc_metadata[metadata_name].like(f'"%{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}%") + ) case "not contains": - filters.append(DatasetDocument.doc_metadata[metadata_name].notlike(f'"%{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key NOT LIKE :value")).params( + key=metadata_name, value=f"%{value}%" + ) + ) case "start with": - filters.append(DatasetDocument.doc_metadata[metadata_name].like(f'"{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"{value}%") + ) case "end with": - filters.append(DatasetDocument.doc_metadata[metadata_name].like(f'"%{value}"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}") + ) case "is" | "=": if isinstance(value, str): filters.append(DatasetDocument.doc_metadata[metadata_name] == f'"{value}"') @@ -955,9 +965,9 @@ class DatasetRetrieval: filters.append( sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) != value ) - case "is empty": + case "empty": filters.append(DatasetDocument.doc_metadata[metadata_name].is_(None)) - case "is not empty": + case "not empty": filters.append(DatasetDocument.doc_metadata[metadata_name].isnot(None)) case "before" | "<": filters.append(sqlalchemy_cast(DatasetDocument.doc_metadata[metadata_name].astext, Integer) < value) diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index a99a8d6a22..7d3ba0d7ce 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -4,7 +4,7 @@ from collections import defaultdict from collections.abc import Mapping, Sequence from typing import Any, Optional, cast -from sqlalchemy import and_, func, or_ +from sqlalchemy import and_, func, or_, text from core.app.app_config.entities import DatasetRetrieveConfigEntity from core.app.entities.app_invoke_entities import ModelConfigWithCredentialsEntity @@ -293,13 +293,14 @@ class KnowledgeRetrievalNode(LLMNode): Document.enabled == True, Document.archived == False, ) + filters = [] + metadata_condition = None if node_data.metadata_filtering_mode == "disabled": return None, None elif node_data.metadata_filtering_mode == "automatic": automatic_metadata_filters = self._automatic_metadata_filter_func(dataset_ids, query, node_data) if automatic_metadata_filters: conditions = [] - filters = [] for filter in automatic_metadata_filters: self._process_metadata_filter_func( filter.get("condition"), filter.get("metadata_name"), filter.get("value"), filters @@ -318,10 +319,9 @@ class KnowledgeRetrievalNode(LLMNode): elif node_data.metadata_filtering_mode == "manual": if node_data.metadata_filtering_conditions: for condition in node_data.metadata_filtering_conditions.conditions: - filters = [] metadata_name = condition.name expected_value = condition.value - if expected_value: + if expected_value or condition.comparison_operator in ("empty", "not empty"): if isinstance(expected_value, str): expected_value = self.graph_runtime_state.variable_pool.convert_template( expected_value @@ -407,16 +407,26 @@ class KnowledgeRetrievalNode(LLMNode): return [] return automatic_metadata_filters - def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: str, filters: list): + def _process_metadata_filter_func(self, condition: str, metadata_name: str, value: Optional[str], filters: list): match condition: case "contains": - filters.append(Document.doc_metadata[metadata_name].like(f'"%{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}%") + ) case "not contains": - filters.append(Document.doc_metadata[metadata_name].notlike(f'"%{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key NOT LIKE :value")).params( + key=metadata_name, value=f"%{value}%" + ) + ) case "start with": - filters.append(Document.doc_metadata[metadata_name].like(f'"{value}%"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"{value}%") + ) case "end with": - filters.append(Document.doc_metadata[metadata_name].like(f'"%{value}"')) + filters.append( + (text("documents.doc_metadata ->> :key LIKE :value")).params(key=metadata_name, value=f"%{value}") + ) case "=" | "is": if isinstance(value, str): filters.append(Document.doc_metadata[metadata_name] == f'"{value}"') @@ -427,9 +437,9 @@ class KnowledgeRetrievalNode(LLMNode): filters.append(Document.doc_metadata[metadata_name] != f'"{value}"') else: filters.append(Document.doc_metadata[metadata_name] != value) - case "is empty": + case "empty": filters.append(Document.doc_metadata[metadata_name].is_(None)) - case "is not empty": + case "not empty": filters.append(Document.doc_metadata[metadata_name].isnot(None)) case "before" | "<": filters.append(Document.doc_metadata[metadata_name] < value)