fix bug about fetching knowledge graph (#3394)

### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2024-11-14 12:29:15 +08:00 · 2024-11-14 12:29:15 +08:00 · 4caf932808
commit 4caf932808
parent 400fc3f5e9
6 changed files with 120 additions and 62 deletions
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@ -301,16 +301,13 @@ def retrieval_test():
@login_required
 def knowledge_graph():
    doc_id = request.args["doc_id"]
-    e, doc = DocumentService.get_by_id(doc_id)
-    if not e:
-        return get_data_error_result(message="Document not found!")
    tenant_id = DocumentService.get_tenant_id(doc_id)
    kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
    req = {
        "doc_ids":[doc_id],
        "knowledge_graph_kwd": ["graph", "mind_map"]
    }
-    sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
+    sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids)
    obj = {"graph": {}, "mind_map": {}}
    for id in sres.ids[:2]:
        ty = sres.field[id]["knowledge_graph_kwd"]
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -524,7 +524,7 @@ def upload_and_parse():
@manager.route('/parse', methods=['POST'])
@login_required
 def parse():
-    url = request.json.get("url")
+    url = request.json.get("url") if request.json else ""
    if url:
        if not is_valid_url(url):
            return get_json_result(
@ -537,7 +537,7 @@ def parse():
        options.add_argument('--disable-dev-shm-usage')
        driver = Chrome(options=options)
        driver.get(url)
-        sections = RAGFlowHtmlParser()("", binary=driver.page_source)
+        sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
        return get_json_result(data="\n".join(sections))

    if 'file' not in request.files:
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -15,6 +15,8 @@
 #
 import re
 import os
+from concurrent.futures import ThreadPoolExecutor
+
 from flask_login import current_user
 from peewee import fn

@ -385,6 +387,41 @@ class FileService(CommonService):

        return err, files

+    @staticmethod
+    def parse_docs(file_objs, user_id):
+        from rag.app import presentation, picture, naive, audio, email
+
+        def dummy(prog=None, msg=""):
+            pass
+
+        FACTORY = {
+            ParserType.PRESENTATION.value: presentation,
+            ParserType.PICTURE.value: picture,
+            ParserType.AUDIO.value: audio,
+            ParserType.EMAIL.value: email
+        }
+        parser_config = {"chunk_token_num": 16096, "delimiter": "\n!?;。；！？", "layout_recognize": False}
+        exe = ThreadPoolExecutor(max_workers=12)
+        threads = []
+        for file in file_objs:
+            kwargs = {
+                "lang": "English",
+                "callback": dummy,
+                "parser_config": parser_config,
+                "from_page": 0,
+                "to_page": 100000,
+                "tenant_id": user_id
+            }
+            filetype = filename_type(file.filename)
+            blob = file.read()
+            threads.append(exe.submit(FACTORY.get(FileService.get_parser(filetype, file.filename, ""), naive).chunk, file.filename, blob, **kwargs))
+
+        res = []
+        for th in threads:
+            res.append("\n".join([ck["content_with_weight"] for ck in th.result()]))
+
+        return "\n\n".join(res)
+
    @staticmethod
    def get_parser(doc_type, filename, default):
        if doc_type == FileType.VISUAL:
--- a/api/db/services/knowledgebase_service.py
+++ b/api/db/services/knowledgebase_service.py
@ -73,7 +73,7 @@ class KnowledgebaseService(CommonService):
            cls.model.id,
        ]
        kbs = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
-        kb_ids = [kb["id"] for kb in kbs]
+        kb_ids = [kb.id for kb in kbs]
        return kb_ids

    @classmethod
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@ -10,6 +10,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import re
+
 from deepdoc.parser.utils import get_text
 from rag.nlp import num_tokens_from_string

@ -29,8 +31,6 @@ class RAGFlowTxtParser:
        def add_chunk(t):
            nonlocal cks, tk_nums, delimiter
            tnum = num_tokens_from_string(t)
-            if tnum < 8:
-                pos = ""
            if tk_nums[-1] > chunk_token_num:
                cks.append(t)
                tk_nums.append(tnum)
@ -38,15 +38,19 @@ class RAGFlowTxtParser:
                cks[-1] += t
                tk_nums[-1] += tnum

-        s, e = 0, 1
-        while e < len(txt):
-            if txt[e] in delimiter:
-                add_chunk(txt[s: e + 1])
-                s = e + 1
-                e = s + 1
-            else:
-                e += 1
-        if s < e:
-            add_chunk(txt[s: e + 1])
+        dels = []
+        s = 0
+        for m in re.finditer(r"`([^`]+)`", delimiter, re.I):
+            f, t = m.span()
+            dels.append(m.group(1))
+            dels.extend(list(delimiter[s: f]))
+            s = t
+        if s < len(delimiter):
+            dels.extend(list(delimiter[s:]))
+        dels = [re.escape(d) for d in delimiter if d]
+        dels = [d for d in dels if d]
+        dels = "|".join(dels)
+        secs = re.split(r"(%s)" % dels, txt)
+        for sec in secs: add_chunk(sec)

-        return [[c,""] for c in cks]
+        return [[c, ""] for c in cks]
--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
@ -13,7 +13,8 @@ from rag import settings
 from rag.utils import singleton
 from api.utils.file_utils import get_project_base_directory
 import polars as pl
-from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, FusionExpr
+from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
+    FusionExpr
 from rag.nlp import is_english, rag_tokenizer


@ -26,7 +27,8 @@ class ESConnection(DocStoreConnection):
            try:
                self.es = Elasticsearch(
                    settings.ES["hosts"].split(","),
-                    basic_auth=(settings.ES["username"], settings.ES["password"]) if "username" in settings.ES and "password" in settings.ES else None,
+                    basic_auth=(settings.ES["username"], settings.ES[
+                        "password"]) if "username" in settings.ES and "password" in settings.ES else None,
                    verify_certs=False,
                    timeout=600
                )
@ -57,6 +59,7 @@ class ESConnection(DocStoreConnection):
    """
    Database operations
    """
+
    def dbType(self) -> str:
        return "elasticsearch"

@ -66,6 +69,7 @@ class ESConnection(DocStoreConnection):
    """
    Table operations
    """
+
    def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
        if self.indexExist(indexName, knowledgebaseId):
            return True
@ -97,7 +101,10 @@ class ESConnection(DocStoreConnection):
    """
    CRUD operations
    """
-    def search(self, selectFields: list[str], highlightFields: list[str], condition: dict, matchExprs: list[MatchExpr], orderBy: OrderByExpr, offset: int, limit: int, indexNames: str|list[str], knowledgebaseIds: list[str]) -> list[dict] | pl.DataFrame:
+
+    def search(self, selectFields: list[str], highlightFields: list[str], condition: dict, matchExprs: list[MatchExpr],
+               orderBy: OrderByExpr, offset: int, limit: int, indexNames: str | list[str],
+               knowledgebaseIds: list[str]) -> list[dict] | pl.DataFrame:
        """
        Refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
        """
@ -109,8 +116,10 @@ class ESConnection(DocStoreConnection):
        bqry = None
        vector_similarity_weight = 0.5
        for m in matchExprs:
-            if isinstance(m, FusionExpr) and m.method=="weighted_sum" and "weights" in m.fusion_params:
-                assert len(matchExprs)==3 and isinstance(matchExprs[0], MatchTextExpr) and isinstance(matchExprs[1], MatchDenseExpr) and isinstance(matchExprs[2], FusionExpr)
+            if isinstance(m, FusionExpr) and m.method == "weighted_sum" and "weights" in m.fusion_params:
+                assert len(matchExprs) == 3 and isinstance(matchExprs[0], MatchTextExpr) and isinstance(matchExprs[1],
+                                                                                                        MatchDenseExpr) and isinstance(
+                    matchExprs[2], FusionExpr)
                weights = m.fusion_params["weights"]
                vector_similarity_weight = float(weights.split(",")[1])
        for m in matchExprs:
@ -119,36 +128,41 @@ class ESConnection(DocStoreConnection):
                if "minimum_should_match" in m.extra_options:
                    minimum_should_match = str(int(m.extra_options["minimum_should_match"] * 100)) + "%"
                bqry = Q("bool",
-                            must=Q("query_string", fields=m.fields,
+                         must=Q("query_string", fields=m.fields,
                                type="best_fields", query=m.matching_text,
-                                minimum_should_match = minimum_should_match,
+                                minimum_should_match=minimum_should_match,
                                boost=1),
-                            boost = 1.0 - vector_similarity_weight,
-                        )
-                if condition:
-                    for k, v in condition.items():
-                        if not isinstance(k, str) or not v:
-                            continue
-                        if isinstance(v, list):
-                            bqry.filter.append(Q("terms", **{k: v}))
-                        elif isinstance(v, str) or isinstance(v, int):
-                            bqry.filter.append(Q("term", **{k: v}))
-                        else:
-                            raise Exception(f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
+                         boost=1.0 - vector_similarity_weight,
+                         )
            elif isinstance(m, MatchDenseExpr):
-                assert(bqry is not None)
+                assert (bqry is not None)
                similarity = 0.0
                if "similarity" in m.extra_options:
                    similarity = m.extra_options["similarity"]
                s = s.knn(m.vector_column_name,
-                    m.topn,
-                    m.topn * 2,
-                    query_vector = list(m.embedding_data),
-                    filter = bqry.to_dict(),
-                    similarity = similarity,
-                )
-        if matchExprs:
-            s.query = bqry
+                          m.topn,
+                          m.topn * 2,
+                          query_vector=list(m.embedding_data),
+                          filter=bqry.to_dict(),
+                          similarity=similarity,
+                          )
+
+        if condition:
+            if not bqry:
+                bqry = Q("bool", must=[])
+            for k, v in condition.items():
+                if not isinstance(k, str) or not v:
+                    continue
+                if isinstance(v, list):
+                    bqry.filter.append(Q("terms", **{k: v}))
+                elif isinstance(v, str) or isinstance(v, int):
+                    bqry.filter.append(Q("term", **{k: v}))
+                else:
+                    raise Exception(
+                        f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
+
+        if bqry:
+            s = s.query(bqry)
        for field in highlightFields:
            s = s.highlight(field)

@ -157,12 +171,13 @@ class ESConnection(DocStoreConnection):
            for field, order in orderBy.fields:
                order = "asc" if order == 0 else "desc"
                orders.append({field: {"order": order, "unmapped_type": "float",
-                                 "mode": "avg", "numeric_type": "double"}})
+                                       "mode": "avg", "numeric_type": "double"}})
            s = s.sort(*orders)

        if limit > 0:
            s = s[offset:limit]
        q = s.to_dict()
+        print(json.dumps(q), flush=True)
        # logger.info("ESConnection.search [Q]: " + json.dumps(q))

        for i in range(3):
@ -189,7 +204,7 @@ class ESConnection(DocStoreConnection):
        for i in range(3):
            try:
                res = self.es.get(index=(indexName),
-                                id=chunkId, source=True,)
+                                  id=chunkId, source=True, )
                if str(res.get("timed_out", "")).lower() == "true":
                    raise Exception("Es Timeout.")
                if not res.get("found"):
@ -222,7 +237,7 @@ class ESConnection(DocStoreConnection):
        for _ in range(100):
            try:
                r = self.es.bulk(index=(indexName), operations=operations,
-                                     refresh=False, timeout="600s")
+                                 refresh=False, timeout="600s")
                if re.search(r"False", str(r["errors"]), re.IGNORECASE):
                    return res

@ -249,7 +264,8 @@ class ESConnection(DocStoreConnection):
                    self.es.update(index=indexName, id=chunkId, doc=doc)
                    return True
                except Exception as e:
-                    logger.exception(f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
+                    logger.exception(
+                        f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
                    if str(e).find("Timeout") > 0:
                        continue
        else:
@ -263,7 +279,8 @@ class ESConnection(DocStoreConnection):
                elif isinstance(v, str) or isinstance(v, int):
                    bqry.filter.append(Q("term", **{k: v}))
                else:
-                    raise Exception(f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
+                    raise Exception(
+                        f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
            scripts = []
            for k, v in newValue.items():
                if not isinstance(k, str) or not v:
@ -273,7 +290,8 @@ class ESConnection(DocStoreConnection):
                elif isinstance(v, int):
                    scripts.append(f"ctx._source.{k} = {v}")
                else:
-                    raise Exception(f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
+                    raise Exception(
+                        f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
            ubq = UpdateByQuery(
                index=indexName).using(
                self.es).query(bqry)
@ -313,7 +331,7 @@ class ESConnection(DocStoreConnection):
            try:
                res = self.es.delete_by_query(
                    index=indexName,
-                    body = Search().query(qry).to_dict(),
+                    body=Search().query(qry).to_dict(),
                    refresh=True)
                return res["deleted"]
            except Exception as e:
@ -325,10 +343,10 @@ class ESConnection(DocStoreConnection):
                    return 0
        return 0

-
    """
    Helper functions for search result
    """
+
    def getTotal(self, res):
        if isinstance(res["hits"]["total"], type({})):
            return res["hits"]["total"]["value"]
@ -376,12 +394,13 @@ class ESConnection(DocStoreConnection):
                continue

            txt = d["_source"][fieldnm]
-            txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE|re.MULTILINE)
+            txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
            txts = []
            for t in re.split(r"[.?!;\n]", txt):
                for w in keywords:
-                    t = re.sub(r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"%re.escape(w), r"\1<em>\2</em>\3", t, flags=re.IGNORECASE|re.MULTILINE)
-                if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE|re.MULTILINE):
+                    t = re.sub(r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])" % re.escape(w), r"\1<em>\2</em>\3", t,
+                               flags=re.IGNORECASE | re.MULTILINE)
+                if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE):
                    continue
                txts.append(t)
            ans[d["_id"]] = "...".join(txts) if txts else "...".join([a for a in list(hlts.items())[0][1]])
@ -395,10 +414,10 @@ class ESConnection(DocStoreConnection):
        bkts = res["aggregations"][agg_field]["buckets"]
        return [(b["key"], b["doc_count"]) for b in bkts]

-
    """
    SQL
    """
+
    def sql(self, sql: str, fetch_size: int, format: str):
        logger.info(f"ESConnection.sql get sql: {sql}")
        sql = re.sub(r"[ `]+", " ", sql)
@ -413,7 +432,7 @@ class ESConnection(DocStoreConnection):
                    r.group(1),
                    r.group(2),
                    r.group(3)),
-                    match))
+                 match))

        for p, r in replaces:
            sql = sql.replace(p, r, 1)
@ -421,7 +440,8 @@ class ESConnection(DocStoreConnection):

        for i in range(3):
            try:
-                res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout="2s")
+                res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format,
+                                        request_timeout="2s")
                return res
            except ConnectionTimeout:
                logger.exception("ESConnection.sql timeout [Q]: " + sql)