From bc701d7b4cc77a1d64b6278ff526ea14f946edf9 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Thu, 28 Nov 2024 13:00:38 +0800 Subject: [PATCH] Edit chunk shall update instead of insert it (#3709) ### What problem does this PR solve? Edit chunk shall update instead of insert it. Close #3679 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/chunk_app.py | 2 +- api/apps/kb_app.py | 4 +++- api/apps/user_app.py | 2 +- deepdoc/parser/docx_parser.py | 2 +- deepdoc/parser/pdf_parser.py | 10 +++++----- deepdoc/parser/resume/entities/corporations.py | 2 +- deepdoc/parser/resume/entities/schools.py | 2 +- deepdoc/parser/resume/step_one.py | 4 ++-- deepdoc/parser/resume/step_two.py | 6 +++--- deepdoc/vision/table_structure_recognizer.py | 2 +- rag/app/paper.py | 4 ++-- rag/app/picture.py | 2 +- rag/nlp/__init__.py | 6 +++--- rag/nlp/query.py | 10 +++++----- rag/nlp/rag_tokenizer.py | 6 +++--- rag/nlp/search.py | 18 +++++++++--------- rag/nlp/term_weight.py | 8 ++++---- rag/utils/es_conn.py | 5 ++++- rag/utils/infinity_conn.py | 2 +- 19 files changed, 51 insertions(+), 46 deletions(-) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 4606c8b1..0863df13 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -155,7 +155,7 @@ def set(): v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d["q_%d_vec" % len(v)] = v.tolist() - settings.docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id) + settings.docStoreConn.update({"id": req["chunk_id"]}, d, search.index_name(tenant_id), doc.kb_id) return get_json_result(data=True) except Exception as e: return server_error_response(e) diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index ebac350f..1bb86a5d 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -168,7 +168,9 @@ def rm(): if not KnowledgebaseService.delete_by_id(req["kb_id"]): return get_data_error_result( message="Database error (Knowledgebase removal)!") - settings.docStoreConn.delete({"kb_id": req["kb_id"]}, search.index_name(kbs[0].tenant_id), req["kb_id"]) + for kb in kbs: + settings.docStoreConn.delete({"kb_id": kb.id}, search.index_name(kb.tenant_id), kb.id) + settings.docStoreConn.deleteIdx(search.index_name(kb.tenant_id), kb.id) return get_json_result(data=True) except Exception as e: return server_error_response(e) diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 66cae415..cc050631 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -252,7 +252,7 @@ def feishu_callback(): if res["code"] != 0: return redirect("/?error=%s" % res["message"]) - if "contact:user.email:readonly" not in res["data"]["scope"].split(" "): + if "contact:user.email:readonly" not in res["data"]["scope"].split(): return redirect("/?error=contact:user.email:readonly not in scope") session["access_token"] = res["data"]["access_token"] session["access_token_from"] = "feishu" diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 1c1c14d3..1910f438 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -47,7 +47,7 @@ class RAGFlowDocxParser: for p, n in patt: if re.search(p, b): return n - tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1] + tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1] if len(tks) > 3: if len(tks) < 12: return "Tx" diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 331d5da1..9c613946 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -108,13 +108,13 @@ class RAGFlowPdfParser: h = max(self.__height(up), self.__height(down)) y_dis = self._y_dis(up, down) LEN = 6 - tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ") - tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ") + tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split() + tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split() tks_all = up["text"][-LEN:].strip() \ + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") \ + down["text"][:LEN].strip() - tks_all = rag_tokenizer.tokenize(tks_all).split(" ") + tks_all = rag_tokenizer.tokenize(tks_all).split() fea = [ up.get("R", -1) == down.get("R", -1), y_dis / h, @@ -565,13 +565,13 @@ class RAGFlowPdfParser: if i >= len(self.boxes): break prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join( - self.boxes[i]["text"].strip().split(" ")[:2]) + self.boxes[i]["text"].strip().split()[:2]) while not prefix: self.boxes.pop(i) if i >= len(self.boxes): break prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join( - self.boxes[i]["text"].strip().split(" ")[:2]) + self.boxes[i]["text"].strip().split()[:2]) self.boxes.pop(i) if i >= len(self.boxes) or not prefix: break diff --git a/deepdoc/parser/resume/entities/corporations.py b/deepdoc/parser/resume/entities/corporations.py index c26f58ae..142b0f5e 100644 --- a/deepdoc/parser/resume/entities/corporations.py +++ b/deepdoc/parser/resume/entities/corporations.py @@ -47,7 +47,7 @@ def corpNorm(nm, add_region=True): nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm - tks = rag_tokenizer.tokenize(nm).split(" ") + tks = rag_tokenizer.tokenize(nm).split() reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] nm = "" for t in tks: diff --git a/deepdoc/parser/resume/entities/schools.py b/deepdoc/parser/resume/entities/schools.py index 598d7ae8..31662cde 100644 --- a/deepdoc/parser/resume/entities/schools.py +++ b/deepdoc/parser/resume/entities/schools.py @@ -44,7 +44,7 @@ loadRank(os.path.join(current_file_path, "res/school.rank.csv")) def split(txt): tks = [] - for t in re.sub(r"[ \t]+", " ",txt).split(" "): + for t in re.sub(r"[ \t]+", " ",txt).split(): if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \ re.match(r"[a-zA-Z]", t) and tks: tks[-1] = tks[-1] + " " + t diff --git a/deepdoc/parser/resume/step_one.py b/deepdoc/parser/resume/step_one.py index 90e52e45..96cc668d 100644 --- a/deepdoc/parser/resume/step_one.py +++ b/deepdoc/parser/resume/step_one.py @@ -80,7 +80,7 @@ def refactor(df): def loadjson(line): try: return json.loads(line) - except Exception as e: + except Exception: pass return {} @@ -183,4 +183,4 @@ def refactor(df): "\r", "\\n")) # print(df.values.tolist()) - return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0])) + return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0])) diff --git a/deepdoc/parser/resume/step_two.py b/deepdoc/parser/resume/step_two.py index afc5fb47..7d429777 100644 --- a/deepdoc/parser/resume/step_two.py +++ b/deepdoc/parser/resume/step_two.py @@ -100,7 +100,7 @@ def forEdu(cv): if n.get("school_name") and isinstance(n["school_name"], str): sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) e["sch_nm_kwd"] = sch[-1] - fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1]) + fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1]) if n.get("discipline_name") and isinstance(n["discipline_name"], str): maj.append(n["discipline_name"]) @@ -485,7 +485,7 @@ def parse(cv): nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip()) nm = re.sub(r"[ \t ]+", " ", nm) if re.match(r"[a-zA-Z ]+$", nm): - if len(nm.split(" ")) > 1: + if len(nm.split()) > 1: cv["name"] = nm else: nm = "" @@ -503,7 +503,7 @@ def parse(cv): for py in PY.get_pinyins(nm[:20], ''): for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i] for py in PY.get_pinyins(nm[:20], ' '): - py = py.split(" ") + py = py.split() for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i]) cv["name_kwd"] = name diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index 5759c0f6..be48ca95 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -117,7 +117,7 @@ class TableStructureRecognizer(Recognizer): for p, n in patt: if re.search(p, b["text"].strip()): return n - tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1] + tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1] if len(tks) > 3: if len(tks) < 12: return "Tx" diff --git a/rag/app/paper.py b/rag/app/paper.py index 1be93be8..23483cc0 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -99,11 +99,11 @@ class Pdf(PdfParser): i += 1 txt = b["text"].lower().strip() if re.match("(abstract|摘要)", txt): - if len(txt.split(" ")) > 32 or len(txt) > 64: + if len(txt.split()) > 32 or len(txt) > 64: abstr = txt + self._line_tag(b, zoomin) break txt = self.boxes[i]["text"].lower().strip() - if len(txt.split(" ")) > 32 or len(txt) > 64: + if len(txt.split()) > 32 or len(txt) > 64: abstr = txt + self._line_tag(self.boxes[i], zoomin) i += 1 break diff --git a/rag/app/picture.py b/rag/app/picture.py index fa4862b2..8d5df521 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -33,7 +33,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): txt = "\n".join([t[0] for _, t in bxs if t[0]]) eng = lang.lower() == "english" callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) - if (eng and len(txt.split(" ")) > 32) or len(txt) > 32: + if (eng and len(txt.split()) > 32) or len(txt) > 32: tokenize(doc, txt, eng) callback(0.8, "OCR results is too long to use CV LLM.") return [doc] diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 27233874..41b89597 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False): sections.pop(i) if i >= len(sections): break - prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) + prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2]) while not prefix: sections.pop(i) if i >= len(sections): break - prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) + prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2]) sections.pop(i) if i >= len(sections) or not prefix: break @@ -389,7 +389,7 @@ def title_frequency(bull, sections): def not_title(txt): if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False - if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32): + if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32): return True return re.search(r"[,;,。;!!]", txt) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 63fed29b..9a0ceafa 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -74,7 +74,7 @@ class FulltextQueryer: if not self.isChinese(txt): txt = FulltextQueryer.rmWWW(txt) - tks = rag_tokenizer.tokenize(txt).split(" ") + tks = rag_tokenizer.tokenize(txt).split() keywords = [t for t in tks if t] tks_w = self.tw.weights(tks, preprocess=False) tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w] @@ -83,7 +83,7 @@ class FulltextQueryer: syns = [] for tk, w in tks_w: syn = self.syn.lookup(tk) - syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ") + syn = rag_tokenizer.tokenize(" ".join(syn)).split() keywords.extend(syn) syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn] syns.append(" ".join(syn)) @@ -114,7 +114,7 @@ class FulltextQueryer: txt = FulltextQueryer.rmWWW(txt) qs, keywords = [], [] - for tt in self.tw.split(txt)[:256]: # .split(" "): + for tt in self.tw.split(txt)[:256]: # .split(): if not tt: continue keywords.append(tt) @@ -125,7 +125,7 @@ class FulltextQueryer: tms = [] for tk, w in sorted(twts, key=lambda x: x[1] * -1): sm = ( - rag_tokenizer.fine_grained_tokenize(tk).split(" ") + rag_tokenizer.fine_grained_tokenize(tk).split() if need_fine_grained_tokenize(tk) else [] ) @@ -194,7 +194,7 @@ class FulltextQueryer: def toDict(tks): d = {} if isinstance(tks, str): - tks = tks.split(" ") + tks = tks.split() for t, c in self.tw.weights(tks, preprocess=False): if t not in d: d[t] = 0 diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index 75541f59..0815daae 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -192,7 +192,7 @@ class RagTokenizer: # if split chars is part of token res = [] - tks = re.sub(r"[ ]+", " ", tks).split(" ") + tks = re.sub(r"[ ]+", " ", tks).split() s = 0 while True: if s >= len(tks): @@ -329,7 +329,7 @@ class RagTokenizer: return self.merge_(res) def fine_grained_tokenize(self, tks): - tks = tks.split(" ") + tks = tks.split() zh_num = len([1 for c in tks if c and is_chinese(c[0])]) if zh_num < len(tks) * 0.2: res = [] @@ -393,7 +393,7 @@ def is_alphabet(s): def naiveQie(txt): tks = [] - for t in txt.split(" "): + for t in txt.split(): if tks and re.match(r".*[a-zA-Z]$", tks[-1] ) and re.match(r".*[a-zA-Z]$", t): tks.append(" ") diff --git a/rag/nlp/search.py b/rag/nlp/search.py index eb389bdd..154d5850 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -114,7 +114,7 @@ class Dealer: for k in keywords: kwds.add(k) - for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "): + for kk in rag_tokenizer.fine_grained_tokenize(k).split(): if len(kk) < 2: continue if kk in kwds: @@ -186,7 +186,7 @@ class Dealer: assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( len(ans_v[0]), len(chunk_v[0])) - chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ") + chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split() for ck in chunks] cites = {} thr = 0.63 @@ -195,7 +195,7 @@ class Dealer: sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], chunk_v, rag_tokenizer.tokenize( - self.qryr.rmWWW(pieces_[i])).split(" "), + self.qryr.rmWWW(pieces_[i])).split(), chunks_tks, tkweight, vtweight) mx = np.max(sim) * 0.99 @@ -244,8 +244,8 @@ class Dealer: sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]] ins_tw = [] for i in sres.ids: - content_ltks = sres.field[i][cfield].split(" ") - title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t] + content_ltks = sres.field[i][cfield].split() + title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t] important_kwd = sres.field[i].get("important_kwd", []) tks = content_ltks + title_tks + important_kwd ins_tw.append(tks) @@ -265,8 +265,8 @@ class Dealer: sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]] ins_tw = [] for i in sres.ids: - content_ltks = sres.field[i][cfield].split(" ") - title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t] + content_ltks = sres.field[i][cfield].split() + title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t] important_kwd = sres.field[i].get("important_kwd", []) tks = content_ltks + title_tks + important_kwd ins_tw.append(tks) @@ -279,8 +279,8 @@ class Dealer: def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): return self.qryr.hybrid_similarity(ans_embd, ins_embd, - rag_tokenizer.tokenize(ans).split(" "), - rag_tokenizer.tokenize(inst).split(" ")) + rag_tokenizer.tokenize(ans).split(), + rag_tokenizer.tokenize(inst).split()) def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2, vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False): diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index 810f6b88..bbf446dd 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -99,7 +99,7 @@ class Dealer: txt = re.sub(p, r, txt) res = [] - for t in rag_tokenizer.tokenize(txt).split(" "): + for t in rag_tokenizer.tokenize(txt).split(): tk = t if (stpwd and tk in self.stop_words) or ( re.match(r"[0-9]$", tk) and not num): @@ -150,7 +150,7 @@ class Dealer: def split(self, txt): tks = [] - for t in re.sub(r"[ \t]+", " ", txt).split(" "): + for t in re.sub(r"[ \t]+", " ", txt).split(): if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \ re.match(r".*[a-zA-Z]$", t) and tks and \ self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func": @@ -198,7 +198,7 @@ class Dealer: s = 0 if not s and len(t) >= 4: - s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] + s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1] if len(s) > 1: s = np.min([freq(tt) for tt in s]) / 6. else: @@ -214,7 +214,7 @@ class Dealer: elif re.match(r"[a-z. -]+$", t): return 300 elif len(t) >= 4: - s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] + s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1] if len(s) > 1: return max(3, np.min([df(tt) for tt in s]) / 6.) diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py index e964a60f..09981fba 100644 --- a/rag/utils/es_conn.py +++ b/rag/utils/es_conn.py @@ -85,6 +85,9 @@ class ESConnection(DocStoreConnection): logging.exception("ESConnection.createIndex error %s" % (indexName)) def deleteIdx(self, indexName: str, knowledgebaseId: str): + if len(knowledgebaseId) > 0: + # The index need to be alive after any kb deletion since all kb under this tenant are in one index. + return try: self.es.indices.delete(index=indexName, allow_no_indices=True) except NotFoundError: @@ -400,7 +403,7 @@ class ESConnection(DocStoreConnection): if not hlts: continue txt = "...".join([a for a in list(hlts.items())[0][1]]) - if not is_english(txt.split(" ")): + if not is_english(txt.split()): ans[d["_id"]] = txt continue diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py index 1c0ec8fb..699e279e 100644 --- a/rag/utils/infinity_conn.py +++ b/rag/utils/infinity_conn.py @@ -419,7 +419,7 @@ class InfinityConnection(DocStoreConnection): v = list(v) elif fieldnm == "important_kwd": assert isinstance(v, str) - v = v.split(" ") + v = v.split() else: if not isinstance(v, str): v = str(v)