diff --git a/.gitignore b/.gitignore index d0ef58e9..9cab8c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,4 @@ Cargo.lock *.trie .idea/ -.env .vscode/ diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 686365cf..43d5d0b0 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -141,7 +141,7 @@ def list(): try: docs, tol = DocumentService.get_by_kb_id( kb_id, page_number, items_per_page, orderby, desc, keywords) - return get_json_result(data={"total":tol, "docs": docs}) + return get_json_result(data={"total": tol, "docs": docs}) except Exception as e: return server_error_response(e) @@ -217,7 +217,7 @@ def rm(): return get_data_error_result(retmsg="Tenant not found!") ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) - DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0) + DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0) if not DocumentService.delete_by_id(req["doc_id"]): return get_data_error_result( retmsg="Database error (Document removal)!") @@ -241,7 +241,7 @@ def run(): info["chunk_num"] = 0 info["token_num"] = 0 DocumentService.update_by_id(id, info) - #if str(req["run"]) == TaskStatus.CANCEL.value: + # if str(req["run"]) == TaskStatus.CANCEL.value: tenant_id = DocumentService.get_tenant_id(id) if not tenant_id: return get_data_error_result(retmsg="Tenant not found!") @@ -281,7 +281,7 @@ def rename(): @manager.route('/get/', methods=['GET']) -#@login_required +# @login_required def get(doc_id): try: e, doc = DocumentService.get_by_id(doc_id) @@ -292,8 +292,9 @@ def get(doc_id): ext = re.search(r"\.([^.]+)$", doc.name) if ext: if doc.type == FileType.VISUAL.value: - response.headers.set('Content-Type', 'image/%s'%ext.group(1)) - else: response.headers.set('Content-Type', 'application/%s'%ext.group(1)) + response.headers.set('Content-Type', 'image/%s' % ext.group(1)) + else: + response.headers.set('Content-Type', 'application/%s' % ext.group(1)) return response except Exception as e: return server_error_response(e) @@ -314,11 +315,14 @@ def change_parser(): if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): return get_data_error_result(retmsg="Not supported yet!") - e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": "0"}) + e = DocumentService.update_by_id(doc.id, + {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0", + "token_num": 0, "chunk_num": 0, "process_duation": 0}) if not e: return get_data_error_result(retmsg="Document not found!") - if doc.token_num>0: - e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) + if doc.token_num > 0: + e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, + doc.process_duation * -1) if not e: return get_data_error_result(retmsg="Document not found!") tenant_id = DocumentService.get_tenant_id(req["doc_id"]) @@ -332,7 +336,7 @@ def change_parser(): @manager.route('/image/', methods=['GET']) -#@login_required +# @login_required def get_image(image_id): try: bkt, nm = image_id.split("-") @@ -341,4 +345,3 @@ def get_image(image_id): return response except Exception as e: return server_error_response(e) - diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 8e03d7d9..61d2ab0c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -348,6 +348,9 @@ class HuParser: if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]): bxs.pop(i) continue + if not b["text"].strip(): + bxs.pop(i) + continue concatting_feats = [ b["text"].strip()[-1] in ",;:'\",、‘“;:-", len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:", @@ -856,7 +859,7 @@ class HuParser: pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf") return len(pdf) - def __images__(self, fnm, zoomin=3, page_from=0, page_to=299): + def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): self.lefted_chars = [] self.mean_height = [] self.mean_width = [] @@ -917,6 +920,7 @@ class HuParser: # self.page_cum_height.append( # np.max([c["bottom"] for c in chars])) self.__ocr(i + 1, img, chars, zoomin) + if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="") if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: bxes = [b for bxs in self.boxes for b in bxs] diff --git a/docker/.env b/docker/.env index 09f0572b..e036ef0c 100644 --- a/docker/.env +++ b/docker/.env @@ -16,11 +16,13 @@ MEM_LIMIT=4073741824 MYSQL_PASSWORD=infini_rag_flow MYSQL_PORT=5455 -MINIO_USER=rag_flow +MINIO_USER=infiniflow MINIO_PASSWORD=infini_rag_flow SVR_HTTP_PORT=9380 +TIMEZONE='Asia/Shanghai' + ######## OS setup for ES ########### # sysctl vm.max_map_count # sudo sysctl -w vm.max_map_count=262144 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 92cbbd50..7d6705bf 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -14,6 +14,7 @@ services: - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} - bootstrap.memory_lock=false - xpack.security.enabled=false + - TZ=${TIMEZONE} mem_limit: ${MEM_LIMIT} ulimits: memlock: @@ -41,6 +42,7 @@ services: environment: - SERVERNAME=kibana - ELASTICSEARCH_HOSTS=http://es01:9200 + - TZ=${TIMEZONE} mem_limit: ${MEM_LIMIT} networks: - ragflow @@ -50,7 +52,7 @@ services: container_name: ragflow-mysql environment: - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD} - - TZ="Asia/Shanghai" + - TZ=${TIMEZONE} command: --max_connections=1000 --character-set-server=utf8mb4 @@ -83,6 +85,7 @@ services: environment: - MINIO_ROOT_USER=${MINIO_USER} - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD} + - TZ=${TIMEZONE} volumes: - minio_data:/data networks: @@ -108,6 +111,8 @@ services: - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf - ./nginx/proxy.conf:/etc/nginx/proxy.conf - ./nginx/nginx.conf:/etc/nginx/nginx.conf + environment: + - TZ=${TIMEZONE} networks: - ragflow restart: always diff --git a/rag/app/book.py b/rag/app/book.py index df962cbc..dd31f685 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -26,26 +26,27 @@ class Pdf(PdfParser): filename if not binary else binary, zoomin, from_page, - to_page) - callback(0.1, "OCR finished") + to_page, + callback) + callback("OCR finished") from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.47, "Layout analysis finished") + callback(0.67, "Layout analysis finished") print("paddle layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") self._text_merge() - self._concat_downward(concat_between_pages=False) + tbls = self._extract_table_figure(True, zoomin, True, True) + self._naive_vertical_merge() self._filter_forpages() self._merge_with_same_bullet() callback(0.75, "Text merging finished.") - tbls = self._extract_table_figure(True, zoomin, True, True) callback(0.8, "Text extraction finished") - return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls, tbl_poss + return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -92,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)]) if bull >= 0: cks = hierarchical_merge(bull, sections, 3) else: - sections = [s.split("@") for s in sections] + sections = [s.split("@") for s,_ in sections] sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2] cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?")) @@ -116,6 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if __name__ == "__main__": import sys - def dummy(a, b): + def dummy(prog=None, msg=""): pass chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) diff --git a/rag/app/laws.py b/rag/app/laws.py index 256c02a4..297970a3 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -54,13 +54,15 @@ class Pdf(PdfParser): filename if not binary else binary, zoomin, from_page, - to_page) - callback(0.1, "OCR finished") + to_page, + callback + ) + callback("OCR finished") from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.77, "Layout analysis finished") + callback(0.67, "Layout analysis finished") cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1))) self._naive_vertical_merge() diff --git a/rag/app/manual.py b/rag/app/manual.py index 68b3faf6..75af024c 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -19,20 +19,22 @@ class Pdf(PdfParser): filename if not binary else binary, zoomin, from_page, - to_page) - callback(0.2, "OCR finished.") + to_page, + callback + ) + callback("OCR finished.") from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.5, "Layout analysis finished.") + callback(0.65, "Layout analysis finished.") print("paddle layouts:", timer() - start) self._table_transformer_job(zoomin) - callback(0.7, "Table analysis finished.") + callback(0.67, "Table analysis finished.") self._text_merge() self._concat_downward(concat_between_pages=False) self._filter_forpages() - callback(0.77, "Text merging finished") + callback(0.68, "Text merging finished") tbls = self._extract_table_figure(True, zoomin, True, True) # clean mess diff --git a/rag/app/naive.py b/rag/app/naive.py index 72f53a9d..3b83c53f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -26,24 +26,24 @@ class Pdf(PdfParser): filename if not binary else binary, zoomin, from_page, - to_page) - callback(0.1, "OCR finished") + to_page, + callback + ) + callback("OCR finished") from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.5, "Layout analysis finished.") + callback(0.63, "Layout analysis finished.") print("paddle layouts:", timer() - start) self._table_transformer_job(zoomin) - callback(0.7, "Table analysis finished.") + callback(0.65, "Table analysis finished.") self._text_merge() - self._concat_downward(concat_between_pages=False) - self._filter_forpages() - callback(0.77, "Text merging finished") + callback(0.67, "Text merging finished") tbls = self._extract_table_figure(True, zoomin, True, True) + self._naive_vertical_merge() cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) - #self._naive_vertical_merge() return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls diff --git a/rag/app/paper.py b/rag/app/paper.py index 8738ddb2..a9d1afcc 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -33,13 +33,15 @@ class Pdf(PdfParser): filename if not binary else binary, zoomin, from_page, - to_page) - callback(0.2, "OCR finished.") + to_page, + callback + ) + callback("OCR finished.") from timeit import default_timer as timer start = timer() self._layouts_rec(zoomin) - callback(0.47, "Layout analysis finished") + callback(0.63, "Layout analysis finished") print("paddle layouts:", timer() - start) self._table_transformer_job(zoomin) callback(0.68, "Table analysis finished") diff --git a/rag/app/presentation.py b/rag/app/presentation.py index dd18e720..002dc252 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -49,7 +49,7 @@ class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): callback(msg="OCR is running...") - self.__images__(filename if not binary else binary, zoomin, from_page, to_page) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page))) assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) res = [] diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index cd525866..7ce0d877 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -56,6 +56,7 @@ class HuEmbedding(Base): def encode(self, texts: list, batch_size=32): + texts = [t[:2000] for t in texts] token_count = 0 for t in texts: token_count += num_tokens_from_string(t) res = [] diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index f2c9f436..b3738f7f 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -114,6 +114,7 @@ def add_positions(d, poss): d["page_num_int"].append(pn+1) d["top_int"].append(top) d["position_int"].append((pn+1, left, right, top, bottom)) + d["top_int"] = d["top_int"][:1] def remove_contents_table(sections, eng=False): @@ -172,7 +173,7 @@ def hierarchical_merge(bull, sections, depth): def not_title(txt): if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False - if len(txt) >= 128: return True + if len(txt.split(" "))>12 or (txt.find(" ")<0 and len(txt)) >= 32: return True return re.search(r"[,;,。;!!]", txt) for i, (txt, layout) in enumerate(sections): @@ -181,12 +182,12 @@ def hierarchical_merge(bull, sections, depth): levels[j].append(i) break else: - if re.search(r"(title|head)", layout): + if re.search(r"(title|head)", layout) and not not_title(txt): levels[bullets_size].append(i) else: levels[bullets_size + 1].append(i) sections = [t for t, _ in sections] - for s in sections: print("--", s) + #for s in sections: print("--", s) def binary_search(arr, target): if not arr: return -1 @@ -220,11 +221,29 @@ def hierarchical_merge(bull, sections, depth): if jj > cks[-1][-1]: cks[-1].pop(-1) cks[-1].append(levels[ii][jj]) for ii in cks[-1]: readed[ii] = True + + if not cks:return cks + for i in range(len(cks)): cks[i] = [sections[j] for j in cks[i][::-1]] print("--------------\n", "\n* ".join(cks[i])) - return cks + res = [[]] + num = [0] + for ck in cks: + if len(ck) == 1: + n = num_tokens_from_string(re.sub(r"@@[0-9]+.*", "", ck[0])) + if n + num[-1] < 218: + res[-1].append(ck[0]) + num[-1] += n + continue + res.append(ck) + num.append(n) + continue + res.append(ck) + num.append(218) + + return res def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index 8b2648e7..38a2b44d 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -46,7 +46,7 @@ def collect(tm): def set_dispatching(docid): try: DocumentService.update_by_id( - docid, {"progress": random.randint(0, 3) / 100., + docid, {"progress": random.random()*1 / 100., "progress_msg": "Task dispatched...", "process_begin_at": get_format_time() }) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 67cc0fb8..457808c7 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -72,7 +72,8 @@ def set_progress(task_id, from_page=0, to_page=-1, prog = -1 if to_page > 0: - msg = f"Page({from_page}~{to_page}): " + msg + if msg: + msg = f"Page({from_page}~{to_page}): " + msg d = {"progress_msg": msg} if prog is not None: d["progress"] = prog @@ -168,7 +169,7 @@ def init_kb(row): open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r"))) -def embedding(docs, mdl, parser_config={}): +def embedding(docs, mdl, parser_config={}, callback=None): tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ d["content_with_weight"] for d in docs] tk_count = 0 @@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}): tts, c = mdl.encode(tts) tk_count += c - cnts, c = mdl.encode(cnts) - tk_count += c + cnts_ = [] + for i in range(0, len(cnts), 32): + vts, c = mdl.encode(cnts[i: i+32]) + cnts_.extend(vts) + tk_count += c + callback(msg="") + cnts = cnts_ + title_w = float(parser_config.get("filename_embd_weight", 0.1)) vects = (title_w * tts + (1 - title_w) * cnts) if len(tts) == len(cnts) else cnts @@ -218,10 +225,11 @@ def main(comm, mod): # TODO: exception handler ## set_progress(r["did"], -1, "ERROR: ") try: - tk_count = embedding(cks, embd_mdl, r["parser_config"]) + tk_count = embedding(cks, embd_mdl, r["parser_config"], callback) except Exception as e: callback(-1, "Embedding error:{}".format(str(e))) cron_logger.error(str(e)) + tk_count = 0 callback(msg="Finished embedding! Start to build index!") init_kb(r)