From 979b3a5b4b25c6969a5b19adcf43c8cc173b7f0d Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Wed, 27 Mar 2024 09:53:42 +0800 Subject: [PATCH] support snapshot download from local (#153) * support snapshot download from local * let snapshot download from local --- README.md | 12 ++++-- api/apps/conversation_app.py | 7 +++- deepdoc/parser/pdf_parser.py | 18 +++++++-- deepdoc/vision/layout_recognizer.py | 11 +++++- deepdoc/vision/ocr.py | 40 +++++++++++++++----- deepdoc/vision/recognizer.py | 11 +++++- deepdoc/vision/table_structure_recognizer.py | 11 +++++- docker/README.md | 2 +- docker/entrypoint.sh | 2 +- docker/service_conf.yaml | 2 +- rag/llm/embedding_model.py | 15 +++++++- rag/svr/task_executor.py | 2 +- 12 files changed, 109 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 2e858d85..fc58342d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- + ragflow logo
@@ -11,7 +11,7 @@

- + Static Badge

-[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM, +[RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM, with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management platform to empower your business with AI. @@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again! > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml), > and change the left part of *'80:80'*'. +# System Architecture Diagram + +
+ +
+ # Configuration If you need to change the default setting of the system when you deploy it. There several ways to configure it. Please refer to [README](./docker/README.md) and manually set the configuration. diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index ba6d6002..5a23efb0 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl): rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) docid_idx = list(docid_idx)[0] docnm_idx = list(docnm_idx)[0] + doc_aggs = {} + for r in tbl["rows"]: + if r[docid_idx] not in doc_aggs: + doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0} + doc_aggs[r[docid_idx]]["count"] += 1 return { "answer": "\n".join([clmns, line, rows]), "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], - "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]} + "doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]} } diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 767cfcfd..ed6edea8 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import random import fitz @@ -12,10 +13,12 @@ from PIL import Image, ImageDraw import numpy as np from PyPDF2 import PdfReader as pdf2_read + +from api.utils.file_utils import get_project_base_directory from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from rag.nlp import huqie from copy import deepcopy -from huggingface_hub import hf_hub_download +from huggingface_hub import hf_hub_download, snapshot_download logging.getLogger("pdfminer").setLevel(logging.WARNING) @@ -32,8 +35,17 @@ class HuParser: self.updown_cnt_mdl = xgb.Booster() if torch.cuda.is_available(): self.updown_cnt_mdl.set_param({"device": "cuda"}) - self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", - filename="updown_concat_xgb.model")) + try: + model_dir = snapshot_download( + repo_id="InfiniFlow/text_concat_xgb_v1.0", + local_dir=os.path.join( + get_project_base_directory(), + "rag/res/deepdoc"), + local_files_only=True) + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0") + + self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model")) self.page_from = 0 """ If you have trouble downloading HuggingFace models, -_^ this might help!! diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index ba7ed85e..e107e077 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer): "Equation", ] def __init__(self, domain): - model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + try: + model_dir = snapshot_download( + repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join( + get_project_base_directory(), + "rag/res/deepdoc"), + local_files_only=True) + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) self.garbage_layouts = ["footer", "header", "reference"] diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index 86565199..dd34e4b2 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -14,6 +14,10 @@ import copy import time import os + +from huggingface_hub import snapshot_download + +from api.utils.file_utils import get_project_base_directory from .operators import * import numpy as np import onnxruntime as ort @@ -21,6 +25,7 @@ import onnxruntime as ort from .postprocess import build_post_process from rag.settings import cron_logger + def transform(data, ops=None): """ transform """ if ops is None: @@ -66,9 +71,15 @@ def load_model(model_dir, nm): options.intra_op_num_threads = 2 options.inter_op_num_threads = 2 if False and ort.get_device() == "GPU": - sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider']) + sess = ort.InferenceSession( + model_file_path, + options=options, + providers=['CUDAExecutionProvider']) else: - sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider']) + sess = ort.InferenceSession( + model_file_path, + options=options, + providers=['CPUExecutionProvider']) return sess, sess.get_inputs()[0] @@ -331,7 +342,8 @@ class TextRecognizer(object): outputs = self.predictor.run(None, input_dict) break except Exception as e: - if i >= 3: raise e + if i >= 3: + raise e time.sleep(5) preds = outputs[0] rec_result = self.postprocess_op(preds) @@ -442,7 +454,8 @@ class TextDetector(object): outputs = self.predictor.run(None, input_dict) break except Exception as e: - if i >= 3: raise e + if i >= 3: + raise e time.sleep(5) post_result = self.postprocess_op({"maps": outputs[0]}, shape_list) @@ -466,7 +479,15 @@ class OCR(object): """ if not model_dir: - model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + try: + model_dir = snapshot_download( + repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join( + get_project_base_directory(), + "rag/res/deepdoc"), + local_files_only=True) + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") self.text_detector = TextDetector(model_dir) self.text_recognizer = TextRecognizer(model_dir) @@ -548,14 +569,16 @@ class OCR(object): cron_logger.debug("dt_boxes num : {}, elapsed : {}".format( len(dt_boxes), elapse)) - return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))]) + return zip(self.sorted_boxes(dt_boxes), [ + ("", 0) for _ in range(len(dt_boxes))]) def recognize(self, ori_im, box): img_crop = self.get_rotate_crop_image(ori_im, box) rec_res, elapse = self.text_recognizer([img_crop]) text, score = rec_res[0] - if score < self.drop_score:return "" + if score < self.drop_score: + return "" return text def __call__(self, img, cls=True): @@ -600,8 +623,7 @@ class OCR(object): end = time.time() time_dict['all'] = end - start - - #for bno in range(len(img_crop_list)): + # for bno in range(len(img_crop_list)): # print(f"{bno}, {rec_res[bno]}") return list(zip([a.tolist() for a in filter_boxes], filter_rec_res)) diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index 1de3cd50..4f619660 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -17,6 +17,7 @@ from copy import deepcopy import onnxruntime as ort from huggingface_hub import snapshot_download +from api.utils.file_utils import get_project_base_directory from .operators import * from rag.settings import cron_logger @@ -35,7 +36,15 @@ class Recognizer(object): """ if not model_dir: - model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + try: + model_dir = snapshot_download( + repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join( + get_project_base_directory(), + "rag/res/deepdoc"), + local_files_only=True) + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") model_file_path = os.path.join(model_dir, task_name + ".onnx") if not os.path.exists(model_file_path): diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index be2430a0..022d5582 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer): ] def __init__(self): - model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + try: + model_dir = snapshot_download( + repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join( + get_project_base_directory(), + "rag/res/deepdoc"), + local_files_only=True) + except Exception as e: + model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") + super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) def __call__(self, images, thr=0.2): diff --git a/docker/README.md b/docker/README.md index 812f8b13..63d435cb 100644 --- a/docker/README.md +++ b/docker/README.md @@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*. ### factory -The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported. +The LLM suppliers. 'Tongyi-Qianwen', "OpenAI", "Moonshot" and "ZHIPU-AI" are supported. ### api_key The corresponding API key of your assigned LLM vendor. diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index a3742350..1e76a13a 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -29,7 +29,7 @@ function task_bro(){ task_bro & -WS=8 +WS=2 for ((i=0;i]{0,12})?>", " ", d["content_with_weight"]) for d in docs] tk_count = 0 if len(tts) == len(cnts): tts_ = np.array([])