support snapshot download from local (#153)

* support snapshot download from local

* let snapshot download from local
This commit is contained in:
KevinHuSh 2024-03-27 09:53:42 +08:00 committed by GitHub
parent da21320b88
commit 979b3a5b4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 109 additions and 24 deletions

View File

@ -1,5 +1,5 @@
<div align="center"> <div align="center">
<a href="https://ragflow.io/"> <a href="https://demo.ragflow.io/">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo"> <img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
</a> </a>
</div> </div>
@ -11,7 +11,7 @@
</p> </p>
<p align="center"> <p align="center">
<a href="https://ragflow.io" target="_blank"> <a href="https://demo.ragflow.io" target="_blank">
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a> <img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank"> <a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen" <img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
@ -21,7 +21,7 @@
</a> </a>
</p> </p>
[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM, [RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
platform to empower your business with AI. platform to empower your business with AI.
@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml), > The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
> and change the left part of *'80:80'*'. > and change the left part of *'80:80'*'.
# System Architecture Diagram
<div align="center" style="margin-top:20px;margin-bottom:20px;">
<img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
</div>
# Configuration # Configuration
If you need to change the default setting of the system when you deploy it. There several ways to configure it. If you need to change the default setting of the system when you deploy it. There several ways to configure it.
Please refer to [README](./docker/README.md) and manually set the configuration. Please refer to [README](./docker/README.md) and manually set the configuration.

View File

@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
docid_idx = list(docid_idx)[0] docid_idx = list(docid_idx)[0]
docnm_idx = list(docnm_idx)[0] docnm_idx = list(docnm_idx)[0]
doc_aggs = {}
for r in tbl["rows"]:
if r[docid_idx] not in doc_aggs:
doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
doc_aggs[r[docid_idx]]["count"] += 1
return { return {
"answer": "\n".join([clmns, line, rows]), "answer": "\n".join([clmns, line, rows]),
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
"doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]} "doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
} }

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
import random import random
import fitz import fitz
@ -12,10 +13,12 @@ from PIL import Image, ImageDraw
import numpy as np import numpy as np
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie from rag.nlp import huqie
from copy import deepcopy from copy import deepcopy
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download, snapshot_download
logging.getLogger("pdfminer").setLevel(logging.WARNING) logging.getLogger("pdfminer").setLevel(logging.WARNING)
@ -32,8 +35,17 @@ class HuParser:
self.updown_cnt_mdl = xgb.Booster() self.updown_cnt_mdl = xgb.Booster()
if torch.cuda.is_available(): if torch.cuda.is_available():
self.updown_cnt_mdl.set_param({"device": "cuda"}) self.updown_cnt_mdl.set_param({"device": "cuda"})
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", try:
filename="updown_concat_xgb.model")) model_dir = snapshot_download(
repo_id="InfiniFlow/text_concat_xgb_v1.0",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
self.page_from = 0 self.page_from = 0
""" """
If you have trouble downloading HuggingFace models, -_^ this might help!! If you have trouble downloading HuggingFace models, -_^ this might help!!

View File

@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
"Equation", "Equation",
] ]
def __init__(self, domain): def __init__(self, domain):
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
self.garbage_layouts = ["footer", "header", "reference"] self.garbage_layouts = ["footer", "header", "reference"]

View File

@ -14,6 +14,10 @@
import copy import copy
import time import time
import os import os
from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from .operators import * from .operators import *
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
@ -21,6 +25,7 @@ import onnxruntime as ort
from .postprocess import build_post_process from .postprocess import build_post_process
from rag.settings import cron_logger from rag.settings import cron_logger
def transform(data, ops=None): def transform(data, ops=None):
""" transform """ """ transform """
if ops is None: if ops is None:
@ -66,9 +71,15 @@ def load_model(model_dir, nm):
options.intra_op_num_threads = 2 options.intra_op_num_threads = 2
options.inter_op_num_threads = 2 options.inter_op_num_threads = 2
if False and ort.get_device() == "GPU": if False and ort.get_device() == "GPU":
sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider']) sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CUDAExecutionProvider'])
else: else:
sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider']) sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CPUExecutionProvider'])
return sess, sess.get_inputs()[0] return sess, sess.get_inputs()[0]
@ -331,7 +342,8 @@ class TextRecognizer(object):
outputs = self.predictor.run(None, input_dict) outputs = self.predictor.run(None, input_dict)
break break
except Exception as e: except Exception as e:
if i >= 3: raise e if i >= 3:
raise e
time.sleep(5) time.sleep(5)
preds = outputs[0] preds = outputs[0]
rec_result = self.postprocess_op(preds) rec_result = self.postprocess_op(preds)
@ -442,7 +454,8 @@ class TextDetector(object):
outputs = self.predictor.run(None, input_dict) outputs = self.predictor.run(None, input_dict)
break break
except Exception as e: except Exception as e:
if i >= 3: raise e if i >= 3:
raise e
time.sleep(5) time.sleep(5)
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list) post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
@ -466,7 +479,15 @@ class OCR(object):
""" """
if not model_dir: if not model_dir:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
self.text_detector = TextDetector(model_dir) self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir) self.text_recognizer = TextRecognizer(model_dir)
@ -548,14 +569,16 @@ class OCR(object):
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format( cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
len(dt_boxes), elapse)) len(dt_boxes), elapse))
return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))]) return zip(self.sorted_boxes(dt_boxes), [
("", 0) for _ in range(len(dt_boxes))])
def recognize(self, ori_im, box): def recognize(self, ori_im, box):
img_crop = self.get_rotate_crop_image(ori_im, box) img_crop = self.get_rotate_crop_image(ori_im, box)
rec_res, elapse = self.text_recognizer([img_crop]) rec_res, elapse = self.text_recognizer([img_crop])
text, score = rec_res[0] text, score = rec_res[0]
if score < self.drop_score:return "" if score < self.drop_score:
return ""
return text return text
def __call__(self, img, cls=True): def __call__(self, img, cls=True):
@ -600,8 +623,7 @@ class OCR(object):
end = time.time() end = time.time()
time_dict['all'] = end - start time_dict['all'] = end - start
# for bno in range(len(img_crop_list)):
#for bno in range(len(img_crop_list)):
# print(f"{bno}, {rec_res[bno]}") # print(f"{bno}, {rec_res[bno]}")
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res)) return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))

View File

@ -17,6 +17,7 @@ from copy import deepcopy
import onnxruntime as ort import onnxruntime as ort
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from .operators import * from .operators import *
from rag.settings import cron_logger from rag.settings import cron_logger
@ -35,7 +36,15 @@ class Recognizer(object):
""" """
if not model_dir: if not model_dir:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
model_file_path = os.path.join(model_dir, task_name + ".onnx") model_file_path = os.path.join(model_dir, task_name + ".onnx")
if not os.path.exists(model_file_path): if not os.path.exists(model_file_path):

View File

@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
] ]
def __init__(self): def __init__(self):
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
def __call__(self, images, thr=0.2): def __call__(self, images, thr=0.2):

View File

@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*. Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
### factory ### factory
The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported. The LLM suppliers. 'Tongyi-Qianwen', "OpenAI" "Moonshot" and "ZHIPU-AI" are supported.
### api_key ### api_key
The corresponding API key of your assigned LLM vendor. The corresponding API key of your assigned LLM vendor.

View File

@ -29,7 +29,7 @@ function task_bro(){
task_bro & task_bro &
WS=8 WS=2
for ((i=0;i<WS;i++)) for ((i=0;i<WS;i++))
do do
task_exe $i $WS & task_exe $i $WS &

View File

@ -16,7 +16,7 @@ minio:
es: es:
hosts: 'http://es01:9200' hosts: 'http://es01:9200'
user_default_llm: user_default_llm:
factory: '通义千问' factory: 'Tongyi-Qianwen'
api_key: 'sk-xxxxxxxxxxxxx' api_key: 'sk-xxxxxxxxxxxxx'
oauth: oauth:
github: github:

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import os
from abc import ABC from abc import ABC
import dashscope import dashscope
@ -21,9 +22,21 @@ from FlagEmbedding import FlagModel
import torch import torch
import numpy as np import numpy as np
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True), try:
model_dir = snapshot_download(
repo_id="BAAI/bge-large-zh-v1.5",
local_dir=os.path.join(
get_project_base_directory(),
"rag/res/bge-large-zh-v1.5"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
flag_model = FlagModel(model_dir,
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available()) use_fp16=torch.cuda.is_available())

View File

@ -172,7 +172,7 @@ def init_kb(row):
def embedding(docs, mdl, parser_config={}, callback=None): def embedding(docs, mdl, parser_config={}, callback=None):
batch_size = 32 batch_size = 32
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
d["content_with_weight"] for d in docs] re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
tk_count = 0 tk_count = 0
if len(tts) == len(cnts): if len(tts) == len(cnts):
tts_ = np.array([]) tts_ = np.array([])