support snapshot download from local (#153)
* support snapshot download from local * let snapshot download from local
This commit is contained in:
parent
da21320b88
commit
979b3a5b4b
12
README.md
12
README.md
@ -1,5 +1,5 @@
|
|||||||
<div align="center">
|
<div align="center">
|
||||||
<a href="https://ragflow.io/">
|
<a href="https://demo.ragflow.io/">
|
||||||
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
|
<img src="https://github.com/infiniflow/ragflow/assets/12318111/f034fb27-b3bf-401b-b213-e1dfa7448d2a" width="320" alt="ragflow logo">
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
@ -11,7 +11,7 @@
|
|||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v1.0-brightgreen"
|
||||||
@ -21,7 +21,7 @@
|
|||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
[RagFlow](http://ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
|
[RagFlow](http://demo.ragflow.io) is a knowledge management platform built on custom-build document understanding engine and LLM,
|
||||||
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
|
with reasoned and well-founded answers to your question. Clone this repository, you can deploy your own knowledge management
|
||||||
platform to empower your business with AI.
|
platform to empower your business with AI.
|
||||||
|
|
||||||
@ -119,6 +119,12 @@ Open your browser, enter the IP address of your server, _**Hallelujah**_ again!
|
|||||||
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
|
> The default serving port is 80, if you want to change that, please refer to [docker-compose.yml](./docker-compose.yaml),
|
||||||
> and change the left part of *'80:80'*'.
|
> and change the left part of *'80:80'*'.
|
||||||
|
|
||||||
|
# System Architecture Diagram
|
||||||
|
|
||||||
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
|
<img src="https://github.com/infiniflow/ragflow/assets/12318111/39c8e546-51ca-4b50-a1da-83731b540cd0" width="1000"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
If you need to change the default setting of the system when you deploy it. There several ways to configure it.
|
If you need to change the default setting of the system when you deploy it. There several ways to configure it.
|
||||||
Please refer to [README](./docker/README.md) and manually set the configuration.
|
Please refer to [README](./docker/README.md) and manually set the configuration.
|
||||||
|
|||||||
@ -320,8 +320,13 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
|
|||||||
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows)
|
||||||
docid_idx = list(docid_idx)[0]
|
docid_idx = list(docid_idx)[0]
|
||||||
docnm_idx = list(docnm_idx)[0]
|
docnm_idx = list(docnm_idx)[0]
|
||||||
|
doc_aggs = {}
|
||||||
|
for r in tbl["rows"]:
|
||||||
|
if r[docid_idx] not in doc_aggs:
|
||||||
|
doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0}
|
||||||
|
doc_aggs[r[docid_idx]]["count"] += 1
|
||||||
return {
|
return {
|
||||||
"answer": "\n".join([clmns, line, rows]),
|
"answer": "\n".join([clmns, line, rows]),
|
||||||
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
|
"reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]],
|
||||||
"doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]}
|
"doc_aggs":[{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in doc_aggs.items()]}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
@ -12,10 +13,12 @@ from PIL import Image, ImageDraw
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from PyPDF2 import PdfReader as pdf2_read
|
from PyPDF2 import PdfReader as pdf2_read
|
||||||
|
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||||
from rag.nlp import huqie
|
from rag.nlp import huqie
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download, snapshot_download
|
||||||
|
|
||||||
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
||||||
|
|
||||||
@ -32,8 +35,17 @@ class HuParser:
|
|||||||
self.updown_cnt_mdl = xgb.Booster()
|
self.updown_cnt_mdl = xgb.Booster()
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||||
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
try:
|
||||||
filename="updown_concat_xgb.model"))
|
model_dir = snapshot_download(
|
||||||
|
repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/deepdoc"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
|
model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
|
||||||
|
|
||||||
|
self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
|
||||||
self.page_from = 0
|
self.page_from = 0
|
||||||
"""
|
"""
|
||||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||||
|
|||||||
@ -37,7 +37,16 @@ class LayoutRecognizer(Recognizer):
|
|||||||
"Equation",
|
"Equation",
|
||||||
]
|
]
|
||||||
def __init__(self, domain):
|
def __init__(self, domain):
|
||||||
|
try:
|
||||||
|
model_dir = snapshot_download(
|
||||||
|
repo_id="InfiniFlow/deepdoc",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/deepdoc"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
||||||
|
|
||||||
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
||||||
self.garbage_layouts = ["footer", "header", "reference"]
|
self.garbage_layouts = ["footer", "header", "reference"]
|
||||||
|
|
||||||
|
|||||||
@ -14,6 +14,10 @@
|
|||||||
import copy
|
import copy
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from .operators import *
|
from .operators import *
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
@ -21,6 +25,7 @@ import onnxruntime as ort
|
|||||||
from .postprocess import build_post_process
|
from .postprocess import build_post_process
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
|
|
||||||
def transform(data, ops=None):
|
def transform(data, ops=None):
|
||||||
""" transform """
|
""" transform """
|
||||||
if ops is None:
|
if ops is None:
|
||||||
@ -66,9 +71,15 @@ def load_model(model_dir, nm):
|
|||||||
options.intra_op_num_threads = 2
|
options.intra_op_num_threads = 2
|
||||||
options.inter_op_num_threads = 2
|
options.inter_op_num_threads = 2
|
||||||
if False and ort.get_device() == "GPU":
|
if False and ort.get_device() == "GPU":
|
||||||
sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
|
sess = ort.InferenceSession(
|
||||||
|
model_file_path,
|
||||||
|
options=options,
|
||||||
|
providers=['CUDAExecutionProvider'])
|
||||||
else:
|
else:
|
||||||
sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
|
sess = ort.InferenceSession(
|
||||||
|
model_file_path,
|
||||||
|
options=options,
|
||||||
|
providers=['CPUExecutionProvider'])
|
||||||
return sess, sess.get_inputs()[0]
|
return sess, sess.get_inputs()[0]
|
||||||
|
|
||||||
|
|
||||||
@ -331,7 +342,8 @@ class TextRecognizer(object):
|
|||||||
outputs = self.predictor.run(None, input_dict)
|
outputs = self.predictor.run(None, input_dict)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if i >= 3: raise e
|
if i >= 3:
|
||||||
|
raise e
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
preds = outputs[0]
|
preds = outputs[0]
|
||||||
rec_result = self.postprocess_op(preds)
|
rec_result = self.postprocess_op(preds)
|
||||||
@ -442,7 +454,8 @@ class TextDetector(object):
|
|||||||
outputs = self.predictor.run(None, input_dict)
|
outputs = self.predictor.run(None, input_dict)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if i >= 3: raise e
|
if i >= 3:
|
||||||
|
raise e
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
|
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
|
||||||
@ -466,6 +479,14 @@ class OCR(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
if not model_dir:
|
if not model_dir:
|
||||||
|
try:
|
||||||
|
model_dir = snapshot_download(
|
||||||
|
repo_id="InfiniFlow/deepdoc",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/deepdoc"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
||||||
|
|
||||||
self.text_detector = TextDetector(model_dir)
|
self.text_detector = TextDetector(model_dir)
|
||||||
@ -548,14 +569,16 @@ class OCR(object):
|
|||||||
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
|
cron_logger.debug("dt_boxes num : {}, elapsed : {}".format(
|
||||||
len(dt_boxes), elapse))
|
len(dt_boxes), elapse))
|
||||||
|
|
||||||
return zip(self.sorted_boxes(dt_boxes), [("",0) for _ in range(len(dt_boxes))])
|
return zip(self.sorted_boxes(dt_boxes), [
|
||||||
|
("", 0) for _ in range(len(dt_boxes))])
|
||||||
|
|
||||||
def recognize(self, ori_im, box):
|
def recognize(self, ori_im, box):
|
||||||
img_crop = self.get_rotate_crop_image(ori_im, box)
|
img_crop = self.get_rotate_crop_image(ori_im, box)
|
||||||
|
|
||||||
rec_res, elapse = self.text_recognizer([img_crop])
|
rec_res, elapse = self.text_recognizer([img_crop])
|
||||||
text, score = rec_res[0]
|
text, score = rec_res[0]
|
||||||
if score < self.drop_score:return ""
|
if score < self.drop_score:
|
||||||
|
return ""
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def __call__(self, img, cls=True):
|
def __call__(self, img, cls=True):
|
||||||
@ -600,8 +623,7 @@ class OCR(object):
|
|||||||
end = time.time()
|
end = time.time()
|
||||||
time_dict['all'] = end - start
|
time_dict['all'] = end - start
|
||||||
|
|
||||||
|
# for bno in range(len(img_crop_list)):
|
||||||
#for bno in range(len(img_crop_list)):
|
|
||||||
# print(f"{bno}, {rec_res[bno]}")
|
# print(f"{bno}, {rec_res[bno]}")
|
||||||
|
|
||||||
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
|
return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))
|
||||||
|
|||||||
@ -17,6 +17,7 @@ from copy import deepcopy
|
|||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from .operators import *
|
from .operators import *
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -35,6 +36,14 @@ class Recognizer(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
if not model_dir:
|
if not model_dir:
|
||||||
|
try:
|
||||||
|
model_dir = snapshot_download(
|
||||||
|
repo_id="InfiniFlow/deepdoc",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/deepdoc"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
||||||
|
|
||||||
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
model_file_path = os.path.join(model_dir, task_name + ".onnx")
|
||||||
|
|||||||
@ -34,7 +34,16 @@ class TableStructureRecognizer(Recognizer):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
try:
|
||||||
|
model_dir = snapshot_download(
|
||||||
|
repo_id="InfiniFlow/deepdoc",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/deepdoc"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
||||||
|
|
||||||
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
||||||
|
|
||||||
def __call__(self, images, thr=0.2):
|
def __call__(self, images, thr=0.2):
|
||||||
|
|||||||
@ -67,7 +67,7 @@ The serving IP and port inside the docker container. This is not updating until
|
|||||||
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
|
Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*.
|
||||||
|
|
||||||
### factory
|
### factory
|
||||||
The LLM suppliers. '通义千问', "OpenAI" and "智谱AI" are supported.
|
The LLM suppliers. 'Tongyi-Qianwen', "OpenAI", "Moonshot" and "ZHIPU-AI" are supported.
|
||||||
|
|
||||||
### api_key
|
### api_key
|
||||||
The corresponding API key of your assigned LLM vendor.
|
The corresponding API key of your assigned LLM vendor.
|
||||||
|
|||||||
@ -29,7 +29,7 @@ function task_bro(){
|
|||||||
|
|
||||||
task_bro &
|
task_bro &
|
||||||
|
|
||||||
WS=8
|
WS=2
|
||||||
for ((i=0;i<WS;i++))
|
for ((i=0;i<WS;i++))
|
||||||
do
|
do
|
||||||
task_exe $i $WS &
|
task_exe $i $WS &
|
||||||
|
|||||||
@ -16,7 +16,7 @@ minio:
|
|||||||
es:
|
es:
|
||||||
hosts: 'http://es01:9200'
|
hosts: 'http://es01:9200'
|
||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: '通义千问'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
oauth:
|
oauth:
|
||||||
github:
|
github:
|
||||||
|
|||||||
@ -13,6 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
|
|
||||||
import dashscope
|
import dashscope
|
||||||
@ -21,9 +22,21 @@ from FlagEmbedding import FlagModel
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True),
|
try:
|
||||||
|
model_dir = snapshot_download(
|
||||||
|
repo_id="BAAI/bge-large-zh-v1.5",
|
||||||
|
local_dir=os.path.join(
|
||||||
|
get_project_base_directory(),
|
||||||
|
"rag/res/bge-large-zh-v1.5"),
|
||||||
|
local_files_only=True)
|
||||||
|
except Exception as e:
|
||||||
|
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")
|
||||||
|
|
||||||
|
flag_model = FlagModel(model_dir,
|
||||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||||
use_fp16=torch.cuda.is_available())
|
use_fp16=torch.cuda.is_available())
|
||||||
|
|
||||||
|
|||||||
@ -172,7 +172,7 @@ def init_kb(row):
|
|||||||
def embedding(docs, mdl, parser_config={}, callback=None):
|
def embedding(docs, mdl, parser_config={}, callback=None):
|
||||||
batch_size = 32
|
batch_size = 32
|
||||||
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
||||||
d["content_with_weight"] for d in docs]
|
re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
|
||||||
tk_count = 0
|
tk_count = 0
|
||||||
if len(tts) == len(cnts):
|
if len(tts) == len(cnts):
|
||||||
tts_ = np.array([])
|
tts_ = np.array([])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user