resolve table issues (#125)

This commit is contained in:
KevinHuSh 2024-03-15 14:59:28 +08:00 committed by GitHub
parent 82350c4139
commit de09b0e1a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 15 additions and 14 deletions

View File

@ -14,7 +14,6 @@ ADD ./rag ./rag
ENV PYTHONPATH=/ragflow/ ENV PYTHONPATH=/ragflow/
ENV HF_ENDPOINT=https://hf-mirror.com ENV HF_ENDPOINT=https://hf-mirror.com
/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
ADD docker/entrypoint.sh ./entrypoint.sh ADD docker/entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh RUN chmod +x ./entrypoint.sh

View File

@ -19,7 +19,6 @@ ADD ./rag ./rag
ENV PYTHONPATH=/ragflow/ ENV PYTHONPATH=/ragflow/
ENV HF_ENDPOINT=https://hf-mirror.com ENV HF_ENDPOINT=https://hf-mirror.com
/root/miniconda3/envs/py11/bin/pip install peewee==3.17.1
ADD docker/entrypoint.sh ./entrypoint.sh ADD docker/entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh RUN chmod +x ./entrypoint.sh

View File

@ -309,6 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl):
# compose markdown table # compose markdown table
clmns = "|"+"|".join([re.sub(r"(/.*|[^]+)", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|") clmns = "|"+"|".join([re.sub(r"(/.*|[^]+)", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|")
line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "") line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "")
line = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}\|", "|", line)
rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]] rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
if not docid_idx or not docnm_idx: if not docid_idx or not docnm_idx:
chat_logger.warning("SQL missing field: " + sql) chat_logger.warning("SQL missing field: " + sql)

View File

@ -94,7 +94,7 @@ def init_llm_factory():
"name": "Local", "name": "Local",
"logo": "", "logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "0", "status": "1",
},{ },{
"name": "Moonshot", "name": "Moonshot",
"logo": "", "logo": "",

View File

@ -78,7 +78,7 @@ class KnowledgebaseService(CommonService):
if isinstance(v, dict): if isinstance(v, dict):
assert isinstance(old[k], dict) assert isinstance(old[k], dict)
dfs_update(old[k], v) dfs_update(old[k], v)
if isinstance(v, list): elif isinstance(v, list):
assert isinstance(old[k], list) assert isinstance(old[k], list)
old[k] = list(set(old[k]+v)) old[k] = list(set(old[k]+v))
else: old[k] = v else: old[k] = v

View File

@ -73,9 +73,9 @@ def trans_datatime(s):
def trans_bool(s): def trans_bool(s):
if re.match(r"(true|yes|是)$", str(s).strip(), flags=re.IGNORECASE): if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
return ["yes", ""] return ["yes", ""]
if re.match(r"(false|no|否)$", str(s).strip(), flags=re.IGNORECASE): if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return ["no", ""] return ["no", ""]
@ -107,9 +107,9 @@ def column_data_type(arr):
arr[i] = trans[ty](str(arr[i])) arr[i] = trans[ty](str(arr[i]))
except Exception as e: except Exception as e:
arr[i] = None arr[i] = None
if ty == "text": #if ty == "text":
if len(arr) > 128 and uni / len(arr) < 0.1: # if len(arr) > 128 and uni / len(arr) < 0.1:
ty = "keyword" # ty = "keyword"
return arr, ty return arr, ty
@ -170,7 +170,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
PY = Pinyin() PY = Pinyin()
fieds_map = { fieds_map = {
"text": "_tks", "text": "_tks",
"int": "_int", "int": "_long",
"keyword": "_kwd", "keyword": "_kwd",
"float": "_flt", "float": "_flt",
"datetime": "_dt", "datetime": "_dt",
@ -189,7 +189,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
df[clmns[j]] = cln df[clmns[j]] = cln
if ty == "text": if ty == "text":
txts.extend([str(c) for c in cln if c]) txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i].replace("_", " ")) clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
for i in range(len(clmns))] for i in range(len(clmns))]
eng = lang.lower() == "english"#is_english(txts) eng = lang.lower() == "english"#is_english(txts)
@ -204,6 +204,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
continue continue
if not str(row[clmns[j]]): if not str(row[clmns[j]]):
continue continue
if pd.isna(row[clmns[j]]):
continue
fld = clmns_map[j][0] fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
row[clmns[j]]) row[clmns[j]])
@ -223,7 +225,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(a, b): def dummy(prog=None, msg=""):
pass pass
chunk(sys.argv[1], callback=dummy) chunk(sys.argv[1], callback=dummy)

View File

@ -19,8 +19,8 @@ from .minio_conn import MINIO
from .es_conn import ELASTICSEARCH from .es_conn import ELASTICSEARCH
def rmSpace(txt): def rmSpace(txt):
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt) txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt) return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
def findMaxDt(fnm): def findMaxDt(fnm):