refine table parser (#120)
This commit is contained in:
parent
f1f09df901
commit
0feb085c88
@ -51,6 +51,7 @@ class TaskService(CommonService):
|
|||||||
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
||||||
.where(
|
.where(
|
||||||
Document.status == StatusEnum.VALID.value,
|
Document.status == StatusEnum.VALID.value,
|
||||||
|
Document.run == TaskStatus.RUNNING.value,
|
||||||
~(Document.type == FileType.VIRTUAL.value),
|
~(Document.type == FileType.VIRTUAL.value),
|
||||||
cls.model.progress == 0,
|
cls.model.progress == 0,
|
||||||
cls.model.update_time >= tm,
|
cls.model.update_time >= tm,
|
||||||
|
|||||||
@ -42,7 +42,9 @@ class HuPptParser(object):
|
|||||||
BytesIO(fnm))
|
BytesIO(fnm))
|
||||||
txts = []
|
txts = []
|
||||||
self.total_page = len(ppt.slides)
|
self.total_page = len(ppt.slides)
|
||||||
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
for i, slide in enumerate(ppt.slides):
|
||||||
|
if i < from_page: continue
|
||||||
|
if i >= to_page:break
|
||||||
texts = []
|
texts = []
|
||||||
for shape in slide.shapes:
|
for shape in slide.shapes:
|
||||||
txt = self.__extract(shape)
|
txt = self.__extract(shape)
|
||||||
|
|||||||
@ -13,6 +13,9 @@
|
|||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
from rag.nlp import huqie
|
from rag.nlp import huqie
|
||||||
from deepdoc.parser import PdfParser, PptParser
|
from deepdoc.parser import PdfParser, PptParser
|
||||||
@ -30,7 +33,7 @@ class Ppt(PptParser):
|
|||||||
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||||
imgs.append(buffered.getvalue())
|
imgs.append(Image.open(buffered))
|
||||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||||
callback(0.9, "Image extraction finished")
|
callback(0.9, "Image extraction finished")
|
||||||
self.is_english = is_english(txts)
|
self.is_english = is_english(txts)
|
||||||
|
|||||||
@ -58,12 +58,9 @@ class Excel(ExcelParser):
|
|||||||
continue
|
continue
|
||||||
data.append(row)
|
data.append(row)
|
||||||
done += 1
|
done += 1
|
||||||
if done % 999 == 0:
|
|
||||||
callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + (
|
|
||||||
f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
||||||
res.append(pd.DataFrame(np.array(data), columns=headers))
|
res.append(pd.DataFrame(np.array(data), columns=headers))
|
||||||
|
|
||||||
callback(0.6, ("Extract records: {}. ".format(done) + (
|
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|||||||
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
||||||
rows = []
|
rows = []
|
||||||
for i, line in enumerate(lines[1:]):
|
for i, line in enumerate(lines[1:]):
|
||||||
if from_page < from_page:continue
|
if i < from_page:continue
|
||||||
if i >= to_page: break
|
if i >= to_page: break
|
||||||
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
||||||
if len(row) != len(headers):
|
if len(row) != len(headers):
|
||||||
@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|||||||
df[clmns[j]] = cln
|
df[clmns[j]] = cln
|
||||||
if ty == "text":
|
if ty == "text":
|
||||||
txts.extend([str(c) for c in cln if c])
|
txts.extend([str(c) for c in cln if c])
|
||||||
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
|
clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i])
|
||||||
for i in range(len(clmns))]
|
for i in range(len(clmns))]
|
||||||
|
|
||||||
eng = lang.lower() == "english"#is_english(txts)
|
eng = lang.lower() == "english"#is_english(txts)
|
||||||
for ii, row in df.iterrows():
|
for ii, row in df.iterrows():
|
||||||
d = {}
|
d = {
|
||||||
|
"docnm_kwd": filename,
|
||||||
|
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
|
}
|
||||||
row_txt = []
|
row_txt = []
|
||||||
for j in range(len(clmns)):
|
for j in range(len(clmns)):
|
||||||
if row[clmns[j]] is None:
|
if row[clmns[j]] is None:
|
||||||
|
|||||||
@ -91,10 +91,10 @@ def dispatch():
|
|||||||
tsks.append(task)
|
tsks.append(task)
|
||||||
elif r["parser_id"] == "table":
|
elif r["parser_id"] == "table":
|
||||||
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||||
for i in range(0, rn, 1000):
|
for i in range(0, rn, 3000):
|
||||||
task = new_task()
|
task = new_task()
|
||||||
task["from_page"] = i
|
task["from_page"] = i
|
||||||
task["to_page"] = min(i + 1000, rn)
|
task["to_page"] = min(i + 3000, rn)
|
||||||
tsks.append(task)
|
tsks.append(task)
|
||||||
else:
|
else:
|
||||||
tsks.append(new_task())
|
tsks.append(new_task())
|
||||||
|
|||||||
@ -128,8 +128,6 @@ def build(row):
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
doc = {
|
doc = {
|
||||||
"doc_id": row["doc_id"],
|
"doc_id": row["doc_id"],
|
||||||
@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
|||||||
tk_count += c
|
tk_count += c
|
||||||
|
|
||||||
cnts_ = np.array([])
|
cnts_ = np.array([])
|
||||||
for i in range(0, len(cnts), 32):
|
for i in range(0, len(cnts), 8):
|
||||||
vts, c = mdl.encode(cnts[i: i+32])
|
vts, c = mdl.encode(cnts[i: i+8])
|
||||||
if len(cnts_) == 0: cnts_ = vts
|
if len(cnts_) == 0: cnts_ = vts
|
||||||
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
||||||
tk_count += c
|
tk_count += c
|
||||||
@ -226,6 +224,7 @@ def main(comm, mod):
|
|||||||
continue
|
continue
|
||||||
# TODO: exception handler
|
# TODO: exception handler
|
||||||
## set_progress(r["did"], -1, "ERROR: ")
|
## set_progress(r["did"], -1, "ERROR: ")
|
||||||
|
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||||
try:
|
try:
|
||||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user