Some document API refined. (#53)

Add naive chunking method to RAG
This commit is contained in:
KevinHuSh 2024-02-02 19:21:37 +08:00 committed by GitHub
parent 7b71fb2db6
commit 51482f3e2a
13 changed files with 447 additions and 268 deletions

View File

@ -133,9 +133,9 @@ def list():
orderby = request.args.get("orderby", "create_time") orderby = request.args.get("orderby", "create_time")
desc = request.args.get("desc", True) desc = request.args.get("desc", True)
try: try:
docs = DocumentService.get_by_kb_id( docs, tol = DocumentService.get_by_kb_id(
kb_id, page_number, items_per_page, orderby, desc, keywords) kb_id, page_number, items_per_page, orderby, desc, keywords)
return get_json_result(data=docs) return get_json_result(data={"total":tol, "docs": docs})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@ -228,20 +228,18 @@ def run():
@manager.route('/rename', methods=['POST']) @manager.route('/rename', methods=['POST'])
@login_required @login_required
@validate_request("doc_id", "name", "old_name") @validate_request("doc_id", "name")
def rename(): def rename():
req = request.json req = request.json
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
req["old_name"].lower()).suffix:
get_json_result(
data=False,
retmsg="The extension of file can't be changed",
retcode=RetCode.ARGUMENT_ERROR)
try: try:
e, doc = DocumentService.get_by_id(req["doc_id"]) e, doc = DocumentService.get_by_id(req["doc_id"])
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return get_json_result(
data=False,
retmsg="The extension of file can't be changed",
retcode=RetCode.ARGUMENT_ERROR)
if DocumentService.query(name=req["name"], kb_id=doc.kb_id): if DocumentService.query(name=req["name"], kb_id=doc.kb_id):
return get_data_error_result( return get_data_error_result(
retmsg="Duplicated document name in the same knowledgebase.") retmsg="Duplicated document name in the same knowledgebase.")

View File

@ -36,6 +36,7 @@ class DocumentService(CommonService):
cls.model.name.like(f"%%{keywords}%%")) cls.model.name.like(f"%%{keywords}%%"))
else: else:
docs = cls.model.select().where(cls.model.kb_id == kb_id) docs = cls.model.select().where(cls.model.kb_id == kb_id)
count = docs.count()
if desc: if desc:
docs = docs.order_by(cls.model.getter_by(orderby).desc()) docs = docs.order_by(cls.model.getter_by(orderby).desc())
else: else:
@ -43,7 +44,7 @@ class DocumentService(CommonService):
docs = docs.paginate(page_number, items_per_page) docs = docs.paginate(page_number, items_per_page)
return list(docs.dicts()) return list(docs.dicts()), count
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()

View File

@ -1,91 +0,0 @@
import re
from nltk import word_tokenize
from rag.nlp import stemmer, huqie
BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百]+(编|部分)",
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"第[零一二三四五六七八九十百]+条",
r"[\(][零一二三四五六七八九十百]+[\)]",
], [
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\(][零一二三四五六七八九十百]+[\)]",
r"[\(][0-9]{,2}[\)]",
] ,[
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]
def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i,h in enumerate(hits):
if h <= maxium:continue
res = i
maxium = h
return res
def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j):sections.pop(i)
break

View File

@ -1,10 +1,9 @@
import copy import copy
import random import random
import re import re
from io import BytesIO
from docx import Document
import numpy as np import numpy as np
from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table from rag.parser import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
@ -28,7 +27,6 @@ class Pdf(HuParser):
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished") callback(0.68, "Table analysis finished")
self._text_merge() self._text_merge()
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
self._merge_with_same_bullet() self._merge_with_same_bullet()
@ -37,10 +35,10 @@ class Pdf(HuParser):
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes] return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -52,8 +50,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
doc_parser = HuDocxParser() doc_parser = HuDocxParser()
# TODO: table of contents need to be removed # TODO: table of contents need to be removed
sections, tbls = doc_parser(binary if binary else filename) sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200))) remove_contents_table(sections, eng=is_english(random.choices([t for t,_ in sections], k=200)))
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()
@ -75,54 +73,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)]) make_colon_as_title(sections)
projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections) bull = bullets_category([t for t in random.choices([t for t,_ in sections], k=100)])
levels = [[]] * len(BULLET_PATTERN[bull]) + 2 if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
for i, (txt, layout) in enumerate(sections): else: cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()):
projs[i] = j
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
projs[i] = BULLET_PATTERN[bull]
levels[BULLET_PATTERN[bull]].append(i)
else:
levels[BULLET_PATTERN[bull] + 1].append(i)
sections = [t for t,_ in sections]
def binary_search(arr, target):
if target > arr[-1]: return len(arr) - 1
if target > arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s
cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: break
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
sections = [t for t, _ in sections]
# is it English # is it English
eng = is_english(random.choices(sections, k=218)) eng = is_english(random.choices(sections, k=218))
@ -138,11 +94,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
tokenize(d, r, eng) tokenize(d, r, eng)
d["image"] = img d["image"] = img
res.append(d) res.append(d)
print("TABLE", d["content_with_weight"])
# wrap up to es documents # wrap up to es documents
for ck in cks: for ck in cks:
print("\n-".join(ck[::-1]))
ck = "\n".join(ck[::-1])
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
ck = "\n".join(ck)
if pdf_parser: if pdf_parser:
d["image"] = pdf_parser.crop(ck) d["image"] = pdf_parser.crop(ck)
ck = pdf_parser.remove_tag(ck) ck = pdf_parser.remove_tag(ck)
@ -153,4 +109,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1]) def dummy(a, b):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -3,10 +3,12 @@ import re
from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
import numpy as np import numpy as np
from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Docx(HuDocxParser): class Docx(HuDocxParser):
@ -17,10 +19,20 @@ class Docx(HuDocxParser):
line = re.sub(r"\u3000", " ", line).strip() line = re.sub(r"\u3000", " ", line).strip()
return line return line
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document( self.doc = Document(
filename) if not binary else Document(BytesIO(binary)) filename) if not binary else Document(BytesIO(binary))
lines = [self.__clean(p.text) for p in self.doc.paragraphs] pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:break
if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [l for l in lines if l] return [l for l in lines if l]
@ -38,49 +50,15 @@ class Pdf(HuParser):
start = timer() start = timer()
self._layouts_paddle(zoomin) self._layouts_paddle(zoomin)
callback(0.77, "Layout analysis finished") callback(0.77, "Layout analysis finished")
print("paddle layouts:", timer()-start) cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) self._naive_vertical_merge()
# is it English
eng = is_english([b["text"] for b in bxs])
# Merge vertically
i = 0
while i + 1 < len(bxs):
b = bxs[i]
b_ = bxs[i + 1]
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i)
continue
concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
b["text"].strip()[0] in "。;?!?”)),,、:",
]
# features for not concating
feats = [
b.get("layoutno",0) != b.get("layoutno",0),
b["text"].strip()[-1] in "。?!?",
eng and b["text"].strip()[-1] in ".!?",
b["page_number"] == b_["page_number"] and b_["top"] - \
b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
b["page_number"] < b_["page_number"] and abs(
b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
]
if any(feats) and not any(concatting_feats):
i += 1
continue
# merge up and down
b["bottom"] = b_["bottom"]
b["text"] += b_["text"]
b["x0"] = min(b["x0"], b_["x0"])
b["x1"] = max(b["x1"], b_["x1"])
bxs.pop(i + 1)
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")
return [b["text"] + self._line_tag(b, zoomin) for b in bxs] return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -116,50 +94,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
# is it English # is it English
eng = is_english(sections) eng = is_english(sections)
# Remove 'Contents' part # Remove 'Contents' part
i = 0 remove_contents_table(sections, eng)
while i < len(sections):
if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, sections[j]):
continue
for _ in range(i, j):sections.pop(i)
break
make_colon_as_title(sections)
bull = bullets_category(sections) bull = bullets_category(sections)
projs = [len(BULLET_PATTERN[bull])] * len(sections) cks = hierarchical_merge(bull, sections, 3)
for i, sec in enumerate(sections): if not cks: callback(0.99, "No chunk parsed out.")
for j,p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, sec.strip()):
projs[i] = j
break
readed = [0] * len(sections)
cks = []
for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1):
for i in range(len(sections)):
if readed[i] or projs[i] < pr:
continue
# find father and grand-father and grand...father
p = projs[i]
readed[i] = 1
ck = [sections[i]]
for j in range(i-1, -1, -1):
if projs[j] >= p:continue
ck.append(sections[j])
readed[j] = 1
p = projs[j]
if p == 0: break
cks.append(ck[::-1])
res = [] res = []
# wrap up to es documents # wrap up to es documents
@ -177,4 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1]) def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -1,6 +1,6 @@
import copy import copy
import re import re
from rag.app import tokenize from rag.parser import tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -57,7 +57,7 @@ class Pdf(HuParser):
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
pdf_parser = None pdf_parser = None
paper = {} paper = {}
@ -117,5 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(a, b):
chunk(sys.argv[1]) pass
chunk(sys.argv[1], callback=dummy)

79
rag/app/naive.py Normal file
View File

@ -0,0 +1,79 @@
import copy
import re
from rag.app import laws
from rag.parser import is_english, tokenize, naive_merge
from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page)
callback(0.1, "OCR finished")
from timeit import default_timer as timer
start = timer()
self._layouts_paddle(zoomin)
callback(0.77, "Layout analysis finished")
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
self._naive_vertical_merge()
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None
sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in laws.Docx()(filename, binary):
sections.append((txt, ""))
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
sections = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
txt += l
sections = txt.split("\n")
sections = [(l,"") for l in sections if l]
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
cks = naive_merge(sections, kwargs.get("chunk_token_num", 128), kwargs.get("delimer", "\n。;!?"))
eng = is_english(cks)
res = []
# wrap up to es documents
for ck in cks:
print("--", ck)
d = copy.deepcopy(doc)
if pdf_parser:
d["image"] = pdf_parser.crop(ck)
ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng)
res.append(d)
return res
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -1,7 +1,7 @@
import copy import copy
import re import re
from collections import Counter from collections import Counter
from rag.app import tokenize from rag.parser import tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
import numpy as np import numpy as np
@ -113,7 +113,7 @@ class Pdf(HuParser):
} }
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
pdf_parser = None pdf_parser = None
paper = {} paper = {}
@ -232,5 +232,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(a, b):
chunk(sys.argv[1]) pass
chunk(sys.argv[1], callback=dummy)

View File

@ -3,7 +3,7 @@ import re
from io import BytesIO from io import BytesIO
from pptx import Presentation from pptx import Presentation
from rag.app import tokenize, is_english from rag.parser import tokenize, is_english
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
@ -93,7 +93,7 @@ class Pdf(HuParser):
return res return res
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -122,5 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__== "__main__": if __name__== "__main__":
import sys import sys
print(chunk(sys.argv[1])) def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -3,7 +3,7 @@ import re
from io import BytesIO from io import BytesIO
from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from rag.app import is_english from rag.parser import is_english
from rag.nlp import huqie, stemmer from rag.nlp import huqie, stemmer
@ -55,7 +55,7 @@ def beAdoc(d, q, a, eng):
return d return d
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): def chunk(filename, binary=None, callback=None, **kwargs):
res = [] res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -98,7 +98,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
if __name__== "__main__": if __name__== "__main__":
import sys import sys
def kk(rat, ss): def dummy(a, b):
pass pass
print(chunk(sys.argv[1], callback=kk)) chunk(sys.argv[1], callback=dummy)

View File

@ -1,3 +1,220 @@
import copy
from .pdf_parser import HuParser as PdfParser from .pdf_parser import HuParser as PdfParser
from .docx_parser import HuDocxParser as DocxParser from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser from .excel_parser import HuExcelParser as ExcelParser
import re
from nltk import word_tokenize
from rag.nlp import stemmer, huqie
from ..utils import num_tokens_from_string
BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"第[零一二三四五六七八九十百0-9]+条",
r"[\(][零一二三四五六七八九十百]+[\)]",
], [
r"第[0-9]+章",
r"第[0-9]+节",
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\(][零一二三四五六七八九十百]+[\)]",
r"[\(][0-9]{,2}[\)]",
], [
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]
def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i, h in enumerate(hits):
if h <= maxium: continue
res = i
maxium = h
return res
def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i + 128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j): sections.pop(i)
break
def make_colon_as_title(sections):
if not sections: return []
if type(sections[0]) == type(""): return sections
i = 0
while i < len(sections):
txt, layout = sections[i]
i += 1
txt = txt.split("@")[0].strip()
if not txt:
continue
if txt[-1] not in ":":
continue
txt = txt[::-1]
arr = re.split(r"([。?!!?;]| .)", txt)
if len(arr) < 2 or len(arr[1]) < 32:
continue
sections.insert(i - 1, (arr[0][::-1], "title"))
i += 1
def hierarchical_merge(bull, sections, depth):
if not sections or bull < 0: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
bullets_size = len(BULLET_PATTERN[bull])
levels = [[] for _ in range(bullets_size + 2)]
def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
if len(txt) >= 128: return True
return re.search(r"[,;,。;!!]", txt)
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()) and not not_title(txt):
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
levels[bullets_size].append(i)
else:
levels[bullets_size + 1].append(i)
sections = [t for t, _ in sections]
for s in sections: print("--", s)
def binary_search(arr, target):
if not arr: return -1
if target > arr[-1]: return len(arr) - 1
if target < arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s
cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels[:depth]):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: continue
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
for ii in cks[-1]: readed[ii] = True
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i]))
return cks
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
if not sections: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
cks = [""]
tk_nums = [0]
def add_chunk(t, pos):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8: pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t + pos)
tk_nums.append(tnum)
else:
cks[-1] += t + pos
tk_nums[-1] += tnum
for sec, pos in sections:
s, e = 0, 1
while e < len(sec):
if sec[e] in delimiter:
add_chunk(sec[s: e+1], pos)
s = e + 1
e = s + 1
else:
e += 1
if s < e: add_chunk(sec[s: e], pos)
return cks

View File

@ -98,8 +98,19 @@ class HuDocxParser:
return lines return lines
return ["\n".join(lines)] return ["\n".join(lines)]
def __call__(self, fnm): def __call__(self, fnm, from_page=0, to_page=100000):
self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm)) self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
secs = [(p.text, p.style.name) for p in self.doc.paragraphs] pn = 0
secs = []
for p in self.doc.paragraphs:
if pn > to_page: break
if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls return secs, tbls

View File

@ -650,6 +650,41 @@ class HuParser:
i += 1 i += 1
self.boxes = bxs self.boxes = bxs
def _naive_vertical_merge(self):
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
i = 0
while i + 1 < len(bxs):
b = bxs[i]
b_ = bxs[i + 1]
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i)
continue
concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
b["text"].strip()[0] in "。;?!?”)),,、:",
]
# features for not concating
feats = [
b.get("layoutno", 0) != b.get("layoutno", 0),
b["text"].strip()[-1] in "。?!?",
self.is_english and b["text"].strip()[-1] in ".!?",
b["page_number"] == b_["page_number"] and b_["top"] - \
b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
b["page_number"] < b_["page_number"] and abs(
b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
]
if any(feats) and not any(concatting_feats):
i += 1
continue
# merge up and down
b["bottom"] = b_["bottom"]
b["text"] += b_["text"]
b["x0"] = min(b["x0"], b_["x0"])
b["x1"] = max(b["x1"], b_["x1"])
bxs.pop(i + 1)
self.boxes = bxs
def _concat_downward(self, concat_between_pages=True): def _concat_downward(self, concat_between_pages=True):
# count boxes in the same row as a feature # count boxes in the same row as a feature
for i in range(len(self.boxes)): for i in range(len(self.boxes)):
@ -761,11 +796,13 @@ class HuParser:
def _filter_forpages(self): def _filter_forpages(self):
if not self.boxes: if not self.boxes:
return return
findit = False
i = 0 i = 0
while i < len(self.boxes): while i < len(self.boxes):
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
i += 1 i += 1
continue continue
findit = True
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
self.boxes.pop(i) self.boxes.pop(i)
if i >= len(self.boxes): break if i >= len(self.boxes): break
@ -781,14 +818,36 @@ class HuParser:
continue continue
for k in range(i, j): self.boxes.pop(i) for k in range(i, j): self.boxes.pop(i)
break break
if findit:return
page_dirty = [0] * len(self.page_images)
for b in self.boxes:
if re.search(r"(··|··|··)", b["text"]):
page_dirty[b["page_number"]-1] += 1
page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
if not page_dirty: return
i = 0
while i < len(self.boxes):
if self.boxes[i]["page_number"] in page_dirty:
self.boxes.pop(i)
continue
i += 1
def _merge_with_same_bullet(self): def _merge_with_same_bullet(self):
i = 0 i = 0
while i + 1 < len(self.boxes): while i + 1 < len(self.boxes):
b = self.boxes[i] b = self.boxes[i]
b_ = self.boxes[i + 1] b_ = self.boxes[i + 1]
if not b["text"].strip():
self.boxes.pop(i)
continue
if not b_["text"].strip():
self.boxes.pop(i+1)
continue
if b["text"].strip()[0] != b_["text"].strip()[0] \ if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
or huqie.is_chinese(b["text"].strip()[0]) \
or b["top"] > b_["bottom"]: or b["top"] > b_["bottom"]:
i += 1 i += 1
continue continue
@ -1596,8 +1655,7 @@ class HuParser:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])] enumerate(self.pdf.pages[page_from:page_to])]
self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
range(len(self.page_images))]
self.total_page = len(self.pdf.pages) self.total_page = len(self.pdf.pages)
except Exception as e: except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf") self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
@ -1605,15 +1663,17 @@ class HuParser:
self.page_chars = [] self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin) mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf) self.total_page = len(self.pdf)
for page in self.pdf[page_from:page_to]: for i, page in enumerate(self.pdf):
pix = page.getPixmap(matrix=mat) if i < page_from:continue
if i >= page_to:break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples) pix.samples)
self.page_images.append(img) self.page_images.append(img)
self.page_chars.append([]) self.page_chars.append([])
logging.info("Images converted.") logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))] self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
self.is_english = True self.is_english = True
else: else:
@ -1644,8 +1704,8 @@ class HuParser:
# np.max([c["bottom"] for c in chars])) # np.max([c["bottom"] for c in chars]))
self.__ocr_paddle(i + 1, img, chars, zoomin) self.__ocr_paddle(i + 1, img, chars, zoomin)
if not self.is_english and not all([c for c in self.page_chars]) and self.boxes: if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)])) self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices([b for bxs in self.boxes for b in bxs], k=30)]))
logging.info("Is it English:", self.is_english) logging.info("Is it English:", self.is_english)