diff --git a/rag/app/naive.py b/rag/app/naive.py index 872667ae..97f76112 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -16,16 +16,28 @@ from docx import Document from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser -from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec +from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser from rag.settings import cron_logger from rag.utils import num_tokens_from_string - +from PIL import Image +from functools import reduce class Docx(DocxParser): def __init__(self): pass + def get_picture(self, document, paragraph): + img = paragraph._element.xpath('.//pic:pic') + if not img: + return None + img = img[0] + embed = img.xpath('.//a:blip/@r:embed')[0] + related_part = document.part.related_parts[embed] + image = related_part.image + image = Image.open(BytesIO(image.blob)).convert('RGB') + return image + def __clean(self, line): line = re.sub(r"\u3000", " ", line).strip() return line @@ -35,17 +47,41 @@ class Docx(DocxParser): filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] + last_image = None for p in self.doc.paragraphs: if pn > to_page: break - if from_page <= pn < to_page and p.text.strip(): - lines.append(self.__clean(p.text)) + if from_page <= pn < to_page: + current_image = None + if p.text.strip(): + if p.style.name == 'Caption': + former_image = None + if lines and lines[-1][1] and lines[-1][2] != 'Caption': + former_image = lines[-1][1].pop() + elif last_image: + former_image = last_image + last_image = None + lines.append((self.__clean(p.text), [former_image], p.style.name)) + else: + current_image = self.get_picture(self.doc, p) + image_list = [current_image] + if last_image: + image_list.insert(0, last_image) + last_image = None + lines.append((self.__clean(p.text), image_list, p.style.name)) + else: + if current_image := self.get_picture(self.doc, p): + if lines: + lines[-1][1].append(current_image) + else: + last_image = current_image for run in p.runs: if 'lastRenderedPageBreak' in run._element.xml: pn += 1 continue if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: pn += 1 + new_line = [(line[0], reduce(concat_img, line[1])) for line in lines] tbls = [] for tb in self.doc.tables: html= "" @@ -64,7 +100,7 @@ class Docx(DocxParser): html += "" html += "
" tbls.append(((None, html), "")) - return [(l, "") for l in lines if l], tbls + return new_line, tbls class Pdf(PdfParser): @@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = Docx()(filename, binary) - res = tokenize_table(tbls, doc, eng) + res = tokenize_table(tbls, doc, eng) # just for table + callback(0.8, "Finish parsing.") + st = timer() + + chunks, images = naive_merge_docx( + sections, int(parser_config.get( + "chunk_token_num", 128)), parser_config.get( + "delimiter", "\n!?。;!?")) + + res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) + cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) + return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf( diff --git a/rag/app/qa.py b/rag/app/qa.py index 1b088772..4e95cf51 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -17,7 +17,7 @@ from timeit import default_timer as timer from nltk import word_tokenize from openpyxl import load_workbook from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level -from rag.nlp import rag_tokenizer, tokenize_table +from rag.nlp import rag_tokenizer, tokenize_table, concat_img from rag.settings import cron_logger from deepdoc.parser import PdfParser, ExcelParser, DocxParser from docx import Document @@ -174,26 +174,8 @@ class Docx(DocxParser): embed = img.xpath('.//a:blip/@r:embed')[0] related_part = document.part.related_parts[embed] image = related_part.image - image = Image.open(BytesIO(image.blob)) + image = Image.open(BytesIO(image.blob)).convert('RGB') return image - def concat_img(self, img1, img2): - if img1 and not img2: - return img1 - if not img1 and img2: - return img2 - if not img1 and not img2: - return None - width1, height1 = img1.size - width2, height2 = img2.size - - new_width = max(width1, width2) - new_height = height1 + height2 - new_image = Image.new('RGB', (new_width, new_height)) - - new_image.paste(img1, (0, 0)) - new_image.paste(img2, (0, height1)) - - return new_image def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): self.doc = Document( @@ -211,7 +193,7 @@ class Docx(DocxParser): if not question_level or question_level > 6: # not a question last_answer = f'{last_answer}\n{p_text}' current_image = self.get_picture(self.doc, p) - last_image = self.concat_img(last_image, current_image) + last_image = concat_img(last_image, current_image) else: # is a question if last_answer or last_image: sum_question = '\n'.join(question_stack) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index c65808ee..be8cb701 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -24,6 +24,7 @@ import copy import roman_numbers as r from word2number import w2n from cn2an import cn2an +from PIL import Image all_codecs = [ 'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs', @@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser): return res +def tokenize_chunks_docx(chunks, doc, eng, images): + res = [] + # wrap up as es documents + for ck, image in zip(chunks, images): + if len(ck.strip()) == 0:continue + print("--", ck) + d = copy.deepcopy(doc) + d["image"] = image + tokenize(d, ck, eng) + res.append(d) + return res + + def tokenize_table(tbls, doc, eng, batch_size=10): res = [] # add tables @@ -504,4 +518,54 @@ def docx_question_level(p): if p.style.name.startswith('Heading'): return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip() else: - return 0, re.sub(r"\u3000", " ", p.text).strip() \ No newline at end of file + return 0, re.sub(r"\u3000", " ", p.text).strip() + +def concat_img(img1, img2): + if img1 and not img2: + return img1 + if not img1 and img2: + return img2 + if not img1 and not img2: + return None + width1, height1 = img1.size + width2, height2 = img2.size + + new_width = max(width1, width2) + new_height = height1 + height2 + new_image = Image.new('RGB', (new_width, new_height)) + + new_image.paste(img1, (0, 0)) + new_image.paste(img2, (0, height1)) + + return new_image + +def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): + if not sections: + return [] + + cks = [""] + images = [None] + tk_nums = [0] + + def add_chunk(t, image, pos=""): + nonlocal cks, tk_nums, delimiter + tnum = num_tokens_from_string(t) + if tnum < 8: + pos = "" + if tk_nums[-1] > chunk_token_num: + if t.find(pos) < 0: + t += pos + cks.append(t) + images.append(image) + tk_nums.append(tnum) + else: + if cks[-1].find(pos) < 0: + t += pos + cks[-1] += t + images[-1] = concat_img(images[-1], image) + tk_nums[-1] += tnum + + for sec, image in sections: + add_chunk(sec, image, '') + + return cks, images \ No newline at end of file