diff --git a/rag/app/naive.py b/rag/app/naive.py
index 872667ae..97f76112 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -16,16 +16,28 @@ from docx import Document
from timeit import default_timer as timer
import re
from deepdoc.parser.pdf_parser import PlainParser
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
from rag.settings import cron_logger
from rag.utils import num_tokens_from_string
-
+from PIL import Image
+from functools import reduce
class Docx(DocxParser):
def __init__(self):
pass
+ def get_picture(self, document, paragraph):
+ img = paragraph._element.xpath('.//pic:pic')
+ if not img:
+ return None
+ img = img[0]
+ embed = img.xpath('.//a:blip/@r:embed')[0]
+ related_part = document.part.related_parts[embed]
+ image = related_part.image
+ image = Image.open(BytesIO(image.blob)).convert('RGB')
+ return image
+
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
@@ -35,17 +47,41 @@ class Docx(DocxParser):
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
+ last_image = None
for p in self.doc.paragraphs:
if pn > to_page:
break
- if from_page <= pn < to_page and p.text.strip():
- lines.append(self.__clean(p.text))
+ if from_page <= pn < to_page:
+ current_image = None
+ if p.text.strip():
+ if p.style.name == 'Caption':
+ former_image = None
+ if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+ former_image = lines[-1][1].pop()
+ elif last_image:
+ former_image = last_image
+ last_image = None
+ lines.append((self.__clean(p.text), [former_image], p.style.name))
+ else:
+ current_image = self.get_picture(self.doc, p)
+ image_list = [current_image]
+ if last_image:
+ image_list.insert(0, last_image)
+ last_image = None
+ lines.append((self.__clean(p.text), image_list, p.style.name))
+ else:
+ if current_image := self.get_picture(self.doc, p):
+ if lines:
+ lines[-1][1].append(current_image)
+ else:
+ last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
+ new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
tbls = []
for tb in self.doc.tables:
html= "
"
@@ -64,7 +100,7 @@ class Docx(DocxParser):
html += ""
html += "
"
tbls.append(((None, html), ""))
- return [(l, "") for l in lines if l], tbls
+ return new_line, tbls
class Pdf(PdfParser):
@@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary)
- res = tokenize_table(tbls, doc, eng)
+ res = tokenize_table(tbls, doc, eng) # just for table
+
callback(0.8, "Finish parsing.")
+ st = timer()
+
+ chunks, images = naive_merge_docx(
+ sections, int(parser_config.get(
+ "chunk_token_num", 128)), parser_config.get(
+ "delimiter", "\n!?。;!?"))
+
+ res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
+ cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+ return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf(
diff --git a/rag/app/qa.py b/rag/app/qa.py
index 1b088772..4e95cf51 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
from nltk import word_tokenize
from openpyxl import load_workbook
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
-from rag.nlp import rag_tokenizer, tokenize_table
+from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from rag.settings import cron_logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
@@ -174,26 +174,8 @@ class Docx(DocxParser):
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
- image = Image.open(BytesIO(image.blob))
+ image = Image.open(BytesIO(image.blob)).convert('RGB')
return image
- def concat_img(self, img1, img2):
- if img1 and not img2:
- return img1
- if not img1 and img2:
- return img2
- if not img1 and not img2:
- return None
- width1, height1 = img1.size
- width2, height2 = img2.size
-
- new_width = max(width1, width2)
- new_height = height1 + height2
- new_image = Image.new('RGB', (new_width, new_height))
-
- new_image.paste(img1, (0, 0))
- new_image.paste(img2, (0, height1))
-
- return new_image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
@@ -211,7 +193,7 @@ class Docx(DocxParser):
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
current_image = self.get_picture(self.doc, p)
- last_image = self.concat_img(last_image, current_image)
+ last_image = concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index c65808ee..be8cb701 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -24,6 +24,7 @@ import copy
import roman_numbers as r
from word2number import w2n
from cn2an import cn2an
+from PIL import Image
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
return res
+def tokenize_chunks_docx(chunks, doc, eng, images):
+ res = []
+ # wrap up as es documents
+ for ck, image in zip(chunks, images):
+ if len(ck.strip()) == 0:continue
+ print("--", ck)
+ d = copy.deepcopy(doc)
+ d["image"] = image
+ tokenize(d, ck, eng)
+ res.append(d)
+ return res
+
+
def tokenize_table(tbls, doc, eng, batch_size=10):
res = []
# add tables
@@ -504,4 +518,54 @@ def docx_question_level(p):
if p.style.name.startswith('Heading'):
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
else:
- return 0, re.sub(r"\u3000", " ", p.text).strip()
\ No newline at end of file
+ return 0, re.sub(r"\u3000", " ", p.text).strip()
+
+def concat_img(img1, img2):
+ if img1 and not img2:
+ return img1
+ if not img1 and img2:
+ return img2
+ if not img1 and not img2:
+ return None
+ width1, height1 = img1.size
+ width2, height2 = img2.size
+
+ new_width = max(width1, width2)
+ new_height = height1 + height2
+ new_image = Image.new('RGB', (new_width, new_height))
+
+ new_image.paste(img1, (0, 0))
+ new_image.paste(img2, (0, height1))
+
+ return new_image
+
+def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
+ if not sections:
+ return []
+
+ cks = [""]
+ images = [None]
+ tk_nums = [0]
+
+ def add_chunk(t, image, pos=""):
+ nonlocal cks, tk_nums, delimiter
+ tnum = num_tokens_from_string(t)
+ if tnum < 8:
+ pos = ""
+ if tk_nums[-1] > chunk_token_num:
+ if t.find(pos) < 0:
+ t += pos
+ cks.append(t)
+ images.append(image)
+ tk_nums.append(tnum)
+ else:
+ if cks[-1].find(pos) < 0:
+ t += pos
+ cks[-1] += t
+ images[-1] = concat_img(images[-1], image)
+ tk_nums[-1] += tnum
+
+ for sec, image in sections:
+ add_chunk(sec, image, '')
+
+ return cks, images
\ No newline at end of file