Support displaying images in the chunks of docx files when using general parser (#1253)
### What problem does this PR solve? Support displaying images in chunks of docx files when using general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
9a0736b20f
commit
38bd02f402
@ -16,16 +16,28 @@ from docx import Document
|
|||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser.pdf_parser import PlainParser
|
from deepdoc.parser.pdf_parser import PlainParser
|
||||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
from PIL import Image
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
class Docx(DocxParser):
|
class Docx(DocxParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def get_picture(self, document, paragraph):
|
||||||
|
img = paragraph._element.xpath('.//pic:pic')
|
||||||
|
if not img:
|
||||||
|
return None
|
||||||
|
img = img[0]
|
||||||
|
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||||
|
related_part = document.part.related_parts[embed]
|
||||||
|
image = related_part.image
|
||||||
|
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
||||||
|
return image
|
||||||
|
|
||||||
def __clean(self, line):
|
def __clean(self, line):
|
||||||
line = re.sub(r"\u3000", " ", line).strip()
|
line = re.sub(r"\u3000", " ", line).strip()
|
||||||
return line
|
return line
|
||||||
@ -35,17 +47,41 @@ class Docx(DocxParser):
|
|||||||
filename) if not binary else Document(BytesIO(binary))
|
filename) if not binary else Document(BytesIO(binary))
|
||||||
pn = 0
|
pn = 0
|
||||||
lines = []
|
lines = []
|
||||||
|
last_image = None
|
||||||
for p in self.doc.paragraphs:
|
for p in self.doc.paragraphs:
|
||||||
if pn > to_page:
|
if pn > to_page:
|
||||||
break
|
break
|
||||||
if from_page <= pn < to_page and p.text.strip():
|
if from_page <= pn < to_page:
|
||||||
lines.append(self.__clean(p.text))
|
current_image = None
|
||||||
|
if p.text.strip():
|
||||||
|
if p.style.name == 'Caption':
|
||||||
|
former_image = None
|
||||||
|
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
||||||
|
former_image = lines[-1][1].pop()
|
||||||
|
elif last_image:
|
||||||
|
former_image = last_image
|
||||||
|
last_image = None
|
||||||
|
lines.append((self.__clean(p.text), [former_image], p.style.name))
|
||||||
|
else:
|
||||||
|
current_image = self.get_picture(self.doc, p)
|
||||||
|
image_list = [current_image]
|
||||||
|
if last_image:
|
||||||
|
image_list.insert(0, last_image)
|
||||||
|
last_image = None
|
||||||
|
lines.append((self.__clean(p.text), image_list, p.style.name))
|
||||||
|
else:
|
||||||
|
if current_image := self.get_picture(self.doc, p):
|
||||||
|
if lines:
|
||||||
|
lines[-1][1].append(current_image)
|
||||||
|
else:
|
||||||
|
last_image = current_image
|
||||||
for run in p.runs:
|
for run in p.runs:
|
||||||
if 'lastRenderedPageBreak' in run._element.xml:
|
if 'lastRenderedPageBreak' in run._element.xml:
|
||||||
pn += 1
|
pn += 1
|
||||||
continue
|
continue
|
||||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||||
pn += 1
|
pn += 1
|
||||||
|
new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
|
||||||
tbls = []
|
tbls = []
|
||||||
for tb in self.doc.tables:
|
for tb in self.doc.tables:
|
||||||
html= "<table>"
|
html= "<table>"
|
||||||
@ -64,7 +100,7 @@ class Docx(DocxParser):
|
|||||||
html += "</tr>"
|
html += "</tr>"
|
||||||
html += "</table>"
|
html += "</table>"
|
||||||
tbls.append(((None, html), ""))
|
tbls.append(((None, html), ""))
|
||||||
return [(l, "") for l in lines if l], tbls
|
return new_line, tbls
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
sections, tbls = Docx()(filename, binary)
|
sections, tbls = Docx()(filename, binary)
|
||||||
res = tokenize_table(tbls, doc, eng)
|
res = tokenize_table(tbls, doc, eng) # just for table
|
||||||
|
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
st = timer()
|
||||||
|
|
||||||
|
chunks, images = naive_merge_docx(
|
||||||
|
sections, int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
|
||||||
|
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf(
|
pdf_parser = Pdf(
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
|||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
|
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
|
||||||
from rag.nlp import rag_tokenizer, tokenize_table
|
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||||
from docx import Document
|
from docx import Document
|
||||||
@ -174,26 +174,8 @@ class Docx(DocxParser):
|
|||||||
embed = img.xpath('.//a:blip/@r:embed')[0]
|
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||||
related_part = document.part.related_parts[embed]
|
related_part = document.part.related_parts[embed]
|
||||||
image = related_part.image
|
image = related_part.image
|
||||||
image = Image.open(BytesIO(image.blob))
|
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
||||||
return image
|
return image
|
||||||
def concat_img(self, img1, img2):
|
|
||||||
if img1 and not img2:
|
|
||||||
return img1
|
|
||||||
if not img1 and img2:
|
|
||||||
return img2
|
|
||||||
if not img1 and not img2:
|
|
||||||
return None
|
|
||||||
width1, height1 = img1.size
|
|
||||||
width2, height2 = img2.size
|
|
||||||
|
|
||||||
new_width = max(width1, width2)
|
|
||||||
new_height = height1 + height2
|
|
||||||
new_image = Image.new('RGB', (new_width, new_height))
|
|
||||||
|
|
||||||
new_image.paste(img1, (0, 0))
|
|
||||||
new_image.paste(img2, (0, height1))
|
|
||||||
|
|
||||||
return new_image
|
|
||||||
|
|
||||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
||||||
self.doc = Document(
|
self.doc = Document(
|
||||||
@ -211,7 +193,7 @@ class Docx(DocxParser):
|
|||||||
if not question_level or question_level > 6: # not a question
|
if not question_level or question_level > 6: # not a question
|
||||||
last_answer = f'{last_answer}\n{p_text}'
|
last_answer = f'{last_answer}\n{p_text}'
|
||||||
current_image = self.get_picture(self.doc, p)
|
current_image = self.get_picture(self.doc, p)
|
||||||
last_image = self.concat_img(last_image, current_image)
|
last_image = concat_img(last_image, current_image)
|
||||||
else: # is a question
|
else: # is a question
|
||||||
if last_answer or last_image:
|
if last_answer or last_image:
|
||||||
sum_question = '\n'.join(question_stack)
|
sum_question = '\n'.join(question_stack)
|
||||||
|
|||||||
@ -24,6 +24,7 @@ import copy
|
|||||||
import roman_numbers as r
|
import roman_numbers as r
|
||||||
from word2number import w2n
|
from word2number import w2n
|
||||||
from cn2an import cn2an
|
from cn2an import cn2an
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
all_codecs = [
|
all_codecs = [
|
||||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||||
@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||||
|
res = []
|
||||||
|
# wrap up as es documents
|
||||||
|
for ck, image in zip(chunks, images):
|
||||||
|
if len(ck.strip()) == 0:continue
|
||||||
|
print("--", ck)
|
||||||
|
d = copy.deepcopy(doc)
|
||||||
|
d["image"] = image
|
||||||
|
tokenize(d, ck, eng)
|
||||||
|
res.append(d)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def tokenize_table(tbls, doc, eng, batch_size=10):
|
def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||||
res = []
|
res = []
|
||||||
# add tables
|
# add tables
|
||||||
@ -504,4 +518,54 @@ def docx_question_level(p):
|
|||||||
if p.style.name.startswith('Heading'):
|
if p.style.name.startswith('Heading'):
|
||||||
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
||||||
else:
|
else:
|
||||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||||
|
|
||||||
|
def concat_img(img1, img2):
|
||||||
|
if img1 and not img2:
|
||||||
|
return img1
|
||||||
|
if not img1 and img2:
|
||||||
|
return img2
|
||||||
|
if not img1 and not img2:
|
||||||
|
return None
|
||||||
|
width1, height1 = img1.size
|
||||||
|
width2, height2 = img2.size
|
||||||
|
|
||||||
|
new_width = max(width1, width2)
|
||||||
|
new_height = height1 + height2
|
||||||
|
new_image = Image.new('RGB', (new_width, new_height))
|
||||||
|
|
||||||
|
new_image.paste(img1, (0, 0))
|
||||||
|
new_image.paste(img2, (0, height1))
|
||||||
|
|
||||||
|
return new_image
|
||||||
|
|
||||||
|
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||||
|
if not sections:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cks = [""]
|
||||||
|
images = [None]
|
||||||
|
tk_nums = [0]
|
||||||
|
|
||||||
|
def add_chunk(t, image, pos=""):
|
||||||
|
nonlocal cks, tk_nums, delimiter
|
||||||
|
tnum = num_tokens_from_string(t)
|
||||||
|
if tnum < 8:
|
||||||
|
pos = ""
|
||||||
|
if tk_nums[-1] > chunk_token_num:
|
||||||
|
if t.find(pos) < 0:
|
||||||
|
t += pos
|
||||||
|
cks.append(t)
|
||||||
|
images.append(image)
|
||||||
|
tk_nums.append(tnum)
|
||||||
|
else:
|
||||||
|
if cks[-1].find(pos) < 0:
|
||||||
|
t += pos
|
||||||
|
cks[-1] += t
|
||||||
|
images[-1] = concat_img(images[-1], image)
|
||||||
|
tk_nums[-1] += tnum
|
||||||
|
|
||||||
|
for sec, image in sections:
|
||||||
|
add_chunk(sec, image, '')
|
||||||
|
|
||||||
|
return cks, images
|
||||||
Loading…
x
Reference in New Issue
Block a user