From 79482ff6724f87b79c07ea77122937c72e0dd3aa Mon Sep 17 00:00:00 2001 From: Stephen Hu Date: Mon, 17 Mar 2025 17:02:39 +0800 Subject: [PATCH] Refa: Improve ppt_parser better handle list (#6162) ### What problem does this PR solve? This pull request (PR) incorporates codes for parsing PPTX files, aiming to more precisely depict text in list formats (hint list by .). ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --- deepdoc/parser/ppt_parser.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 25e8f846..5cbd1330 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -23,6 +23,13 @@ class RAGFlowPptParser: def __init__(self): super().__init__() + def __get_bulleted_text(self, paragraph): + is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) ) + if is_bulleted: + return f"{' '* paragraph.level}.{paragraph.text}" + else: + return paragraph.text + def __extract(self, shape): if shape.shape_type == 19: tb = shape.table @@ -33,7 +40,12 @@ class RAGFlowPptParser: return "\n".join(rows) if shape.has_text_frame: - return shape.text_frame.text + text_frame = shape.text_frame + texts = [] + for paragraph in text_frame.paragraphs: + if paragraph.text.strip(): + texts.append(self.__get_bulleted_text(paragraph)) + return "\n".join(texts) if shape.shape_type == 6: texts = [] @@ -65,4 +77,4 @@ class RAGFlowPptParser: logging.exception(e) txts.append("\n".join(texts)) - return txts + return txts \ No newline at end of file