From 79482ff6724f87b79c07ea77122937c72e0dd3aa Mon Sep 17 00:00:00 2001
From: Stephen Hu <stephenhu@seismic.com>
Date: Mon, 17 Mar 2025 17:02:39 +0800
Subject: [PATCH] Refa: Improve ppt_parser better handle list  (#6162)

### What problem does this PR solve?
This pull request (PR) incorporates codes for parsing PPTX files, aiming
to more precisely depict text in list formats (hint list by .).

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
---
 deepdoc/parser/ppt_parser.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py
index 25e8f846..5cbd1330 100644
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@@ -23,6 +23,13 @@ class RAGFlowPptParser:
     def __init__(self):
         super().__init__()
 
+    def __get_bulleted_text(self, paragraph):
+        is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) )
+        if is_bulleted:
+            return f"{'  '* paragraph.level}.{paragraph.text}"
+        else:
+            return paragraph.text
+
     def __extract(self, shape):
         if shape.shape_type == 19:
             tb = shape.table
@@ -33,7 +40,12 @@ class RAGFlowPptParser:
             return "\n".join(rows)
 
         if shape.has_text_frame:
-            return shape.text_frame.text
+            text_frame = shape.text_frame
+            texts = []
+            for paragraph in text_frame.paragraphs:
+                if paragraph.text.strip():
+                    texts.append(self.__get_bulleted_text(paragraph))
+            return "\n".join(texts)
 
         if shape.shape_type == 6:
             texts = []
@@ -65,4 +77,4 @@ class RAGFlowPptParser:
                     logging.exception(e)
             txts.append("\n".join(texts))
 
-        return txts
+        return txts
\ No newline at end of file