From 101b8ff813faaa0edc952d58bd45e52c928b38e8 Mon Sep 17 00:00:00 2001
From: ly0303521 <449043172@qq.com>
Date: Thu, 19 Dec 2024 17:30:26 +0800
Subject: [PATCH] =?UTF-8?q?fix=20chunk=20method=20"Table"=20losing=20conte?=
 =?UTF-8?q?nt=20when=20the=20Excel=20file=20has=20multi=E2=80=A6=20(#4123)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ple sheets

### What problem does this PR solve?
discussed in https://github.com/infiniflow/ragflow/pull/4102
- In excel_parser.py, `total` means the total number of rows in Excel,
but it return in the first iterate, that lead to the wrong `to_page`
- In table.py, it when Excel file has multiple sheets, it will be
divided into multiple parts, every part size is 3000, `data` may be
empty, because it has recorded in the last iterate.
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/excel_parser.py | 2 +-
 rag/app/table.py               | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py
index 1d23978b..77cc7a05 100644
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -90,7 +90,7 @@ class RAGFlowExcelParser:
             for sheetname in wb.sheetnames:
                 ws = wb[sheetname]
                 total += len(list(ws.rows))
-                return total
+            return total
 
         if fnm.split(".")[-1].lower() in ["csv", "txt"]:
             encoding = find_codec(binary)
diff --git a/rag/app/table.py b/rag/app/table.py
index d7ba35a6..e28b882d 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -66,6 +66,8 @@ class Excel(ExcelParser):
                     continue
                 data.append(row)
                 done += 1
+            if np.array(data).size == 0:
+                continue
             res.append(pd.DataFrame(np.array(data), columns=headers))
 
         callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (