From 101b8ff813faaa0edc952d58bd45e52c928b38e8 Mon Sep 17 00:00:00 2001 From: ly0303521 <449043172@qq.com> Date: Thu, 19 Dec 2024 17:30:26 +0800 Subject: [PATCH] =?UTF-8?q?fix=20chunk=20method=20"Table"=20losing=20conte?= =?UTF-8?q?nt=20when=20the=20Excel=20file=20has=20multi=E2=80=A6=20(#4123)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ple sheets ### What problem does this PR solve? discussed in https://github.com/infiniflow/ragflow/pull/4102 - In excel_parser.py, `total` means the total number of rows in Excel, but it return in the first iterate, that lead to the wrong `to_page` - In table.py, it when Excel file has multiple sheets, it will be divided into multiple parts, every part size is 3000, `data` may be empty, because it has recorded in the last iterate. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/excel_parser.py | 2 +- rag/app/table.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 1d23978b..77cc7a05 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -90,7 +90,7 @@ class RAGFlowExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] total += len(list(ws.rows)) - return total + return total if fnm.split(".")[-1].lower() in ["csv", "txt"]: encoding = find_codec(binary) diff --git a/rag/app/table.py b/rag/app/table.py index d7ba35a6..e28b882d 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -66,6 +66,8 @@ class Excel(ExcelParser): continue data.append(row) done += 1 + if np.array(data).size == 0: + continue res.append(pd.DataFrame(np.array(data), columns=headers)) callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (