From 36f2d7b7977774893dda3c79f172a28d62756ac0 Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Tue, 2 Apr 2024 10:51:21 +0800 Subject: [PATCH] To avoid assertion while no rows in excel (#197) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ Issue link:#[[Link the issue here](https://github.com/infiniflow/ragflow/issues/196)] ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Breaking Change (fix or feature that could cause existing functionality not to work as expected) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Test cases - [ ] Python SDK impacted, Need to update PyPI - [ ] Other (please describe): --- deepdoc/parser/excel_parser.py | 2 ++ rag/app/manual.py | 1 + rag/app/one.py | 1 + rag/app/table.py | 1 + 4 files changed, 5 insertions(+) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 7d470f3c..89cabb5d 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -14,6 +14,7 @@ class HuExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] rows = list(ws.rows) + if not rows:continue tb += f"" for t in list(rows[0]): tb += f"" @@ -38,6 +39,7 @@ class HuExcelParser: for sheetname in wb.sheetnames: ws = wb[sheetname] rows = list(ws.rows) + if not rows:continue ti = list(rows[0]) for r in list(rows[1:]): l = [] diff --git a/rag/app/manual.py b/rag/app/manual.py index 234ff9c0..284e3d6e 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -109,6 +109,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)] for (img, rows), poss in tbls: + if not rows:continue sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) diff --git a/rag/app/one.py b/rag/app/one.py index bd904610..8c08e6ac 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -44,6 +44,7 @@ class Pdf(PdfParser): sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] for (img, rows), poss in tbls: + if not rows:continue sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( diff --git a/rag/app/table.py b/rag/app/table.py index 3d105270..29fa2546 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -40,6 +40,7 @@ class Excel(ExcelParser): for sheetname in wb.sheetnames: ws = wb[sheetname] rows = list(ws.rows) + if not rows:continue headers = [cell.value for cell in rows[0]] missed = set([i for i, h in enumerate(headers) if h is None]) headers = [
{sheetname}
{t.value}