Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)
### What problem does this PR solve? When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful Fix:#3230 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
parent
7c0d28b62d
commit
9c6cc20356
@ -840,7 +840,7 @@ class Task(DataBaseModel):
|
||||
doc_id = CharField(max_length=32, null=False, index=True)
|
||||
from_page = IntegerField(default=0)
|
||||
|
||||
to_page = IntegerField(default=-1)
|
||||
to_page = IntegerField(default=100000000)
|
||||
|
||||
begin_at = DateTimeField(null=True, index=True)
|
||||
process_duation = FloatField(default=0)
|
||||
|
||||
@ -110,7 +110,7 @@ class RAGFlowDocxParser:
|
||||
return lines
|
||||
return ["\n".join(lines)]
|
||||
|
||||
def __call__(self, fnm, from_page=0, to_page=100000):
|
||||
def __call__(self, fnm, from_page=0, to_page=100000000):
|
||||
self.doc = Document(fnm) if isinstance(
|
||||
fnm, str) else Document(BytesIO(fnm))
|
||||
pn = 0 # parsed page
|
||||
@ -130,7 +130,7 @@ class RAGFlowDocxParser:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
pn += 1
|
||||
|
||||
secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
|
||||
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
||||
|
||||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||||
return secs, tbls
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user