Fix divided by zero issue. (#4784)

### What problem does this PR solve?

#4779

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-02-08 10:36:26 +08:00 committed by GitHub
parent ccb72e6787
commit f374dd38b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -69,6 +69,7 @@ def find_codec(blob):
return "utf-8" return "utf-8"
QUESTION_PATTERN = [ QUESTION_PATTERN = [
r"第([零一二三四五六七八九十百0-9]+)问", r"第([零一二三四五六七八九十百0-9]+)问",
r"第([零一二三四五六七八九十百0-9]+)条", r"第([零一二三四五六七八九十百0-9]+)条",
@ -83,6 +84,7 @@ QUESTION_PATTERN = [
r"QUESTION ([0-9]+)", r"QUESTION ([0-9]+)",
] ]
def has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list): def has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list):
section, last_section = box['text'], last_box['text'] section, last_section = box['text'], last_box['text']
q_reg = r'(\w|\W)*?(?:|\?|\n|$)+' q_reg = r'(\w|\W)*?(?:|\?|\n|$)+'
@ -125,6 +127,7 @@ def has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list):
return has_bull, index return has_bull, index
return None, last_index return None, last_index
def index_int(index_str): def index_int(index_str):
res = -1 res = -1
try: try:
@ -142,6 +145,7 @@ def index_int(index_str):
return -1 return -1
return res return res
def qbullets_category(sections): def qbullets_category(sections):
global QUESTION_PATTERN global QUESTION_PATTERN
hits = [0] * len(QUESTION_PATTERN) hits = [0] * len(QUESTION_PATTERN)
@ -230,7 +234,10 @@ def is_english(texts):
return True return True
return False return False
def is_chinese(text): def is_chinese(text):
if not text:
return False
chinese = 0 chinese = 0
for ch in text: for ch in text:
if '\u4e00' <= ch <= '\u9fff': if '\u4e00' <= ch <= '\u9fff':
@ -239,6 +246,7 @@ def is_chinese(text):
return True return True
return False return False
def tokenize(d, t, eng): def tokenize(d, t, eng):
d["content_with_weight"] = t d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
@ -416,7 +424,6 @@ def hierarchical_merge(bull, sections, depth):
bullets_size = len(BULLET_PATTERN[bull]) bullets_size = len(BULLET_PATTERN[bull])
levels = [[] for _ in range(bullets_size + 2)] levels = [[] for _ in range(bullets_size + 2)]
for i, (txt, layout) in enumerate(sections): for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]): for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()): if re.match(p, txt.strip()):
@ -594,4 +601,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。"):
add_chunk(sec, image, '') add_chunk(sec, image, '')
return cks, images return cks, images