Fix divided by zero issue. (#4784)
### What problem does this PR solve? #4779 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
ccb72e6787
commit
f374dd38b6
@ -69,6 +69,7 @@ def find_codec(blob):
|
||||
|
||||
return "utf-8"
|
||||
|
||||
|
||||
QUESTION_PATTERN = [
|
||||
r"第([零一二三四五六七八九十百0-9]+)问",
|
||||
r"第([零一二三四五六七八九十百0-9]+)条",
|
||||
@ -83,6 +84,7 @@ QUESTION_PATTERN = [
|
||||
r"QUESTION ([0-9]+)",
|
||||
]
|
||||
|
||||
|
||||
def has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list):
|
||||
section, last_section = box['text'], last_box['text']
|
||||
q_reg = r'(\w|\W)*?(?:?|\?|\n|$)+'
|
||||
@ -125,6 +127,7 @@ def has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list):
|
||||
return has_bull, index
|
||||
return None, last_index
|
||||
|
||||
|
||||
def index_int(index_str):
|
||||
res = -1
|
||||
try:
|
||||
@ -142,6 +145,7 @@ def index_int(index_str):
|
||||
return -1
|
||||
return res
|
||||
|
||||
|
||||
def qbullets_category(sections):
|
||||
global QUESTION_PATTERN
|
||||
hits = [0] * len(QUESTION_PATTERN)
|
||||
@ -230,7 +234,10 @@ def is_english(texts):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_chinese(text):
|
||||
if not text:
|
||||
return False
|
||||
chinese = 0
|
||||
for ch in text:
|
||||
if '\u4e00' <= ch <= '\u9fff':
|
||||
@ -239,6 +246,7 @@ def is_chinese(text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def tokenize(d, t, eng):
|
||||
d["content_with_weight"] = t
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||
@ -416,7 +424,6 @@ def hierarchical_merge(bull, sections, depth):
|
||||
bullets_size = len(BULLET_PATTERN[bull])
|
||||
levels = [[] for _ in range(bullets_size + 2)]
|
||||
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
for j, p in enumerate(BULLET_PATTERN[bull]):
|
||||
if re.match(p, txt.strip()):
|
||||
@ -594,4 +601,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
add_chunk(sec, image, '')
|
||||
|
||||
return cks, images
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user