refine error log while chunking (#1937)

### What problem does this PR solve?



### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu 2024-08-14 11:09:07 +08:00 committed by GitHub
parent d73a75506e
commit da8802d010
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 15 deletions

View File

@ -501,7 +501,9 @@ def upload_and_parse():
"callback": dummy, "callback": dummy,
"parser_config": parser_config, "parser_config": parser_config,
"from_page": 0, "from_page": 0,
"to_page": 100000 "to_page": 100000,
"tenant_id": kb.tenant_id,
"lang": kb.language
} }
threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs)) threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs))

View File

@ -146,27 +146,32 @@ def build(row):
binary = get_minio_binary(bucket, name) binary = get_minio_binary(bucket, name)
cron_logger.info( cron_logger.info(
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"])) "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
except TimeoutError as e:
callback(-1, f"Internal server error: Fetch file from minio timeout. Could you try it again.")
cron_logger.error(
"Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"]))
return
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
else:
callback(-1, f"Get file from minio: %s" %
str(e).replace("'", ""))
traceback.print_exc()
return
try:
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback, to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
cron_logger.info( cron_logger.info(
"Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"])) "Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
except TimeoutError as e:
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
cron_logger.error(
"Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
return
except Exception as e: except Exception as e:
if re.search("(No such file|not found)", str(e)): callback(-1, f"Internal server error while chunking: %s" %
callback(-1, "Can not find file <%s>" % row["name"])
else:
callback(-1, f"Internal server error: %s" %
str(e).replace("'", "")) str(e).replace("'", ""))
traceback.print_exc()
cron_logger.error( cron_logger.error(
"Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) "Chunking {}/{}: {}".format(row["location"], row["name"], str(e)))
traceback.print_exc()
return return
docs = [] docs = []