Fix: add the validation for parser_config. (#5755)
### What problem does this PR solve? #5719 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
b1bbb9e210
commit
da3f279495
@ -30,7 +30,7 @@ from api.utils.api_utils import (
|
|||||||
token_required,
|
token_required,
|
||||||
get_error_data_result,
|
get_error_data_result,
|
||||||
valid,
|
valid,
|
||||||
get_parser_config,
|
get_parser_config, valid_parser_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -89,6 +89,7 @@ def create(tenant_id):
|
|||||||
permission = req.get("permission")
|
permission = req.get("permission")
|
||||||
chunk_method = req.get("chunk_method")
|
chunk_method = req.get("chunk_method")
|
||||||
parser_config = req.get("parser_config")
|
parser_config = req.get("parser_config")
|
||||||
|
valid_parser_config(parser_config)
|
||||||
valid_permission = ["me", "team"]
|
valid_permission = ["me", "team"]
|
||||||
valid_chunk_method = [
|
valid_chunk_method = [
|
||||||
"naive",
|
"naive",
|
||||||
@ -323,6 +324,7 @@ def update(tenant_id, dataset_id):
|
|||||||
permission = req.get("permission")
|
permission = req.get("permission")
|
||||||
chunk_method = req.get("chunk_method")
|
chunk_method = req.get("chunk_method")
|
||||||
parser_config = req.get("parser_config")
|
parser_config = req.get("parser_config")
|
||||||
|
valid_parser_config(parser_config)
|
||||||
valid_permission = ["me", "team"]
|
valid_permission = ["me", "team"]
|
||||||
valid_chunk_method = [
|
valid_chunk_method = [
|
||||||
"naive",
|
"naive",
|
||||||
|
|||||||
@ -371,3 +371,32 @@ def get_parser_config(chunk_method, parser_config):
|
|||||||
"picture": None}
|
"picture": None}
|
||||||
parser_config = key_mapping[chunk_method]
|
parser_config = key_mapping[chunk_method]
|
||||||
return parser_config
|
return parser_config
|
||||||
|
|
||||||
|
|
||||||
|
def valid_parser_config(parser_config):
|
||||||
|
if not parser_config:
|
||||||
|
return
|
||||||
|
scopes = set([
|
||||||
|
"chunk_token_num",
|
||||||
|
"delimiter",
|
||||||
|
"raptor",
|
||||||
|
"graphrag",
|
||||||
|
"layout_recognize",
|
||||||
|
"task_page_size",
|
||||||
|
"pages",
|
||||||
|
"html4excel",
|
||||||
|
"auto_keywords",
|
||||||
|
"auto_questions",
|
||||||
|
"tag_kb_ids",
|
||||||
|
"topn_tags",
|
||||||
|
"filename_embd_weight"
|
||||||
|
])
|
||||||
|
for k in parser_config.keys():
|
||||||
|
assert k in scopes, f"Abnormal 'parser_config'. Invalid key: {k}"
|
||||||
|
|
||||||
|
assert 1 <= parser_config.get("chunk_token_num", 1) < 100000000, "chunk_token_num should be in range from 1 to 100000000"
|
||||||
|
assert 1 <= parser_config.get("task_page_size", 1) < 100000000, "task_page_size should be in range from 1 to 100000000"
|
||||||
|
assert 0 <= parser_config.get("auto_keywords", 0) < 32, "auto_keywords should be in range from 0 to 32"
|
||||||
|
assert 0 <= parser_config.get("auto_questions", 0) < 10, "auto_questions should be in range from 0 to 10"
|
||||||
|
assert 0 <= parser_config.get("topn_tags", 0) < 10, "topn_tags should be in range from 0 to 10"
|
||||||
|
assert isinstance(parser_config.get("html4excel", False), bool), "html4excel should be True or False"
|
||||||
|
|||||||
@ -178,7 +178,7 @@ class TestAdvancedConfigurations:
|
|||||||
|
|
||||||
@pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [
|
@pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [
|
||||||
("naive_default", "naive",
|
("naive_default", "naive",
|
||||||
{"chunk_token_count": 128,
|
{"chunk_token_num": 128,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"delimiter": "\n!?。;!?",
|
"delimiter": "\n!?。;!?",
|
||||||
@ -187,20 +187,20 @@ class TestAdvancedConfigurations:
|
|||||||
},
|
},
|
||||||
0),
|
0),
|
||||||
("naive_empty", "naive", {}, 0),
|
("naive_empty", "naive", {}, 0),
|
||||||
pytest.param("naive_chunk_token_count_negative", "naive",
|
pytest.param("naive_chunk_token_num_negative", "naive",
|
||||||
{"chunk_token_count": -1},
|
{"chunk_token_num": -1},
|
||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
pytest.param("naive_chunk_token_count_zero", "naive",
|
pytest.param("naive_chunk_token_num_zero", "naive",
|
||||||
{"chunk_token_count": 0},
|
{"chunk_token_num": 0},
|
||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
pytest.param("naive_chunk_token_count_float", "naive",
|
pytest.param("naive_chunk_token_num_float", "naive",
|
||||||
{"chunk_token_count": 3.14},
|
{"chunk_token_num": 3.14},
|
||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
pytest.param("naive_chunk_token_count_max", "naive",
|
pytest.param("naive_chunk_token_num_max", "naive",
|
||||||
{"chunk_token_count": 1024*1024*1024},
|
{"chunk_token_num": 1024*1024*1024},
|
||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
pytest.param("naive_chunk_token_count_str", "naive",
|
pytest.param("naive_chunk_token_num_str", "naive",
|
||||||
{"chunk_token_count": '1024'},
|
{"chunk_token_num": '1024'},
|
||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
("naive_layout_recognize_DeepDOC", "naive",
|
("naive_layout_recognize_DeepDOC", "naive",
|
||||||
{"layout_recognize": "DeepDOC"}, 0),
|
{"layout_recognize": "DeepDOC"}, 0),
|
||||||
@ -231,10 +231,6 @@ class TestAdvancedConfigurations:
|
|||||||
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
102, marks=pytest.mark.xfail(reason='issue#5719')),
|
||||||
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0),
|
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0),
|
||||||
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0),
|
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0),
|
||||||
("knowledge_graph_entity_types_default", "knowledge_graph", {
|
|
||||||
"entity_types": ["organization", "person", "location", "event", "time"]}, 0),
|
|
||||||
pytest.param("knowledge_graph_entity_types_not_list", "knowledge_graph", {
|
|
||||||
"entity_types": "organization,person,location,event,time"}, 102, marks=pytest.mark.xfail(reason='issue#5719'))
|
|
||||||
])
|
])
|
||||||
def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code):
|
def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code):
|
||||||
payload = {
|
payload = {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user