diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index ff20e317..f39c6ca9 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -30,7 +30,7 @@ from api.utils.api_utils import ( token_required, get_error_data_result, valid, - get_parser_config, + get_parser_config, valid_parser_config, ) @@ -89,6 +89,7 @@ def create(tenant_id): permission = req.get("permission") chunk_method = req.get("chunk_method") parser_config = req.get("parser_config") + valid_parser_config(parser_config) valid_permission = ["me", "team"] valid_chunk_method = [ "naive", @@ -323,6 +324,7 @@ def update(tenant_id, dataset_id): permission = req.get("permission") chunk_method = req.get("chunk_method") parser_config = req.get("parser_config") + valid_parser_config(parser_config) valid_permission = ["me", "team"] valid_chunk_method = [ "naive", diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index 070cf397..7cc73c7b 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -371,3 +371,32 @@ def get_parser_config(chunk_method, parser_config): "picture": None} parser_config = key_mapping[chunk_method] return parser_config + + +def valid_parser_config(parser_config): + if not parser_config: + return + scopes = set([ + "chunk_token_num", + "delimiter", + "raptor", + "graphrag", + "layout_recognize", + "task_page_size", + "pages", + "html4excel", + "auto_keywords", + "auto_questions", + "tag_kb_ids", + "topn_tags", + "filename_embd_weight" + ]) + for k in parser_config.keys(): + assert k in scopes, f"Abnormal 'parser_config'. Invalid key: {k}" + + assert 1 <= parser_config.get("chunk_token_num", 1) < 100000000, "chunk_token_num should be in range from 1 to 100000000" + assert 1 <= parser_config.get("task_page_size", 1) < 100000000, "task_page_size should be in range from 1 to 100000000" + assert 0 <= parser_config.get("auto_keywords", 0) < 32, "auto_keywords should be in range from 0 to 32" + assert 0 <= parser_config.get("auto_questions", 0) < 10, "auto_questions should be in range from 0 to 10" + assert 0 <= parser_config.get("topn_tags", 0) < 10, "topn_tags should be in range from 0 to 10" + assert isinstance(parser_config.get("html4excel", False), bool), "html4excel should be True or False" diff --git a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py index 1c37d1bb..cb21cb06 100644 --- a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -178,7 +178,7 @@ class TestAdvancedConfigurations: @pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [ ("naive_default", "naive", - {"chunk_token_count": 128, + {"chunk_token_num": 128, "layout_recognize": "DeepDOC", "html4excel": False, "delimiter": "\n!?。;!?", @@ -187,20 +187,20 @@ class TestAdvancedConfigurations: }, 0), ("naive_empty", "naive", {}, 0), - pytest.param("naive_chunk_token_count_negative", "naive", - {"chunk_token_count": -1}, + pytest.param("naive_chunk_token_num_negative", "naive", + {"chunk_token_num": -1}, 102, marks=pytest.mark.xfail(reason='issue#5719')), - pytest.param("naive_chunk_token_count_zero", "naive", - {"chunk_token_count": 0}, + pytest.param("naive_chunk_token_num_zero", "naive", + {"chunk_token_num": 0}, 102, marks=pytest.mark.xfail(reason='issue#5719')), - pytest.param("naive_chunk_token_count_float", "naive", - {"chunk_token_count": 3.14}, + pytest.param("naive_chunk_token_num_float", "naive", + {"chunk_token_num": 3.14}, 102, marks=pytest.mark.xfail(reason='issue#5719')), - pytest.param("naive_chunk_token_count_max", "naive", - {"chunk_token_count": 1024*1024*1024}, + pytest.param("naive_chunk_token_num_max", "naive", + {"chunk_token_num": 1024*1024*1024}, 102, marks=pytest.mark.xfail(reason='issue#5719')), - pytest.param("naive_chunk_token_count_str", "naive", - {"chunk_token_count": '1024'}, + pytest.param("naive_chunk_token_num_str", "naive", + {"chunk_token_num": '1024'}, 102, marks=pytest.mark.xfail(reason='issue#5719')), ("naive_layout_recognize_DeepDOC", "naive", {"layout_recognize": "DeepDOC"}, 0), @@ -231,10 +231,6 @@ class TestAdvancedConfigurations: 102, marks=pytest.mark.xfail(reason='issue#5719')), ("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0), ("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0), - ("knowledge_graph_entity_types_default", "knowledge_graph", { - "entity_types": ["organization", "person", "location", "event", "time"]}, 0), - pytest.param("knowledge_graph_entity_types_not_list", "knowledge_graph", { - "entity_types": "organization,person,location,event,time"}, 102, marks=pytest.mark.xfail(reason='issue#5719')) ]) def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code): payload = {