diff --git a/api/apps/__init__.py b/api/apps/__init__.py index 4fdeb963..04eb28c5 100644 --- a/api/apps/__init__.py +++ b/api/apps/__init__.py @@ -83,7 +83,7 @@ def register_page(page_path): sys.modules[module_name] = page spec.loader.exec_module(page) page_name = getattr(page, 'page_name', page_name) - url_prefix = f'/api/{API_VERSION}/{page_name}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}' + url_prefix = f'/api/{API_VERSION}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}' app.register_blueprint(page.manager, url_prefix=url_prefix) return url_prefix diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 94bdf638..269edb33 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -25,143 +25,146 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.user_service import TenantService from api.settings import RetCode from api.utils import get_uuid -from api.utils.api_utils import get_json_result, token_required, get_data_error_result +from api.utils.api_utils import get_result, token_required,get_error_data_result - -@manager.route('/save', methods=['POST']) +@manager.route('/dataset', methods=['POST']) @token_required -def save(tenant_id): +def create(tenant_id): req = request.json e, t = TenantService.get_by_id(tenant_id) - if "id" not in req: - if "tenant_id" in req or "embedding_model" in req: - return get_data_error_result( - retmsg="Tenant_id or embedding_model must not be provided") - if "name" not in req: - return get_data_error_result( - retmsg="Name is not empty!") - req['id'] = get_uuid() - req["name"] = req["name"].strip() - if req["name"] == "": - return get_data_error_result( - retmsg="Name is not empty string!") - if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value): - return get_data_error_result( - retmsg="Duplicated knowledgebase name in creating dataset.") - req["tenant_id"] = req['created_by'] = tenant_id - req['embedding_model'] = t.embd_id - key_mapping = { - "chunk_num": "chunk_count", - "doc_num": "document_count", - "parser_id": "parse_method", - "embd_id": "embedding_model" - } - mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req} - req.update(mapped_keys) - if not KnowledgebaseService.save(**req): - return get_data_error_result(retmsg="Create dataset error.(Database error)") - renamed_data = {} - e, k = KnowledgebaseService.get_by_id(req["id"]) - for key, value in k.to_dict().items(): - new_key = key_mapping.get(key, key) - renamed_data[new_key] = value - return get_json_result(data=renamed_data) - else: - invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"} - if any(key in req for key in invalid_keys): - return get_data_error_result(retmsg="The input parameters are invalid.") + if "tenant_id" in req or "embedding_model" in req: + return get_error_data_result( + retmsg="Tenant_id or embedding_model must not be provided") + chunk_count=req.get("chunk_count") + document_count=req.get("document_count") + if chunk_count or document_count: + return get_error_data_result(retmsg="chunk_count or document_count must be 0 or not be provided") + if "name" not in req: + return get_error_data_result( + retmsg="Name is not empty!") + req['id'] = get_uuid() + req["name"] = req["name"].strip() + if req["name"] == "": + return get_error_data_result( + retmsg="Name is not empty string!") + if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value): + return get_error_data_result( + retmsg="Duplicated knowledgebase name in creating dataset.") + req["tenant_id"] = req['created_by'] = tenant_id + req['embedding_model'] = t.embd_id + key_mapping = { + "chunk_num": "chunk_count", + "doc_num": "document_count", + "parser_id": "parse_method", + "embd_id": "embedding_model" + } + mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req} + req.update(mapped_keys) + if not KnowledgebaseService.save(**req): + return get_error_data_result(retmsg="Create dataset error.(Database error)") + renamed_data = {} + e, k = KnowledgebaseService.get_by_id(req["id"]) + for key, value in k.to_dict().items(): + new_key = key_mapping.get(key, key) + renamed_data[new_key] = value + return get_result(data=renamed_data) - if "tenant_id" in req: - if req["tenant_id"] != tenant_id: - return get_data_error_result( - retmsg="Can't change tenant_id.") - - if "embedding_model" in req: - if req["embedding_model"] != t.embd_id: - return get_data_error_result( - retmsg="Can't change embedding_model.") - req.pop("embedding_model") - - if not KnowledgebaseService.query( - created_by=tenant_id, id=req["id"]): - return get_json_result( - data=False, retmsg='You do not own the dataset.', - retcode=RetCode.OPERATING_ERROR) - - if not req["id"]: - return get_data_error_result( - retmsg="id can not be empty.") - e, kb = KnowledgebaseService.get_by_id(req["id"]) - - if "chunk_count" in req: - if req["chunk_count"] != kb.chunk_num: - return get_data_error_result( - retmsg="Can't change chunk_count.") - req.pop("chunk_count") - - if "document_count" in req: - if req['document_count'] != kb.doc_num: - return get_data_error_result( - retmsg="Can't change document_count.") - req.pop("document_count") - - if "parse_method" in req: - if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id: - return get_data_error_result( - retmsg="If chunk count is not 0, parse method is not changable.") - req['parser_id'] = req.pop('parse_method') - if "name" in req: - req["name"] = req["name"].strip() - if req["name"].lower() != kb.name.lower() \ - and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, - status=StatusEnum.VALID.value)) > 0: - return get_data_error_result( - retmsg="Duplicated knowledgebase name in updating dataset.") - - del req["id"] - if not KnowledgebaseService.update_by_id(kb.id, req): - return get_data_error_result(retmsg="Update dataset error.(Database error)") - return get_json_result(data=True) - - -@manager.route('/delete', methods=['DELETE']) +@manager.route('/dataset', methods=['DELETE']) @token_required def delete(tenant_id): - req = request.args - if "id" not in req: - return get_data_error_result( - retmsg="id is required") - kbs = KnowledgebaseService.query( - created_by=tenant_id, id=req["id"]) - if not kbs: - return get_json_result( - data=False, retmsg='You do not own the dataset', - retcode=RetCode.OPERATING_ERROR) + req = request.json + names=req.get("names") + ids = req.get("ids") + if not ids and not names: + return get_error_data_result( + retmsg="ids or names is required") + id_list=[] + if names: + for name in names: + kbs=KnowledgebaseService.query(name=name,tenant_id=tenant_id) + if not kbs: + return get_error_data_result(retmsg=f"You don't own the dataset {name}") + id_list.append(kbs[0].id) + if ids: + for id in ids: + kbs=KnowledgebaseService.query(id=id,tenant_id=tenant_id) + if not kbs: + return get_error_data_result(retmsg=f"You don't own the dataset {id}") + id_list.extend(ids) + for id in id_list: + for doc in DocumentService.query(kb_id=id): + if not DocumentService.remove_document(doc, tenant_id): + return get_error_data_result( + retmsg="Remove document error.(Database error)") + f2d = File2DocumentService.get_by_document_id(doc.id) + FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) + File2DocumentService.delete_by_document_id(doc.id) + if not KnowledgebaseService.delete_by_id(id): + return get_error_data_result( + retmsg="Delete dataset error.(Database serror)") + return get_result(retcode=RetCode.SUCCESS) - for doc in DocumentService.query(kb_id=req["id"]): - if not DocumentService.remove_document(doc, kbs[0].tenant_id): - return get_data_error_result( - retmsg="Remove document error.(Database error)") - f2d = File2DocumentService.get_by_document_id(doc.id) - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) - File2DocumentService.delete_by_document_id(doc.id) - - if not KnowledgebaseService.delete_by_id(req["id"]): - return get_data_error_result( - retmsg="Delete dataset error.(Database serror)") - return get_json_result(data=True) - - -@manager.route('/list', methods=['GET']) +@manager.route('/dataset/', methods=['PUT']) @token_required -def list_datasets(tenant_id): +def update(tenant_id,dataset_id): + if not KnowledgebaseService.query(id=dataset_id,tenant_id=tenant_id): + return get_error_data_result(retmsg="You don't own the dataset") + req = request.json + e, t = TenantService.get_by_id(tenant_id) + invalid_keys = {"id", "embd_id", "chunk_num", "doc_num", "parser_id"} + if any(key in req for key in invalid_keys): + return get_error_data_result(retmsg="The input parameters are invalid.") + if "tenant_id" in req: + if req["tenant_id"] != tenant_id: + return get_error_data_result( + retmsg="Can't change tenant_id.") + if "embedding_model" in req: + if req["embedding_model"] != t.embd_id: + return get_error_data_result( + retmsg="Can't change embedding_model.") + req.pop("embedding_model") + e, kb = KnowledgebaseService.get_by_id(dataset_id) + if "chunk_count" in req: + if req["chunk_count"] != kb.chunk_num: + return get_error_data_result( + retmsg="Can't change chunk_count.") + req.pop("chunk_count") + if "document_count" in req: + if req['document_count'] != kb.doc_num: + return get_error_data_result( + retmsg="Can't change document_count.") + req.pop("document_count") + if "parse_method" in req: + if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id: + return get_error_data_result( + retmsg="If chunk count is not 0, parse method is not changable.") + req['parser_id'] = req.pop('parse_method') + if "name" in req: + req["name"] = req["name"].strip() + if req["name"].lower() != kb.name.lower() \ + and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, + status=StatusEnum.VALID.value)) > 0: + return get_error_data_result( + retmsg="Duplicated knowledgebase name in updating dataset.") + if not KnowledgebaseService.update_by_id(kb.id, req): + return get_error_data_result(retmsg="Update dataset error.(Database error)") + return get_result(retcode=RetCode.SUCCESS) + +@manager.route('/dataset', methods=['GET']) +@token_required +def list(tenant_id): + id = request.args.get("id") + name = request.args.get("name") + kbs = KnowledgebaseService.query(id=id,name=name,status=1) + if not kbs: + return get_error_data_result(retmsg="The dataset doesn't exist") page_number = int(request.args.get("page", 1)) items_per_page = int(request.args.get("page_size", 1024)) orderby = request.args.get("orderby", "create_time") desc = bool(request.args.get("desc", True)) tenants = TenantService.get_joined_tenants_by_user_id(tenant_id) - kbs = KnowledgebaseService.get_by_tenant_ids( - [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc) + kbs = KnowledgebaseService.get_list( + [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc, id, name) renamed_list = [] for kb in kbs: key_mapping = { @@ -175,50 +178,4 @@ def list_datasets(tenant_id): new_key = key_mapping.get(key, key) renamed_data[new_key] = value renamed_list.append(renamed_data) - return get_json_result(data=renamed_list) - - -@manager.route('/detail', methods=['GET']) -@token_required -def detail(tenant_id): - req = request.args - key_mapping = { - "chunk_num": "chunk_count", - "doc_num": "document_count", - "parser_id": "parse_method", - "embd_id": "embedding_model" - } - renamed_data = {} - if "id" in req: - id = req["id"] - kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"]) - if not kb: - return get_json_result( - data=False, retmsg='You do not own the dataset.', - retcode=RetCode.OPERATING_ERROR) - if "name" in req: - name = req["name"] - if kb[0].name != name: - return get_json_result( - data=False, retmsg='You do not own the dataset.', - retcode=RetCode.OPERATING_ERROR) - e, k = KnowledgebaseService.get_by_id(id) - for key, value in k.to_dict().items(): - new_key = key_mapping.get(key, key) - renamed_data[new_key] = value - return get_json_result(data=renamed_data) - else: - if "name" in req: - name = req["name"] - e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id) - if not e: - return get_json_result( - data=False, retmsg='You do not own the dataset.', - retcode=RetCode.OPERATING_ERROR) - for key, value in k.to_dict().items(): - new_key = key_mapping.get(key, key) - renamed_data[new_key] = value - return get_json_result(data=renamed_data) - else: - return get_data_error_result( - retmsg="At least one of `id` or `name` must be provided.") + return get_result(data=renamed_list) diff --git a/api/db/services/knowledgebase_service.py b/api/db/services/knowledgebase_service.py index 2874ee26..9fe75d46 100644 --- a/api/db/services/knowledgebase_service.py +++ b/api/db/services/knowledgebase_service.py @@ -142,3 +142,27 @@ class KnowledgebaseService(CommonService): @DB.connection_context() def get_all_ids(cls): return [m["id"] for m in cls.model.select(cls.model.id).dicts()] + + @classmethod + @DB.connection_context() + def get_list(cls, joined_tenant_ids, user_id, + page_number, items_per_page, orderby, desc, id , name): + kbs = cls.model.select() + if id: + kbs = kbs.where(cls.model.id == id) + if name: + kbs = kbs.where(cls.model.name == name) + kbs = kbs.where( + ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == + TenantPermission.TEAM.value)) | ( + cls.model.tenant_id == user_id)) + & (cls.model.status == StatusEnum.VALID.value) + ) + if desc: + kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) + else: + kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) + + kbs = kbs.paginate(page_number, items_per_page) + + return list(kbs.dicts()) diff --git a/api/http_api.md b/api/http_api.md index f7cec545..80c1f81a 100644 --- a/api/http_api.md +++ b/api/http_api.md @@ -5,63 +5,134 @@ **POST** `/api/v1/dataset` -Creates a dataset with a name. If dataset of the same name already exists, the new dataset will be renamed by RAGFlow automatically. +Creates a dataset. ### Request - Method: POST -- URL: `/api/v1/dataset` +- URL: `http://{address}/api/v1/dataset` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - Body: - - `"dataset_name"`: `string` + - `"id"`: `string` + - `"name"`: `string` + - `"avatar"`: `string` - `"tenant_id"`: `string` + - `"description"`: `string` + - `"language"`: `string` - `"embedding_model"`: `string` - - `"chunk_count"`: `integer` + - `"permission"`: `string` - `"document_count"`: `integer` + - `"chunk_count"`: `integer` - `"parse_method"`: `string` + - `"parser_config"`: `Dataset.ParserConfig` #### Request example -```shell +```bash +# "id": id must not be provided. +# "name": name is required and can't be duplicated. +# "tenant_id": tenant_id must not be provided. +# "embedding_model": embedding_model must not be provided. +# "navie" means general. curl --request POST \ - --url http://{address}/api/v1/dataset \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ - --data-binary '{ - "dataset_name": "test", - "tenant_id": "4fb0cd625f9311efba4a0242ac120006", - "embedding_model": "BAAI/bge--zh-v1.5", - "chunk_count": 0, - "document_count": 0, - "parse_method": "general" + --url http://{address}/api/v1/dataset \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{ + "name": "test", + "chunk_count": 0, + "document_count": 0, + "parse_method": "naive" }' ``` #### Request parameters -- `"dataset_name"`: (*Body parameter*) +- `"id"`: (*Body parameter*) + The ID of the created dataset used to uniquely identify different datasets. + - If creating a dataset, `id` must not be provided. + +- `"name"`: (*Body parameter*) The name of the dataset, which must adhere to the following requirements: - - Maximum 65,535 characters. + - Required when creating a dataset and must be unique. + - If updating a dataset, `name` must still be unique. + +- `"avatar"`: (*Body parameter*) + Base64 encoding of the avatar. + - `"tenant_id"`: (*Body parameter*) - The ID of the tenant. + The ID of the tenant associated with the dataset, used to link it with specific users. + - If creating a dataset, `tenant_id` must not be provided. + - If updating a dataset, `tenant_id` cannot be changed. + +- `"description"`: (*Body parameter*) + The description of the dataset. + +- `"language"`: (*Body parameter*) + The language setting for the dataset. + - `"embedding_model"`: (*Body parameter*) - Embedding model used in the dataset. -- `"chunk_count"`: (*Body parameter*) - Chunk count of the dataset. + Embedding model used in the dataset to generate vector embeddings. + - If creating a dataset, `embedding_model` must not be provided. + - If updating a dataset, `embedding_model` cannot be changed. + +- `"permission"`: (*Body parameter*) + Specifies who can manipulate the dataset. + - `"document_count"`: (*Body parameter*) - Document count of the dataset. -- `"parse_mehtod"`: (*Body parameter*) - Parsing method of the dataset. + Document count of the dataset. + - If updating a dataset, `document_count` cannot be changed. + +- `"chunk_count"`: (*Body parameter*) + Chunk count of the dataset. + - If updating a dataset, `chunk_count` cannot be changed. + +- `"parse_method"`: (*Body parameter*) + Parsing method of the dataset. + - If updating `parse_method`, `chunk_count` must be greater than 0. + +- `"parser_config"`: (*Body parameter*) + The configuration settings for the dataset parser. ### Response The successful response includes a JSON object like the following: -```shell +```json { - "code": 0 + "code": 0, + "data": { + "avatar": null, + "chunk_count": 0, + "create_date": "Thu, 10 Oct 2024 05:57:37 GMT", + "create_time": 1728539857641, + "created_by": "69736c5e723611efb51b0242ac120007", + "description": null, + "document_count": 0, + "embedding_model": "BAAI/bge-large-zh-v1.5", + "id": "8d73076886cc11ef8c270242ac120006", + "language": "English", + "name": "test_1", + "parse_method": "naive", + "parser_config": { + "pages": [ + [ + 1, + 1000000 + ] + ] + }, + "permission": "me", + "similarity_threshold": 0.2, + "status": "1", + "tenant_id": "69736c5e723611efb51b0242ac120007", + "token_num": 0, + "update_date": "Thu, 10 Oct 2024 05:57:37 GMT", + "update_time": 1728539857641, + "vector_similarity_weight": 0.3 + } } ``` @@ -71,10 +142,10 @@ The successful response includes a JSON object like the following: The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't connect database" + "code": 102, + "message": "Duplicated knowledgebase name in creating dataset." } ``` @@ -82,27 +153,31 @@ The error response includes a JSON object like the following: **DELETE** `/api/v1/dataset` -Deletes a dataset by its id or name. +Deletes datasets by ids or names. ### Request - Method: DELETE -- URL: `/api/v1/dataset/{dataset_id}` +- URL: `http://{address}/api/v1/dataset` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' + - Body: + - `"names"`: `List[string]` + - `"ids"`: `List[string]` #### Request example -```shell +```bash +# Either id or name must be provided, but not both. curl --request DELETE \ - --url http://{address}/api/v1/dataset/0 \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --data ' { - "names": ["ds1", "ds2"] - }' + --url http://{address}/api/v1/dataset \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{ + "names": ["test_1", "test_2"] + }' ``` #### Request parameters @@ -118,7 +193,7 @@ curl --request DELETE \ The successful response includes a JSON object like the following: -```shell +```json { "code": 0 } @@ -130,10 +205,10 @@ The successful response includes a JSON object like the following: The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Try to delete non-existent dataset." + "code": 102, + "message": "You don't own the dataset." } ``` @@ -146,50 +221,47 @@ Updates a dataset by its id. ### Request - Method: PUT -- URL: `/api/v1/dataset/{dataset_id}` +- URL: `http://{address}/api/v1/dataset/{dataset_id}` - Headers: - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' + - Body: (Refer to the "Create Dataset" for the complete structure of the request body.) #### Request example -```shell +```bash +# "id": id is required. +# "name": If you update name, it can't be duplicated. +# "tenant_id": If you update tenant_id, it can't be changed +# "embedding_model": If you update embedding_model, it can't be changed. +# "chunk_count": If you update chunk_count, it can't be changed. +# "document_count": If you update document_count, it can't be changed. +# "parse_method": If you update parse_method, chunk_count must be 0. +# "navie" means general. curl --request PUT \ - --url http://{address}/api/v1/dataset/0 \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - --data-binary '{ - "dataset_name": "test", - "tenant_id": "4fb0cd625f9311efba4a0242ac120006", - "embedding_model": "BAAI/bge--zh-v1.5", - "chunk_count": 0, - "document_count": 0, - "parse_method": "general" + --url http://{address}/api/v1/dataset/{dataset_id} \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ + --data '{ + "name": "test", + "tenant_id": "4fb0cd625f9311efba4a0242ac120006", + "embedding_model": "BAAI/bge-zh-v1.5", + "chunk_count": 0, + "document_count": 0, + "parse_method": "navie" }' ``` #### Request parameters +(Refer to the "Create Dataset" for the complete structure of the request parameters.) -- `"dataset_name"`: (*Body parameter*) - The name of the dataset, which must adhere to the following requirements: - - Maximum 65,535 characters. -- `"tenant_id"`: (*Body parameter*) - The ID of the tenant. -- `"embedding_model"`: (*Body parameter*) - Embedding model used in the dataset. -- `"chunk_count"`: (*Body parameter*) - Chunk count of the dataset. -- `"document_count"`: (*Body parameter*) - Document count of the dataset. -- `"parse_mehtod"`: (*Body parameter*) - Parsing method of the dataset. ### Response The successful response includes a JSON object like the following: -```shell +```json { "code": 0 } @@ -201,35 +273,37 @@ The successful response includes a JSON object like the following: The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't change embedding model since some files already use it." + "code": 102, + "message": "Can't change tenant_id." } ``` ## List datasets -**GET** `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` +**GET** `/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}` List all datasets ### Request - Method: GET -- URL: `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` +- URL: `http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}` - Headers: - - `content-Type: application/json` - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' #### Request example -```shell +```bash +# If no page parameter is passed, the default is 1 +# If no page_size parameter is passed, the default is 1024 +# If no order_by parameter is passed, the default is "create_time" +# If no desc parameter is passed, the default is True curl --request GET \ - --url http://{address}/api/v1/dataset?page=0&page_size=50&orderby=create_time&desc=false \ - --header 'Content-Type: application/json' \ - --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' + --url http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id} \ + --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' ``` #### Request parameters @@ -244,54 +318,63 @@ curl --request GET \ A boolean flag indicating whether the sorting should be in descending order. - `name`: (*Path parameter*) Dataset name +- - `"id"`: (*Path parameter*) + The ID of the dataset to be retrieved. +- `"name"`: (*Path parameter*) + The name of the dataset to be retrieved. ### Response The successful response includes a JSON object like the following: -```shell +```json { "code": 0, "data": [ { - "avatar": "", - "chunk_count": 0, - "create_date": "Thu, 29 Aug 2024 03:13:07 GMT", - "create_time": 1724901187843, - "created_by": "4fb0cd625f9311efba4a0242ac120006", - "description": "", - "document_count": 0, - "embedding_model": "BAAI/bge-large-zh-v1.5", - "id": "9d3d906665b411ef87d10242ac120006", - "language": "English", - "name": "Test", - "parser_config": { - "chunk_token_count": 128, - "delimiter": "\n!?。;!?", - "layout_recognize": true, - "task_page_size": 12 - }, - "parse_method": "naive", - "permission": "me", - "similarity_threshold": 0.2, - "status": "1", - "tenant_id": "4fb0cd625f9311efba4a0242ac120006", - "token_count": 0, - "update_date": "Thu, 29 Aug 2024 03:13:07 GMT", - "update_time": 1724901187843, - "vector_similarity_weight": 0.3 + "avatar": "", + "chunk_count": 59, + "create_date": "Sat, 14 Sep 2024 01:12:37 GMT", + "create_time": 1726276357324, + "created_by": "69736c5e723611efb51b0242ac120007", + "description": null, + "document_count": 1, + "embedding_model": "BAAI/bge-large-zh-v1.5", + "id": "6e211ee0723611efa10a0242ac120007", + "language": "English", + "name": "mysql", + "parse_method": "knowledge_graph", + "parser_config": { + "chunk_token_num": 8192, + "delimiter": "\\n!?;。;!?", + "entity_types": [ + "organization", + "person", + "location", + "event", + "time" + ] + }, + "permission": "me", + "similarity_threshold": 0.2, + "status": "1", + "tenant_id": "69736c5e723611efb51b0242ac120007", + "token_num": 12744, + "update_date": "Thu, 10 Oct 2024 04:07:23 GMT", + "update_time": 1728533243536, + "vector_similarity_weight": 0.3 } - ], + ] } ``` The error response includes a JSON object like the following: -```shell +```json { - "code": 3016, - "message": "Can't access database to get the dataset list." + "code": 102, + "message": "The dataset doesn't exist" } ``` diff --git a/api/python_api_reference.md b/api/python_api_reference.md index c566adce..73723558 100644 --- a/api/python_api_reference.md +++ b/api/python_api_reference.md @@ -38,9 +38,9 @@ The unique name of the dataset to create. It must adhere to the following requir #### avatar: `str` -The url or ???????????????????????? path to the avatar image associated with the created dataset. Defaults to `""` +Base64 encoding of the avatar. Defaults to `""` -#### tenant_id: `str` ????????????????? +#### tenant_id: `str` The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`. @@ -55,9 +55,9 @@ The description of the created dataset. Defaults to `""`. The language setting of the created dataset. Defaults to `"English"`. ???????????? -#### embedding_model: `str` ???????????????? +#### embedding_model: `str` -The specific model or algorithm used by the dataset to generate vector embeddings. Defaults to `""`. +The specific model used by the dataset to generate vector embeddings. Defaults to `""`. - If creating a dataset, embedding_model must not be provided. - If updating a dataset, embedding_model can't be changed. @@ -89,12 +89,10 @@ The method used by the dataset to parse and process data. The configuration settings for the parser used by the dataset. ### Returns - -- Success: An `infinity.local_infinity.table.LocalTable` object in Python module mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. -- Failure: `InfinityException` - - `error_code`: `int` A non-zero value indicating a specific error condition. - - `error_msg`: `str` A message providing additional details about the error. - +```python +DataSet +description: dataset object +``` ### Examples ```python @@ -106,19 +104,28 @@ ds = rag.create_dataset(name="kb_1") --- -## Delete knowledge base +## Delete knowledge bases ```python -DataSet.delete() -> bool +RAGFlow.delete_dataset(ids: List[str] = None, names: List[str] = None) ``` +Deletes knowledge bases. +### Parameters -Deletes a knowledge base. +#### ids: `List[str]` +The ids of the datasets to be deleted. + +#### names: `List[str]` + +The names of the datasets to be deleted. + +Either `ids` or `names` must be provided, but not both. ### Returns -`bool` - -description:the case of updating an dateset, `True` or `False`. +```python +no return +``` ### Examples @@ -126,8 +133,8 @@ description:the case of updating an dateset, `True` or `False`. from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -ds = rag.create_dataset(name="kb_1") -ds.delete() +rag.delete_dataset(names=["name_1","name_2"]) +rag.delete_dataset(ids=["id_1","id_2"]) ``` --- @@ -139,7 +146,9 @@ RAGFlow.list_datasets( page: int = 1, page_size: int = 1024, orderby: str = "create_time", - desc: bool = True + desc: bool = True, + id: str = None, + name: str = None ) -> List[DataSet] ``` @@ -163,6 +172,14 @@ The field by which the records should be sorted. This specifies the attribute or Whether the sorting should be in descending order. Defaults to `True`. +#### id: `str` + +The id of the dataset to be got. Defaults to `None`. + +#### name: `str` + +The name of the dataset to be got. Defaults to `None`. + ### Returns ```python @@ -182,57 +199,17 @@ for ds in rag.list_datasets(): --- -## Retrieve knowledge base + +## Update knowledge base ```python -RAGFlow.get_dataset( - id: str = None, - name: str = None -) -> DataSet -``` - -Retrieves a knowledge base by name. - -### Parameters - -#### name: `str` - -The name of the dataset to be got. If `id` is not provided, `name` is required. - -#### id: `str` - -The id of the dataset to be got. If `name` is not provided, `id` is required. - -### Returns - -```python -DataSet -description: dataset object -``` - -### Examples - -```python -from ragflow import RAGFlow - -rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") -ds = rag.get_dataset(name="ragflow") -print(ds) -``` - ---- - -## Save knowledge base configurations - -```python -DataSet.save() -> bool +DataSet.update(update_message: dict) ``` ### Returns ```python -bool -description:the case of updating an dateset, True or False. +no return ``` ### Examples @@ -242,8 +219,7 @@ from ragflow import RAGFlow rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") ds = rag.get_dataset(name="kb_1") -ds.parse_method = "manual" -ds.save() +ds.update({"parse_method":"manual", ...}} ``` --- diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index 0988ff9f..0a91ce04 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -268,3 +268,32 @@ def token_required(func): return func(*args, **kwargs) return decorated_function + +def get_result(retcode=RetCode.SUCCESS, retmsg='error', data=None): + if retcode == 0: + if data is not None: + response = {"code": retcode, "data": data} + else: + response = {"code": retcode} + else: + response = {"code": retcode, "message": retmsg} + return jsonify(response) + +def get_error_data_result(retcode=RetCode.DATA_ERROR, + retmsg='Sorry! Data missing!'): + import re + result_dict = { + "code": retcode, + "message": re.sub( + r"rag", + "seceum", + retmsg, + flags=re.IGNORECASE)} + response = {} + for key, value in result_dict.items(): + if value is None and key != "code": + continue + else: + response[key] = value + return jsonify(response) + diff --git a/sdk/python/ragflow/modules/base.py b/sdk/python/ragflow/modules/base.py index 09793777..522eeb12 100644 --- a/sdk/python/ragflow/modules/base.py +++ b/sdk/python/ragflow/modules/base.py @@ -30,5 +30,9 @@ class Base(object): res = self.rag.delete(path, params) return res + def put(self,path, json): + res = self.rag.put(path,json) + return res + def __str__(self): return str(self.to_json()) diff --git a/sdk/python/ragflow/modules/dataset.py b/sdk/python/ragflow/modules/dataset.py index 4efd3b95..caa4644d 100644 --- a/sdk/python/ragflow/modules/dataset.py +++ b/sdk/python/ragflow/modules/dataset.py @@ -32,24 +32,13 @@ class DataSet(Base): res_dict.pop(k) super().__init__(rag, res_dict) - def save(self) -> bool: - res = self.post('/dataset/save', - {"id": self.id, "name": self.name, "avatar": self.avatar, "tenant_id": self.tenant_id, - "description": self.description, "language": self.language, "embedding_model": self.embedding_model, - "permission": self.permission, - "document_count": self.document_count, "chunk_count": self.chunk_count, "parse_method": self.parse_method, - "parser_config": self.parser_config.to_json() - }) + def update(self, update_message: dict): + res = self.put(f'/dataset/{self.id}', + update_message) res = res.json() - if res.get("retmsg") == "success": return True - raise Exception(res["retmsg"]) + if res.get("code") != 0: + raise Exception(res["message"]) - def delete(self) -> bool: - res = self.rm('/dataset/delete', - {"id": self.id}) - res = res.json() - if res.get("retmsg") == "success": return True - raise Exception(res["retmsg"]) def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]: """ diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index f30433d9..9c0d7327 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -18,9 +18,9 @@ from typing import List import requests from .modules.assistant import Assistant +from .modules.chunk import Chunk from .modules.dataset import DataSet from .modules.document import Document -from .modules.chunk import Chunk class RAGFlow: @@ -41,7 +41,11 @@ class RAGFlow: return res def delete(self, path, params): - res = requests.delete(url=self.api_url + path, params=params, headers=self.authorization_header) + res = requests.delete(url=self.api_url + path, json=params, headers=self.authorization_header) + return res + + def put(self, path, json): + res = requests.put(url=self.api_url + path, json= json,headers=self.authorization_header) return res def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English", @@ -52,7 +56,7 @@ class RAGFlow: parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True, "delimiter": "\n!?。;!?", "task_page_size": 12}) parser_config = parser_config.to_json() - res = self.post("/dataset/save", + res = self.post("/dataset", {"name": name, "avatar": avatar, "description": description, "language": language, "permission": permission, "document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method, @@ -60,27 +64,28 @@ class RAGFlow: } ) res = res.json() - if res.get("retmsg") == "success": + if res.get("code") == 0: return DataSet(self, res["data"]) - raise Exception(res["retmsg"]) + raise Exception(res["message"]) - def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True) -> \ + def delete_dataset(self, ids: List[str] = None, names: List[str] = None): + res = self.delete("/dataset",{"ids": ids, "names": names}) + res=res.json() + if res.get("code") != 0: + raise Exception(res["message"]) + + def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True, + id: str = None, name: str = None) -> \ List[DataSet]: - res = self.get("/dataset/list", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc}) + res = self.get("/dataset", + {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name}) res = res.json() result_list = [] - if res.get("retmsg") == "success": + if res.get("code") == 0: for data in res['data']: result_list.append(DataSet(self, data)) return result_list - raise Exception(res["retmsg"]) - - def get_dataset(self, id: str = None, name: str = None) -> DataSet: - res = self.get("/dataset/detail", {"id": id, "name": name}) - res = res.json() - if res.get("retmsg") == "success": - return DataSet(self, res['data']) - raise Exception(res["retmsg"]) + raise Exception(res["message"]) def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [], llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant: @@ -272,4 +277,3 @@ class RAGFlow: except Exception as e: print(f"An error occurred during retrieval: {e}") raise - diff --git a/sdk/python/test/common.py b/sdk/python/test/common.py index c92e34de..47e57903 100644 --- a/sdk/python/test/common.py +++ b/sdk/python/test/common.py @@ -1,4 +1,4 @@ -API_KEY = 'ragflow-k0YzUxMGY4NjY5YTExZWY5MjI5MDI0Mm' +API_KEY = 'ragflow-NiYmZjNTVjODYwNzExZWZiODEwMDI0Mm' HOST_ADDRESS = 'http://127.0.0.1:9380' \ No newline at end of file diff --git a/sdk/python/test/t_dataset.py b/sdk/python/test/t_dataset.py index eddae95a..c995613f 100644 --- a/sdk/python/test/t_dataset.py +++ b/sdk/python/test/t_dataset.py @@ -24,9 +24,8 @@ class TestDataset(TestSdk): ds = rag.create_dataset("ABC") if isinstance(ds, DataSet): assert ds.name == "ABC", "Name does not match." - ds.name = 'DEF' - res = ds.save() - assert res is True, f"Failed to update dataset, error: {res}" + res = ds.update({"name":"DEF"}) + assert res is None, f"Failed to update dataset, error: {res}" else: assert False, f"Failed to create dataset, error: {ds}" @@ -38,8 +37,8 @@ class TestDataset(TestSdk): ds = rag.create_dataset("MA") if isinstance(ds, DataSet): assert ds.name == "MA", "Name does not match." - res = ds.delete() - assert res is True, f"Failed to delete dataset, error: {res}" + res = rag.delete_dataset(names=["MA"]) + assert res is None, f"Failed to delete dataset, error: {res}" else: assert False, f"Failed to create dataset, error: {ds}" @@ -52,12 +51,3 @@ class TestDataset(TestSdk): assert len(list_datasets) > 0, "Do not exist any dataset" for ds in list_datasets: assert isinstance(ds, DataSet), "Existence type is not dataset." - - def test_get_detail_dataset_with_success(self): - """ - Test getting a dataset's detail with success - """ - rag = RAGFlow(API_KEY, HOST_ADDRESS) - ds = rag.get_dataset(name="God") - assert isinstance(ds, DataSet), f"Failed to get dataset, error: {ds}." - assert ds.name == "God", "Name does not match"