Refactor Dataset API (#2783)

### What problem does this PR solve?

Refactor Dataset API

### Type of change

- [x] Refactoring

---------

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
liuhua 2024-10-11 09:55:27 +08:00 committed by GitHub
parent a2f9c03a95
commit cbd7cd7c4d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 449 additions and 393 deletions

View File

@ -83,7 +83,7 @@ def register_page(page_path):
sys.modules[module_name] = page sys.modules[module_name] = page
spec.loader.exec_module(page) spec.loader.exec_module(page)
page_name = getattr(page, 'page_name', page_name) page_name = getattr(page, 'page_name', page_name)
url_prefix = f'/api/{API_VERSION}/{page_name}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}' url_prefix = f'/api/{API_VERSION}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}'
app.register_blueprint(page.manager, url_prefix=url_prefix) app.register_blueprint(page.manager, url_prefix=url_prefix)
return url_prefix return url_prefix

View File

@ -25,143 +25,146 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.user_service import TenantService from api.db.services.user_service import TenantService
from api.settings import RetCode from api.settings import RetCode
from api.utils import get_uuid from api.utils import get_uuid
from api.utils.api_utils import get_json_result, token_required, get_data_error_result from api.utils.api_utils import get_result, token_required,get_error_data_result
@manager.route('/dataset', methods=['POST'])
@manager.route('/save', methods=['POST'])
@token_required @token_required
def save(tenant_id): def create(tenant_id):
req = request.json req = request.json
e, t = TenantService.get_by_id(tenant_id) e, t = TenantService.get_by_id(tenant_id)
if "id" not in req: if "tenant_id" in req or "embedding_model" in req:
if "tenant_id" in req or "embedding_model" in req: return get_error_data_result(
return get_data_error_result( retmsg="Tenant_id or embedding_model must not be provided")
retmsg="Tenant_id or embedding_model must not be provided") chunk_count=req.get("chunk_count")
if "name" not in req: document_count=req.get("document_count")
return get_data_error_result( if chunk_count or document_count:
retmsg="Name is not empty!") return get_error_data_result(retmsg="chunk_count or document_count must be 0 or not be provided")
req['id'] = get_uuid() if "name" not in req:
req["name"] = req["name"].strip() return get_error_data_result(
if req["name"] == "": retmsg="Name is not empty!")
return get_data_error_result( req['id'] = get_uuid()
retmsg="Name is not empty string!") req["name"] = req["name"].strip()
if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value): if req["name"] == "":
return get_data_error_result( return get_error_data_result(
retmsg="Duplicated knowledgebase name in creating dataset.") retmsg="Name is not empty string!")
req["tenant_id"] = req['created_by'] = tenant_id if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
req['embedding_model'] = t.embd_id return get_error_data_result(
key_mapping = { retmsg="Duplicated knowledgebase name in creating dataset.")
"chunk_num": "chunk_count", req["tenant_id"] = req['created_by'] = tenant_id
"doc_num": "document_count", req['embedding_model'] = t.embd_id
"parser_id": "parse_method", key_mapping = {
"embd_id": "embedding_model" "chunk_num": "chunk_count",
} "doc_num": "document_count",
mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req} "parser_id": "parse_method",
req.update(mapped_keys) "embd_id": "embedding_model"
if not KnowledgebaseService.save(**req): }
return get_data_error_result(retmsg="Create dataset error.(Database error)") mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
renamed_data = {} req.update(mapped_keys)
e, k = KnowledgebaseService.get_by_id(req["id"]) if not KnowledgebaseService.save(**req):
for key, value in k.to_dict().items(): return get_error_data_result(retmsg="Create dataset error.(Database error)")
new_key = key_mapping.get(key, key) renamed_data = {}
renamed_data[new_key] = value e, k = KnowledgebaseService.get_by_id(req["id"])
return get_json_result(data=renamed_data) for key, value in k.to_dict().items():
else: new_key = key_mapping.get(key, key)
invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"} renamed_data[new_key] = value
if any(key in req for key in invalid_keys): return get_result(data=renamed_data)
return get_data_error_result(retmsg="The input parameters are invalid.")
if "tenant_id" in req: @manager.route('/dataset', methods=['DELETE'])
if req["tenant_id"] != tenant_id:
return get_data_error_result(
retmsg="Can't change tenant_id.")
if "embedding_model" in req:
if req["embedding_model"] != t.embd_id:
return get_data_error_result(
retmsg="Can't change embedding_model.")
req.pop("embedding_model")
if not KnowledgebaseService.query(
created_by=tenant_id, id=req["id"]):
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
if not req["id"]:
return get_data_error_result(
retmsg="id can not be empty.")
e, kb = KnowledgebaseService.get_by_id(req["id"])
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_data_error_result(
retmsg="Can't change chunk_count.")
req.pop("chunk_count")
if "document_count" in req:
if req['document_count'] != kb.doc_num:
return get_data_error_result(
retmsg="Can't change document_count.")
req.pop("document_count")
if "parse_method" in req:
if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
return get_data_error_result(
retmsg="If chunk count is not 0, parse method is not changable.")
req['parser_id'] = req.pop('parse_method')
if "name" in req:
req["name"] = req["name"].strip()
if req["name"].lower() != kb.name.lower() \
and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
status=StatusEnum.VALID.value)) > 0:
return get_data_error_result(
retmsg="Duplicated knowledgebase name in updating dataset.")
del req["id"]
if not KnowledgebaseService.update_by_id(kb.id, req):
return get_data_error_result(retmsg="Update dataset error.(Database error)")
return get_json_result(data=True)
@manager.route('/delete', methods=['DELETE'])
@token_required @token_required
def delete(tenant_id): def delete(tenant_id):
req = request.args req = request.json
if "id" not in req: names=req.get("names")
return get_data_error_result( ids = req.get("ids")
retmsg="id is required") if not ids and not names:
kbs = KnowledgebaseService.query( return get_error_data_result(
created_by=tenant_id, id=req["id"]) retmsg="ids or names is required")
if not kbs: id_list=[]
return get_json_result( if names:
data=False, retmsg='You do not own the dataset', for name in names:
retcode=RetCode.OPERATING_ERROR) kbs=KnowledgebaseService.query(name=name,tenant_id=tenant_id)
if not kbs:
return get_error_data_result(retmsg=f"You don't own the dataset {name}")
id_list.append(kbs[0].id)
if ids:
for id in ids:
kbs=KnowledgebaseService.query(id=id,tenant_id=tenant_id)
if not kbs:
return get_error_data_result(retmsg=f"You don't own the dataset {id}")
id_list.extend(ids)
for id in id_list:
for doc in DocumentService.query(kb_id=id):
if not DocumentService.remove_document(doc, tenant_id):
return get_error_data_result(
retmsg="Remove document error.(Database error)")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)
if not KnowledgebaseService.delete_by_id(id):
return get_error_data_result(
retmsg="Delete dataset error.(Database serror)")
return get_result(retcode=RetCode.SUCCESS)
for doc in DocumentService.query(kb_id=req["id"]): @manager.route('/dataset/<dataset_id>', methods=['PUT'])
if not DocumentService.remove_document(doc, kbs[0].tenant_id):
return get_data_error_result(
retmsg="Remove document error.(Database error)")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)
if not KnowledgebaseService.delete_by_id(req["id"]):
return get_data_error_result(
retmsg="Delete dataset error.(Database serror)")
return get_json_result(data=True)
@manager.route('/list', methods=['GET'])
@token_required @token_required
def list_datasets(tenant_id): def update(tenant_id,dataset_id):
if not KnowledgebaseService.query(id=dataset_id,tenant_id=tenant_id):
return get_error_data_result(retmsg="You don't own the dataset")
req = request.json
e, t = TenantService.get_by_id(tenant_id)
invalid_keys = {"id", "embd_id", "chunk_num", "doc_num", "parser_id"}
if any(key in req for key in invalid_keys):
return get_error_data_result(retmsg="The input parameters are invalid.")
if "tenant_id" in req:
if req["tenant_id"] != tenant_id:
return get_error_data_result(
retmsg="Can't change tenant_id.")
if "embedding_model" in req:
if req["embedding_model"] != t.embd_id:
return get_error_data_result(
retmsg="Can't change embedding_model.")
req.pop("embedding_model")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_error_data_result(
retmsg="Can't change chunk_count.")
req.pop("chunk_count")
if "document_count" in req:
if req['document_count'] != kb.doc_num:
return get_error_data_result(
retmsg="Can't change document_count.")
req.pop("document_count")
if "parse_method" in req:
if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
return get_error_data_result(
retmsg="If chunk count is not 0, parse method is not changable.")
req['parser_id'] = req.pop('parse_method')
if "name" in req:
req["name"] = req["name"].strip()
if req["name"].lower() != kb.name.lower() \
and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
status=StatusEnum.VALID.value)) > 0:
return get_error_data_result(
retmsg="Duplicated knowledgebase name in updating dataset.")
if not KnowledgebaseService.update_by_id(kb.id, req):
return get_error_data_result(retmsg="Update dataset error.(Database error)")
return get_result(retcode=RetCode.SUCCESS)
@manager.route('/dataset', methods=['GET'])
@token_required
def list(tenant_id):
id = request.args.get("id")
name = request.args.get("name")
kbs = KnowledgebaseService.query(id=id,name=name,status=1)
if not kbs:
return get_error_data_result(retmsg="The dataset doesn't exist")
page_number = int(request.args.get("page", 1)) page_number = int(request.args.get("page", 1))
items_per_page = int(request.args.get("page_size", 1024)) items_per_page = int(request.args.get("page_size", 1024))
orderby = request.args.get("orderby", "create_time") orderby = request.args.get("orderby", "create_time")
desc = bool(request.args.get("desc", True)) desc = bool(request.args.get("desc", True))
tenants = TenantService.get_joined_tenants_by_user_id(tenant_id) tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
kbs = KnowledgebaseService.get_by_tenant_ids( kbs = KnowledgebaseService.get_list(
[m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc) [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc, id, name)
renamed_list = [] renamed_list = []
for kb in kbs: for kb in kbs:
key_mapping = { key_mapping = {
@ -175,50 +178,4 @@ def list_datasets(tenant_id):
new_key = key_mapping.get(key, key) new_key = key_mapping.get(key, key)
renamed_data[new_key] = value renamed_data[new_key] = value
renamed_list.append(renamed_data) renamed_list.append(renamed_data)
return get_json_result(data=renamed_list) return get_result(data=renamed_list)
@manager.route('/detail', methods=['GET'])
@token_required
def detail(tenant_id):
req = request.args
key_mapping = {
"chunk_num": "chunk_count",
"doc_num": "document_count",
"parser_id": "parse_method",
"embd_id": "embedding_model"
}
renamed_data = {}
if "id" in req:
id = req["id"]
kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"])
if not kb:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
if "name" in req:
name = req["name"]
if kb[0].name != name:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
e, k = KnowledgebaseService.get_by_id(id)
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_json_result(data=renamed_data)
else:
if "name" in req:
name = req["name"]
e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id)
if not e:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_json_result(data=renamed_data)
else:
return get_data_error_result(
retmsg="At least one of `id` or `name` must be provided.")

View File

@ -142,3 +142,27 @@ class KnowledgebaseService(CommonService):
@DB.connection_context() @DB.connection_context()
def get_all_ids(cls): def get_all_ids(cls):
return [m["id"] for m in cls.model.select(cls.model.id).dicts()] return [m["id"] for m in cls.model.select(cls.model.id).dicts()]
@classmethod
@DB.connection_context()
def get_list(cls, joined_tenant_ids, user_id,
page_number, items_per_page, orderby, desc, id , name):
kbs = cls.model.select()
if id:
kbs = kbs.where(cls.model.id == id)
if name:
kbs = kbs.where(cls.model.name == name)
kbs = kbs.where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
if desc:
kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
else:
kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
kbs = kbs.paginate(page_number, items_per_page)
return list(kbs.dicts())

View File

@ -5,63 +5,134 @@
**POST** `/api/v1/dataset` **POST** `/api/v1/dataset`
Creates a dataset with a name. If dataset of the same name already exists, the new dataset will be renamed by RAGFlow automatically. Creates a dataset.
### Request ### Request
- Method: POST - Method: POST
- URL: `/api/v1/dataset` - URL: `http://{address}/api/v1/dataset`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body: - Body:
- `"dataset_name"`: `string` - `"id"`: `string`
- `"name"`: `string`
- `"avatar"`: `string`
- `"tenant_id"`: `string` - `"tenant_id"`: `string`
- `"description"`: `string`
- `"language"`: `string`
- `"embedding_model"`: `string` - `"embedding_model"`: `string`
- `"chunk_count"`: `integer` - `"permission"`: `string`
- `"document_count"`: `integer` - `"document_count"`: `integer`
- `"chunk_count"`: `integer`
- `"parse_method"`: `string` - `"parse_method"`: `string`
- `"parser_config"`: `Dataset.ParserConfig`
#### Request example #### Request example
```shell ```bash
# "id": id must not be provided.
# "name": name is required and can't be duplicated.
# "tenant_id": tenant_id must not be provided.
# "embedding_model": embedding_model must not be provided.
# "navie" means general.
curl --request POST \ curl --request POST \
--url http://{address}/api/v1/dataset \ --url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data-binary '{ --data '{
"dataset_name": "test", "name": "test",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006", "chunk_count": 0,
"embedding_model": "BAAI/bge--zh-v1.5", "document_count": 0,
"chunk_count": 0, "parse_method": "naive"
"document_count": 0,
"parse_method": "general"
}' }'
``` ```
#### Request parameters #### Request parameters
- `"dataset_name"`: (*Body parameter*) - `"id"`: (*Body parameter*)
The ID of the created dataset used to uniquely identify different datasets.
- If creating a dataset, `id` must not be provided.
- `"name"`: (*Body parameter*)
The name of the dataset, which must adhere to the following requirements: The name of the dataset, which must adhere to the following requirements:
- Maximum 65,535 characters. - Required when creating a dataset and must be unique.
- If updating a dataset, `name` must still be unique.
- `"avatar"`: (*Body parameter*)
Base64 encoding of the avatar.
- `"tenant_id"`: (*Body parameter*) - `"tenant_id"`: (*Body parameter*)
The ID of the tenant. The ID of the tenant associated with the dataset, used to link it with specific users.
- If creating a dataset, `tenant_id` must not be provided.
- If updating a dataset, `tenant_id` cannot be changed.
- `"description"`: (*Body parameter*)
The description of the dataset.
- `"language"`: (*Body parameter*)
The language setting for the dataset.
- `"embedding_model"`: (*Body parameter*) - `"embedding_model"`: (*Body parameter*)
Embedding model used in the dataset. Embedding model used in the dataset to generate vector embeddings.
- `"chunk_count"`: (*Body parameter*) - If creating a dataset, `embedding_model` must not be provided.
Chunk count of the dataset. - If updating a dataset, `embedding_model` cannot be changed.
- `"permission"`: (*Body parameter*)
Specifies who can manipulate the dataset.
- `"document_count"`: (*Body parameter*) - `"document_count"`: (*Body parameter*)
Document count of the dataset. Document count of the dataset.
- `"parse_mehtod"`: (*Body parameter*) - If updating a dataset, `document_count` cannot be changed.
- `"chunk_count"`: (*Body parameter*)
Chunk count of the dataset.
- If updating a dataset, `chunk_count` cannot be changed.
- `"parse_method"`: (*Body parameter*)
Parsing method of the dataset. Parsing method of the dataset.
- If updating `parse_method`, `chunk_count` must be greater than 0.
- `"parser_config"`: (*Body parameter*)
The configuration settings for the dataset parser.
### Response ### Response
The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:
```shell ```json
{ {
"code": 0 "code": 0,
"data": {
"avatar": null,
"chunk_count": 0,
"create_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"create_time": 1728539857641,
"created_by": "69736c5e723611efb51b0242ac120007",
"description": null,
"document_count": 0,
"embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "8d73076886cc11ef8c270242ac120006",
"language": "English",
"name": "test_1",
"parse_method": "naive",
"parser_config": {
"pages": [
[
1,
1000000
]
]
},
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 0,
"update_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"update_time": 1728539857641,
"vector_similarity_weight": 0.3
}
} }
``` ```
@ -71,10 +142,10 @@ The successful response includes a JSON object like the following:
The error response includes a JSON object like the following: The error response includes a JSON object like the following:
```shell ```json
{ {
"code": 3016, "code": 102,
"message": "Can't connect database" "message": "Duplicated knowledgebase name in creating dataset."
} }
``` ```
@ -82,27 +153,31 @@ The error response includes a JSON object like the following:
**DELETE** `/api/v1/dataset` **DELETE** `/api/v1/dataset`
Deletes a dataset by its id or name. Deletes datasets by ids or names.
### Request ### Request
- Method: DELETE - Method: DELETE
- URL: `/api/v1/dataset/{dataset_id}` - URL: `http://{address}/api/v1/dataset`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body:
- `"names"`: `List[string]`
- `"ids"`: `List[string]`
#### Request example #### Request example
```shell ```bash
# Either id or name must be provided, but not both.
curl --request DELETE \ curl --request DELETE \
--url http://{address}/api/v1/dataset/0 \ --url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data ' { --data '{
"names": ["ds1", "ds2"] "names": ["test_1", "test_2"]
}' }'
``` ```
#### Request parameters #### Request parameters
@ -118,7 +193,7 @@ curl --request DELETE \
The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:
```shell ```json
{ {
"code": 0 "code": 0
} }
@ -130,10 +205,10 @@ The successful response includes a JSON object like the following:
The error response includes a JSON object like the following: The error response includes a JSON object like the following:
```shell ```json
{ {
"code": 3016, "code": 102,
"message": "Try to delete non-existent dataset." "message": "You don't own the dataset."
} }
``` ```
@ -146,50 +221,47 @@ Updates a dataset by its id.
### Request ### Request
- Method: PUT - Method: PUT
- URL: `/api/v1/dataset/{dataset_id}` - URL: `http://{address}/api/v1/dataset/{dataset_id}`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body: (Refer to the "Create Dataset" for the complete structure of the request body.)
#### Request example #### Request example
```shell ```bash
# "id": id is required.
# "name": If you update name, it can't be duplicated.
# "tenant_id": If you update tenant_id, it can't be changed
# "embedding_model": If you update embedding_model, it can't be changed.
# "chunk_count": If you update chunk_count, it can't be changed.
# "document_count": If you update document_count, it can't be changed.
# "parse_method": If you update parse_method, chunk_count must be 0.
# "navie" means general.
curl --request PUT \ curl --request PUT \
--url http://{address}/api/v1/dataset/0 \ --url http://{address}/api/v1/dataset/{dataset_id} \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data-binary '{ --data '{
"dataset_name": "test", "name": "test",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006", "tenant_id": "4fb0cd625f9311efba4a0242ac120006",
"embedding_model": "BAAI/bge--zh-v1.5", "embedding_model": "BAAI/bge-zh-v1.5",
"chunk_count": 0, "chunk_count": 0,
"document_count": 0, "document_count": 0,
"parse_method": "general" "parse_method": "navie"
}' }'
``` ```
#### Request parameters #### Request parameters
(Refer to the "Create Dataset" for the complete structure of the request parameters.)
- `"dataset_name"`: (*Body parameter*)
The name of the dataset, which must adhere to the following requirements:
- Maximum 65,535 characters.
- `"tenant_id"`: (*Body parameter*)
The ID of the tenant.
- `"embedding_model"`: (*Body parameter*)
Embedding model used in the dataset.
- `"chunk_count"`: (*Body parameter*)
Chunk count of the dataset.
- `"document_count"`: (*Body parameter*)
Document count of the dataset.
- `"parse_mehtod"`: (*Body parameter*)
Parsing method of the dataset.
### Response ### Response
The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:
```shell ```json
{ {
"code": 0 "code": 0
} }
@ -201,35 +273,37 @@ The successful response includes a JSON object like the following:
The error response includes a JSON object like the following: The error response includes a JSON object like the following:
```shell ```json
{ {
"code": 3016, "code": 102,
"message": "Can't change embedding model since some files already use it." "message": "Can't change tenant_id."
} }
``` ```
## List datasets ## List datasets
**GET** `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` **GET** `/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}`
List all datasets List all datasets
### Request ### Request
- Method: GET - Method: GET
- URL: `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` - URL: `http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}`
- Headers: - Headers:
- `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
#### Request example #### Request example
```shell ```bash
# If no page parameter is passed, the default is 1
# If no page_size parameter is passed, the default is 1024
# If no order_by parameter is passed, the default is "create_time"
# If no desc parameter is passed, the default is True
curl --request GET \ curl --request GET \
--url http://{address}/api/v1/dataset?page=0&page_size=50&orderby=create_time&desc=false \ --url http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id} \
--header 'Content-Type: application/json' \ --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
``` ```
#### Request parameters #### Request parameters
@ -244,54 +318,63 @@ curl --request GET \
A boolean flag indicating whether the sorting should be in descending order. A boolean flag indicating whether the sorting should be in descending order.
- `name`: (*Path parameter*) - `name`: (*Path parameter*)
Dataset name Dataset name
- - `"id"`: (*Path parameter*)
The ID of the dataset to be retrieved.
- `"name"`: (*Path parameter*)
The name of the dataset to be retrieved.
### Response ### Response
The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:
```shell ```json
{ {
"code": 0, "code": 0,
"data": [ "data": [
{ {
"avatar": "", "avatar": "",
"chunk_count": 0, "chunk_count": 59,
"create_date": "Thu, 29 Aug 2024 03:13:07 GMT", "create_date": "Sat, 14 Sep 2024 01:12:37 GMT",
"create_time": 1724901187843, "create_time": 1726276357324,
"created_by": "4fb0cd625f9311efba4a0242ac120006", "created_by": "69736c5e723611efb51b0242ac120007",
"description": "", "description": null,
"document_count": 0, "document_count": 1,
"embedding_model": "BAAI/bge-large-zh-v1.5", "embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "9d3d906665b411ef87d10242ac120006", "id": "6e211ee0723611efa10a0242ac120007",
"language": "English", "language": "English",
"name": "Test", "name": "mysql",
"parser_config": { "parse_method": "knowledge_graph",
"chunk_token_count": 128, "parser_config": {
"delimiter": "\n!?。;!?", "chunk_token_num": 8192,
"layout_recognize": true, "delimiter": "\\n!?;。;!?",
"task_page_size": 12 "entity_types": [
}, "organization",
"parse_method": "naive", "person",
"permission": "me", "location",
"similarity_threshold": 0.2, "event",
"status": "1", "time"
"tenant_id": "4fb0cd625f9311efba4a0242ac120006", ]
"token_count": 0, },
"update_date": "Thu, 29 Aug 2024 03:13:07 GMT", "permission": "me",
"update_time": 1724901187843, "similarity_threshold": 0.2,
"vector_similarity_weight": 0.3 "status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 12744,
"update_date": "Thu, 10 Oct 2024 04:07:23 GMT",
"update_time": 1728533243536,
"vector_similarity_weight": 0.3
} }
], ]
} }
``` ```
The error response includes a JSON object like the following: The error response includes a JSON object like the following:
```shell ```json
{ {
"code": 3016, "code": 102,
"message": "Can't access database to get the dataset list." "message": "The dataset doesn't exist"
} }
``` ```

View File

@ -38,9 +38,9 @@ The unique name of the dataset to create. It must adhere to the following requir
#### avatar: `str` #### avatar: `str`
The url or ???????????????????????? path to the avatar image associated with the created dataset. Defaults to `""` Base64 encoding of the avatar. Defaults to `""`
#### tenant_id: `str` ????????????????? #### tenant_id: `str`
The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`. The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`.
@ -55,9 +55,9 @@ The description of the created dataset. Defaults to `""`.
The language setting of the created dataset. Defaults to `"English"`. ???????????? The language setting of the created dataset. Defaults to `"English"`. ????????????
#### embedding_model: `str` ???????????????? #### embedding_model: `str`
The specific model or algorithm used by the dataset to generate vector embeddings. Defaults to `""`. The specific model used by the dataset to generate vector embeddings. Defaults to `""`.
- If creating a dataset, embedding_model must not be provided. - If creating a dataset, embedding_model must not be provided.
- If updating a dataset, embedding_model can't be changed. - If updating a dataset, embedding_model can't be changed.
@ -89,12 +89,10 @@ The method used by the dataset to parse and process data.
The configuration settings for the parser used by the dataset. The configuration settings for the parser used by the dataset.
### Returns ### Returns
```python
- Success: An `infinity.local_infinity.table.LocalTable` object in Python module mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. DataSet
- Failure: `InfinityException` description: dataset object
- `error_code`: `int` A non-zero value indicating a specific error condition. ```
- `error_msg`: `str` A message providing additional details about the error.
### Examples ### Examples
```python ```python
@ -106,19 +104,28 @@ ds = rag.create_dataset(name="kb_1")
--- ---
## Delete knowledge base ## Delete knowledge bases
```python ```python
DataSet.delete() -> bool RAGFlow.delete_dataset(ids: List[str] = None, names: List[str] = None)
``` ```
Deletes knowledge bases.
### Parameters
Deletes a knowledge base. #### ids: `List[str]`
The ids of the datasets to be deleted.
#### names: `List[str]`
The names of the datasets to be deleted.
Either `ids` or `names` must be provided, but not both.
### Returns ### Returns
`bool` ```python
no return
description:the case of updating an dateset, `True` or `False`. ```
### Examples ### Examples
@ -126,8 +133,8 @@ description:the case of updating an dateset, `True` or `False`.
from ragflow import RAGFlow from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.create_dataset(name="kb_1") rag.delete_dataset(names=["name_1","name_2"])
ds.delete() rag.delete_dataset(ids=["id_1","id_2"])
``` ```
--- ---
@ -139,7 +146,9 @@ RAGFlow.list_datasets(
page: int = 1, page: int = 1,
page_size: int = 1024, page_size: int = 1024,
orderby: str = "create_time", orderby: str = "create_time",
desc: bool = True desc: bool = True,
id: str = None,
name: str = None
) -> List[DataSet] ) -> List[DataSet]
``` ```
@ -163,6 +172,14 @@ The field by which the records should be sorted. This specifies the attribute or
Whether the sorting should be in descending order. Defaults to `True`. Whether the sorting should be in descending order. Defaults to `True`.
#### id: `str`
The id of the dataset to be got. Defaults to `None`.
#### name: `str`
The name of the dataset to be got. Defaults to `None`.
### Returns ### Returns
```python ```python
@ -182,57 +199,17 @@ for ds in rag.list_datasets():
--- ---
## Retrieve knowledge base
## Update knowledge base
```python ```python
RAGFlow.get_dataset( DataSet.update(update_message: dict)
id: str = None,
name: str = None
) -> DataSet
```
Retrieves a knowledge base by name.
### Parameters
#### name: `str`
The name of the dataset to be got. If `id` is not provided, `name` is required.
#### id: `str`
The id of the dataset to be got. If `name` is not provided, `id` is required.
### Returns
```python
DataSet
description: dataset object
```
### Examples
```python
from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.get_dataset(name="ragflow")
print(ds)
```
---
## Save knowledge base configurations
```python
DataSet.save() -> bool
``` ```
### Returns ### Returns
```python ```python
bool no return
description:the case of updating an dateset, True or False.
``` ```
### Examples ### Examples
@ -242,8 +219,7 @@ from ragflow import RAGFlow
rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.get_dataset(name="kb_1") ds = rag.get_dataset(name="kb_1")
ds.parse_method = "manual" ds.update({"parse_method":"manual", ...}}
ds.save()
``` ```
--- ---

View File

@ -268,3 +268,32 @@ def token_required(func):
return func(*args, **kwargs) return func(*args, **kwargs)
return decorated_function return decorated_function
def get_result(retcode=RetCode.SUCCESS, retmsg='error', data=None):
if retcode == 0:
if data is not None:
response = {"code": retcode, "data": data}
else:
response = {"code": retcode}
else:
response = {"code": retcode, "message": retmsg}
return jsonify(response)
def get_error_data_result(retcode=RetCode.DATA_ERROR,
retmsg='Sorry! Data missing!'):
import re
result_dict = {
"code": retcode,
"message": re.sub(
r"rag",
"seceum",
retmsg,
flags=re.IGNORECASE)}
response = {}
for key, value in result_dict.items():
if value is None and key != "code":
continue
else:
response[key] = value
return jsonify(response)

View File

@ -30,5 +30,9 @@ class Base(object):
res = self.rag.delete(path, params) res = self.rag.delete(path, params)
return res return res
def put(self,path, json):
res = self.rag.put(path,json)
return res
def __str__(self): def __str__(self):
return str(self.to_json()) return str(self.to_json())

View File

@ -32,24 +32,13 @@ class DataSet(Base):
res_dict.pop(k) res_dict.pop(k)
super().__init__(rag, res_dict) super().__init__(rag, res_dict)
def save(self) -> bool: def update(self, update_message: dict):
res = self.post('/dataset/save', res = self.put(f'/dataset/{self.id}',
{"id": self.id, "name": self.name, "avatar": self.avatar, "tenant_id": self.tenant_id, update_message)
"description": self.description, "language": self.language, "embedding_model": self.embedding_model,
"permission": self.permission,
"document_count": self.document_count, "chunk_count": self.chunk_count, "parse_method": self.parse_method,
"parser_config": self.parser_config.to_json()
})
res = res.json() res = res.json()
if res.get("retmsg") == "success": return True if res.get("code") != 0:
raise Exception(res["retmsg"]) raise Exception(res["message"])
def delete(self) -> bool:
res = self.rm('/dataset/delete',
{"id": self.id})
res = res.json()
if res.get("retmsg") == "success": return True
raise Exception(res["retmsg"])
def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]: def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
""" """

View File

@ -18,9 +18,9 @@ from typing import List
import requests import requests
from .modules.assistant import Assistant from .modules.assistant import Assistant
from .modules.chunk import Chunk
from .modules.dataset import DataSet from .modules.dataset import DataSet
from .modules.document import Document from .modules.document import Document
from .modules.chunk import Chunk
class RAGFlow: class RAGFlow:
@ -41,7 +41,11 @@ class RAGFlow:
return res return res
def delete(self, path, params): def delete(self, path, params):
res = requests.delete(url=self.api_url + path, params=params, headers=self.authorization_header) res = requests.delete(url=self.api_url + path, json=params, headers=self.authorization_header)
return res
def put(self, path, json):
res = requests.put(url=self.api_url + path, json= json,headers=self.authorization_header)
return res return res
def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English", def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English",
@ -52,7 +56,7 @@ class RAGFlow:
parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True, parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True,
"delimiter": "\n!?。;!?", "task_page_size": 12}) "delimiter": "\n!?。;!?", "task_page_size": 12})
parser_config = parser_config.to_json() parser_config = parser_config.to_json()
res = self.post("/dataset/save", res = self.post("/dataset",
{"name": name, "avatar": avatar, "description": description, "language": language, {"name": name, "avatar": avatar, "description": description, "language": language,
"permission": permission, "permission": permission,
"document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method, "document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method,
@ -60,27 +64,28 @@ class RAGFlow:
} }
) )
res = res.json() res = res.json()
if res.get("retmsg") == "success": if res.get("code") == 0:
return DataSet(self, res["data"]) return DataSet(self, res["data"])
raise Exception(res["retmsg"]) raise Exception(res["message"])
def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True) -> \ def delete_dataset(self, ids: List[str] = None, names: List[str] = None):
res = self.delete("/dataset",{"ids": ids, "names": names})
res=res.json()
if res.get("code") != 0:
raise Exception(res["message"])
def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True,
id: str = None, name: str = None) -> \
List[DataSet]: List[DataSet]:
res = self.get("/dataset/list", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc}) res = self.get("/dataset",
{"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
res = res.json() res = res.json()
result_list = [] result_list = []
if res.get("retmsg") == "success": if res.get("code") == 0:
for data in res['data']: for data in res['data']:
result_list.append(DataSet(self, data)) result_list.append(DataSet(self, data))
return result_list return result_list
raise Exception(res["retmsg"]) raise Exception(res["message"])
def get_dataset(self, id: str = None, name: str = None) -> DataSet:
res = self.get("/dataset/detail", {"id": id, "name": name})
res = res.json()
if res.get("retmsg") == "success":
return DataSet(self, res['data'])
raise Exception(res["retmsg"])
def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [], def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [],
llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant: llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant:
@ -272,4 +277,3 @@ class RAGFlow:
except Exception as e: except Exception as e:
print(f"An error occurred during retrieval: {e}") print(f"An error occurred during retrieval: {e}")
raise raise

View File

@ -1,4 +1,4 @@
API_KEY = 'ragflow-k0YzUxMGY4NjY5YTExZWY5MjI5MDI0Mm' API_KEY = 'ragflow-NiYmZjNTVjODYwNzExZWZiODEwMDI0Mm'
HOST_ADDRESS = 'http://127.0.0.1:9380' HOST_ADDRESS = 'http://127.0.0.1:9380'

View File

@ -24,9 +24,8 @@ class TestDataset(TestSdk):
ds = rag.create_dataset("ABC") ds = rag.create_dataset("ABC")
if isinstance(ds, DataSet): if isinstance(ds, DataSet):
assert ds.name == "ABC", "Name does not match." assert ds.name == "ABC", "Name does not match."
ds.name = 'DEF' res = ds.update({"name":"DEF"})
res = ds.save() assert res is None, f"Failed to update dataset, error: {res}"
assert res is True, f"Failed to update dataset, error: {res}"
else: else:
assert False, f"Failed to create dataset, error: {ds}" assert False, f"Failed to create dataset, error: {ds}"
@ -38,8 +37,8 @@ class TestDataset(TestSdk):
ds = rag.create_dataset("MA") ds = rag.create_dataset("MA")
if isinstance(ds, DataSet): if isinstance(ds, DataSet):
assert ds.name == "MA", "Name does not match." assert ds.name == "MA", "Name does not match."
res = ds.delete() res = rag.delete_dataset(names=["MA"])
assert res is True, f"Failed to delete dataset, error: {res}" assert res is None, f"Failed to delete dataset, error: {res}"
else: else:
assert False, f"Failed to create dataset, error: {ds}" assert False, f"Failed to create dataset, error: {ds}"
@ -52,12 +51,3 @@ class TestDataset(TestSdk):
assert len(list_datasets) > 0, "Do not exist any dataset" assert len(list_datasets) > 0, "Do not exist any dataset"
for ds in list_datasets: for ds in list_datasets:
assert isinstance(ds, DataSet), "Existence type is not dataset." assert isinstance(ds, DataSet), "Existence type is not dataset."
def test_get_detail_dataset_with_success(self):
"""
Test getting a dataset's detail with success
"""
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.get_dataset(name="God")
assert isinstance(ds, DataSet), f"Failed to get dataset, error: {ds}."
assert ds.name == "God", "Name does not match"