diff --git a/api/apps/api_app.py b/api/apps/api_app.py index b0ab20ed..52f0ba1f 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -335,6 +335,8 @@ def upload(): doc["parser_id"] = request.form.get("parser_id").strip() if doc["type"] == FileType.VISUAL: doc["parser_id"] = ParserType.PICTURE.value + if doc["type"] == FileType.AURAL: + doc["parser_id"] = ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): doc["parser_id"] = ParserType.PRESENTATION.value @@ -581,4 +583,4 @@ def completion_faq(): return response except Exception as e: - return server_error_response(e) \ No newline at end of file + return server_error_response(e) diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index b9d1d7d3..111cf2ec 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -39,7 +39,7 @@ from api.utils import get_uuid from api.utils.api_utils import construct_json_result, construct_error_response from api.utils.api_utils import construct_result, validate_request from api.utils.file_utils import filename_type, thumbnail -from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture +from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio from rag.nlp import search from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO @@ -377,6 +377,8 @@ def upload_documents(dataset_id): } if doc["type"] == FileType.VISUAL: doc["parser_id"] = ParserType.PICTURE.value + if doc["type"] == FileType.AURAL: + doc["parser_id"] = ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): doc["parser_id"] = ParserType.PRESENTATION.value DocumentService.insert(doc) @@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id): resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) case "table": table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) + case "audio": + audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) case _: return False diff --git a/api/apps/document_app.py b/api/apps/document_app.py index ea178bde..31611dd9 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -105,6 +105,8 @@ def upload(): } if doc["type"] == FileType.VISUAL: doc["parser_id"] = ParserType.PICTURE.value + if doc["type"] == FileType.AURAL: + doc["parser_id"] = ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): doc["parser_id"] = ParserType.PRESENTATION.value DocumentService.insert(doc) @@ -171,6 +173,8 @@ def web_crawl(): } if doc["type"] == FileType.VISUAL: doc["parser_id"] = ParserType.PICTURE.value + if doc["type"] == FileType.AURAL: + doc["parser_id"] = ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): doc["parser_id"] = ParserType.PRESENTATION.value DocumentService.insert(doc) diff --git a/api/db/__init__.py b/api/db/__init__.py index 6d8fb0ab..81d79c1a 100644 --- a/api/db/__init__.py +++ b/api/db/__init__.py @@ -84,6 +84,7 @@ class ParserType(StrEnum): NAIVE = "naive" PICTURE = "picture" ONE = "one" + AUDIO = "audio" class FileSource(StrEnum): @@ -96,4 +97,4 @@ class CanvasType(StrEnum): ChatBot = "chatbot" DocBot = "docbot" -KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase" \ No newline at end of file +KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase" diff --git a/api/db/init_data.py b/api/db/init_data.py index 0acfe7b1..a042f450 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -121,6 +121,8 @@ def init_llm_factory(): LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"]) LLMService.filter_delete([LLMService.model.fid == "QAnything"]) TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"}) + TenantService.filter_update([1 == 1], { + "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"}) ## insert openai two embedding models to the current openai user. print("Start to insert 2 OpenAI embedding models...") tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()]) @@ -143,7 +145,7 @@ def init_llm_factory(): """ drop table llm; drop table llm_factories; - update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One'; + update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio'; alter table knowledgebase modify avatar longtext; alter table user modify avatar longtext; alter table dialog modify icon longtext; diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index a994d610..4c34b7e1 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -15,7 +15,7 @@ # from api.db.services.user_service import TenantService from api.settings import database_logger -from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel +from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel from api.db import LLMType from api.db.db_models import DB, UserTenant from api.db.db_models import LLMFactories, LLM, TenantLLM @@ -120,6 +120,14 @@ class TenantLLMService(CommonService): return ChatModel[model_config["llm_factory"]]( model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) + if llm_type == LLMType.SPEECH2TEXT: + if model_config["llm_factory"] not in Seq2txtModel: + return + return Seq2txtModel[model_config["llm_factory"]]( + model_config["api_key"], model_config["llm_name"], lang, + base_url=model_config["api_base"] + ) + @classmethod @DB.connection_context() def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None): @@ -207,6 +215,14 @@ class LLMBundle(object): "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id)) return txt + def transcription(self, audio): + txt, used_tokens = self.mdl.transcription(audio) + if not TenantLLMService.increase_usage( + self.tenant_id, self.llm_type, used_tokens): + database_logger.error( + "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id)) + return txt + def chat(self, system, history, gen_conf): txt, used_tokens = self.mdl.chat(system, history, gen_conf) if not TenantLLMService.increase_usage( diff --git a/api/settings.py b/api/settings.py index 7dbf2f61..1adbd501 100644 --- a/api/settings.py +++ b/api/settings.py @@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] API_KEY = LLM.get("api_key", "") PARSERS = LLM.get( "parsers", - "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One") + "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio") # distribution DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) diff --git a/rag/app/audio.py b/rag/app/audio.py new file mode 100644 index 00000000..397e4e75 --- /dev/null +++ b/rag/app/audio.py @@ -0,0 +1,42 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import io +import re +import numpy as np + +from api.db import LLMType +from rag.nlp import rag_tokenizer +from api.db.services.llm_service import LLMBundle +from rag.nlp import tokenize + + +def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): + doc = { + "docnm_kwd": filename, + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) + + # is it English + eng = lang.lower() == "english" # is_english(sections) + try: + callback(0.1, "USE Sequence2Txt LLM to transcription the audio") + seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang) + ans = seq2txt_mdl.transcription(binary) + callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32]) + tokenize(doc, ans, eng) + return [doc] + except Exception as e: + callback(prog=-1, msg=str(e)) + + return [] diff --git a/rag/app/picture.py b/rag/app/picture.py index 475b6f3b..0474b759 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): callback(0.4, "Use CV LLM to describe the picture.") cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) ans = cv_mdl.describe(binary) - callback(0.8, "CV LLM respoond: %s ..." % ans[:32]) + callback(0.8, "CV LLM respond: %s ..." % ans[:32]) txt += "\n" + ans tokenize(doc, txt, eng) return [doc] diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index a405e5ca..323b689a 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer from io import BytesIO import pandas as pd -from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one +from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio from api.db import LLMType, ParserType from api.db.services.document_service import DocumentService @@ -68,6 +68,7 @@ FACTORY = { ParserType.RESUME.value: resume, ParserType.PICTURE.value: picture, ParserType.ONE.value: one, + ParserType.AUDIO.value: audio }