Add ParsertType Audio (#1637)

### What problem does this PR solve?

#1514 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
H 2024-07-22 19:17:30 +08:00 committed by GitHub
parent 9f109adf28
commit ac7a0d4fbf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 80 additions and 8 deletions

View File

@ -335,6 +335,8 @@ def upload():
doc["parser_id"] = request.form.get("parser_id").strip() doc["parser_id"] = request.form.get("parser_id").strip()
if doc["type"] == FileType.VISUAL: if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename): if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value doc["parser_id"] = ParserType.PRESENTATION.value

View File

@ -39,7 +39,7 @@ from api.utils import get_uuid
from api.utils.api_utils import construct_json_result, construct_error_response from api.utils.api_utils import construct_json_result, construct_error_response
from api.utils.api_utils import construct_result, validate_request from api.utils.api_utils import construct_result, validate_request
from api.utils.file_utils import filename_type, thumbnail from api.utils.file_utils import filename_type, thumbnail
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
from rag.nlp import search from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO
@ -377,6 +377,8 @@ def upload_documents(dataset_id):
} }
if doc["type"] == FileType.VISUAL: if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename): if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc) DocumentService.insert(doc)
@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "table": case "table":
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "audio":
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case _: case _:
return False return False

View File

@ -105,6 +105,8 @@ def upload():
} }
if doc["type"] == FileType.VISUAL: if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename): if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc) DocumentService.insert(doc)
@ -171,6 +173,8 @@ def web_crawl():
} }
if doc["type"] == FileType.VISUAL: if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename): if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc) DocumentService.insert(doc)

View File

@ -84,6 +84,7 @@ class ParserType(StrEnum):
NAIVE = "naive" NAIVE = "naive"
PICTURE = "picture" PICTURE = "picture"
ONE = "one" ONE = "one"
AUDIO = "audio"
class FileSource(StrEnum): class FileSource(StrEnum):

View File

@ -121,6 +121,8 @@ def init_llm_factory():
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"]) LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
LLMService.filter_delete([LLMService.model.fid == "QAnything"]) LLMService.filter_delete([LLMService.model.fid == "QAnything"])
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"}) TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
TenantService.filter_update([1 == 1], {
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
## insert openai two embedding models to the current openai user. ## insert openai two embedding models to the current openai user.
print("Start to insert 2 OpenAI embedding models...") print("Start to insert 2 OpenAI embedding models...")
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()]) tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@ -143,7 +145,7 @@ def init_llm_factory():
""" """
drop table llm; drop table llm;
drop table llm_factories; drop table llm_factories;
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One'; update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
alter table knowledgebase modify avatar longtext; alter table knowledgebase modify avatar longtext;
alter table user modify avatar longtext; alter table user modify avatar longtext;
alter table dialog modify icon longtext; alter table dialog modify icon longtext;

View File

@ -15,7 +15,7 @@
# #
from api.db.services.user_service import TenantService from api.db.services.user_service import TenantService
from api.settings import database_logger from api.settings import database_logger
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
from api.db import LLMType from api.db import LLMType
from api.db.db_models import DB, UserTenant from api.db.db_models import DB, UserTenant
from api.db.db_models import LLMFactories, LLM, TenantLLM from api.db.db_models import LLMFactories, LLM, TenantLLM
@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
return ChatModel[model_config["llm_factory"]]( return ChatModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
if llm_type == LLMType.SPEECH2TEXT:
if model_config["llm_factory"] not in Seq2txtModel:
return
return Seq2txtModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], lang,
base_url=model_config["api_base"]
)
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None): def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@ -207,6 +215,14 @@ class LLMBundle(object):
"Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id)) "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
return txt return txt
def transcription(self, audio):
txt, used_tokens = self.mdl.transcription(audio)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
database_logger.error(
"Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
return txt
def chat(self, system, history, gen_conf): def chat(self, system, history, gen_conf):
txt, used_tokens = self.mdl.chat(system, history, gen_conf) txt, used_tokens = self.mdl.chat(system, history, gen_conf)
if not TenantLLMService.increase_usage( if not TenantLLMService.increase_usage(

View File

@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
API_KEY = LLM.get("api_key", "") API_KEY = LLM.get("api_key", "")
PARSERS = LLM.get( PARSERS = LLM.get(
"parsers", "parsers",
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One") "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")
# distribution # distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

42
rag/app/audio.py Normal file
View File

@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import re
import numpy as np
from api.db import LLMType
from rag.nlp import rag_tokenizer
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # is_english(sections)
try:
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
ans = seq2txt_mdl.transcription(binary)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []

View File

@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.4, "Use CV LLM to describe the picture.") callback(0.4, "Use CV LLM to describe the picture.")
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.describe(binary) ans = cv_mdl.describe(binary)
callback(0.8, "CV LLM respoond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans txt += "\n" + ans
tokenize(doc, txt, eng) tokenize(doc, txt, eng)
return [doc] return [doc]

View File

@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
from io import BytesIO from io import BytesIO
import pandas as pd import pandas as pd
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
@ -68,6 +68,7 @@ FACTORY = {
ParserType.RESUME.value: resume, ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture, ParserType.PICTURE.value: picture,
ParserType.ONE.value: one, ParserType.ONE.value: one,
ParserType.AUDIO.value: audio
} }