diff --git a/api/apps/api_app.py b/api/apps/api_app.py
index b0ab20ed..52f0ba1f 100644
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
@@ -335,6 +335,8 @@ def upload():
                 doc["parser_id"] = request.form.get("parser_id").strip()
         if doc["type"] == FileType.VISUAL:
             doc["parser_id"] = ParserType.PICTURE.value
+        if doc["type"] == FileType.AURAL:
+            doc["parser_id"] = ParserType.AUDIO.value
         if re.search(r"\.(ppt|pptx|pages)$", filename):
             doc["parser_id"] = ParserType.PRESENTATION.value
 
@@ -581,4 +583,4 @@ def completion_faq():
         return response
 
     except Exception as e:
-        return server_error_response(e)
\ No newline at end of file
+        return server_error_response(e)
diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py
index b9d1d7d3..111cf2ec 100644
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
-from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
+from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@@ -377,6 +377,8 @@ def upload_documents(dataset_id):
             }
             if doc["type"] == FileType.VISUAL:
                 doc["parser_id"] = ParserType.PICTURE.value
+            if doc["type"] == FileType.AURAL:
+                doc["parser_id"] = ParserType.AUDIO.value
             if re.search(r"\.(ppt|pptx|pages)$", filename):
                 doc["parser_id"] = ParserType.PRESENTATION.value
             DocumentService.insert(doc)
@@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
             resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case "table":
             table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
+        case "audio":
+            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
         case _:
             return False
 
diff --git a/api/apps/document_app.py b/api/apps/document_app.py
index ea178bde..31611dd9 100644
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -105,6 +105,8 @@ def upload():
             }
             if doc["type"] == FileType.VISUAL:
                 doc["parser_id"] = ParserType.PICTURE.value
+            if doc["type"] == FileType.AURAL:
+                doc["parser_id"] = ParserType.AUDIO.value
             if re.search(r"\.(ppt|pptx|pages)$", filename):
                 doc["parser_id"] = ParserType.PRESENTATION.value
             DocumentService.insert(doc)
@@ -171,6 +173,8 @@ def web_crawl():
         }
         if doc["type"] == FileType.VISUAL:
             doc["parser_id"] = ParserType.PICTURE.value
+        if doc["type"] == FileType.AURAL:
+            doc["parser_id"] = ParserType.AUDIO.value
         if re.search(r"\.(ppt|pptx|pages)$", filename):
             doc["parser_id"] = ParserType.PRESENTATION.value
         DocumentService.insert(doc)
diff --git a/api/db/__init__.py b/api/db/__init__.py
index 6d8fb0ab..81d79c1a 100644
--- a/api/db/__init__.py
+++ b/api/db/__init__.py
@@ -84,6 +84,7 @@ class ParserType(StrEnum):
     NAIVE = "naive"
     PICTURE = "picture"
     ONE = "one"
+    AUDIO = "audio"
 
 
 class FileSource(StrEnum):
@@ -96,4 +97,4 @@ class CanvasType(StrEnum):
     ChatBot = "chatbot"
     DocBot = "docbot"
 
-KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
\ No newline at end of file
+KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
diff --git a/api/db/init_data.py b/api/db/init_data.py
index 0acfe7b1..a042f450 100644
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@@ -121,6 +121,8 @@ def init_llm_factory():
     LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
     LLMService.filter_delete([LLMService.model.fid == "QAnything"])
     TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
+    TenantService.filter_update([1 == 1], {
+        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
     ## insert openai two embedding models to the current openai user.
     print("Start to insert 2 OpenAI embedding models...")
     tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@@ -143,7 +145,7 @@ def init_llm_factory():
     """
     drop table llm;
     drop table llm_factories;
-    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
+    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
     alter table knowledgebase modify avatar longtext;
     alter table user modify avatar longtext;
     alter table dialog modify icon longtext;
diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py
index a994d610..4c34b7e1 100644
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@@ -15,7 +15,7 @@
 #
 from api.db.services.user_service import TenantService
 from api.settings import database_logger
-from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
+from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
 from api.db import LLMType
 from api.db.db_models import DB, UserTenant
 from api.db.db_models import LLMFactories, LLM, TenantLLM
@@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
             return ChatModel[model_config["llm_factory"]](
                 model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
 
+        if llm_type == LLMType.SPEECH2TEXT:
+            if model_config["llm_factory"] not in Seq2txtModel:
+                return
+            return Seq2txtModel[model_config["llm_factory"]](
+                model_config["api_key"], model_config["llm_name"], lang,
+                base_url=model_config["api_base"]
+            )
+
     @classmethod
     @DB.connection_context()
     def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@@ -207,6 +215,14 @@ class LLMBundle(object):
                 "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
         return txt
 
+    def transcription(self, audio):
+        txt, used_tokens = self.mdl.transcription(audio)
+        if not TenantLLMService.increase_usage(
+                self.tenant_id, self.llm_type, used_tokens):
+            database_logger.error(
+                "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
+        return txt
+
     def chat(self, system, history, gen_conf):
         txt, used_tokens = self.mdl.chat(system, history, gen_conf)
         if not TenantLLMService.increase_usage(
diff --git a/api/settings.py b/api/settings.py
index 7dbf2f61..1adbd501 100644
--- a/api/settings.py
+++ b/api/settings.py
@@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
     "parsers",
-    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
+    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")
 
 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
diff --git a/rag/app/audio.py b/rag/app/audio.py
new file mode 100644
index 00000000..397e4e75
--- /dev/null
+++ b/rag/app/audio.py
@@ -0,0 +1,42 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import re
+import numpy as np
+
+from api.db import LLMType
+from rag.nlp import rag_tokenizer
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import tokenize
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
+
+    # is it English
+    eng = lang.lower() == "english"  # is_english(sections)
+    try:
+        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
+        seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
+        ans = seq2txt_mdl.transcription(binary)
+        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
+        tokenize(doc, ans, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+
+    return []
diff --git a/rag/app/picture.py b/rag/app/picture.py
index 475b6f3b..0474b759 100644
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
         callback(0.4, "Use CV LLM to describe the picture.")
         cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
         ans = cv_mdl.describe(binary)
-        callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
+        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
         txt += "\n" + ans
         tokenize(doc, txt, eng)
         return [doc]
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index a405e5ca..323b689a 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
 from io import BytesIO
 import pandas as pd
 
-from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
+from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
 
 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@@ -68,6 +68,7 @@ FACTORY = {
     ParserType.RESUME.value: resume,
     ParserType.PICTURE.value: picture,
     ParserType.ONE.value: one,
+    ParserType.AUDIO.value: audio
 }