diff --git a/python/cosevoice/Dockerfile b/python/cosevoice/Dockerfile
new file mode 100644
index 000000000..655b62ed0
--- /dev/null
+++ b/python/cosevoice/Dockerfile
@@ -0,0 +1,12 @@
+FROM dockerhub.icu/pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /opt/CosyVoice
+
+RUN chmod 777 /tmp && sed -i 's@//.*archive.ubuntu.com@//mirrors.ustc.edu.cn@g' /etc/apt/sources.list && apt-get update -y && apt-get -y install git unzip git-lfs
+RUN git lfs install && git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+# here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
+COPY ./requirements.txt CosyVoice
+RUN cd CosyVoice && pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
+COPY fastapi/server.py CosyVoice/runtime/python/fastapi/
diff --git a/python/cosevoice/fastapi/client.py b/python/cosevoice/fastapi/client.py
new file mode 100644
index 000000000..115d92940
--- /dev/null
+++ b/python/cosevoice/fastapi/client.py
@@ -0,0 +1,78 @@
+import argparse
+import logging
+import requests
+
+def saveResponse(path, response):
+ # 以二进制写入模式打开文件
+ with open(path, 'wb') as file:
+ # 将响应的二进制内容写入文件
+ file.write(response.content)
+
+def main():
+ api = args.api_base
+ if args.mode == 'sft':
+ url = api + "/api/inference/sft"
+ payload={
+ 'tts': args.tts_text,
+ 'role': args.spk_id
+ }
+ response = requests.request("POST", url, data=payload)
+ saveResponse(args.tts_wav, response)
+ elif args.mode == 'zero_shot':
+ url = api + "/api/inference/zero-shot"
+ payload={
+ 'tts': args.tts_text,
+ 'prompt': args.prompt_text
+ }
+ files=[('audio', ('prompt_audio.wav', open(args.prompt_wav,'rb'), 'application/octet-stream'))]
+ response = requests.request("POST", url, data=payload, files=files)
+ saveResponse(args.tts_wav, response)
+ elif args.mode == 'cross_lingual':
+ url = api + "/api/inference/cross-lingual"
+ payload={
+ 'tts': args.tts_text,
+ }
+ files=[('audio', ('prompt_audio.wav', open(args.prompt_wav,'rb'), 'application/octet-stream'))]
+ response = requests.request("POST", url, data=payload, files=files)
+ saveResponse(args.tts_wav, response)
+ else:
+ url = api + "/api/inference/instruct"
+ payload = {
+ 'tts': args.tts_text,
+ 'role': args.spk_id,
+ 'instruct': args.instruct_text
+ }
+ response = requests.request("POST", url, data=payload)
+ saveResponse(args.tts_wav, response)
+ logging.info("Response save to {}", args.tts_wav)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--api_base',
+ type=str,
+ default='http://127.0.0.1:50000')
+ parser.add_argument('--mode',
+ default='sft',
+ choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],
+ help='request mode')
+ parser.add_argument('--tts_text',
+ type=str,
+ default='你好,我是通义千问语音合成大模型,请问有什么可以帮您的吗?')
+ parser.add_argument('--spk_id',
+ type=str,
+ default='中文男')
+ parser.add_argument('--prompt_text',
+ type=str,
+ default='希望你以后能够做的比我还好呦。')
+ parser.add_argument('--prompt_wav',
+ type=str,
+ default='../../../zero_shot_prompt.wav')
+ parser.add_argument('--instruct_text',
+ type=str,
+ default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
+ parser.add_argument('--tts_wav',
+ type=str,
+ default='loushiming.mp3')
+ args = parser.parse_args()
+ prompt_sr, target_sr = 16000, 22050
+ main()
diff --git a/python/cosevoice/fastapi/server.py b/python/cosevoice/fastapi/server.py
new file mode 100644
index 000000000..105a7c406
--- /dev/null
+++ b/python/cosevoice/fastapi/server.py
@@ -0,0 +1,136 @@
+# Set inference model
+# export MODEL_DIR=pretrained_models/CosyVoice-300M-Instruct
+# For development
+# fastapi dev --port 6006 fastapi_server.py
+# For production deployment
+# fastapi run --port 6006 fastapi_server.py
+
+import os
+import sys
+import io,time
+from fastapi import FastAPI, Request, Response, File, UploadFile, Form, Body
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware #引入 CORS中间件模块
+from contextlib import asynccontextmanager
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../../..'.format(ROOT_DIR))
+sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav
+import numpy as np
+import torch
+import torchaudio
+import logging
+from pydantic import BaseModel
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+
+class LaunchFailed(Exception):
+ pass
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ model_dir = os.getenv("MODEL_DIR", "pretrained_models/CosyVoice-300M-SFT")
+ if model_dir:
+ logging.info("MODEL_DIR is {}", model_dir)
+ app.cosyvoice = CosyVoice(model_dir)
+ # sft usage
+ logging.info("Avaliable speakers {}", app.cosyvoice.list_avaliable_spks())
+ else:
+ raise LaunchFailed("MODEL_DIR environment must set")
+ yield
+
+app = FastAPI(lifespan=lifespan)
+
+#设置允许访问的域名
+origins = ["*"] #"*",即为所有,也可以改为允许的特定ip。
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=origins, #设置允许的origins来源
+ allow_credentials=True,
+ allow_methods=["*"], # 设置允许跨域的http方法,比如 get、post、put等。
+ allow_headers=["*"]) #允许跨域的headers,可以用来鉴别来源等作用。
+
+def buildResponse(output):
+ buffer = io.BytesIO()
+ torchaudio.save(buffer, output, 22050, format="mp3")
+ buffer.seek(0)
+ return Response(content=buffer.read(-1), media_type="audio/mpeg")
+
+@app.post("/api/inference/sft")
+@app.get("/api/inference/sft")
+async def sft(tts: str = Form(), role: str = Form()):
+ start = time.process_time()
+ output = app.cosyvoice.inference_sft(tts, role)
+ end = time.process_time()
+ logging.info("infer time is {} seconds", end-start)
+ return buildResponse(output['tts_speech'])
+
+class SpeechRequest(BaseModel):
+ model: str
+ input: str
+ voice: str
+
+@app.post("/v1/audio/speech")
+async def sft(request: Request, speech_request: SpeechRequest):
+ # 解析请求体中的JSON数据
+ data = speech_request.dict()
+
+ start = time.process_time()
+ output = app.cosyvoice.inference_sft(data['input'], data['voice'])
+ end = time.process_time()
+ logging.info("infer time is {} seconds", end-start)
+ return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/zero-shot")
+async def zeroShot(tts: str = Form(), prompt: str = Form(), audio: UploadFile = File()):
+ start = time.process_time()
+ prompt_speech = load_wav(audio.file, 16000)
+ prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+ prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+ prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+
+ output = app.cosyvoice.inference_zero_shot(tts, prompt, prompt_speech_16k)
+ end = time.process_time()
+ logging.info("infer time is {} seconds", end-start)
+ return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/cross-lingual")
+async def crossLingual(tts: str = Form(), audio: UploadFile = File()):
+ start = time.process_time()
+ prompt_speech = load_wav(audio.file, 16000)
+ prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+ prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+ prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+
+ output = app.cosyvoice.inference_cross_lingual(tts, prompt_speech_16k)
+ end = time.process_time()
+ logging.info("infer time is {} seconds", end-start)
+ return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/instruct")
+@app.get("/api/inference/instruct")
+async def instruct(tts: str = Form(), role: str = Form(), instruct: str = Form()):
+ start = time.process_time()
+ output = app.cosyvoice.inference_instruct(tts, role, instruct)
+ end = time.process_time()
+ logging.info("infer time is {} seconds", end-start)
+ return buildResponse(output['tts_speech'])
+
+@app.get("/api/roles")
+async def roles():
+ return {"roles": app.cosyvoice.list_avaliable_spks()}
+
+@app.get("/", response_class=HTMLResponse)
+async def root():
+ return """
+
+
+
+
+ split_with_space: true
+
+frontend: WavFrontend
+frontend_conf:
+ fs: 16000
+ window: hamming
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ lfr_m: 7
+ lfr_n: 6
+ cmvn_file: null
+
+
+dataset: SenseVoiceCTCDataset
+dataset_conf:
+ index_ds: IndexDSJsonl
+ batch_sampler: EspnetStyleBatchSampler
+ data_split_num: 32
+ batch_type: token
+ batch_size: 14000
+ max_token_length: 2000
+ min_token_length: 60
+ max_source_length: 2000
+ min_source_length: 60
+ max_target_length: 200
+ min_target_length: 0
+ shuffle: true
+ num_workers: 4
+ sos: ${model_conf.sos}
+ eos: ${model_conf.eos}
+ IndexDSJsonl: IndexDSJsonl
+ retry: 20
+
+train_conf:
+ accum_grad: 1
+ grad_clip: 5
+ max_epoch: 20
+ keep_nbest_models: 10
+ avg_nbest_model: 10
+ log_interval: 100
+ resume: true
+ validate_interval: 10000
+ save_checkpoint_interval: 10000
+
+optim: adamw
+optim_conf:
+ lr: 0.00002
+scheduler: warmuplr
+scheduler_conf:
+ warmup_steps: 25000
+
+specaug: SpecAugLFR
+specaug_conf:
+ apply_time_warp: false
+ time_warp_window: 5
+ time_warp_mode: bicubic
+ apply_freq_mask: true
+ freq_mask_width_range:
+ - 0
+ - 30
+ lfr_rate: 6
+ num_freq_mask: 1
+ apply_time_mask: true
+ time_mask_width_range:
+ - 0
+ - 12
+ num_time_mask: 1
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/configuration.json b/python/sensevoice/app/iic/SenseVoiceSmall/configuration.json
new file mode 100644
index 000000000..264d8171d
--- /dev/null
+++ b/python/sensevoice/app/iic/SenseVoiceSmall/configuration.json
@@ -0,0 +1,14 @@
+{
+ "framework": "pytorch",
+ "task" : "auto-speech-recognition",
+ "model": {"type" : "funasr"},
+ "pipeline": {"type":"funasr-pipeline"},
+ "model_name_in_hub": {
+ "ms":"",
+ "hf":""},
+ "file_path_metas": {
+ "init_param":"model.pt",
+ "config":"config.yaml",
+ "tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
+ "frontend_conf":{"cmvn_file": "am.mvn"}}
+}
\ No newline at end of file
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/aed_figure.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/aed_figure.png
new file mode 100644
index 000000000..995cedf3b
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/aed_figure.png differ
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/asr_results.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/asr_results.png
new file mode 100644
index 000000000..d962606e9
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/asr_results.png differ
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/inference.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/inference.png
new file mode 100644
index 000000000..0af617430
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/inference.png differ
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/sensevoice.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/sensevoice.png
new file mode 100644
index 000000000..8b8786b75
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/sensevoice.png differ
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_figure.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_figure.png
new file mode 100644
index 000000000..e3348900a
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_figure.png differ
diff --git a/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_table.png b/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_table.png
new file mode 100644
index 000000000..da432dfc3
Binary files /dev/null and b/python/sensevoice/app/iic/SenseVoiceSmall/fig/ser_table.png differ
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mdl b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mdl
new file mode 100644
index 000000000..dbe7c19ac
Binary files /dev/null and b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mdl differ
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.msc b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.msc
new file mode 100644
index 000000000..0223aee8f
Binary files /dev/null and b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.msc differ
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mv b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mv
new file mode 100644
index 000000000..ed0eee2ab
--- /dev/null
+++ b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mv
@@ -0,0 +1 @@
+Revision:master,CreatedAt:1707184291
\ No newline at end of file
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
new file mode 100644
index 000000000..1953b3602
--- /dev/null
+++ b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
@@ -0,0 +1,296 @@
+---
+tasks:
+- voice-activity-detection
+domain:
+- audio
+model-type:
+- VAD model
+frameworks:
+- pytorch
+backbone:
+- fsmn
+metrics:
+- f1_score
+license: Apache License 2.0
+language:
+- cn
+tags:
+- FunASR
+- FSMN
+- Alibaba
+- Online
+datasets:
+ train:
+ - 20,000 hour industrial Mandarin task
+ test:
+ - 20,000 hour industrial Mandarin task
+widgets:
+ - task: voice-activity-detection
+ model_revision: v2.0.4
+ inputs:
+ - type: audio
+ name: input
+ title: 音频
+ examples:
+ - name: 1
+ title: 示例1
+ inputs:
+ - name: input
+ data: git://example/vad_example.wav
+ inferencespec:
+ cpu: 1 #CPU数量
+ memory: 4096
+---
+
+# FSMN-Monophone VAD 模型介绍
+
+[//]: # (FSMN-Monophone VAD 模型)
+
+## Highlight
+- 16k中文通用VAD模型:可用于检测长语音片段中有效语音的起止时间点。
+ - 基于[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)场景的使用
+ - 基于[FunASR框架](https://github.com/alibaba-damo-academy/FunASR),可进行ASR,VAD,[中文标点](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)的自由组合
+ - 基于音频数据的有效语音片段起止时间点检测
+
+## [FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)
+[FunASR](https://github.com/alibaba-damo-academy/FunASR)希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并推动语音识别生态的发展。让语音识别更有趣!
+
+[**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
+| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
+| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**服务部署**](https://www.funasr.com)
+| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
+
+
+## 模型原理介绍
+
+FSMN-Monophone VAD是达摩院语音团队提出的高效语音端点检测模型,用于检测输入音频中有效语音的起止时间点信息,并将检测出来的有效音频片段输入识别引擎进行识别,减少无效语音带来的识别错误。
+
+
+
+
+FSMN-Monophone VAD模型结构如上图所示:模型结构层面,FSMN模型结构建模时可考虑上下文信息,训练和推理速度快,且时延可控;同时根据VAD模型size以及低时延的要求,对FSMN的网络结构、右看帧数进行了适配。在建模单元层面,speech信息比较丰富,仅用单类来表征学习能力有限,我们将单一speech类升级为Monophone。建模单元细分,可以避免参数平均,抽象学习能力增强,区分性更好。
+
+## 基于ModelScope进行推理
+
+- 推理支持音频格式如下:
+ - wav文件路径,例如:data/test/audios/vad_example.wav
+ - wav文件url,例如:https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav
+ - wav二进制数据,格式bytes,例如:用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
+ - 已解析的audio音频,例如:audio, rate = soundfile.read("vad_example_zh.wav"),类型为numpy.ndarray或者torch.Tensor。
+ - wav.scp文件,需符合如下要求:
+
+```sh
+cat wav.scp
+vad_example1 data/test/audios/vad_example1.wav
+vad_example2 data/test/audios/vad_example2.wav
+...
+```
+
+- 若输入格式wav文件url,api调用方式可参考如下范例:
+
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+ task=Tasks.voice_activity_detection,
+ model='iic/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+ model_revision="v2.0.4",
+)
+
+segments_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
+print(segments_result)
+```
+
+- 输入音频为pcm格式,调用api时需要传入音频采样率参数fs,例如:
+
+```python
+segments_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm', fs=16000)
+```
+
+- 若输入格式为文件wav.scp(注:文件名需要以.scp结尾),可添加 output_dir 参数将识别结果写入文件中,参考示例如下:
+
+```python
+inference_pipeline(input="wav.scp", output_dir='./output_dir')
+```
+识别结果输出路径结构如下:
+
+```sh
+tree output_dir/
+output_dir/
+└── 1best_recog
+ └── text
+
+1 directory, 1 files
+```
+text:VAD检测语音起止时间点结果文件(单位:ms)
+
+- 若输入音频为已解析的audio音频,api调用方式可参考如下范例:
+
+```python
+import soundfile
+
+waveform, sample_rate = soundfile.read("vad_example_zh.wav")
+segments_result = inference_pipeline(input=waveform)
+print(segments_result)
+```
+
+- VAD常用参数调整说明(参考:vad.yaml文件):
+ - max_end_silence_time:尾部连续检测到多长时间静音进行尾点判停,参数范围500ms~6000ms,默认值800ms(该值过低容易出现语音提前截断的情况)。
+ - speech_noise_thres:speech的得分减去noise的得分大于此值则判断为speech,参数范围:(-1,1)
+ - 取值越趋于-1,噪音被误判定为语音的概率越大,FA越高
+ - 取值越趋于+1,语音被误判定为噪音的概率越大,Pmiss越高
+ - 通常情况下,该值会根据当前模型在长语音测试集上的效果取balance
+
+
+
+
+## 基于FunASR进行推理
+
+下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav))
+
+### 可执行命令行
+在命令行终端执行:
+
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
+```
+
+注:支持单条音频文件识别,也支持文件列表,列表为kaldi风格wav.scp:`wav_id wav_path`
+
+### python示例
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+ vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+ punc_model="ct-punc-c", punc_model_revision="v2.0.4",
+ # spk_model="cam++", spk_model_revision="v2.0.2",
+ )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
+ batch_size_s=300,
+ hotword='魔搭')
+print(res)
+```
+注:`model_hub`:表示模型仓库,`ms`为选择modelscope下载,`hf`为选择huggingface下载。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+ speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+ is_final = i == total_chunk_num - 1
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+ print(res)
+```
+
+注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测(非实时)
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 语音端点检测(实时)
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+ speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+ is_final = i == total_chunk_num - 1
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+ if len(res[0]["value"]):
+ print(res)
+```
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh", model_revision="v2.0.4")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+更多详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
+
+
+## 微调
+
+详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
+
+
+
+
+
+## 使用方式以及适用范围
+
+运行范围
+- 支持Linux-x86_64、Mac和Windows运行。
+
+使用方式
+- 直接推理:可以直接对长语音数据进行计算,有效语音片段的起止时间点信息(单位:ms)。
+
+## 相关论文以及引用信息
+
+```BibTeX
+@inproceedings{zhang2018deep,
+ title={Deep-FSMN for large vocabulary continuous speech recognition},
+ author={Zhang, Shiliang and Lei, Ming and Yan, Zhijie and Dai, Lirong},
+ booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+ pages={5869--5873},
+ year={2018},
+ organization={IEEE}
+}
+```
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/am.mvn b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/am.mvn
new file mode 100644
index 000000000..59f64ee6d
--- /dev/null
+++ b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/am.mvn
@@ -0,0 +1,8 @@
+
+ 400 400
+[ 0 ]
+ 400 400
+ 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+ 400 400
+ 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+
\ No newline at end of file
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/config.yaml b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/config.yaml
new file mode 100644
index 000000000..4664aef95
--- /dev/null
+++ b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/config.yaml
@@ -0,0 +1,56 @@
+frontend: WavFrontendOnline
+frontend_conf:
+ fs: 16000
+ window: hamming
+ n_mels: 80
+ frame_length: 25
+ frame_shift: 10
+ dither: 0.0
+ lfr_m: 5
+ lfr_n: 1
+
+model: FsmnVADStreaming
+model_conf:
+ sample_rate: 16000
+ detect_mode: 1
+ snr_mode: 0
+ max_end_silence_time: 800
+ max_start_silence_time: 3000
+ do_start_point_detection: True
+ do_end_point_detection: True
+ window_size_ms: 200
+ sil_to_speech_time_thres: 150
+ speech_to_sil_time_thres: 150
+ speech_2_noise_ratio: 1.0
+ do_extend: 1
+ lookback_time_start_point: 200
+ lookahead_time_end_point: 100
+ max_single_segment_time: 60000
+ snr_thres: -100.0
+ noise_frame_num_used_for_snr: 100
+ decibel_thres: -100.0
+ speech_noise_thres: 0.6
+ fe_prior_thres: 0.0001
+ silence_pdf_num: 1
+ sil_pdf_ids: [0]
+ speech_noise_thresh_low: -0.1
+ speech_noise_thresh_high: 0.3
+ output_frame_probs: False
+ frame_in_ms: 10
+ frame_length_ms: 25
+
+encoder: FSMN
+encoder_conf:
+ input_dim: 400
+ input_affine_dim: 140
+ fsmn_layers: 4
+ linear_dim: 250
+ proj_dim: 128
+ lorder: 20
+ rorder: 0
+ lstride: 1
+ rstride: 0
+ output_affine_dim: 140
+ output_dim: 248
+
+
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/configuration.json b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/configuration.json
new file mode 100644
index 000000000..c5f8b9e15
--- /dev/null
+++ b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/configuration.json
@@ -0,0 +1,13 @@
+{
+ "framework": "pytorch",
+ "task" : "voice-activity-detection",
+ "pipeline": {"type":"funasr-pipeline"},
+ "model": {"type" : "funasr"},
+ "file_path_metas": {
+ "init_param":"model.pt",
+ "config":"config.yaml",
+ "frontend_conf":{"cmvn_file": "am.mvn"}},
+ "model_name_in_hub": {
+ "ms":"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+ "hf":""}
+}
\ No newline at end of file
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav
new file mode 100644
index 000000000..2ebc8c776
Binary files /dev/null and b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav differ
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/fig/struct.png b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/fig/struct.png
new file mode 100644
index 000000000..1d102a97c
Binary files /dev/null and b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/fig/struct.png differ
diff --git a/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pt b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pt
new file mode 100644
index 000000000..dc8c365c9
Binary files /dev/null and b/python/sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pt differ
diff --git a/python/sensevoice/app/main.py b/python/sensevoice/app/main.py
new file mode 100644
index 000000000..c1c8c2cbe
--- /dev/null
+++ b/python/sensevoice/app/main.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic import BaseModel, HttpUrl, ValidationError
+from typing import List
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+import uuid
+import os
+
+app = FastAPI()
+
+
+# 数据验证模型
+class UrlInput(BaseModel):
+ audio_urls: List[HttpUrl]
+
+
+# 模型加载
+model_dir = "iic/SenseVoiceSmall"
+
+# 快速预测
+# model = AutoModel(model=model_dir, trust_remote_code=True, device="cpu")
+
+# 准确预测
+model = AutoModel(
+ model=model_dir,
+ vad_model="fsmn-vad",
+ vad_kwargs={"max_single_segment_time": 30000},
+ trust_remote_code=True,
+ device="cuda:0",
+)
+
+
+@app.post("/upload-url/")
+async def upload_url(data: UrlInput):
+ try:
+ results = []
+ for url in data.audio_urls:
+ res = model.generate(
+ input=str(url), # 将 URL 转换为字符串
+ cache={},
+ language=language,
+ use_itn=False,
+ batch_size=batch_size,
+ )
+ data = rich_transcription_postprocess(res[0]["text"])
+ results.append(data)
+ return {"message": "URL input processed successfully", "results": results}
+ except ValidationError as e:
+ raise HTTPException(status_code=400, detail=e.errors())
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/v1/audio/transcriptions")
+async def upload_file(file: UploadFile = File(...)):
+ try:
+ #for file in files:
+ if not file.content_type.startswith("audio/"):
+ raise HTTPException(status_code=400, detail="Invalid file type")
+
+ # 读取文件为 bytes
+ #audio_bytes = await file.read()
+
+ unique_filename = str(uuid.uuid4()) + ".mp3"
+
+ # 保存上传的音频文件
+ audio_file_path = os.path.join("/tmp", unique_filename)
+ with open(audio_file_path, "wb") as audio_file:
+ audio_file.write(await file.read())
+
+ # 直接将文件对象传递给模型
+ res = model.generate(
+ input=audio_file_path,
+ cache={},
+ language=language,
+ use_itn=True,
+ batch_size=batch_size,
+ merge_vad=True, #
+ merge_length_s=15,
+ )
+ data = rich_transcription_postprocess(res[0]["text"])
+ return {"message": "File inputs processed successfully", "text": data}
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+ batch_size = 60
+ language = "auto"
+
+ import uvicorn
+
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/python/sensevoice/app/model.py b/python/sensevoice/app/model.py
new file mode 100644
index 000000000..177660a9f
--- /dev/null
+++ b/python/sensevoice/app/model.py
@@ -0,0 +1,895 @@
+
+import time
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Iterable, Optional
+
+from funasr.register import tables
+from funasr.models.ctc.ctc import CTC
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.models.paraformer.search import Hypothesis
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.metrics.compute_acc import compute_accuracy, th_accuracy
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+
+
+class SinusoidalPositionEncoder(torch.nn.Module):
+ """ """
+
+ def __int__(self, d_model=80, dropout_rate=0.1):
+ pass
+
+ def encode(
+ self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
+ ):
+ batch_size = positions.size(0)
+ positions = positions.type(dtype)
+ device = positions.device
+ log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
+ depth / 2 - 1
+ )
+ inv_timescales = torch.exp(
+ torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
+ )
+ inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+ scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
+ inv_timescales, [1, 1, -1]
+ )
+ encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+ return encoding.type(dtype)
+
+ def forward(self, x):
+ batch_size, timesteps, input_dim = x.size()
+ positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
+ position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+
+ return x + position_encoding
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+ """Positionwise feed forward layer.
+
+ Args:
+ idim (int): Input dimenstion.
+ hidden_units (int): The number of hidden units.
+ dropout_rate (float): Dropout rate.
+
+ """
+
+ def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+ """Construct an PositionwiseFeedForward object."""
+ super(PositionwiseFeedForward, self).__init__()
+ self.w_1 = torch.nn.Linear(idim, hidden_units)
+ self.w_2 = torch.nn.Linear(hidden_units, idim)
+ self.dropout = torch.nn.Dropout(dropout_rate)
+ self.activation = activation
+
+ def forward(self, x):
+ """Forward function."""
+ return self.w_2(self.dropout(self.activation(self.w_1(x))))
+
+
+class MultiHeadedAttentionSANM(nn.Module):
+ """Multi-Head Attention layer.
+
+ Args:
+ n_head (int): The number of heads.
+ n_feat (int): The number of features.
+ dropout_rate (float): Dropout rate.
+
+ """
+
+ def __init__(
+ self,
+ n_head,
+ in_feat,
+ n_feat,
+ dropout_rate,
+ kernel_size,
+ sanm_shfit=0,
+ lora_list=None,
+ lora_rank=8,
+ lora_alpha=16,
+ lora_dropout=0.1,
+ ):
+ """Construct an MultiHeadedAttention object."""
+ super().__init__()
+ assert n_feat % n_head == 0
+ # We assume d_v always equals d_k
+ self.d_k = n_feat // n_head
+ self.h = n_head
+ # self.linear_q = nn.Linear(n_feat, n_feat)
+ # self.linear_k = nn.Linear(n_feat, n_feat)
+ # self.linear_v = nn.Linear(n_feat, n_feat)
+
+ self.linear_out = nn.Linear(n_feat, n_feat)
+ self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+ self.attn = None
+ self.dropout = nn.Dropout(p=dropout_rate)
+
+ self.fsmn_block = nn.Conv1d(
+ n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+ )
+ # padding
+ left_padding = (kernel_size - 1) // 2
+ if sanm_shfit > 0:
+ left_padding = left_padding + sanm_shfit
+ right_padding = kernel_size - 1 - left_padding
+ self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+
+ def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+ b, t, d = inputs.size()
+ if mask is not None:
+ mask = torch.reshape(mask, (b, -1, 1))
+ if mask_shfit_chunk is not None:
+ mask = mask * mask_shfit_chunk
+ inputs = inputs * mask
+
+ x = inputs.transpose(1, 2)
+ x = self.pad_fn(x)
+ x = self.fsmn_block(x)
+ x = x.transpose(1, 2)
+ x += inputs
+ x = self.dropout(x)
+ if mask is not None:
+ x = x * mask
+ return x
+
+ def forward_qkv(self, x):
+ """Transform query, key and value.
+
+ Args:
+ query (torch.Tensor): Query tensor (#batch, time1, size).
+ key (torch.Tensor): Key tensor (#batch, time2, size).
+ value (torch.Tensor): Value tensor (#batch, time2, size).
+
+ Returns:
+ torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+ torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+ torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+
+ """
+ b, t, d = x.size()
+ q_k_v = self.linear_q_k_v(x)
+ q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+ q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+ 1, 2
+ ) # (batch, head, time1, d_k)
+ k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+ 1, 2
+ ) # (batch, head, time2, d_k)
+ v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+ 1, 2
+ ) # (batch, head, time2, d_k)
+
+ return q_h, k_h, v_h, v
+
+ def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
+ """Compute attention context vector.
+
+ Args:
+ value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+ scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+ mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+ Returns:
+ torch.Tensor: Transformed value (#batch, time1, d_model)
+ weighted by the attention score (#batch, time1, time2).
+
+ """
+ n_batch = value.size(0)
+ if mask is not None:
+ if mask_att_chunk_encoder is not None:
+ mask = mask * mask_att_chunk_encoder
+
+ mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
+
+ min_value = -float(
+ "inf"
+ ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+ scores = scores.masked_fill(mask, min_value)
+ self.attn = torch.softmax(scores, dim=-1).masked_fill(
+ mask, 0.0
+ ) # (batch, head, time1, time2)
+ else:
+ self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
+
+ p_attn = self.dropout(self.attn)
+ x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
+ x = (
+ x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+ ) # (batch, time1, d_model)
+
+ return self.linear_out(x) # (batch, time1, d_model)
+
+ def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ """Compute scaled dot product attention.
+
+ Args:
+ query (torch.Tensor): Query tensor (#batch, time1, size).
+ key (torch.Tensor): Key tensor (#batch, time2, size).
+ value (torch.Tensor): Value tensor (#batch, time2, size).
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+ (#batch, time1, time2).
+
+ Returns:
+ torch.Tensor: Output tensor (#batch, time1, d_model).
+
+ """
+ q_h, k_h, v_h, v = self.forward_qkv(x)
+ fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+ q_h = q_h * self.d_k ** (-0.5)
+ scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+ att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
+ return att_outs + fsmn_memory
+
+ def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+ """Compute scaled dot product attention.
+
+ Args:
+ query (torch.Tensor): Query tensor (#batch, time1, size).
+ key (torch.Tensor): Key tensor (#batch, time2, size).
+ value (torch.Tensor): Value tensor (#batch, time2, size).
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+ (#batch, time1, time2).
+
+ Returns:
+ torch.Tensor: Output tensor (#batch, time1, d_model).
+
+ """
+ q_h, k_h, v_h, v = self.forward_qkv(x)
+ if chunk_size is not None and look_back > 0 or look_back == -1:
+ if cache is not None:
+ k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
+ v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
+ k_h = torch.cat((cache["k"], k_h), dim=2)
+ v_h = torch.cat((cache["v"], v_h), dim=2)
+
+ cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
+ cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
+ if look_back != -1:
+ cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
+ cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
+ else:
+ cache_tmp = {
+ "k": k_h[:, :, : -(chunk_size[2]), :],
+ "v": v_h[:, :, : -(chunk_size[2]), :],
+ }
+ cache = cache_tmp
+ fsmn_memory = self.forward_fsmn(v, None)
+ q_h = q_h * self.d_k ** (-0.5)
+ scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+ att_outs = self.forward_attention(v_h, scores, None)
+ return att_outs + fsmn_memory, cache
+
+
+class LayerNorm(nn.LayerNorm):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def forward(self, input):
+ output = F.layer_norm(
+ input.float(),
+ self.normalized_shape,
+ self.weight.float() if self.weight is not None else None,
+ self.bias.float() if self.bias is not None else None,
+ self.eps,
+ )
+ return output.type_as(input)
+
+
+def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
+ if maxlen is None:
+ maxlen = lengths.max()
+ row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
+ matrix = torch.unsqueeze(lengths, dim=-1)
+ mask = row_vector < matrix
+ mask = mask.detach()
+
+ return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+
+
+class EncoderLayerSANM(nn.Module):
+ def __init__(
+ self,
+ in_size,
+ size,
+ self_attn,
+ feed_forward,
+ dropout_rate,
+ normalize_before=True,
+ concat_after=False,
+ stochastic_depth_rate=0.0,
+ ):
+ """Construct an EncoderLayer object."""
+ super(EncoderLayerSANM, self).__init__()
+ self.self_attn = self_attn
+ self.feed_forward = feed_forward
+ self.norm1 = LayerNorm(in_size)
+ self.norm2 = LayerNorm(size)
+ self.dropout = nn.Dropout(dropout_rate)
+ self.in_size = in_size
+ self.size = size
+ self.normalize_before = normalize_before
+ self.concat_after = concat_after
+ if self.concat_after:
+ self.concat_linear = nn.Linear(size + size, size)
+ self.stochastic_depth_rate = stochastic_depth_rate
+ self.dropout_rate = dropout_rate
+
+ def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ """Compute encoded features.
+
+ Args:
+ x_input (torch.Tensor): Input tensor (#batch, time, size).
+ mask (torch.Tensor): Mask tensor for the input (#batch, time).
+ cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+ Returns:
+ torch.Tensor: Output tensor (#batch, time, size).
+ torch.Tensor: Mask tensor (#batch, time).
+
+ """
+ skip_layer = False
+ # with stochastic depth, residual connection `x + f(x)` becomes
+ # `x <- x + 1 / (1 - p) * f(x)` at training time.
+ stoch_layer_coeff = 1.0
+ if self.training and self.stochastic_depth_rate > 0:
+ skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+ stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+ if skip_layer:
+ if cache is not None:
+ x = torch.cat([cache, x], dim=1)
+ return x, mask
+
+ residual = x
+ if self.normalize_before:
+ x = self.norm1(x)
+
+ if self.concat_after:
+ x_concat = torch.cat(
+ (
+ x,
+ self.self_attn(
+ x,
+ mask,
+ mask_shfit_chunk=mask_shfit_chunk,
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
+ ),
+ ),
+ dim=-1,
+ )
+ if self.in_size == self.size:
+ x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+ else:
+ x = stoch_layer_coeff * self.concat_linear(x_concat)
+ else:
+ if self.in_size == self.size:
+ x = residual + stoch_layer_coeff * self.dropout(
+ self.self_attn(
+ x,
+ mask,
+ mask_shfit_chunk=mask_shfit_chunk,
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
+ )
+ )
+ else:
+ x = stoch_layer_coeff * self.dropout(
+ self.self_attn(
+ x,
+ mask,
+ mask_shfit_chunk=mask_shfit_chunk,
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
+ )
+ )
+ if not self.normalize_before:
+ x = self.norm1(x)
+
+ residual = x
+ if self.normalize_before:
+ x = self.norm2(x)
+ x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+ if not self.normalize_before:
+ x = self.norm2(x)
+
+ return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+
+ def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+ """Compute encoded features.
+
+ Args:
+ x_input (torch.Tensor): Input tensor (#batch, time, size).
+ mask (torch.Tensor): Mask tensor for the input (#batch, time).
+ cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+ Returns:
+ torch.Tensor: Output tensor (#batch, time, size).
+ torch.Tensor: Mask tensor (#batch, time).
+
+ """
+
+ residual = x
+ if self.normalize_before:
+ x = self.norm1(x)
+
+ if self.in_size == self.size:
+ attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+ x = residual + attn
+ else:
+ x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+
+ if not self.normalize_before:
+ x = self.norm1(x)
+
+ residual = x
+ if self.normalize_before:
+ x = self.norm2(x)
+ x = residual + self.feed_forward(x)
+ if not self.normalize_before:
+ x = self.norm2(x)
+
+ return x, cache
+
+
+@tables.register("encoder_classes", "SenseVoiceEncoderSmall")
+class SenseVoiceEncoderSmall(nn.Module):
+ """
+ Author: Speech Lab of DAMO Academy, Alibaba Group
+ SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
+ https://arxiv.org/abs/2006.01713
+ """
+
+ def __init__(
+ self,
+ input_size: int,
+ output_size: int = 256,
+ attention_heads: int = 4,
+ linear_units: int = 2048,
+ num_blocks: int = 6,
+ tp_blocks: int = 0,
+ dropout_rate: float = 0.1,
+ positional_dropout_rate: float = 0.1,
+ attention_dropout_rate: float = 0.0,
+ stochastic_depth_rate: float = 0.0,
+ input_layer: Optional[str] = "conv2d",
+ pos_enc_class=SinusoidalPositionEncoder,
+ normalize_before: bool = True,
+ concat_after: bool = False,
+ positionwise_layer_type: str = "linear",
+ positionwise_conv_kernel_size: int = 1,
+ padding_idx: int = -1,
+ kernel_size: int = 11,
+ sanm_shfit: int = 0,
+ selfattention_layer_type: str = "sanm",
+ **kwargs,
+ ):
+ super().__init__()
+ self._output_size = output_size
+
+ self.embed = SinusoidalPositionEncoder()
+
+ self.normalize_before = normalize_before
+
+ positionwise_layer = PositionwiseFeedForward
+ positionwise_layer_args = (
+ output_size,
+ linear_units,
+ dropout_rate,
+ )
+
+ encoder_selfattn_layer = MultiHeadedAttentionSANM
+ encoder_selfattn_layer_args0 = (
+ attention_heads,
+ input_size,
+ output_size,
+ attention_dropout_rate,
+ kernel_size,
+ sanm_shfit,
+ )
+ encoder_selfattn_layer_args = (
+ attention_heads,
+ output_size,
+ output_size,
+ attention_dropout_rate,
+ kernel_size,
+ sanm_shfit,
+ )
+
+ self.encoders0 = nn.ModuleList(
+ [
+ EncoderLayerSANM(
+ input_size,
+ output_size,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+ positionwise_layer(*positionwise_layer_args),
+ dropout_rate,
+ )
+ for i in range(1)
+ ]
+ )
+ self.encoders = nn.ModuleList(
+ [
+ EncoderLayerSANM(
+ output_size,
+ output_size,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
+ positionwise_layer(*positionwise_layer_args),
+ dropout_rate,
+ )
+ for i in range(num_blocks - 1)
+ ]
+ )
+
+ self.tp_encoders = nn.ModuleList(
+ [
+ EncoderLayerSANM(
+ output_size,
+ output_size,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
+ positionwise_layer(*positionwise_layer_args),
+ dropout_rate,
+ )
+ for i in range(tp_blocks)
+ ]
+ )
+
+ self.after_norm = LayerNorm(output_size)
+
+ self.tp_norm = LayerNorm(output_size)
+
+ def output_size(self) -> int:
+ return self._output_size
+
+ def forward(
+ self,
+ xs_pad: torch.Tensor,
+ ilens: torch.Tensor,
+ ):
+ """Embed positions in tensor."""
+ masks = sequence_mask(ilens, device=ilens.device)[:, None, :]
+
+ xs_pad *= self.output_size() ** 0.5
+
+ xs_pad = self.embed(xs_pad)
+
+ # forward encoder1
+ for layer_idx, encoder_layer in enumerate(self.encoders0):
+ encoder_outs = encoder_layer(xs_pad, masks)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+ for layer_idx, encoder_layer in enumerate(self.encoders):
+ encoder_outs = encoder_layer(xs_pad, masks)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+ xs_pad = self.after_norm(xs_pad)
+
+ # forward encoder2
+ olens = masks.squeeze(1).sum(1).int()
+
+ for layer_idx, encoder_layer in enumerate(self.tp_encoders):
+ encoder_outs = encoder_layer(xs_pad, masks)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+ xs_pad = self.tp_norm(xs_pad)
+ return xs_pad, olens
+
+
+@tables.register("model_classes", "SenseVoiceSmall")
+class SenseVoiceSmall(nn.Module):
+ """CTC-attention hybrid Encoder-Decoder model"""
+
+ def __init__(
+ self,
+ specaug: str = None,
+ specaug_conf: dict = None,
+ normalize: str = None,
+ normalize_conf: dict = None,
+ encoder: str = None,
+ encoder_conf: dict = None,
+ ctc_conf: dict = None,
+ input_size: int = 80,
+ vocab_size: int = -1,
+ ignore_id: int = -1,
+ blank_id: int = 0,
+ sos: int = 1,
+ eos: int = 2,
+ length_normalized_loss: bool = False,
+ **kwargs,
+ ):
+
+ super().__init__()
+
+ if specaug is not None:
+ specaug_class = tables.specaug_classes.get(specaug)
+ specaug = specaug_class(**specaug_conf)
+ if normalize is not None:
+ normalize_class = tables.normalize_classes.get(normalize)
+ normalize = normalize_class(**normalize_conf)
+ encoder_class = tables.encoder_classes.get(encoder)
+ encoder = encoder_class(input_size=input_size, **encoder_conf)
+ encoder_output_size = encoder.output_size()
+
+ if ctc_conf is None:
+ ctc_conf = {}
+ ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
+
+ self.blank_id = blank_id
+ self.sos = sos if sos is not None else vocab_size - 1
+ self.eos = eos if eos is not None else vocab_size - 1
+ self.vocab_size = vocab_size
+ self.ignore_id = ignore_id
+ self.specaug = specaug
+ self.normalize = normalize
+ self.encoder = encoder
+ self.error_calculator = None
+
+ self.ctc = ctc
+
+ self.length_normalized_loss = length_normalized_loss
+ self.encoder_output_size = encoder_output_size
+
+ self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+ self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+ self.textnorm_dict = {"withitn": 14, "woitn": 15}
+ self.textnorm_int_dict = {25016: 14, 25017: 15}
+ self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size)
+ self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004}
+
+ self.criterion_att = LabelSmoothingLoss(
+ size=self.vocab_size,
+ padding_idx=self.ignore_id,
+ smoothing=kwargs.get("lsm_weight", 0.0),
+ normalize_length=self.length_normalized_loss,
+ )
+
+ @staticmethod
+ def from_pretrained(model:str=None, **kwargs):
+ from funasr import AutoModel
+ model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
+
+ return model, kwargs
+
+ def forward(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ text_lengths: torch.Tensor,
+ **kwargs,
+ ):
+ """Encoder + Decoder + Calc loss
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ text: (Batch, Length)
+ text_lengths: (Batch,)
+ """
+ # import pdb;
+ # pdb.set_trace()
+ if len(text_lengths.size()) > 1:
+ text_lengths = text_lengths[:, 0]
+ if len(speech_lengths.size()) > 1:
+ speech_lengths = speech_lengths[:, 0]
+
+ batch_size = speech.shape[0]
+
+ # 1. Encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text)
+
+ loss_ctc, cer_ctc = None, None
+ loss_rich, acc_rich = None, None
+ stats = dict()
+
+ loss_ctc, cer_ctc = self._calc_ctc_loss(
+ encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4
+ )
+
+ loss_rich, acc_rich = self._calc_rich_ce_loss(
+ encoder_out[:, :4, :], text[:, :4]
+ )
+
+ loss = loss_ctc
+ # Collect total loss stats
+ stats["loss"] = torch.clone(loss.detach()) if loss_ctc is not None else None
+ stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None
+ stats["acc_rich"] = acc_rich
+
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
+ if self.length_normalized_loss:
+ batch_size = int((text_lengths + 1).sum())
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+ return loss, stats, weight
+
+ def encode(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ **kwargs,
+ ):
+ """Frontend + Encoder. Note that this method is used by asr_inference.py
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ ind: int
+ """
+
+ # Data augmentation
+ if self.specaug is not None and self.training:
+ speech, speech_lengths = self.specaug(speech, speech_lengths)
+
+ # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+ if self.normalize is not None:
+ speech, speech_lengths = self.normalize(speech, speech_lengths)
+
+
+ lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device)
+ language_query = self.embed(lids)
+
+ styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device)
+ style_query = self.embed(styles)
+ speech = torch.cat((style_query, speech), dim=1)
+ speech_lengths += 1
+
+ event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1)
+ input_query = torch.cat((language_query, event_emo_query), dim=1)
+ speech = torch.cat((input_query, speech), dim=1)
+ speech_lengths += 3
+
+ encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+
+ return encoder_out, encoder_out_lens
+
+ def _calc_ctc_loss(
+ self,
+ encoder_out: torch.Tensor,
+ encoder_out_lens: torch.Tensor,
+ ys_pad: torch.Tensor,
+ ys_pad_lens: torch.Tensor,
+ ):
+ # Calc CTC loss
+ loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+ # Calc CER using CTC
+ cer_ctc = None
+ if not self.training and self.error_calculator is not None:
+ ys_hat = self.ctc.argmax(encoder_out).data
+ cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+ return loss_ctc, cer_ctc
+
+ def _calc_rich_ce_loss(
+ self,
+ encoder_out: torch.Tensor,
+ ys_pad: torch.Tensor,
+ ):
+ decoder_out = self.ctc.ctc_lo(encoder_out)
+ # 2. Compute attention loss
+ loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous())
+ acc_rich = th_accuracy(
+ decoder_out.view(-1, self.vocab_size),
+ ys_pad.contiguous(),
+ ignore_label=self.ignore_id,
+ )
+
+ return loss_rich, acc_rich
+
+
+ def inference(
+ self,
+ data_in,
+ data_lengths=None,
+ key: list = ["wav_file_tmp_name"],
+ tokenizer=None,
+ frontend=None,
+ **kwargs,
+ ):
+
+
+ meta_data = {}
+ if (
+ isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
+ ): # fbank
+ speech, speech_lengths = data_in, data_lengths
+ if len(speech.shape) < 3:
+ speech = speech[None, :, :]
+ if speech_lengths is None:
+ speech_lengths = speech.shape[1]
+ else:
+ # extract fbank feats
+ time1 = time.perf_counter()
+ audio_sample_list = load_audio_text_image_video(
+ data_in,
+ fs=frontend.fs,
+ audio_fs=kwargs.get("fs", 16000),
+ data_type=kwargs.get("data_type", "sound"),
+ tokenizer=tokenizer,
+ )
+ time2 = time.perf_counter()
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
+ speech, speech_lengths = extract_fbank(
+ audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
+ )
+ time3 = time.perf_counter()
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+ meta_data["batch_data_time"] = (
+ speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+ )
+
+ speech = speech.to(device=kwargs["device"])
+ speech_lengths = speech_lengths.to(device=kwargs["device"])
+
+ language = kwargs.get("language", "auto")
+ language_query = self.embed(
+ torch.LongTensor(
+ [[self.lid_dict[language] if language in self.lid_dict else 0]]
+ ).to(speech.device)
+ ).repeat(speech.size(0), 1, 1)
+
+ use_itn = kwargs.get("use_itn", False)
+ textnorm = kwargs.get("text_norm", None)
+ if textnorm is None:
+ textnorm = "withitn" if use_itn else "woitn"
+ textnorm_query = self.embed(
+ torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device)
+ ).repeat(speech.size(0), 1, 1)
+ speech = torch.cat((textnorm_query, speech), dim=1)
+ speech_lengths += 1
+
+ event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(
+ speech.size(0), 1, 1
+ )
+ input_query = torch.cat((language_query, event_emo_query), dim=1)
+ speech = torch.cat((input_query, speech), dim=1)
+ speech_lengths += 3
+
+ # Encoder
+ encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+ if isinstance(encoder_out, tuple):
+ encoder_out = encoder_out[0]
+
+ # c. Passed the encoder result and the beam search
+ ctc_logits = self.ctc.log_softmax(encoder_out)
+ if kwargs.get("ban_emo_unk", False):
+ ctc_logits[:, :, self.emo_dict["unk"]] = -float("inf")
+
+ results = []
+ b, n, d = encoder_out.size()
+ if isinstance(key[0], (list, tuple)):
+ key = key[0]
+ if len(key) < b:
+ key = key * b
+ for i in range(b):
+ x = ctc_logits[i, : encoder_out_lens[i].item(), :]
+ yseq = x.argmax(dim=-1)
+ yseq = torch.unique_consecutive(yseq, dim=-1)
+
+ ibest_writer = None
+ if kwargs.get("output_dir") is not None:
+ if not hasattr(self, "writer"):
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
+ ibest_writer = self.writer[f"1best_recog"]
+
+ mask = yseq != self.blank_id
+ token_int = yseq[mask].tolist()
+
+ # Change integer-ids to tokens
+ text = tokenizer.decode(token_int)
+
+ result_i = {"key": key[i], "text": text}
+ results.append(result_i)
+
+ if ibest_writer is not None:
+ ibest_writer["text"][key[i]] = text
+
+ return results, meta_data
+
+ def export(self, **kwargs):
+ from export_meta import export_rebuild_model
+
+ if "max_seq_len" not in kwargs:
+ kwargs["max_seq_len"] = 512
+ models = export_rebuild_model(model=self, **kwargs)
+ return models
diff --git a/python/sensevoice/app/requirements.txt b/python/sensevoice/app/requirements.txt
new file mode 100644
index 000000000..0395d6fe9
--- /dev/null
+++ b/python/sensevoice/app/requirements.txt
@@ -0,0 +1,5 @@
+torch>=1.13
+torchaudio
+funasr>=1.1.1
+fastapi
+modelscope
diff --git a/python/sensevoice/main.py b/python/sensevoice/main.py
new file mode 100644
index 000000000..ef065a87a
--- /dev/null
+++ b/python/sensevoice/main.py
@@ -0,0 +1,56 @@
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from tempfile import NamedTemporaryFile
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+import os
+
+# 加载模型
+model_dir = "./iic/SenseVoiceSmall"
+
+model = AutoModel(
+ model=model_dir,
+ trust_remote_code=True,
+ remote_code="./model.py",
+ vad_model="fsmn-vad",
+ vad_kwargs={"max_single_segment_time": 30000},
+ device="cuda:0",
+)
+
+app = FastAPI()
+
+@app.post("/v1/audio/transcriptions")
+async def handler(file: UploadFile = File(...)):
+ if not file:
+ raise HTTPException(status_code=400, detail="No file was provided")
+
+ # 使用NamedTemporaryFile创建临时文件
+ with NamedTemporaryFile(delete=False) as temp_file:
+ # 将用户上传的文件写入临时文件
+ content = await file.read()
+ temp_file.write(content)
+ temp_file_path = temp_file.name
+
+ try:
+ # 开始运行模型
+ result = model.generate(
+ input=temp_file_path,
+ cache={},
+ language="auto",
+ use_itn=True,
+ batch_size_s=60,
+ merge_vad=True,
+ merge_length_s=15,
+ )
+ text = rich_transcription_postprocess(result[0]["text"])
+
+ # 返回包含结果的JSON响应
+ return JSONResponse(content={'text': text})
+ finally:
+ # 删除临时文件
+ os.unlink(temp_file_path)
+
+if __name__ == "__main__":
+ import uvicorn
+
+ uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/python/sensevoice/run.sh b/python/sensevoice/run.sh
new file mode 100644
index 000000000..c1311ecce
--- /dev/null
+++ b/python/sensevoice/run.sh
@@ -0,0 +1 @@
+docker run -d -p 8000:8000 registry.cn-hangzhou.aliyuncs.com/luanshaotong/sensevoice:v0.1