update code positon (#3907)

2025-07-23 13:03:50 +00:00 · 2025-02-27 10:30:43 +08:00
parent fb0eb49196
commit c3d3b30d7e
107 changed files with 24 additions and 480 deletions
--- a/plugins/model/llm-Baichuan2/openai_api.py
+++ b/plugins/model/llm-Baichuan2/openai_api.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Implements API for Baichuan2-7B-Chat in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
+# Usage: python openai_api.py
+
+import gc
+import time
+import torch
+import uvicorn
+from pydantic import BaseModel, Field, validator
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from contextlib import asynccontextmanager
+from typing import Any, Dict, List, Optional, Union
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from sse_starlette.sse import ServerSentEvent, EventSourceResponse
+from transformers.generation.utils import GenerationConfig
+import random
+import string
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI): # collects GPU memory
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[str] = []  # Assuming ModelCard is a string type. Replace with the correct type if not.
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+    @validator('role')
+    def check_role(cls, v):
+        if v not in ["user", "assistant", "system"]:
+            raise ValueError('role must be one of "user", "assistant", "system"')
+        return v
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+
+    @validator('role', allow_reuse=True)
+    def check_role(cls, v):
+        if v is not None and v not in ["user", "assistant", "system"]:
+            raise ValueError('role must be one of "user", "assistant", "system"')
+        return v
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_length: Optional[int] = 8192  # max_length should be an integer.
+    stream: Optional[bool] = False
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: str
+
+    @validator('finish_reason')
+    def check_finish_reason(cls, v):
+        if v not in ["stop", "length"]:
+            raise ValueError('finish_reason must be one of "stop" or "length"')
+        return v
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+     index: int
+     delta: DeltaMessage
+     finish_reason: Optional[str]
+
+     @validator('finish_reason', allow_reuse=True)
+     def check_finish_reason(cls, v):
+         if v is not None and v not in ["stop", "length"]:
+             raise ValueError('finish_reason must be one of "stop" or "length"')
+         return v
+
+class ChatCompletionResponse(BaseModel):
+     id:str 
+     object:str 
+     
+     @validator('object')
+     def check_object(cls,v): 
+         if v not in ["chat.completion","chat.completion.chunk"]: 
+             raise ValueError("object must be one of 'chat.completion' or 'chat.completion.chunk'")
+         return v
+     
+     created :Optional[int]=Field(default_factory=lambda:int(time.time()))
+     model:str 
+     choices :List[Union[ChatCompletionResponseChoice,ChatCompletionResponseStreamChoice]]
+
+
+def generate_id():
+    possible_characters = string.ascii_letters + string.digits
+    random_string = ''.join(random.choices(possible_characters, k=29))
+    return 'chatcmpl-' + random_string
+    
+
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    global model_args
+    model_card = ModelCard(id="gpt-3.5-turbo")
+    return ModelList(data=[model_card])
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+    if request.messages[-1].role != "user":
+        raise HTTPException(status_code=400, detail="Invalid request")
+    query = request.messages[-1].content
+    prev_messages = request.messages[:-1]
+    if len(prev_messages) > 0 and prev_messages[0].role == "system":
+        query = prev_messages.pop(0).content + query
+    messages = []
+    for message in prev_messages:
+        messages.append({"role": message.role, "content": message.content})
+    
+    messages.append({"role": "user", "content": query})
+    
+    if request.stream:
+        generate = predict(messages, request.model)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+    
+    response = '本接口不支持非stream模式'
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=ChatMessage(role="assistant", content=response),
+        finish_reason="stop"
+    )
+    id='chatcmpl-7QyqpwdfhqwajicIEznoc6Q47XAyW'
+
+    return ChatCompletionResponse(id=id,model=request.model, choices=[choice_data], object="chat.completion")
+
+
+async def predict(messages: List[List[str]], model_id: str):
+    global model, tokenizer
+    id = generate_id()
+    created = int(time.time())
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(role="assistant",content=""),
+        finish_reason=None
+    )
+    chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data])
+    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+
+    current_length = 0
+
+    for new_response in model.chat(tokenizer, messages, stream=True):
+        if len(new_response) == current_length:
+            continue
+
+        new_text = new_response[current_length:]
+        current_length = len(new_response)
+
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0,
+            delta=DeltaMessage(content=new_text),
+            finish_reason=None
+        )
+        chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data])
+        yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(),
+        finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data])
+    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+    yield '[DONE]'
+
+
+def load_models():
+    print("本次加载的大语言模型为: Baichuan-13B-Chat")
+    tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", use_fast=False, trust_remote_code=True)
+    # model = AutoModelForCausalLM.from_pretrained("Baichuan2-13B-Chat", torch_dtype=torch.float32, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", torch_dtype=torch.float16, trust_remote_code=True)
+    model = model.cuda() 
+    model.generation_config = GenerationConfig.from_pretrained("baichuan-inc/Baichuan2-7B-Chat") 
+    return tokenizer, model
+
+if __name__ == "__main__":
+    tokenizer, model = load_models()
+    uvicorn.run(app, host='0.0.0.0', port=6006, workers=1)
+
+    while True:
+        try:
+            # 在这里执行您的程序逻辑
+
+            # 检查显存使用情况，如果超过阈值（例如90%），则触发垃圾回收
+            if torch.cuda.is_available():
+                gpu_memory_usage = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()
+                if gpu_memory_usage > 0.9:
+                    gc.collect()
+                    torch.cuda.empty_cache()
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                print("显存不足，正在重启程序...")
+                gc.collect()
+                torch.cuda.empty_cache()
+                time.sleep(5) # 等待一段时间以确保显存已释放
+                tokenizer, model = load_models()
+            else:
+                raise e
+
+    
--- a/plugins/model/llm-Baichuan2/requirements.txt
+++ b/plugins/model/llm-Baichuan2/requirements.txt
@@ -0,0 +1,14 @@
+protobuf
+transformers==4.30.2
+cpm_kernels
+torch>=2.0
+gradio
+mdtex2html
+sentencepiece
+accelerate
+sse-starlette
+fastapi==0.99.1
+pydantic==1.10.7
+uvicorn==0.21.1
+xformers
+bitsandbytes
--- a/plugins/model/llm-ChatGLM2/openai_api.py
+++ b/plugins/model/llm-ChatGLM2/openai_api.py
@@ -0,0 +1,260 @@
+# coding=utf-8
+import argparse
+import time
+from contextlib import asynccontextmanager
+from typing import List, Literal, Optional, Union
+
+import numpy as np
+import tiktoken
+import torch
+import uvicorn
+from fastapi import Depends, FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from sentence_transformers import SentenceTransformer
+from sklearn.preprocessing import PolynomialFeatures
+from sse_starlette.sse import EventSourceResponse
+from starlette.status import HTTP_401_UNAUTHORIZED
+from transformers import AutoModel, AutoTokenizer
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):  # collects GPU memory
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+class ChatMessage(BaseModel):
+    role: Literal["user", "assistant", "system"]
+    content: str
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_length: Optional[int] = None
+    stream: Optional[bool] = False
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Literal["stop", "length"]
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]]
+
+
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[
+        Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]
+    ]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+
+
+async def verify_token(request: Request):
+    auth_header = request.headers.get('Authorization')
+    if auth_header:
+        token_type, _, token = auth_header.partition(' ')
+        if (
+            token_type.lower() == "bearer"
+            and token == "sk-aaabbbcccdddeeefffggghhhiiijjjkkk"
+        ):  # 这里配置你的token
+            return True
+    raise HTTPException(
+        status_code=HTTP_401_UNAUTHORIZED,
+        detail="Invalid authorization credentials",
+    )
+
+
+class EmbeddingRequest(BaseModel):
+    input: List[str]
+    model: str
+
+
+class EmbeddingResponse(BaseModel):
+    data: list
+    model: str
+    object: str
+    usage: dict
+
+
+def num_tokens_from_string(string: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding('cl100k_base')
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+
+def expand_features(embedding, target_length):
+    poly = PolynomialFeatures(degree=2)
+    expanded_embedding = poly.fit_transform(embedding.reshape(1, -1))
+    expanded_embedding = expanded_embedding.flatten()
+    if len(expanded_embedding) > target_length:
+        # 如果扩展后的特征超过目标长度，可以通过截断或其他方法来减少维度
+        expanded_embedding = expanded_embedding[:target_length]
+    elif len(expanded_embedding) < target_length:
+        # 如果扩展后的特征少于目标长度，可以通过填充或其他方法来增加维度
+        expanded_embedding = np.pad(
+            expanded_embedding, (0, target_length - len(expanded_embedding))
+        )
+    return expanded_embedding
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(
+    request: ChatCompletionRequest, token: bool = Depends(verify_token)
+):
+    global model, tokenizer
+
+    if request.messages[-1].role != "user":
+        raise HTTPException(status_code=400, detail="Invalid request")
+    query = request.messages[-1].content
+
+    prev_messages = request.messages[:-1]
+    if len(prev_messages) > 0 and prev_messages[0].role == "system":
+        query = prev_messages.pop(0).content + query
+
+    history = []
+    if len(prev_messages) % 2 == 0:
+        for i in range(0, len(prev_messages), 2):
+            if (
+                prev_messages[i].role == "user"
+                and prev_messages[i + 1].role == "assistant"
+            ):
+                history.append([prev_messages[i].content, prev_messages[i + 1].content])
+
+    if request.stream:
+        generate = predict(query, history, request.model)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+
+    response, _ = model.chat(tokenizer, query, history=history)
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=ChatMessage(role="assistant", content=response),
+        finish_reason="stop",
+    )
+
+    return ChatCompletionResponse(
+        model=request.model, choices=[choice_data], object="chat.completion"
+    )
+
+
+async def predict(query: str, history: List[List[str]], model_id: str):
+    global model, tokenizer
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(role="assistant"), finish_reason=None
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+
+    current_length = 0
+
+    for new_response, _ in model.stream_chat(tokenizer, query, history):
+        if len(new_response) == current_length:
+            continue
+
+        new_text = new_response[current_length:]
+        current_length = len(new_response)
+
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0, delta=DeltaMessage(content=new_text), finish_reason=None
+        )
+        chunk = ChatCompletionResponse(
+            model=model_id, choices=[choice_data], object="chat.completion.chunk"
+        )
+        yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(), finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
+    yield '[DONE]'
+
+
+@app.post("/v1/embeddings", response_model=EmbeddingResponse)
+async def get_embeddings(
+    request: EmbeddingRequest, token: bool = Depends(verify_token)
+):
+    # 计算嵌入向量和tokens数量
+    embeddings = [embeddings_model.encode(text) for text in request.input]
+
+    # 如果嵌入向量的维度不为1536，则使用插值法扩展至1536维度
+    embeddings = [
+        expand_features(embedding, 1536) if len(embedding) < 1536 else embedding
+        for embedding in embeddings
+    ]
+
+    # Min-Max normalization 归一化
+    embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings]
+
+    # 将numpy数组转换为列表
+    embeddings = [embedding.tolist() for embedding in embeddings]
+    prompt_tokens = sum(len(text.split()) for text in request.input)
+    total_tokens = sum(num_tokens_from_string(text) for text in request.input)
+
+    response = {
+        "data": [
+            {"embedding": embedding, "index": index, "object": "embedding"}
+            for index, embedding in enumerate(embeddings)
+        ],
+        "model": request.model,
+        "object": "list",
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "total_tokens": total_tokens,
+        },
+    }
+
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="16", type=str, help="Model name")
+    args = parser.parse_args()
+
+    model_dict = {
+        "4": "THUDM/chatglm2-6b-int4",
+        "8": "THUDM/chatglm2-6b-int8",
+        "16": "THUDM/chatglm2-6b",
+    }
+
+    model_name = model_dict.get(args.model_name, "THUDM/chatglm2-6b")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
+    embeddings_model = SentenceTransformer('moka-ai/m3e-large', device='cpu')
+
+    uvicorn.run(app, host='0.0.0.0', port=6006, workers=1)
--- a/plugins/model/llm-ChatGLM2/requirements.txt
+++ b/plugins/model/llm-ChatGLM2/requirements.txt
@@ -0,0 +1,11 @@
+fastapi==0.101.1
+numpy==1.24.3
+pydantic==1.10.7
+scikit_learn==1.2.2
+sentence_transformers==2.2.2
+sse_starlette==1.6.5
+starlette==0.27.0
+tiktoken==0.4.0
+torch==2.0.1
+transformers==4.31.0
+uvicorn==0.23.2
--- a/plugins/model/ocr-surya/Dockerfile
+++ b/plugins/model/ocr-surya/Dockerfile
@@ -0,0 +1,17 @@
+FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+
+# please download the model from https://huggingface.co/vikp/surya_det3 
+# and https://huggingface.co/vikp/surya_rec2, and put it in the directory vikp/
+COPY ./vikp ./vikp
+
+COPY requirements.txt .
+
+RUN python3 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN python3 -m pip uninstall opencv-python -y
+
+RUN python3 -m pip install opencv-python-headless -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+COPY app.py Dockerfile ./
+
+ENTRYPOINT python3 app.py
--- a/plugins/model/ocr-surya/README.md
+++ b/plugins/model/ocr-surya/README.md
--- a/plugins/model/ocr-surya/app.py
+++ b/plugins/model/ocr-surya/app.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import base64
+import io
+import json
+import logging
+import os
+from typing import List, Optional
+
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException, Security
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from PIL import Image, ImageFile
+from pydantic import BaseModel
+from surya.model.detection.model import load_model as load_det_model
+from surya.model.detection.model import load_processor as load_det_processor
+from surya.model.recognition.model import load_model as load_rec_model
+from surya.model.recognition.processor import load_processor as load_rec_processor
+from surya.ocr import run_ocr
+from surya.schema import OCRResult
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
+app = FastAPI()
+security = HTTPBearer()
+env_bearer_token = None
+
+
+# GPU显存回收
+def torch_gc():
+    if torch.cuda.is_available():  # 检查是否可用CUDA
+        torch.cuda.empty_cache()  # 清空CUDA缓存
+        torch.cuda.ipc_collect()  # 收集CUDA内存碎片
+
+
+class ImageReq(BaseModel):
+    images: List[str]
+    sorted: Optional[bool] = False
+
+
+class Singleton(type):
+
+    def __call__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = super().__call__(*args, **kwargs)
+        return cls._instance
+
+
+class Surya(metaclass=Singleton):
+
+    def __init__(self):
+        self.langs = json.loads(os.getenv("LANGS", '["zh", "en"]'))
+        self.batch_size = os.getenv("BATCH_SIZE")
+        if self.batch_size is not None:
+            self.batch_size = int(self.batch_size)
+        self.det_processor, self.det_model = load_det_processor(
+        ), load_det_model()
+        self.rec_model, self.rec_processor = load_rec_model(
+        ), load_rec_processor()
+
+    def run(self, image: ImageFile.ImageFile) -> List[OCRResult]:
+        predictions = run_ocr([image], [self.langs], self.det_model,
+                              self.det_processor, self.rec_model,
+                              self.rec_processor, self.batch_size)
+        return predictions
+
+
+class Chat(object):
+
+    def __init__(self):
+        self.surya = Surya()
+
+    def base64_to_image(base64_string: str) -> ImageFile.ImageFile:
+        image_data = base64.b64decode(base64_string)
+        image_stream = io.BytesIO(image_data)
+        image = Image.open(image_stream)
+        return image
+
+    def sort_text_by_bbox(original_data: List[dict]) -> str:
+        # 根据bbox进行排序，从左到右，从上到下。返回排序后的按行的字符串。
+        # 排序
+        lines, line = [], []
+        original_data.sort(key=lambda item: item["bbox"][1])
+        for item in original_data:
+            mid_h = (item["bbox"][1] + item["bbox"][3]) / 2
+            if len(line) == 0 or (mid_h >= line[0]["bbox"][1]
+                                  and mid_h <= line[0]["bbox"][3]):
+                line.append(item)
+            else:
+                lines.append(line)
+                line = [item]
+        lines.append(line)
+        for line in lines:
+            line.sort(key=lambda item: item["bbox"][0])
+        # 构建行字符串
+        string_result = ""
+        for line in lines:
+            for item in line:
+                string_result += item["text"] + " "
+            string_result += "\n"
+        return string_result
+
+    def query_ocr(self, image_base64: str,
+                  sorted: bool) -> str:
+        if image_base64 is None or len(image_base64) == 0:
+            return ""
+        try:
+            image = Chat.base64_to_image(image_base64)
+            ocr_result = self.surya.run(image)
+            result = []
+
+            for text_line in ocr_result[0].text_lines:
+                result.append(text_line.text)
+
+            if sorted:
+                result = self.sort_text_lines(result)
+
+            # 将所有文本行合并成一个字符串，用换行符分隔
+            final_result = "\n".join(result)
+
+            torch_gc()
+            return final_result
+        except Exception as e:
+            logging.error(f"OCR 处理失败: {e}")
+            raise HTTPException(status_code=400, detail=f"OCR 处理失败: {str(e)}")
+
+    @staticmethod
+    def sort_text_lines(text_lines: List[str]) -> List[str]:
+        # 这里可以实现自定义的排序逻辑
+        # 目前只是简单地返回原始列表，因为我们没有位置信息来进行排序
+        return text_lines
+
+@app.post('/v1/ocr/text')
+async def handle_post_request(
+    image_req: ImageReq,
+    credentials: HTTPAuthorizationCredentials = Security(security)):
+    token = credentials.credentials
+    if env_bearer_token is not None and token != env_bearer_token:
+        raise HTTPException(status_code=401, detail="无效的令牌")
+    chat = Chat()
+    try:
+        results = []
+        for image_base64 in image_req.images:
+            results.append(chat.query_ocr(image_base64, image_req.sorted))
+        return {"error": None, "results": results}
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        logging.error(f"识别报错：{e}")
+        raise HTTPException(status_code=500, detail=f"识别出错: {str(e)}")
+
+if __name__ == "__main__":
+    env_bearer_token = os.getenv("ACCESS_TOKEN")
+    try:
+        uvicorn.run(app, host='0.0.0.0', port=7230)
+    except Exception as e:
+        logging.error(f"API启动失败！报错：{e}")
--- a/plugins/model/ocr-surya/requirements.txt
+++ b/plugins/model/ocr-surya/requirements.txt
@@ -0,0 +1,3 @@
+surya-ocr==0.5.0
+fastapi==0.104.1
+uvicorn==0.17.6
--- a/plugins/model/pdf-marker/Dockerfile
+++ b/plugins/model/pdf-marker/Dockerfile
@@ -0,0 +1,38 @@
+FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+# 安装构建依赖 cv2 dependencies
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
+
+# 设置 pip 配置
+RUN mkdir -p /root/.pip
+COPY pip.conf /root/.pip/
+
+# 创建模型文件夹
+RUN mkdir -p /root/huggingface
+
+# 复制依赖文件
+COPY requirements.txt /root/
+COPY api_mp.py /root/
+
+
+# 导入huggingface的代理和huggingface模型位置
+ENV HF_ENDPOINT=https://hf-mirror.com \
+    HF_DATASETS_CACHE=/root/huggingface \
+    HUGGINGFACE_HUB_CACHE=/root/huggingface \
+    HF_HOME=/root/huggingface
+
+# 设置工作目录
+WORKDIR /root
+
+# 安装 Python 依赖
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# 删除不必要的工具和文件以减小镜像体积
+RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip
+
+
+
+# 设置容器启动命令
+CMD ["python3", "api_mp.py"]
--- a/plugins/model/pdf-marker/Readme.md
+++ b/plugins/model/pdf-marker/Readme.md
@@ -0,0 +1,134 @@
+# 项目介绍
+
+本项目实现了一个高效的 **PDF 转 Markdown 接口服务**，支持多进程并行处理多个 PDF 文件。通过高性能的接口设计，快速将 PDF 文档转换为 Markdown 格式文本。
+
+- **简洁性：**项目无需修改代码，仅需调整文件路径即可使用，简单易用
+- **易用性：**通过提供简洁的 API，开发者只需发送 HTTP 请求即可完成 PDF 转换
+- **灵活性：**支持本地部署和 Docker 容器部署两种方式，便于快速上手和灵活集成
+
+# 配置推荐
+
+## 常规配置
+
+24G显存的显卡两张，可以支持四个文件同时处理
+
+## 最低配置
+
+**不低于11G** 显存的显卡一张
+
+并设置每张卡处理的进程数为1
+
+```bash
+export PROCESSES_PER_GPU="1"
+```
+
+## 单文件实测速率
+
+| 显卡          | 中文PDF      | 英文PDF      | 扫描件       |
+| ------------- | ------------ | ------------ | ------------ |
+| **4090D 24G** | **0.75s/页** | **1.60s/页** | **3.26s/页** |
+| **P40 24G**   | **0.99s/页** | **2.22s/页** | **5.24s/页** |
+
+## 多文件实测速率
+
+中文PDF+英文PDF
+
+| 显卡          | 串行处理     | 并行处理     | 提升效率  |
+| ------------- | ------------ | ------------ | --------- |
+| **4090D 24G** | **0.92s/页** | **0.62s/页** | **31.9%** |
+| **P40 24G**   | **1.22s/页** | **0.85s/页** | **30.5%** |
+
+# 本地开发
+
+## 基本流程
+
+1. 克隆一个FastGPT的项目文件
+
+   ```
+   git clone https://github.com/labring/FastGPT.git
+   ```
+
+2. 将主目录设置为 python下的pdf-marker文件
+
+   ```
+   cd python/pdf-marker
+   ```
+
+3. 创建Anaconda并安装requirement.txt文件
+
+   安装的Anaconda版本：**conda 24.7.1**
+
+   ```
+   conda create -n pdf-marker python=3.11
+   pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+   conda activate pdf-marker
+   ```
+
+4. 执行主文件启动pdf2md服务
+
+   ```
+   python api_mp.py
+   ```
+
+# 镜像打包和部署
+
+## 本地构建镜像
+
+1. 在 `pdf-marker` 根目录下执行：
+
+    ```bash
+    sudo docker build -t model_pdf -f Dockerfile .
+    ```
+2. 运行容器
+    ```bash
+    sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 -e PROCESSES_PER_GPU="2" model_pdf
+    ```
+## 快速构建镜像
+```dockerfile
+docker pull crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest
+docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 -e PROCESSES_PER_GPU="2" crpi-h3snc261q1dosroc.cn-hangzhou.personal.cr.aliyuncs.com/marker11/marker_images:latest
+```
+*注意*：参数PROCESSES_PER_GPU设置每张显卡上文件处理的并行数量，24G的显卡可以设置为2。在多显卡的环境中会自动切换显卡来运行多文件的并行处理。 
+# 访问示例
+
+用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务
+
+参数：file-->本地文件的地址
+
+- 访问方法
+
+  ```
+  curl --location --request POST "http://localhost:7231/v1/parse/file" \
+  --header "Authorization: Bearer your_access_token" \
+  --form "file=@./file/chinese_test.pdf"
+  ```
+
+- 多文件测试数据
+
+  运行 `test` 文件下的 `test.py` 文件，修改里面的 `file_paths` 为自己仓库的 `url` 即可
+
+# FQA
+
+- 如果出现huggingface模型下载不下来?
+
+  可以选择在环境变量中加入huggingface镜像
+
+  ```bash
+  export HF_ENDPOINT=https://hf-mirror.com
+  export HF_DATASETS_CACHE=/huggingface
+  export HUGGINGFACE_HUB_CACHE=/huggingface
+  export HF_HOME=/huggingface
+  ```
+
+  也可以直接访问 [huggingface][https://huggingface.co] 来下载模型到 `/huggingface` 文件夹下
+
+  ```
+  https://huggingface.co/vikp/surya_det3/tree/main
+  https://huggingface.co/vikp/surya_layout3/tree/main
+  https://huggingface.co/vikp/surya_order/tree/main
+  https://huggingface.co/vikp/surya_rec2/tree/main
+  https://huggingface.co/vikp/surya_tablerec/tree/main
+  https://huggingface.co/vikp/texify2/tree/main
+  ```
+
+  
--- a/plugins/model/pdf-marker/api_mp.py
+++ b/plugins/model/pdf-marker/api_mp.py
@@ -0,0 +1,141 @@
+import asyncio
+import base64
+import fitz
+import torch.multiprocessing as mp
+import shutil
+import time
+from contextlib import asynccontextmanager
+from loguru import logger
+from fastapi import HTTPException, FastAPI, UploadFile, File
+import multiprocessing
+from marker.output import save_markdown
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+import torch
+from concurrent.futures import ProcessPoolExecutor
+import os
+app = FastAPI()
+model_lst = None
+model_refs = None
+temp_dir = "./temp"
+os.environ['PROCESSES_PER_GPU'] = str(2)
+
+def worker_init(counter, lock):
+    global model_lst
+    num_gpus = torch.cuda.device_count()
+    processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
+    with lock:
+        worker_id = counter.value
+        counter.value += 1
+    if num_gpus == 0:
+        device = 'cpu'
+    else:
+        device_id = worker_id // processes_per_gpu
+        if device_id >= num_gpus:
+            raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
+        device = f'cuda:{device_id}'
+    model_lst = load_all_models(device=device, dtype=torch.float32)
+    print(f"Worker {worker_id}: Models loaded successfully on {device}!")
+    for model in model_lst:
+        if model is None:
+            continue
+        model.share_memory()
+
+def process_file_with_multiprocessing(temp_file_path):
+    global model_lst
+    full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1)
+    fname = os.path.basename(temp_file_path)
+    subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta)
+    md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path)
+    return md_content_with_base64_images, out_meta
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        mp.set_start_method('spawn')
+    except RuntimeError:
+        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
+    manager = multiprocessing.Manager()
+    worker_counter = manager.Value('i', 0)
+    worker_lock = manager.Lock()
+    global my_pool
+    gpu_count = torch.cuda.device_count()
+    my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock))
+
+    yield
+    global temp_dir
+    if temp_dir and os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    del model_lst
+    del model_refs
+    print("Application shutdown, cleaning up...")
+
+app.router.lifespan_context = lifespan
+
+@app.post("/v1/parse/file")
+async def read_file(
+        file: UploadFile = File(...)):
+    try:
+        start_time = time.time()
+        global temp_dir
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file_path = os.path.join(temp_dir, file.filename)
+        with open(temp_file_path, "wb") as temp_file:
+            temp_file.write(await file.read())
+        pdf_document = fitz.open(temp_file_path)
+        total_pages = pdf_document.page_count
+        pdf_document.close()
+        global my_pool
+        loop = asyncio.get_event_loop()
+        md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path)
+
+        end_time = time.time()
+        duration = end_time - start_time
+        print(file.filename+"Total time:", duration)
+        return {
+                "success": True,
+                "message": "",
+                "data": {
+                    "markdown": md_content_with_base64_images,
+                    "page": total_pages,
+                    "duration": duration
+                }
+            }
+
+    except Exception as e:
+        logger.exception(e)
+        raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}")
+
+    finally:
+
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+def img_to_base64(img_path):
+    with open(img_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+def embed_images_as_base64(md_content, image_dir):
+    lines = md_content.split('\n')
+    new_lines = []
+    for line in lines:
+        if line.startswith("![") and "](" in line and ")" in line:
+            start_idx = line.index("](") + 2
+            end_idx = line.index(")", start_idx)
+            img_rel_path = line[start_idx:end_idx]
+
+            img_name = os.path.basename(img_rel_path)
+            img_path = os.path.join(image_dir, img_name)
+
+            if os.path.exists(img_path):
+                img_base64 = img_to_base64(img_path)
+                new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}'
+                new_lines.append(new_line)
+            else:
+                new_lines.append(line)
+        else:
+            new_lines.append(line)
+    return '\n'.join(new_lines)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7231)
+
--- a/plugins/model/pdf-marker/pip.conf
+++ b/plugins/model/pdf-marker/pip.conf
@@ -0,0 +1,5 @@
+[global]
+time-out=60
+index-url=https://pypi.tuna.tsinghua.edu.cn/simple
+[install]
+trusted-host=pypi.tuna.tsinghua.edu.cn
--- a/plugins/model/pdf-marker/requirements.txt
+++ b/plugins/model/pdf-marker/requirements.txt
@@ -0,0 +1,108 @@
+acres==0.1.0
+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+ci-info==0.3.0
+click==8.1.7
+coloredlogs==15.0.1
+configobj==5.0.9
+configparser==7.1.0
+dol==0.2.83
+etelemetry==0.3.1
+fastapi==0.115.5
+filelock==3.16.1
+filetype==1.2.0
+flatbuffers==24.3.25
+frontend==0.0.3
+fsspec==2024.10.0
+ftfy==6.3.1
+h11==0.14.0
+httplib2==0.22.0
+huggingface-hub==0.26.2
+humanfriendly==10.0
+i2==0.1.36
+idna==3.10
+importlib_resources==6.4.5
+isodate==0.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.4
+joblib==1.4.2
+loguru==0.7.2
+looseversion==1.3.0
+lxml==5.3.0
+marker-pdf==0.3.10
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+nibabel==5.3.2
+nipype==1.9.1
+numpy==2.1.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+onnxruntime==1.20.1
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+packaging==24.2
+pandas==2.2.3
+pathlib==1.0.1
+pdftext==0.3.19
+pillow==10.4.0
+pip==24.3.1
+protobuf==5.28.3
+prov==2.0.1
+puremagic==1.28
+pydantic==2.10.0
+pydantic_core==2.27.0
+pydantic-settings==2.6.1
+pydot==3.0.2
+PyMuPDF==1.24.14
+pyparsing==3.2.0
+pypdfium2==4.30.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.17
+pytz==2024.2
+pyxnat==1.6.2
+PyYAML==6.0.2
+RapidFuzz==3.10.1
+rdflib==6.3.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+setuptools==75.6.0
+simplejson==3.19.3
+six==1.16.0
+sniffio==1.3.1
+starlette==0.41.3
+surya-ocr==0.6.13
+sympy==1.13.1
+tabled-pdf==0.1.4
+tabulate==0.9.0
+texify==0.2.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+torch==2.5.1
+tqdm==4.67.0
+traits==6.4.3
+transformers==4.46.3
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+wcwidth==0.2.13
+wheel==0.45.0
--- a/plugins/model/pdf-marker/test/test.py
+++ b/plugins/model/pdf-marker/test/test.py
@@ -0,0 +1,26 @@
+import json
+import os
+from io import BytesIO
+import requests
+from multiprocessing import Process
+def request_(file_path):
+    url = "http://127.0.0.1:7231/v1/parse/file"
+    response = requests.get(file_path)
+    if response.status_code == 200:
+        file_data = BytesIO(response.content)
+        pdf_name = os.path.basename(file_path)
+        files = {'file': (pdf_name, file_data, 'application/pdf')}
+        response = requests.post(url, files=files)
+        if response.status_code == 200:
+            print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False))
+        else:
+            print(f"Request failed with status code: {response.status_code}")
+            print(response.text)
+
+if __name__ == "__main__":
+    file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf",
+                 "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf","https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"]
+    for file_path in file_paths:
+        p = Process(target=request_, args=(file_path))
+        p.start()
+
--- a/plugins/model/rerank-bge/README.md
+++ b/plugins/model/rerank-bge/README.md
@@ -0,0 +1,114 @@
+# 接入 bge-rerank 重排模型
+
+## 不同模型推荐配置
+
+推荐配置如下：
+
+| 模型名           | 内存  | 显存  | 硬盘空间 | 启动命令      |
+| ---------------- | ----- | ----- | -------- | ------------- |
+| bge-reranker-base  | >=4GB | >=4GB | >=8GB    | python app.py |
+| bge-reranker-large | >=8GB | >=8GB | >=8GB    | python app.py |
+| bge-reranker-v2-m3 | >=8GB | >=8GB | >=8GB    | python app.py |
+
+## 源码部署
+
+### 1. 安装环境
+
+- Python 3.9, 3.10
+- CUDA 11.7
+- 科学上网环境
+
+### 2. 下载代码
+
+3 个模型代码分别为：
+
+1. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-base](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-base)
+2. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-large](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-large)
+3. [https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-v2-m3](https://github.com/labring/FastGPT/tree/main/python/bge-rerank/bge-reranker-v2-m3)
+
+### 3. 安装依赖
+
+```sh
+pip install -r requirements.txt
+```
+
+### 4. 下载模型
+
+3个模型的 huggingface 仓库地址如下：
+
+1. [https://huggingface.co/BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)
+2. [https://huggingface.co/BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large)
+3. [https://huggingface.co/BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
+
+在对应代码目录下 clone 模型。目录结构：
+
+```
+bge-reranker-base/
+app.py
+Dockerfile
+requirements.txt
+```
+
+### 5. 运行代码
+
+```bash
+python app.py
+```
+
+启动成功后应该会显示如下地址：
+
+![](./rerank1.png)
+
+> 这里的 `http://0.0.0.0:6006` 就是请求地址。
+
+## docker 部署
+
+**镜像名分别为:**
+
+1. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1
+2. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-large:v0.1
+3. registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-v2-m3:v0.1
+
+**端口**
+
+6006
+
+**环境变量**
+
+```
+ACCESS_TOKEN=访问安全凭证，请求时，Authorization: Bearer ${ACCESS_TOKEN}
+```
+
+**运行命令示例**
+
+```sh
+# auth token 为mytoken
+docker run -d --name reranker -p 6006:6006 -e ACCESS_TOKEN=mytoken --gpus all registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1
+```
+
+**docker-compose.yml示例**
+
+```
+version: "3"
+services:
+  reranker:
+    image: registry.cn-hangzhou.aliyuncs.com/fastgpt/bge-rerank-base:v0.1
+    container_name: reranker
+    # GPU运行环境，如果宿主机未安装，将deploy配置隐藏即可
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+    ports:
+      - 6006:6006
+    environment:
+      - ACCESS_TOKEN=mytoken
+
+```
+
+## 接入 FastGPT
+
+参考 [ReRank模型接入](https://doc.tryfastgpt.ai/docs/development/configuration/#rerank-接入)
--- a/plugins/model/rerank-bge/bge-reranker-base/Dockerfile
+++ b/plugins/model/rerank-bge/bge-reranker-base/Dockerfile
@@ -0,0 +1,12 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+# please download the model from https://huggingface.co/BAAI/bge-reranker-base and put it in the same directory as Dockerfile
+COPY ./bge-reranker-base ./bge-reranker-base
+
+COPY requirements.txt .
+
+RUN python3 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+COPY app.py Dockerfile .
+
+ENTRYPOINT python3 app.py
--- a/plugins/model/rerank-bge/bge-reranker-base/app.py
+++ b/plugins/model/rerank-bge/bge-reranker-base/app.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time: 2023/11/7 22:45
+@Author: zhidong
+@File: reranker.py
+@Desc:
+"""
+import os
+import numpy as np
+import logging
+import uvicorn
+import datetime
+from fastapi import FastAPI, Security, HTTPException
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from FlagEmbedding import FlagReranker
+from pydantic import Field, BaseModel, validator
+from typing import Optional, List
+
+app = FastAPI()
+security = HTTPBearer()
+env_bearer_token = 'ACCESS_TOKEN'
+
+class QADocs(BaseModel):
+    query: Optional[str]
+    documents: Optional[List[str]]
+
+
+class Singleton(type):
+    def __call__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = super().__call__(*args, **kwargs)
+        return cls._instance
+
+
+RERANK_MODEL_PATH = os.path.join(os.path.dirname(__file__), "bge-reranker-base")
+
+class ReRanker(metaclass=Singleton):
+    def __init__(self, model_path):
+        self.reranker = FlagReranker(model_path, use_fp16=False)
+
+    def compute_score(self, pairs: List[List[str]]):
+        if len(pairs) > 0:
+            result = self.reranker.compute_score(pairs, normalize=True)
+            if isinstance(result, float):
+                result = [result]
+            return result
+        else:
+            return None
+
+class Chat(object):
+    def __init__(self, rerank_model_path: str = RERANK_MODEL_PATH):
+        self.reranker = ReRanker(rerank_model_path)
+
+    def fit_query_answer_rerank(self, query_docs: QADocs) -> List:
+        if query_docs is None or len(query_docs.documents) == 0:
+            return []
+
+        pair = [[query_docs.query, doc] for doc in query_docs.documents]
+        scores = self.reranker.compute_score(pair)
+
+        new_docs = []
+        for index, score in enumerate(scores):
+            new_docs.append({"index": index, "text": query_docs.documents[index], "score": score})
+        results = [{"index": documents["index"], "relevance_score": documents["score"]} for documents in list(sorted(new_docs, key=lambda x: x["score"], reverse=True))]
+        return results
+
+@app.post('/v1/rerank')
+async def handle_post_request(docs: QADocs, credentials: HTTPAuthorizationCredentials = Security(security)):
+    token = credentials.credentials
+    if env_bearer_token is not None and token != env_bearer_token:
+        raise HTTPException(status_code=401, detail="Invalid token")
+    chat = Chat()
+    try:
+        results = chat.fit_query_answer_rerank(docs)
+        return {"results": results}
+    except Exception as e:
+        print(f"报错：\n{e}")
+        return {"error": "重排出错"}
+
+if __name__ == "__main__":
+    token = os.getenv("ACCESS_TOKEN")
+    if token is not None:
+        env_bearer_token = token
+    try:
+        uvicorn.run(app, host='0.0.0.0', port=6006)
+    except Exception as e:
+        print(f"API启动失败！\n报错：\n{e}")
--- a/plugins/model/rerank-bge/bge-reranker-base/requirements.txt
+++ b/plugins/model/rerank-bge/bge-reranker-base/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.104.1
+transformers[sentencepiece]
+FlagEmbedding==1.2.8
+pydantic==1.10.13
+uvicorn==0.17.6
+itsdangerous
+protobuf
--- a/plugins/model/rerank-bge/bge-reranker-large/Dockerfile
+++ b/plugins/model/rerank-bge/bge-reranker-large/Dockerfile
@@ -0,0 +1,12 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+# please download the model from https://huggingface.co/BAAI/bge-reranker-large and put it in the same directory as Dockerfile
+COPY ./bge-reranker-large ./bge-reranker-large
+
+COPY requirements.txt .
+
+RUN python3 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+COPY app.py Dockerfile .
+
+ENTRYPOINT python3 app.py
--- a/plugins/model/rerank-bge/bge-reranker-large/app.py
+++ b/plugins/model/rerank-bge/bge-reranker-large/app.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time: 2023/11/7 22:45
+@Author: zhidong
+@File: reranker.py
+@Desc:
+"""
+import os
+import numpy as np
+import logging
+import uvicorn
+import datetime
+from fastapi import FastAPI, Security, HTTPException
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from FlagEmbedding import FlagReranker
+from pydantic import Field, BaseModel, validator
+from typing import Optional, List
+
+app = FastAPI()
+security = HTTPBearer()
+env_bearer_token = 'ACCESS_TOKEN'
+
+class QADocs(BaseModel):
+    query: Optional[str]
+    documents: Optional[List[str]]
+
+
+class Singleton(type):
+    def __call__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = super().__call__(*args, **kwargs)
+        return cls._instance
+
+
+RERANK_MODEL_PATH = os.path.join(os.path.dirname(__file__), "bge-reranker-large")
+
+class ReRanker(metaclass=Singleton):
+    def __init__(self, model_path):
+        self.reranker = FlagReranker(model_path, use_fp16=False)
+
+    def compute_score(self, pairs: List[List[str]]):
+        if len(pairs) > 0:
+            result = self.reranker.compute_score(pairs, normalize=True)
+            if isinstance(result, float):
+                result = [result]
+            return result
+        else:
+            return None
+
+class Chat(object):
+    def __init__(self, rerank_model_path: str = RERANK_MODEL_PATH):
+        self.reranker = ReRanker(rerank_model_path)
+
+    def fit_query_answer_rerank(self, query_docs: QADocs) -> List:
+        if query_docs is None or len(query_docs.documents) == 0:
+            return []
+
+        pair = [[query_docs.query, doc] for doc in query_docs.documents]
+        scores = self.reranker.compute_score(pair)
+
+        new_docs = []
+        for index, score in enumerate(scores):
+            new_docs.append({"index": index, "text": query_docs.documents[index], "score": score})
+        results = [{"index": documents["index"], "relevance_score": documents["score"]} for documents in list(sorted(new_docs, key=lambda x: x["score"], reverse=True))]
+        return results
+
+@app.post('/v1/rerank')
+async def handle_post_request(docs: QADocs, credentials: HTTPAuthorizationCredentials = Security(security)):
+    token = credentials.credentials
+    if env_bearer_token is not None and token != env_bearer_token:
+        raise HTTPException(status_code=401, detail="Invalid token")
+    chat = Chat()
+    try:
+        results = chat.fit_query_answer_rerank(docs)
+        return {"results": results}
+    except Exception as e:
+        print(f"报错：\n{e}")
+        return {"error": "重排出错"}
+
+if __name__ == "__main__":
+    token = os.getenv("ACCESS_TOKEN")
+    if token is not None:
+        env_bearer_token = token
+    try:
+        uvicorn.run(app, host='0.0.0.0', port=6006)
+    except Exception as e:
+        print(f"API启动失败！\n报错：\n{e}")
--- a/plugins/model/rerank-bge/bge-reranker-large/requirements.txt
+++ b/plugins/model/rerank-bge/bge-reranker-large/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.104.1
+transformers[sentencepiece]
+FlagEmbedding==1.2.8
+pydantic==1.10.13
+uvicorn==0.17.6
+itsdangerous
+protobuf
--- a/plugins/model/rerank-bge/bge-reranker-v2-m3/Dockerfile
+++ b/plugins/model/rerank-bge/bge-reranker-v2-m3/Dockerfile
@@ -0,0 +1,12 @@
+FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+# please download the model from https://huggingface.co/BAAI/bge-reranker-v2-m3 and put it in the same directory as Dockerfile
+COPY ./bge-reranker-v2-m3 ./bge-reranker-v2-m3
+
+COPY requirements.txt .
+
+RUN python3 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+COPY app.py Dockerfile .
+
+ENTRYPOINT python3 app.py
--- a/plugins/model/rerank-bge/bge-reranker-v2-m3/app.py
+++ b/plugins/model/rerank-bge/bge-reranker-v2-m3/app.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@Time: 2023/11/7 22:45
+@Author: zhidong
+@File: reranker.py
+@Desc:
+"""
+import os
+import numpy as np
+import logging
+import uvicorn
+import datetime
+from fastapi import FastAPI, Security, HTTPException
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from FlagEmbedding import FlagReranker
+from pydantic import Field, BaseModel, validator
+from typing import Optional, List
+
+app = FastAPI()
+security = HTTPBearer()
+env_bearer_token = 'ACCESS_TOKEN'
+
+class QADocs(BaseModel):
+    query: Optional[str]
+    documents: Optional[List[str]]
+
+
+class Singleton(type):
+    def __call__(cls, *args, **kwargs):
+        if not hasattr(cls, '_instance'):
+            cls._instance = super().__call__(*args, **kwargs)
+        return cls._instance
+
+
+RERANK_MODEL_PATH = os.path.join(os.path.dirname(__file__), "bge-reranker-v2-m3")
+
+class ReRanker(metaclass=Singleton):
+    def __init__(self, model_path):
+        self.reranker = FlagReranker(model_path, use_fp16=False)
+
+    def compute_score(self, pairs: List[List[str]]):
+        if len(pairs) > 0:
+            result = self.reranker.compute_score(pairs, normalize=True)
+            if isinstance(result, float):
+                result = [result]
+            return result
+        else:
+            return None
+
+class Chat(object):
+    def __init__(self, rerank_model_path: str = RERANK_MODEL_PATH):
+        self.reranker = ReRanker(rerank_model_path)
+
+    def fit_query_answer_rerank(self, query_docs: QADocs) -> List:
+        if query_docs is None or len(query_docs.documents) == 0:
+            return []
+
+        pair = [[query_docs.query, doc] for doc in query_docs.documents]
+        scores = self.reranker.compute_score(pair)
+
+        new_docs = []
+        for index, score in enumerate(scores):
+            new_docs.append({"index": index, "text": query_docs.documents[index], "score": score})
+        results = [{"index": documents["index"], "relevance_score": documents["score"]} for documents in list(sorted(new_docs, key=lambda x: x["score"], reverse=True))]
+        return results
+
+@app.post('/v1/rerank')
+async def handle_post_request(docs: QADocs, credentials: HTTPAuthorizationCredentials = Security(security)):
+    token = credentials.credentials
+    if env_bearer_token is not None and token != env_bearer_token:
+        raise HTTPException(status_code=401, detail="Invalid token")
+    chat = Chat()
+    try:
+        results = chat.fit_query_answer_rerank(docs)
+        return {"results": results}
+    except Exception as e:
+        print(f"报错：\n{e}")
+        return {"error": "重排出错"}
+
+if __name__ == "__main__":
+    token = os.getenv("ACCESS_TOKEN")
+    if token is not None:
+        env_bearer_token = token
+    try:
+        uvicorn.run(app, host='0.0.0.0', port=6006)
+    except Exception as e:
+        print(f"API启动失败！\n报错：\n{e}")
--- a/plugins/model/rerank-bge/bge-reranker-v2-m3/requirements.txt
+++ b/plugins/model/rerank-bge/bge-reranker-v2-m3/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.104.1
+transformers[sentencepiece]
+FlagEmbedding==1.2.8
+pydantic==1.10.13
+uvicorn==0.17.6
+itsdangerous
+protobuf
--- a/plugins/model/rerank-bge/rerank1.png
+++ b/plugins/model/rerank-bge/rerank1.png
--- a/plugins/model/stt-sensevoice/Dockerfile
+++ b/plugins/model/stt-sensevoice/Dockerfile
@@ -0,0 +1,12 @@
+#FROM yiminger/sensevoice:latest
+FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-runtime
+
+COPY ./app /app
+
+WORKDIR /app
+
+#COPY main.py /app/main.py
+
+RUN pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+CMD ["python3","main.py"]
--- a/plugins/model/stt-sensevoice/app/=1.13
+++ b/plugins/model/stt-sensevoice/app/=1.13
@@ -0,0 +1,33 @@
+Looking in indexes: https://download.pytorch.org/whl/cpu
+Collecting torch
+  Downloading https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp38-cp38-linux_x86_64.whl (190.4 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 190.4/190.4 MB 1.3 MB/s eta 0:00:00
+Collecting torchaudio
+  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.3.1%2Bcpu-cp38-cp38-linux_x86_64.whl (1.7 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 22.6 MB/s eta 0:00:00
+Collecting typing-extensions>=4.8.0
+  Downloading https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)
+Collecting networkx
+  Downloading https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 22.8 MB/s eta 0:00:00
+Collecting jinja2
+  Downloading https://download.pytorch.org/whl/Jinja2-3.1.3-py3-none-any.whl (133 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.2/133.2 kB 25.7 MB/s eta 0:00:00
+Collecting fsspec
+  Downloading https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 170.9/170.9 kB 24.7 MB/s eta 0:00:00
+Collecting sympy
+  Downloading https://download.pytorch.org/whl/sympy-1.12-py3-none-any.whl (5.7 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 22.4 MB/s eta 0:00:00
+Collecting filelock
+  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)
+Collecting MarkupSafe>=2.0
+  Downloading https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26 kB)
+Collecting networkx
+  Downloading https://download.pytorch.org/whl/networkx-3.0-py3-none-any.whl (2.0 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 22.7 MB/s eta 0:00:00
+Collecting mpmath>=0.19
+  Downloading https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 106.6 MB/s eta 0:00:00
+Installing collected packages: mpmath, typing-extensions, sympy, networkx, MarkupSafe, fsspec, filelock, jinja2, torch, torchaudio
+Successfully installed MarkupSafe-2.1.5 filelock-3.13.1 fsspec-2024.2.0 jinja2-3.1.3 mpmath-1.3.0 networkx-3.0 sympy-1.12 torch-2.3.1+cpu torchaudio-2.3.1+cpu typing-extensions-4.9.0
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.mdl
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.mdl
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.msc
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.msc
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.mv
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/.mv
@@ -0,0 +1 @@
+Revision:master,CreatedAt:1720157464
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/README.md
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/README.md
@@ -0,0 +1,210 @@
+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tasks:
+- auto-speech-recognition
+
+#model-type:
+##如 gpt、phi、llama、chatglm、baichuan 等
+#- gpt
+
+#domain:
+##如 nlp、cv、audio、multi-modal
+#- nlp
+
+#language:
+##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
+#- cn 
+
+#metrics:
+##如 CIDEr、Blue、ROUGE 等
+#- CIDEr
+
+#tags:
+##各种自定义，包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
+#- pretrained
+
+#tools:
+##如 vllm、fastchat、llamacpp、AdaSeq 等
+#- vllm
+---
+
+# Highlights
+**SenseVoice**专注于高精度多语言语音识别、情感辨识和音频事件检测
+- **多语言识别：** 采用超过40万小时数据训练，支持超过50种语言，识别效果上优于Whisper模型。
+- **富文本识别：** 
+  - 具备优秀的情感识别，能够在测试数据上达到和超过目前最佳情感识别模型的效果。
+  - 支持声音事件检测能力，支持音乐、掌声、笑声、哭声、咳嗽、喷嚏等多种常见人机交互事件进行检测。
+- **高效推理：** SenseVoice-Small模型采用非自回归端到端框架，推理延迟极低，10s音频推理仅耗时70ms，15倍优于Whisper-Large。
+- **微调定制：** 具备便捷的微调脚本与策略，方便用户根据业务场景修复长尾样本问题。
+- **服务部署：** 具有完整的服务部署链路，支持多并发请求，支持客户端语言有，python、c++、html、java与c#等。
+
+
+## <strong>[SenseVoice开源项目介绍]()</strong>
+<strong>[SenseVoice]()</strong>开源模型是多语言音频理解模型，具有包括语音识别、语种识别、语音情感识别，声学事件检测能力。
+
+[**github仓库**]()
+| [**最新动态**]()
+| [**环境安装**]()
+
+# 模型结构图
+SenseVoice多语言音频理解模型，支持语音识别、语种识别、语音情感识别、声学事件检测、逆文本正则化等能力，采用工业级数十万小时的标注音频进行模型训练，保证了模型的通用识别效果。模型可以被应用于中文、粤语、英语、日语、韩语音频识别，并输出带有情感和事件的富文本转写结果。
+
+<p align="center">
+<img src="fig/sensevoice.png" alt="SenseVoice模型结构"  width="1500" />
+</p>
+
+SenseVoice-Small是基于非自回归端到端框架模型，为了指定任务，我们在语音特征前添加四个嵌入作为输入传递给编码器：
+- LID：用于预测音频语种标签。
+- SER：用于预测音频情感标签。
+- AED：用于预测音频包含的事件标签。
+- ITN：用于指定识别输出文本是否进行逆文本正则化。
+
+
+# 用法
+
+## 推理
+
+### modelscope pipeline推理
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='iic/SenseVoiceSmall',
+    model_revision="master")
+
+rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+
+### 直接推理
+
+```python
+from model import SenseVoiceSmall
+
+model_dir = "iic/SenseVoiceSmall"
+m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
+
+
+res = m.inference(
+    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    **kwargs,
+)
+
+print(res)
+```
+
+### 使用funasr推理
+
+```python
+from funasr import AutoModel
+
+model_dir = "iic/SenseVoiceSmall"
+input_file = (
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+
+model = AutoModel(model=model_dir,
+                  vad_model="fsmn-vad",
+                  vad_kwargs={"max_single_segment_time": 30000},
+                  trust_remote_code=True, device="cuda:0")
+
+res = model.generate(
+    input=input_file,
+    cache={},
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    batch_size_s=0,
+)
+
+print(res)
+```
+
+funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
+如果输入均为短音频，并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
+
+```python
+model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
+
+res = model.generate(
+    input=input_file,
+    cache={},
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    batch_size=64,
+)
+```
+
+更多详细用法，请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
+
+## 模型下载
+
+
+SDK下载
+```bash
+#安装ModelScope
+pip install modelscope
+```
+```python
+#SDK模型下载
+from modelscope import snapshot_download
+model_dir = snapshot_download('iic/SenseVoiceSmall')
+```
+Git下载
+```
+#Git模型下载
+git clone https://www.modelscope.cn/iic/SenseVoiceSmall.git
+```
+
+## 服务部署
+
+Undo
+
+# Performance
+
+## 语音识别效果
+我们在开源基准数据集（包括 AISHELL-1、AISHELL-2、Wenetspeech、Librispeech和Common Voice）上比较了SenseVoice与Whisper的多语言语音识别性能和推理效率。在中文和粤语识别效果上，SenseVoice-Small模型具有明显的效果优势。
+
+<p align="center">
+<img src="fig/asr_results.png" alt="SenseVoice模型在开源测试集上的表现"  width="2500" />
+</p>
+
+
+
+## 情感识别效果
+由于目前缺乏被广泛使用的情感识别测试指标和方法，我们在多个测试集的多种指标进行测试，并与近年来Benchmark上的多个结果进行了全面的对比。所选取的测试集同时包含中文/英文两种语言以及表演、影视剧、自然对话等多种风格的数据，在不进行目标数据微调的前提下，SenseVoice能够在测试数据上达到和超过目前最佳情感识别模型的效果。
+
+<p align="center">
+<img src="fig/ser_table.png" alt="SenseVoice模型SER效果1"  width="1500" />
+</p>
+
+同时，我们还在测试集上对多个开源情感识别模型进行对比，结果表明，SenseVoice-Large模型可以在几乎所有数据上都达到了最佳效果，而SenseVoice-Small模型同样可以在多数数据集上取得超越其他开源模型的效果。
+
+<p align="center">
+<img src="fig/ser_figure.png" alt="SenseVoice模型SER效果2"  width="500" />
+</p>
+
+## 事件检测效果
+
+尽管SenseVoice只在语音数据上进行训练，它仍然可以作为事件检测模型进行单独使用。我们在环境音分类ESC-50数据集上与目前业内广泛使用的BEATS与PANN模型的效果进行了对比。SenseVoice模型能够在这些任务上取得较好的效果，但受限于训练数据与训练方式，其事件分类效果专业的事件检测模型相比仍然有一定的差距。
+
+<p align="center">
+<img src="fig/aed_figure.png" alt="SenseVoice模型AED效果"  width="500" />
+</p>
+
+
+
+## 推理效率
+SenseVoice-Small模型采用非自回归端到端架构，推理延迟极低。在参数量与Whisper-Small模型相当的情况下，比Whisper-Small模型推理速度快7倍，比Whisper-Large模型快17倍。同时SenseVoice-small模型在音频时长增加的情况下，推理耗时也无明显增加。
+
+
+<p align="center">
+<img src="fig/inference.png" alt="SenseVoice模型的推理效率"  width="1500" />
+</p>
+
+<p style="color: lightgrey;">如果您是本模型的贡献者，我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>，及时完善模型卡片内容。</p>
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/am.mvn
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/am.mvn
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/config.yaml
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/config.yaml
@@ -0,0 +1,97 @@
+encoder: SenseVoiceEncoderSmall
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 50
+    tp_blocks: 20
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+
+
+model: SenseVoiceSmall
+model_conf:
+    length_normalized_loss: true
+    sos: 1
+    eos: 2
+    ignore_id: -1
+
+tokenizer: SentencepiecesTokenizer
+tokenizer_conf:
+  bpemodel: null
+  unk_symbol: <unk>
+  split_with_space: true
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+    cmvn_file: null
+
+
+dataset: SenseVoiceCTCDataset
+dataset_conf:
+  index_ds: IndexDSJsonl
+  batch_sampler: EspnetStyleBatchSampler
+  data_split_num: 32
+  batch_type: token
+  batch_size: 14000
+  max_token_length: 2000
+  min_token_length: 60
+  max_source_length: 2000
+  min_source_length: 60
+  max_target_length: 200
+  min_target_length: 0
+  shuffle: true
+  num_workers: 4
+  sos: ${model_conf.sos}
+  eos: ${model_conf.eos}
+  IndexDSJsonl: IndexDSJsonl
+  retry: 20
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 20
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  log_interval: 100
+  resume: true
+  validate_interval: 10000
+  save_checkpoint_interval: 10000
+
+optim: adamw
+optim_conf:
+  lr: 0.00002
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/configuration.json
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/configuration.json
@@ -0,0 +1,14 @@
+{
+  "framework": "pytorch",
+  "task" : "auto-speech-recognition",
+  "model": {"type" : "funasr"},
+  "pipeline": {"type":"funasr-pipeline"},
+  "model_name_in_hub": {
+    "ms":"", 
+    "hf":""},
+  "file_path_metas": {
+    "init_param":"model.pt", 
+    "config":"config.yaml",
+    "tokenizer_conf": {"bpemodel": "chn_jpn_yue_eng_ko_spectok.bpe.model"},
+    "frontend_conf":{"cmvn_file": "am.mvn"}}
+}
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/aed_figure.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/aed_figure.png
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/asr_results.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/asr_results.png
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/inference.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/inference.png
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/sensevoice.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/sensevoice.png
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/ser_figure.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/ser_figure.png
--- a/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/ser_table.png
+++ b/plugins/model/stt-sensevoice/app/iic/SenseVoiceSmall/fig/ser_table.png
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mdl
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mdl
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.msc
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.msc
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mv
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/.mv
@@ -0,0 +1 @@
+Revision:master,CreatedAt:1707184291
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
@@ -0,0 +1,296 @@
+---
+tasks:
+- voice-activity-detection
+domain:
+- audio
+model-type:
+- VAD model
+frameworks:
+- pytorch
+backbone:
+- fsmn
+metrics:
+- f1_score
+license: Apache License 2.0
+language: 
+- cn
+tags:
+- FunASR
+- FSMN
+- Alibaba
+- Online
+datasets:
+  train:
+  - 20,000 hour industrial Mandarin task
+  test:
+  - 20,000 hour industrial Mandarin task
+widgets:
+  - task: voice-activity-detection
+    model_revision: v2.0.4
+    inputs:
+      - type: audio
+        name: input
+        title: 音频
+    examples:
+      - name: 1
+        title: 示例1
+        inputs:
+          - name: input
+            data: git://example/vad_example.wav 
+    inferencespec:
+      cpu: 1 #CPU数量
+      memory: 4096
+---
+
+# FSMN-Monophone VAD 模型介绍
+
+[//]: # (FSMN-Monophone VAD 模型)
+
+## Highlight
+- 16k中文通用VAD模型：可用于检测长语音片段中有效语音的起止时间点。
+  - 基于[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)场景的使用
+  - 基于[FunASR框架](https://github.com/alibaba-damo-academy/FunASR)，可进行ASR，VAD，[中文标点](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)的自由组合
+  - 基于音频数据的有效语音片段起止时间点检测
+
+## <strong>[FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)</strong>
+<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调，研究人员和开发人员可以更方便地进行语音识别模型的研究和生产，并推动语音识别生态的发展。让语音识别更有趣！
+
+[**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
+| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
+| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**服务部署**](https://www.funasr.com)
+| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
+
+
+## 模型原理介绍
+
+FSMN-Monophone VAD是达摩院语音团队提出的高效语音端点检测模型，用于检测输入音频中有效语音的起止时间点信息，并将检测出来的有效音频片段输入识别引擎进行识别，减少无效语音带来的识别错误。
+
+<p align="center">
+<img src="fig/struct.png" alt="VAD模型结构"  width="500" />
+
+FSMN-Monophone VAD模型结构如上图所示：模型结构层面，FSMN模型结构建模时可考虑上下文信息，训练和推理速度快，且时延可控；同时根据VAD模型size以及低时延的要求，对FSMN的网络结构、右看帧数进行了适配。在建模单元层面，speech信息比较丰富，仅用单类来表征学习能力有限，我们将单一speech类升级为Monophone。建模单元细分，可以避免参数平均，抽象学习能力增强，区分性更好。
+
+## 基于ModelScope进行推理
+
+- 推理支持音频格式如下：
+  - wav文件路径，例如：data/test/audios/vad_example.wav
+  - wav文件url，例如：https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav
+  - wav二进制数据，格式bytes，例如：用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
+  - 已解析的audio音频，例如：audio, rate = soundfile.read("vad_example_zh.wav")，类型为numpy.ndarray或者torch.Tensor。
+  - wav.scp文件，需符合如下要求：
+
+```sh
+cat wav.scp
+vad_example1  data/test/audios/vad_example1.wav
+vad_example2  data/test/audios/vad_example2.wav
+...
+```
+
+- 若输入格式wav文件url，api调用方式可参考如下范例：
+
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.voice_activity_detection,
+    model='iic/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+    model_revision="v2.0.4",
+)
+
+segments_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
+print(segments_result)
+```
+
+- 输入音频为pcm格式，调用api时需要传入音频采样率参数fs，例如：
+
+```python
+segments_result = inference_pipeline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm', fs=16000)
+```
+
+- 若输入格式为文件wav.scp(注：文件名需要以.scp结尾)，可添加 output_dir 参数将识别结果写入文件中，参考示例如下：
+
+```python
+inference_pipeline(input="wav.scp", output_dir='./output_dir')
+```
+识别结果输出路径结构如下：
+
+```sh
+tree output_dir/
+output_dir/
+└── 1best_recog
+    └── text
+
+1 directory, 1 files
+```
+text：VAD检测语音起止时间点结果文件（单位：ms）
+
+- 若输入音频为已解析的audio音频，api调用方式可参考如下范例：
+
+```python
+import soundfile
+
+waveform, sample_rate = soundfile.read("vad_example_zh.wav")
+segments_result = inference_pipeline(input=waveform)
+print(segments_result)
+```
+
+- VAD常用参数调整说明（参考：vad.yaml文件）：
+  - max_end_silence_time：尾部连续检测到多长时间静音进行尾点判停，参数范围500ms～6000ms，默认值800ms(该值过低容易出现语音提前截断的情况)。
+  - speech_noise_thres：speech的得分减去noise的得分大于此值则判断为speech，参数范围：（-1,1）
+    - 取值越趋于-1，噪音被误判定为语音的概率越大，FA越高
+    - 取值越趋于+1，语音被误判定为噪音的概率越大，Pmiss越高
+    - 通常情况下，该值会根据当前模型在长语音测试集上的效果取balance
+    
+
+
+
+## 基于FunASR进行推理
+
+下面为快速上手教程，测试音频（[中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav)，[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)）
+
+### 可执行命令行
+在命令行终端执行：
+
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
+```
+
+注：支持单条音频文件识别，也支持文件列表，列表为kaldi风格wav.scp：`wav_id   wav_path`
+
+### python示例
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
+                  # spk_model="cam++", spk_model_revision="v2.0.2",
+                  )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav", 
+            batch_size_s=300, 
+            hotword='魔搭')
+print(res)
+```
+注：`model_hub`：表示模型仓库，`ms`为选择modelscope下载，`hf`为选择huggingface下载。
+
+#### 实时语音识别
+
+```python
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+
+#### 标点恢复
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
+
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+
+#### 时间戳预测
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh", model_revision="v2.0.4")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+
+更多详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+
+## 微调
+
+详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+
+
+
+
+
+## 使用方式以及适用范围
+
+运行范围
+- 支持Linux-x86_64、Mac和Windows运行。
+
+使用方式
+- 直接推理：可以直接对长语音数据进行计算，有效语音片段的起止时间点信息（单位：ms）。
+
+## 相关论文以及引用信息
+
+```BibTeX
+@inproceedings{zhang2018deep,
+  title={Deep-FSMN for large vocabulary continuous speech recognition},
+  author={Zhang, Shiliang and Lei, Ming and Yan, Zhijie and Dai, Lirong},
+  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={5869--5873},
+  year={2018},
+  organization={IEEE}
+}
+```
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/am.mvn
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/am.mvn
@@ -0,0 +1,8 @@
+<Nnet>
+<Splice> 400 400
+[ 0 ]
+<AddShift> 400 400
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 400 400
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/config.yaml
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/config.yaml
@@ -0,0 +1,56 @@
+frontend: WavFrontendOnline
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    dither: 0.0
+    lfr_m: 5
+    lfr_n: 1
+
+model: FsmnVADStreaming
+model_conf:
+    sample_rate: 16000
+    detect_mode: 1 
+    snr_mode: 0
+    max_end_silence_time: 800
+    max_start_silence_time: 3000
+    do_start_point_detection: True
+    do_end_point_detection: True
+    window_size_ms: 200
+    sil_to_speech_time_thres: 150
+    speech_to_sil_time_thres: 150
+    speech_2_noise_ratio: 1.0
+    do_extend: 1
+    lookback_time_start_point: 200
+    lookahead_time_end_point: 100
+    max_single_segment_time: 60000
+    snr_thres: -100.0
+    noise_frame_num_used_for_snr: 100
+    decibel_thres: -100.0
+    speech_noise_thres: 0.6
+    fe_prior_thres: 0.0001
+    silence_pdf_num: 1
+    sil_pdf_ids: [0]
+    speech_noise_thresh_low: -0.1
+    speech_noise_thresh_high: 0.3
+    output_frame_probs: False
+    frame_in_ms: 10
+    frame_length_ms: 25
+    
+encoder: FSMN
+encoder_conf:
+    input_dim: 400
+    input_affine_dim: 140
+    fsmn_layers: 4
+    linear_dim: 250
+    proj_dim: 128
+    lorder: 20
+    rorder: 0
+    lstride: 1
+    rstride: 0
+    output_affine_dim: 140
+    output_dim: 248
+
+
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/configuration.json
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/configuration.json
@@ -0,0 +1,13 @@
+{
+  "framework": "pytorch",
+  "task" : "voice-activity-detection",
+  "pipeline": {"type":"funasr-pipeline"},
+  "model": {"type" : "funasr"},
+  "file_path_metas": {
+    "init_param":"model.pt", 
+    "config":"config.yaml",
+    "frontend_conf":{"cmvn_file": "am.mvn"}},
+  "model_name_in_hub": {
+    "ms":"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", 
+    "hf":""}
+}
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/fig/struct.png
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/fig/struct.png
--- a/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pt
+++ b/plugins/model/stt-sensevoice/app/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pt
--- a/plugins/model/stt-sensevoice/app/main.py
+++ b/plugins/model/stt-sensevoice/app/main.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from pydantic import BaseModel, HttpUrl, ValidationError
+from typing import List
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+import uuid
+import os
+
+app = FastAPI()
+
+
+# 数据验证模型
+class UrlInput(BaseModel):
+    audio_urls: List[HttpUrl]
+
+
+# 模型加载
+model_dir = "iic/SenseVoiceSmall"
+
+# 快速预测
+# model = AutoModel(model=model_dir, trust_remote_code=True, device="cpu")
+
+# 准确预测
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    trust_remote_code=True,
+    device="cuda:0",
+)
+
+
+@app.post("/upload-url/")
+async def upload_url(data: UrlInput):
+    try:
+        results = []
+        for url in data.audio_urls:
+            res = model.generate(
+                input=str(url),  # 将 URL 转换为字符串
+                cache={},
+                language=language,
+                use_itn=False,
+                batch_size=batch_size,
+            )
+            data = rich_transcription_postprocess(res[0]["text"])
+            results.append(data)
+        return {"message": "URL input processed successfully", "results": results}
+    except ValidationError as e:
+        raise HTTPException(status_code=400, detail=e.errors())
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/v1/audio/transcriptions")
+async def upload_file(file: UploadFile = File(...)):
+    try:
+        #for file in files:
+            if not file.content_type.startswith("audio/"):
+                raise HTTPException(status_code=400, detail="Invalid file type")
+
+            # 读取文件为 bytes
+            #audio_bytes = await file.read()
+
+            unique_filename = str(uuid.uuid4()) + ".mp3"
+
+            # 保存上传的音频文件
+            audio_file_path = os.path.join("/tmp", unique_filename)
+            with open(audio_file_path, "wb") as audio_file:
+                audio_file.write(await file.read())
+
+            # 直接将文件对象传递给模型
+            res = model.generate(
+                input=audio_file_path,
+                cache={},
+                language=language,
+                use_itn=True,
+                batch_size=batch_size,
+                merge_vad=True,  #
+                merge_length_s=15,
+            )
+            data = rich_transcription_postprocess(res[0]["text"])
+            return {"message": "File inputs processed successfully", "text": data}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+    batch_size = 60
+    language = "auto"
+
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/plugins/model/stt-sensevoice/app/model.py
+++ b/plugins/model/stt-sensevoice/app/model.py
@@ -0,0 +1,895 @@
+
+import time
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Iterable, Optional
+
+from funasr.register import tables
+from funasr.models.ctc.ctc import CTC
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.models.paraformer.search import Hypothesis
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.metrics.compute_acc import compute_accuracy, th_accuracy
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+
+
+class SinusoidalPositionEncoder(torch.nn.Module):
+    """ """
+
+    def __int__(self, d_model=80, dropout_rate=0.1):
+        pass
+
+    def encode(
+        self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
+    ):
+        batch_size = positions.size(0)
+        positions = positions.type(dtype)
+        device = positions.device
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
+            depth / 2 - 1
+        )
+        inv_timescales = torch.exp(
+            torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
+        )
+        inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
+            inv_timescales, [1, 1, -1]
+        )
+        encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+        return encoding.type(dtype)
+
+    def forward(self, x):
+        batch_size, timesteps, input_dim = x.size()
+        positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+
+        return x + position_encoding
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
+
+
+class MultiHeadedAttentionSANM(nn.Module):
+    """Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(
+        self,
+        n_head,
+        in_feat,
+        n_feat,
+        dropout_rate,
+        kernel_size,
+        sanm_shfit=0,
+        lora_list=None,
+        lora_rank=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        # self.linear_q = nn.Linear(n_feat, n_feat)
+        # self.linear_k = nn.Linear(n_feat, n_feat)
+        # self.linear_v = nn.Linear(n_feat, n_feat)
+
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.fsmn_block = nn.Conv1d(
+            n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+        )
+        # padding
+        left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            left_padding = left_padding + sanm_shfit
+        right_padding = kernel_size - 1 - left_padding
+        self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+
+    def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+        b, t, d = inputs.size()
+        if mask is not None:
+            mask = torch.reshape(mask, (b, -1, 1))
+            if mask_shfit_chunk is not None:
+                mask = mask * mask_shfit_chunk
+            inputs = inputs * mask
+
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        x = self.dropout(x)
+        if mask is not None:
+            x = x * mask
+        return x
+
+    def forward_qkv(self, x):
+        """Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+
+        """
+        b, t, d = x.size()
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time1, d_k)
+        k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+
+        return q_h, k_h, v_h, v
+
+    def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            if mask_att_chunk_encoder is not None:
+                mask = mask * mask_att_chunk_encoder
+
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+
+            min_value = -float(
+                "inf"
+            )  # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
+        return att_outs + fsmn_memory
+
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        if chunk_size is not None and look_back > 0 or look_back == -1:
+            if cache is not None:
+                k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
+                v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
+                k_h = torch.cat((cache["k"], k_h), dim=2)
+                v_h = torch.cat((cache["v"], v_h), dim=2)
+
+                cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
+                cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
+                if look_back != -1:
+                    cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
+                    cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
+            else:
+                cache_tmp = {
+                    "k": k_h[:, :, : -(chunk_size[2]), :],
+                    "v": v_h[:, :, : -(chunk_size[2]), :],
+                }
+                cache = cache_tmp
+        fsmn_memory = self.forward_fsmn(v, None)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, None)
+        return att_outs + fsmn_memory, cache
+
+
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+
+
+def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
+    if maxlen is None:
+        maxlen = lengths.max()
+    row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
+    matrix = torch.unsqueeze(lengths, dim=-1)
+    mask = row_vector < matrix
+    mask = mask.detach()
+
+    return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+
+
+class EncoderLayerSANM(nn.Module):
+    def __init__(
+        self,
+        in_size,
+        size,
+        self_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        stochastic_depth_rate=0.0,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayerSANM, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(in_size)
+        self.norm2 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.in_size = in_size
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.dropout_rate = dropout_rate
+
+    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if self.concat_after:
+            x_concat = torch.cat(
+                (
+                    x,
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    ),
+                ),
+                dim=-1,
+            )
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+            else:
+                x = stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+            else:
+                x = stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if self.in_size == self.size:
+            attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+            x = residual + attn
+        else:
+            x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.feed_forward(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, cache
+
+
+@tables.register("encoder_classes", "SenseVoiceEncoderSmall")
+class SenseVoiceEncoderSmall(nn.Module):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
+    https://arxiv.org/abs/2006.01713
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        tp_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        stochastic_depth_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        pos_enc_class=SinusoidalPositionEncoder,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 1,
+        padding_idx: int = -1,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        selfattention_layer_type: str = "sanm",
+        **kwargs,
+    ):
+        super().__init__()
+        self._output_size = output_size
+
+        self.embed = SinusoidalPositionEncoder()
+
+        self.normalize_before = normalize_before
+
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+        )
+
+        encoder_selfattn_layer = MultiHeadedAttentionSANM
+        encoder_selfattn_layer_args0 = (
+            attention_heads,
+            input_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+
+        self.encoders0 = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    input_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(1)
+            ]
+        )
+        self.encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(num_blocks - 1)
+            ]
+        )
+
+        self.tp_encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(tp_blocks)
+            ]
+        )
+
+        self.after_norm = LayerNorm(output_size)
+
+        self.tp_norm = LayerNorm(output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+    ):
+        """Embed positions in tensor."""
+        masks = sequence_mask(ilens, device=ilens.device)[:, None, :]
+
+        xs_pad *= self.output_size() ** 0.5
+
+        xs_pad = self.embed(xs_pad)
+
+        # forward encoder1
+        for layer_idx, encoder_layer in enumerate(self.encoders0):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+        for layer_idx, encoder_layer in enumerate(self.encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+        xs_pad = self.after_norm(xs_pad)
+
+        # forward encoder2
+        olens = masks.squeeze(1).sum(1).int()
+
+        for layer_idx, encoder_layer in enumerate(self.tp_encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+        xs_pad = self.tp_norm(xs_pad)
+        return xs_pad, olens
+
+
+@tables.register("model_classes", "SenseVoiceSmall")
+class SenseVoiceSmall(nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        specaug: str = None,
+        specaug_conf: dict = None,
+        normalize: str = None,
+        normalize_conf: dict = None,
+        encoder: str = None,
+        encoder_conf: dict = None,
+        ctc_conf: dict = None,
+        input_size: int = 80,
+        vocab_size: int = -1,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        length_normalized_loss: bool = False,
+        **kwargs,
+    ):
+
+        super().__init__()
+
+        if specaug is not None:
+            specaug_class = tables.specaug_classes.get(specaug)
+            specaug = specaug_class(**specaug_conf)
+        if normalize is not None:
+            normalize_class = tables.normalize_classes.get(normalize)
+            normalize = normalize_class(**normalize_conf)
+        encoder_class = tables.encoder_classes.get(encoder)
+        encoder = encoder_class(input_size=input_size, **encoder_conf)
+        encoder_output_size = encoder.output_size()
+
+        if ctc_conf is None:
+            ctc_conf = {}
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
+
+        self.blank_id = blank_id
+        self.sos = sos if sos is not None else vocab_size - 1
+        self.eos = eos if eos is not None else vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.specaug = specaug
+        self.normalize = normalize
+        self.encoder = encoder
+        self.error_calculator = None
+
+        self.ctc = ctc
+
+        self.length_normalized_loss = length_normalized_loss
+        self.encoder_output_size = encoder_output_size
+
+        self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+        self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size)
+        self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004}
+        
+        self.criterion_att = LabelSmoothingLoss(
+            size=self.vocab_size,
+            padding_idx=self.ignore_id,
+            smoothing=kwargs.get("lsm_weight", 0.0),
+            normalize_length=self.length_normalized_loss,
+        )
+    
+    @staticmethod
+    def from_pretrained(model:str=None, **kwargs):
+        from funasr import AutoModel
+        model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
+        
+        return model, kwargs
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ):
+        """Encoder + Decoder + Calc loss
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        # import pdb;
+        # pdb.set_trace()
+        if len(text_lengths.size()) > 1:
+            text_lengths = text_lengths[:, 0]
+        if len(speech_lengths.size()) > 1:
+            speech_lengths = speech_lengths[:, 0]
+
+        batch_size = speech.shape[0]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text)
+
+        loss_ctc, cer_ctc = None, None
+        loss_rich, acc_rich = None, None
+        stats = dict()
+
+        loss_ctc, cer_ctc = self._calc_ctc_loss(
+            encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4
+        )
+
+        loss_rich, acc_rich = self._calc_rich_ce_loss(
+            encoder_out[:, :4, :], text[:, :4]
+        )
+
+        loss = loss_ctc
+        # Collect total loss stats
+        stats["loss"] = torch.clone(loss.detach()) if loss_ctc is not None else None
+        stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None
+        stats["acc_rich"] = acc_rich
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + 1).sum())
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def encode(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        **kwargs,
+    ):
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                ind: int
+        """
+
+        # Data augmentation
+        if self.specaug is not None and self.training:
+            speech, speech_lengths = self.specaug(speech, speech_lengths)
+
+        # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+        if self.normalize is not None:
+            speech, speech_lengths = self.normalize(speech, speech_lengths)
+
+
+        lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device)
+        language_query = self.embed(lids)
+        
+        styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device)
+        style_query = self.embed(styles)
+        speech = torch.cat((style_query, speech), dim=1)
+        speech_lengths += 1
+
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1)
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+
+        return encoder_out, encoder_out_lens
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
+
+    def _calc_rich_ce_loss(
+        self,
+        encoder_out: torch.Tensor,
+        ys_pad: torch.Tensor,
+    ):
+        decoder_out = self.ctc.ctc_lo(encoder_out)
+        # 2. Compute attention loss
+        loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous())
+        acc_rich = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_pad.contiguous(),
+            ignore_label=self.ignore_id,
+        )
+
+        return loss_rich, acc_rich
+
+
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = ["wav_file_tmp_name"],
+        tokenizer=None,
+        frontend=None,
+        **kwargs,
+    ):
+
+
+        meta_data = {}
+        if (
+            isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
+        ):  # fbank
+            speech, speech_lengths = data_in, data_lengths
+            if len(speech.shape) < 3:
+                speech = speech[None, :, :]
+            if speech_lengths is None:
+                speech_lengths = speech.shape[1]
+        else:
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(
+                data_in,
+                fs=frontend.fs,
+                audio_fs=kwargs.get("fs", 16000),
+                data_type=kwargs.get("data_type", "sound"),
+                tokenizer=tokenizer,
+            )
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(
+                audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
+            )
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = (
+                speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+            )
+
+        speech = speech.to(device=kwargs["device"])
+        speech_lengths = speech_lengths.to(device=kwargs["device"])
+
+        language = kwargs.get("language", "auto")
+        language_query = self.embed(
+            torch.LongTensor(
+                [[self.lid_dict[language] if language in self.lid_dict else 0]]
+            ).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        
+        use_itn = kwargs.get("use_itn", False)
+        textnorm = kwargs.get("text_norm", None)
+        if textnorm is None:
+            textnorm = "withitn" if use_itn else "woitn"
+        textnorm_query = self.embed(
+            torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        speech = torch.cat((textnorm_query, speech), dim=1)
+        speech_lengths += 1
+
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(
+            speech.size(0), 1, 1
+        )
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+
+        # Encoder
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+
+        # c. Passed the encoder result and the beam search
+        ctc_logits = self.ctc.log_softmax(encoder_out)
+        if kwargs.get("ban_emo_unk", False):
+            ctc_logits[:, :, self.emo_dict["unk"]] = -float("inf")
+
+        results = []
+        b, n, d = encoder_out.size()
+        if isinstance(key[0], (list, tuple)):
+            key = key[0]
+        if len(key) < b:
+            key = key * b
+        for i in range(b):
+            x = ctc_logits[i, : encoder_out_lens[i].item(), :]
+            yseq = x.argmax(dim=-1)
+            yseq = torch.unique_consecutive(yseq, dim=-1)
+
+            ibest_writer = None
+            if kwargs.get("output_dir") is not None:
+                if not hasattr(self, "writer"):
+                    self.writer = DatadirWriter(kwargs.get("output_dir"))
+                ibest_writer = self.writer[f"1best_recog"]
+
+            mask = yseq != self.blank_id
+            token_int = yseq[mask].tolist()
+
+            # Change integer-ids to tokens
+            text = tokenizer.decode(token_int)
+
+            result_i = {"key": key[i], "text": text}
+            results.append(result_i)
+
+            if ibest_writer is not None:
+                ibest_writer["text"][key[i]] = text
+
+        return results, meta_data
+
+    def export(self, **kwargs):
+        from export_meta import export_rebuild_model
+
+        if "max_seq_len" not in kwargs:
+            kwargs["max_seq_len"] = 512
+        models = export_rebuild_model(model=self, **kwargs)
+        return models
--- a/plugins/model/stt-sensevoice/app/requirements.txt
+++ b/plugins/model/stt-sensevoice/app/requirements.txt
@@ -0,0 +1,5 @@
+torch>=1.13
+torchaudio
+funasr>=1.1.1
+fastapi
+modelscope
--- a/plugins/model/stt-sensevoice/main.py
+++ b/plugins/model/stt-sensevoice/main.py
@@ -0,0 +1,56 @@
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from tempfile import NamedTemporaryFile
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+import os
+
+# 加载模型
+model_dir = "./iic/SenseVoiceSmall"
+
+model = AutoModel(
+    model=model_dir,
+    trust_remote_code=True,
+    remote_code="./model.py",  
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+app = FastAPI()
+
+@app.post("/v1/audio/transcriptions")
+async def handler(file: UploadFile = File(...)):
+    if not file:
+        raise HTTPException(status_code=400, detail="No file was provided")
+
+    # 使用NamedTemporaryFile创建临时文件
+    with NamedTemporaryFile(delete=False) as temp_file:
+        # 将用户上传的文件写入临时文件
+        content = await file.read()
+        temp_file.write(content)
+        temp_file_path = temp_file.name
+
+    try:
+        # 开始运行模型
+        result = model.generate(
+            input=temp_file_path,
+            cache={},
+            language="auto", 
+            use_itn=True,
+            batch_size_s=60,
+            merge_vad=True,
+            merge_length_s=15,
+        )
+        text = rich_transcription_postprocess(result[0]["text"])
+
+        # 返回包含结果的JSON响应
+        return JSONResponse(content={'text': text})
+    finally:
+        # 删除临时文件
+        os.unlink(temp_file_path)
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/plugins/model/stt-sensevoice/run.sh
+++ b/plugins/model/stt-sensevoice/run.sh
@@ -0,0 +1 @@
+docker run -d -p 8000:8000 registry.cn-hangzhou.aliyuncs.com/luanshaotong/sensevoice:v0.1
--- a/plugins/model/tts-cosevoice/Dockerfile
+++ b/plugins/model/tts-cosevoice/Dockerfile
@@ -0,0 +1,12 @@
+FROM dockerhub.icu/pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /opt/CosyVoice
+
+RUN chmod 777 /tmp && sed -i 's@//.*archive.ubuntu.com@//mirrors.ustc.edu.cn@g' /etc/apt/sources.list && apt-get update -y && apt-get -y install git unzip git-lfs
+RUN git lfs install && git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
+# here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
+COPY ./requirements.txt CosyVoice
+RUN cd CosyVoice && pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto
+COPY fastapi/server.py CosyVoice/runtime/python/fastapi/
--- a/plugins/model/tts-cosevoice/fastapi/client.py
+++ b/plugins/model/tts-cosevoice/fastapi/client.py
@@ -0,0 +1,78 @@
+import argparse
+import logging
+import requests
+
+def saveResponse(path, response):
+    # 以二进制写入模式打开文件
+    with open(path, 'wb') as file:
+        # 将响应的二进制内容写入文件
+        file.write(response.content)
+
+def main():
+    api = args.api_base
+    if args.mode == 'sft':
+        url = api + "/api/inference/sft"
+        payload={
+            'tts': args.tts_text,
+            'role': args.spk_id
+        }
+        response = requests.request("POST", url, data=payload)
+        saveResponse(args.tts_wav, response)
+    elif args.mode == 'zero_shot':
+        url = api + "/api/inference/zero-shot"
+        payload={
+            'tts': args.tts_text,
+            'prompt': args.prompt_text
+        }
+        files=[('audio', ('prompt_audio.wav', open(args.prompt_wav,'rb'), 'application/octet-stream'))]
+        response = requests.request("POST", url, data=payload, files=files)
+        saveResponse(args.tts_wav, response)
+    elif args.mode == 'cross_lingual':
+        url = api + "/api/inference/cross-lingual"
+        payload={
+            'tts': args.tts_text,
+        }
+        files=[('audio', ('prompt_audio.wav', open(args.prompt_wav,'rb'), 'application/octet-stream'))]
+        response = requests.request("POST", url, data=payload, files=files)
+        saveResponse(args.tts_wav, response)
+    else:
+        url = api + "/api/inference/instruct"
+        payload = {
+            'tts': args.tts_text,
+            'role': args.spk_id,
+            'instruct': args.instruct_text
+        }
+        response = requests.request("POST", url, data=payload)
+        saveResponse(args.tts_wav, response)
+    logging.info("Response save to {}", args.tts_wav)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--api_base',
+                        type=str,
+                        default='http://127.0.0.1:50000')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],
+                        help='request mode')
+    parser.add_argument('--tts_text',
+                        type=str,
+                        default='你好，我是通义千问语音合成大模型，请问有什么可以帮您的吗？')
+    parser.add_argument('--spk_id',
+                        type=str,
+                        default='中文男')
+    parser.add_argument('--prompt_text',
+                        type=str,
+                        default='希望你以后能够做的比我还好呦。')
+    parser.add_argument('--prompt_wav',
+                        type=str,
+                        default='../../../zero_shot_prompt.wav')
+    parser.add_argument('--instruct_text',
+                        type=str,
+                        default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
+    parser.add_argument('--tts_wav',
+                        type=str,
+                        default='loushiming.mp3')
+    args = parser.parse_args()
+    prompt_sr, target_sr = 16000, 22050
+    main()
--- a/plugins/model/tts-cosevoice/fastapi/server.py
+++ b/plugins/model/tts-cosevoice/fastapi/server.py
@@ -0,0 +1,136 @@
+# Set inference model
+# export MODEL_DIR=pretrained_models/CosyVoice-300M-Instruct
+# For development
+# fastapi dev --port 6006 fastapi_server.py
+# For production deployment
+# fastapi run --port 6006 fastapi_server.py
+
+import os
+import sys
+import io,time
+from fastapi import FastAPI, Request, Response, File, UploadFile, Form, Body
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware  #引入 CORS中间件模块
+from contextlib import asynccontextmanager
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../../..'.format(ROOT_DIR))
+sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav
+import numpy as np
+import torch
+import torchaudio
+import logging
+from pydantic import BaseModel
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+
+class LaunchFailed(Exception):
+    pass
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    model_dir = os.getenv("MODEL_DIR", "pretrained_models/CosyVoice-300M-SFT")
+    if model_dir:
+        logging.info("MODEL_DIR is {}", model_dir)
+        app.cosyvoice = CosyVoice(model_dir)
+        # sft usage
+        logging.info("Avaliable speakers {}", app.cosyvoice.list_avaliable_spks())
+    else:
+        raise LaunchFailed("MODEL_DIR environment must set")
+    yield
+
+app = FastAPI(lifespan=lifespan)
+
+#设置允许访问的域名
+origins = ["*"]  #"*"，即为所有,也可以改为允许的特定ip。
+app.add_middleware(
+    CORSMiddleware, 
+    allow_origins=origins,  #设置允许的origins来源
+    allow_credentials=True,
+    allow_methods=["*"],  # 设置允许跨域的http方法，比如 get、post、put等。
+    allow_headers=["*"])  #允许跨域的headers，可以用来鉴别来源等作用。
+
+def buildResponse(output):
+    buffer = io.BytesIO()
+    torchaudio.save(buffer, output, 22050, format="mp3")
+    buffer.seek(0)
+    return Response(content=buffer.read(-1), media_type="audio/mpeg")
+
+@app.post("/api/inference/sft")
+@app.get("/api/inference/sft")
+async def sft(tts: str = Form(), role: str = Form()):
+    start = time.process_time()
+    output = app.cosyvoice.inference_sft(tts, role)
+    end = time.process_time()
+    logging.info("infer time is {} seconds", end-start)
+    return buildResponse(output['tts_speech'])
+
+class SpeechRequest(BaseModel):
+    model: str
+    input: str
+    voice: str
+
+@app.post("/v1/audio/speech")
+async def sft(request: Request, speech_request: SpeechRequest):
+    # 解析请求体中的JSON数据
+    data = speech_request.dict()
+    
+    start = time.process_time()
+    output = app.cosyvoice.inference_sft(data['input'], data['voice'])
+    end = time.process_time()
+    logging.info("infer time is {} seconds", end-start)
+    return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/zero-shot")
+async def zeroShot(tts: str = Form(), prompt: str = Form(), audio: UploadFile = File()):
+    start = time.process_time()
+    prompt_speech = load_wav(audio.file, 16000)
+    prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+    prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+    prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+
+    output = app.cosyvoice.inference_zero_shot(tts, prompt, prompt_speech_16k)
+    end = time.process_time()
+    logging.info("infer time is {} seconds", end-start)
+    return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/cross-lingual")
+async def crossLingual(tts: str = Form(), audio: UploadFile = File()):
+    start = time.process_time()
+    prompt_speech = load_wav(audio.file, 16000)
+    prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+    prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+    prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+
+    output = app.cosyvoice.inference_cross_lingual(tts, prompt_speech_16k)
+    end = time.process_time()
+    logging.info("infer time is {} seconds", end-start)
+    return buildResponse(output['tts_speech'])
+
+@app.post("/api/inference/instruct")
+@app.get("/api/inference/instruct")
+async def instruct(tts: str = Form(), role: str = Form(), instruct: str = Form()):
+    start = time.process_time()
+    output = app.cosyvoice.inference_instruct(tts, role, instruct)
+    end = time.process_time()
+    logging.info("infer time is {} seconds", end-start)
+    return buildResponse(output['tts_speech'])
+
+@app.get("/api/roles")
+async def roles():
+    return {"roles": app.cosyvoice.list_avaliable_spks()}
+
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    return """
+    <!DOCTYPE html>
+    <html lang=zh-cn>
+        <head>
+            <meta charset=utf-8>
+            <title>Api information</title>
+        </head>
+        <body>
+            Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. <a href='./docs'>Documents of API</a>
+        </body>
+    </html>
+    """
--- a/plugins/model/tts-cosevoice/grpc/client.py
+++ b/plugins/model/tts-cosevoice/grpc/client.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../../..'.format(ROOT_DIR))
+sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
+import logging
+import argparse
+import torchaudio
+import cosyvoice_pb2
+import cosyvoice_pb2_grpc
+import grpc
+import torch
+import numpy as np
+from cosyvoice.utils.file_utils import load_wav
+
+
+def main():
+    with grpc.insecure_channel("{}:{}".format(args.host, args.port)) as channel:
+        stub = cosyvoice_pb2_grpc.CosyVoiceStub(channel)
+        request = cosyvoice_pb2.Request()
+        if args.mode == 'sft':
+            logging.info('send sft request')
+            sft_request = cosyvoice_pb2.sftRequest()
+            sft_request.spk_id = args.spk_id
+            sft_request.tts_text = args.tts_text
+            request.sft_request.CopyFrom(sft_request)
+        elif args.mode == 'zero_shot':
+            logging.info('send zero_shot request')
+            zero_shot_request = cosyvoice_pb2.zeroshotRequest()
+            zero_shot_request.tts_text = args.tts_text
+            zero_shot_request.prompt_text = args.prompt_text
+            prompt_speech = load_wav(args.prompt_wav, 16000)
+            zero_shot_request.prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+            request.zero_shot_request.CopyFrom(zero_shot_request)
+        elif args.mode == 'cross_lingual':
+            logging.info('send cross_lingual request')
+            cross_lingual_request = cosyvoice_pb2.crosslingualRequest()
+            cross_lingual_request.tts_text = args.tts_text
+            prompt_speech = load_wav(args.prompt_wav, 16000)
+            cross_lingual_request.prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
+            request.cross_lingual_request.CopyFrom(cross_lingual_request)
+        else:
+            logging.info('send instruct request')
+            instruct_request = cosyvoice_pb2.instructRequest()
+            instruct_request.tts_text = args.tts_text
+            instruct_request.spk_id = args.spk_id
+            instruct_request.instruct_text = args.instruct_text
+            request.instruct_request.CopyFrom(instruct_request)
+
+        response = stub.Inference(request)
+        logging.info('save response to {}'.format(args.tts_wav))
+        tts_speech = torch.from_numpy(np.array(np.frombuffer(response.tts_audio, dtype=np.int16))).unsqueeze(dim=0)
+        torchaudio.save(args.tts_wav, tts_speech, target_sr)
+        logging.info('get response')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host',
+                        type=str,
+                        default='0.0.0.0')
+    parser.add_argument('--port',
+                        type=int,
+                        default='50000')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],
+                        help='request mode')
+    parser.add_argument('--tts_text',
+                        type=str,
+                        default='你好，我是通义千问语音合成大模型，请问有什么可以帮您的吗？')
+    parser.add_argument('--spk_id',
+                        type=str,
+                        default='中文女')
+    parser.add_argument('--prompt_text',
+                        type=str,
+                        default='希望你以后能够做的比我还好呦。')
+    parser.add_argument('--prompt_wav',
+                        type=str,
+                        default='../../../zero_shot_prompt.wav')
+    parser.add_argument('--instruct_text',
+                        type=str,
+                        default='Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.')
+    parser.add_argument('--tts_wav',
+                        type=str,
+                        default='demo.wav')
+    args = parser.parse_args()
+    prompt_sr, target_sr = 16000, 22050
+    main()
--- a/plugins/model/tts-cosevoice/grpc/cosyvoice.proto
+++ b/plugins/model/tts-cosevoice/grpc/cosyvoice.proto
@@ -0,0 +1,43 @@
+syntax = "proto3";
+
+package cosyvoice;
+option go_package = "protos/";
+
+service CosyVoice{
+  rpc Inference(Request) returns (Response) {}
+}
+
+message Request{
+  oneof RequestPayload {
+    sftRequest sft_request = 1;
+    zeroshotRequest zero_shot_request = 2;
+    crosslingualRequest cross_lingual_request = 3;
+    instructRequest instruct_request = 4;
+  }
+}
+
+message sftRequest{
+  string spk_id = 1;
+  string tts_text = 2;
+}
+
+message zeroshotRequest{
+  string tts_text = 1;
+  string prompt_text = 2;
+  bytes prompt_audio = 3;
+}
+
+message crosslingualRequest{
+  string tts_text = 1;
+  bytes prompt_audio = 2;
+}
+
+message instructRequest{
+  string tts_text = 1;
+  string spk_id = 2;
+  string instruct_text = 3;
+}
+
+message Response{
+  bytes tts_audio = 1;
+}
--- a/plugins/model/tts-cosevoice/grpc/server.py
+++ b/plugins/model/tts-cosevoice/grpc/server.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../../..'.format(ROOT_DIR))
+sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from concurrent import futures
+import argparse
+import cosyvoice_pb2
+import cosyvoice_pb2_grpc
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import grpc
+import torch
+import numpy as np
+from cosyvoice.cli.cosyvoice import CosyVoice
+
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(message)s')
+
+class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
+    def __init__(self, args):
+        self.cosyvoice = CosyVoice(args.model_dir)
+        logging.info('grpc service initialized')
+
+    def Inference(self, request, context):
+        if request.HasField('sft_request'):
+            logging.info('get sft inference request')
+            model_output = self.cosyvoice.inference_sft(request.sft_request.tts_text, request.sft_request.spk_id)
+        elif request.HasField('zero_shot_request'):
+            logging.info('get zero_shot inference request')
+            prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(request.zero_shot_request.prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+            prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+            model_output = self.cosyvoice.inference_zero_shot(request.zero_shot_request.tts_text, request.zero_shot_request.prompt_text, prompt_speech_16k)
+        elif request.HasField('cross_lingual_request'):
+            logging.info('get cross_lingual inference request')
+            prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(request.cross_lingual_request.prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
+            prompt_speech_16k = prompt_speech_16k.float() / (2**15)
+            model_output = self.cosyvoice.inference_cross_lingual(request.cross_lingual_request.tts_text, prompt_speech_16k)
+        else:
+            logging.info('get instruct inference request')
+            model_output = self.cosyvoice.inference_instruct(request.instruct_request.tts_text, request.instruct_request.spk_id, request.instruct_request.instruct_text)
+
+        logging.info('send inference response')
+        response = cosyvoice_pb2.Response()
+        response.tts_audio = (model_output['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
+        return response
+
+def main():
+    grpcServer = grpc.server(futures.ThreadPoolExecutor(max_workers=args.max_conc), maximum_concurrent_rpcs=args.max_conc)
+    cosyvoice_pb2_grpc.add_CosyVoiceServicer_to_server(CosyVoiceServiceImpl(args), grpcServer)
+    grpcServer.add_insecure_port('0.0.0.0:{}'.format(args.port))
+    grpcServer.start()
+    logging.info("server listening on 0.0.0.0:{}".format(args.port))
+    grpcServer.wait_for_termination()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port',
+                        type=int,
+                        default=50000)
+    parser.add_argument('--max_conc',
+                        type=int,
+                        default=4)
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='iic/CosyVoice-300M',
+                        help='local path or modelscope repo id')
+    args = parser.parse_args()
+    main()
--- a/plugins/model/tts-cosevoice/requirements.txt
+++ b/plugins/model/tts-cosevoice/requirements.txt
@@ -0,0 +1,29 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+conformer==0.3.2
+deepspeed==0.14.2; sys_platform == 'linux'
+diffusers==0.27.2
+gdown==5.1.0
+gradio==4.32.2
+grpcio==1.57.0
+grpcio-tools==1.57.0
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+inflect==7.3.1
+librosa==0.10.2
+lightning==2.2.4
+matplotlib==3.7.5
+modelscope==1.15.0
+networkx==3.1
+omegaconf==2.3.0
+onnxruntime-gpu; sys_platform == 'linux'
+onnxruntime; sys_platform == 'darwin' or sys_platform == 'windows'
+openai-whisper==20231117
+protobuf==4.25
+pydantic==2.7.0
+rich==13.7.1
+soundfile==0.12.1
+tensorboard
+wget==3.2
+fastapi==0.111.0
+fastapi-cli==0.0.4
+WeTextProcessing==1.0.3
				`@@ -0,0 +1 @@`
				`docker run -d -p 8000:8000 registry.cn-hangzhou.aliyuncs.com/luanshaotong/sensevoice:v0.1`