From 4358b6de4d8c6a6216870eab3e3009e702f1018b Mon Sep 17 00:00:00 2001
From: Archer <545436317@qq.com>
Date: Fri, 17 Nov 2023 00:03:05 +0800
Subject: [PATCH] Add whisper and tts ui (#484)

Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
---
 .../content/docs/development/configuration.md |  24 +-
 packages/global/core/ai/model.d.ts            |   6 +
 packages/global/core/ai/model.ts              |   9 +-
 packages/service/common/file/upload/multer.ts |   4 +-
 packages/service/core/ai/config.ts            |   2 +-
 projects/app/data/config.json                 |   9 +-
 projects/app/package.json                     |   2 +-
 projects/app/public/locales/en/common.json    |  14 +-
 projects/app/public/locales/zh/common.json    |  14 +-
 .../src/components/ChatBox/MessageInput.tsx   | 230 ++++++++++++++++++
 projects/app/src/components/ChatBox/index.tsx | 146 +++--------
 .../Icon/icons/core/app/headphones.svg        |   3 +
 .../components/Icon/icons/core/app/tts.svg    |   3 +
 .../Icon/icons/core/chat/recordFill.svg       |   4 +-
 .../Icon/icons/core/chat/speaking.svg         |  11 +
 .../Icon/icons/core/chat/stopSpeech.svg       |   4 +
 .../Icon/icons/core/chat/stopSpeechFill.svg   |  16 +-
 projects/app/src/components/Icon/index.tsx    |   6 +-
 .../app/src/components/Markdown/img/Image.tsx |  27 +-
 projects/app/src/components/MyModal/index.tsx |  19 +-
 projects/app/src/components/Select/index.tsx  | 136 ++++++-----
 .../app/src/global/common/api/systemRes.d.ts  |  15 +-
 projects/app/src/global/core/chat/api.d.ts    |   1 +
 .../src/pages/api/core/chat/item/getSpeech.ts |  13 +-
 .../app/src/pages/api/system/getInitData.ts   |  75 ++----
 .../src/pages/api/v1/audio/transcriptions.ts  |  20 +-
 .../pages/app/detail/components/TTSSelect.tsx | 144 ++++++++---
 projects/app/src/pages/index.tsx              |   4 -
 .../src/service/support/wallet/bill/push.ts   |  36 ++-
 projects/app/src/types/index.d.ts             |   4 +-
 projects/app/src/web/common/api/fetch.ts      |   6 +
 .../app/src/web/common/hooks/useSpeech.ts     | 101 ++++++--
 projects/app/src/web/common/utils/voice.ts    |   6 +-
 projects/app/src/web/styles/theme.ts          |  25 ++
 34 files changed, 806 insertions(+), 333 deletions(-)
 create mode 100644 projects/app/src/components/ChatBox/MessageInput.tsx
 create mode 100644 projects/app/src/components/Icon/icons/core/app/headphones.svg
 create mode 100644 projects/app/src/components/Icon/icons/core/app/tts.svg
 create mode 100644 projects/app/src/components/Icon/icons/core/chat/speaking.svg
 create mode 100644 projects/app/src/components/Icon/icons/core/chat/stopSpeech.svg

diff --git a/docSite/content/docs/development/configuration.md b/docSite/content/docs/development/configuration.md
index 971ed48f5..c6766157d 100644
--- a/docSite/content/docs/development/configuration.md
+++ b/docSite/content/docs/development/configuration.md
@@ -123,13 +123,23 @@ weight: 520
     {
       "model": "tts-1",
       "name": "OpenAI TTS1",
-      "price": 0
-    },
-    {
-      "model": "tts-1-hd",
-      "name": "OpenAI TTS1HD",
-      "price": 0
+      "price": 0,
+      "baseUrl": "",
+      "key": "",
+      "voices": [
+        { "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
+        { "label": "Echo", "value": "echo", "bufferId": "openai-Echo" },
+        { "label": "Fable", "value": "fable", "bufferId": "openai-Fable" },
+        { "label": "Onyx", "value": "onyx", "bufferId": "openai-Onyx" },
+        { "label": "Nova", "value": "nova", "bufferId": "openai-Nova" },
+        { "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
+      ]
     }
-  ]
+  ],
+  "WhisperModel": {
+    "model": "whisper-1",
+    "name": "Whisper1",
+    "price": 0
+  }
 }
 ```
diff --git a/packages/global/core/ai/model.d.ts b/packages/global/core/ai/model.d.ts
index 71c86c6f6..2d40c32a9 100644
--- a/packages/global/core/ai/model.d.ts
+++ b/packages/global/core/ai/model.d.ts
@@ -33,3 +33,9 @@ export type AudioSpeechModelType = {
   key?: string;
   voices: { label: string; value: string; bufferId: string }[];
 };
+
+export type WhisperModelType = {
+  model: string;
+  name: string;
+  price: number;
+};
diff --git a/packages/global/core/ai/model.ts b/packages/global/core/ai/model.ts
index cbe8591e7..021ee1a7f 100644
--- a/packages/global/core/ai/model.ts
+++ b/packages/global/core/ai/model.ts
@@ -3,7 +3,8 @@ import type {
   ChatModelItemType,
   FunctionModelItemType,
   VectorModelItemType,
-  AudioSpeechModelType
+  AudioSpeechModelType,
+  WhisperModelType
 } from './model.d';
 
 export const defaultChatModels: ChatModelItemType[] = [
@@ -116,3 +117,9 @@ export const defaultAudioSpeechModels: AudioSpeechModelType[] = [
     ]
   }
 ];
+
+export const defaultWhisperModel: WhisperModelType = {
+  model: 'whisper-1',
+  name: 'Whisper1',
+  price: 0
+};
diff --git a/packages/service/common/file/upload/multer.ts b/packages/service/common/file/upload/multer.ts
index ee0cb8ecd..a20fd7e19 100644
--- a/packages/service/common/file/upload/multer.ts
+++ b/packages/service/common/file/upload/multer.ts
@@ -32,10 +32,10 @@ export function getUploadModel({ maxSize = 500 }: { maxSize?: number }) {
       })
     }).any();
 
-    async doUpload(req: NextApiRequest, res: NextApiResponse) {
+    async doUpload<T = Record<string, any>>(req: NextApiRequest, res: NextApiResponse) {
       return new Promise<{
         files: FileType[];
-        metadata: Record<string, any>;
+        metadata: T;
         bucketName?: `${BucketNameEnum}`;
       }>((resolve, reject) => {
         // @ts-ignore
diff --git a/packages/service/core/ai/config.ts b/packages/service/core/ai/config.ts
index 3c443db01..e13a174cc 100644
--- a/packages/service/core/ai/config.ts
+++ b/packages/service/core/ai/config.ts
@@ -6,7 +6,7 @@ export const baseUrl = process.env.ONEAPI_URL || openaiBaseUrl;
 
 export const systemAIChatKey = process.env.CHAT_API_KEY || '';
 
-export const getAIApi = (props?: UserModelSchema['openaiAccount'], timeout = 6000) => {
+export const getAIApi = (props?: UserModelSchema['openaiAccount'], timeout = 60000) => {
   return new OpenAI({
     apiKey: props?.key || systemAIChatKey,
     baseURL: props?.baseUrl || baseUrl,
diff --git a/projects/app/data/config.json b/projects/app/data/config.json
index fb3388d37..f675dc51e 100644
--- a/projects/app/data/config.json
+++ b/projects/app/data/config.json
@@ -103,7 +103,7 @@
       "model": "tts-1",
       "name": "OpenAI TTS1",
       "price": 0,
-      "baseUrl": "https://api.openai.com/v1",
+      "baseUrl": "",
       "key": "",
       "voices": [
         { "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
@@ -114,5 +114,10 @@
         { "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
       ]
     }
-  ]
+  ],
+  "WhisperModel": {
+    "model": "whisper-1",
+    "name": "Whisper1",
+    "price": 0
+  }
 }
diff --git a/projects/app/package.json b/projects/app/package.json
index 041439161..1ddb71266 100644
--- a/projects/app/package.json
+++ b/projects/app/package.json
@@ -1,6 +1,6 @@
 {
   "name": "app",
-  "version": "4.6",
+  "version": "4.6.1",
   "private": false,
   "scripts": {
     "dev": "next dev",
diff --git a/projects/app/public/locales/en/common.json b/projects/app/public/locales/en/common.json
index 671a828a5..b8c2118e8 100644
--- a/projects/app/public/locales/en/common.json
+++ b/projects/app/public/locales/en/common.json
@@ -73,6 +73,7 @@
     "Complete Response": "Complete Response",
     "Confirm to clear history": "Confirm to clear history?",
     "Confirm to clear share chat history": " Are you sure to delete all chats?",
+    "Converting to text": "Converting to text...",
     "Exit Chat": "Exit",
     "Feedback Close": "Close Feedback",
     "Feedback Failed": "Feedback Failed",
@@ -216,12 +217,15 @@
     "app": {
       "Next Step Guide": "Next step guide",
       "Question Guide Tip": "At the end of the conversation, three leading questions will be asked.",
+      "Select TTS": "Select TTS",
       "TTS": "Audio Speech",
       "TTS Tip": "After this function is enabled, the voice playback function can be used after each conversation. Use of this feature may incur additional charges.",
       "tts": {
         "Close": "NoUse",
         "Model alloy": "Female - Alloy",
         "Model echo": "Male - Echo",
+        "Speech model": "Speech model",
+        "Speech speed": "Speed",
         "Test Listen": "Test",
         "Test Listen Text": "Hello, this is FastGPT, how can I help you?",
         "Web": "Browser (free)"
@@ -229,10 +233,15 @@
     },
     "chat": {
       "Audio Speech Error": "Audio Speech Error",
+      "Speaking": "I'm listening...",
       "Record": "Speech",
       "Restart": "Restart",
       "Send Message": "Send Message",
-      "Stop Speak": "Stop Speak"
+      "Stop Speak": "Stop Speak",
+      "Type a message": "Input problem",
+      "tts": {
+        "Stop Speech": "Stop"
+      }
     },
     "dataset": {
       "Choose Dataset": "Choose Dataset",
@@ -580,7 +589,8 @@
   "wallet": {
     "bill": {
       "Audio Speech": "Audio Speech",
-      "bill username": "User"
+      "bill username": "User",
+      "Whisper": "Whisper"
     }
   }
 }
diff --git a/projects/app/public/locales/zh/common.json b/projects/app/public/locales/zh/common.json
index 474ccd724..881bd4a37 100644
--- a/projects/app/public/locales/zh/common.json
+++ b/projects/app/public/locales/zh/common.json
@@ -73,6 +73,7 @@
     "Complete Response": "完整响应",
     "Confirm to clear history": "确认清空该应用的在线聊天记录？分享和 API 调用的记录不会被清空。",
     "Confirm to clear share chat history": "确认删除所有聊天记录？",
+    "Converting to text": "正在转换为文本...",
     "Exit Chat": "退出聊天",
     "Feedback Close": "关闭反馈",
     "Feedback Failed": "提交反馈异常",
@@ -216,12 +217,15 @@
     "app": {
       "Next Step Guide": "下一步指引",
       "Question Guide Tip": "对话结束后，会为生成 3 个引导性问题。",
+      "Select TTS": "选择语音播放模式",
       "TTS": "语音播报",
       "TTS Tip": "开启后，每次对话后可使用语音播放功能。使用该功能可能产生额外费用。",
       "tts": {
         "Close": "不使用",
         "Model alloy": "女声 - Alloy",
         "Model echo": "男声 - Echo",
+        "Speech model": "语音模型",
+        "Speech speed": "语速",
         "Test Listen": "试听",
         "Test Listen Text": "你好，我是 FastGPT，有什么可以帮助你么？",
         "Web": "浏览器自带(免费)"
@@ -232,7 +236,12 @@
       "Record": "语音输入",
       "Restart": "重开对话",
       "Send Message": "发送",
-      "Stop Speak": "停止录音"
+      "Speaking": "我在听，请说...",
+      "Stop Speak": "停止录音",
+      "Type a message": "输入问题",
+      "tts": {
+        "Stop Speech": "停止"
+      }
     },
     "dataset": {
       "Choose Dataset": "关联知识库",
@@ -580,7 +589,8 @@
   "wallet": {
     "bill": {
       "Audio Speech": "语音播报",
-      "bill username": "用户"
+      "bill username": "用户",
+      "Whisper": "语音输入"
     }
   }
 }
diff --git a/projects/app/src/components/ChatBox/MessageInput.tsx b/projects/app/src/components/ChatBox/MessageInput.tsx
new file mode 100644
index 000000000..28e77fd90
--- /dev/null
+++ b/projects/app/src/components/ChatBox/MessageInput.tsx
@@ -0,0 +1,230 @@
+import { useSpeech } from '@/web/common/hooks/useSpeech';
+import { useSystemStore } from '@/web/common/system/useSystemStore';
+import { Box, Flex, Spinner, Textarea } from '@chakra-ui/react';
+import React, { useRef, useEffect } from 'react';
+import { useTranslation } from 'react-i18next';
+import MyTooltip from '../MyTooltip';
+import MyIcon from '../Icon';
+import styles from './index.module.scss';
+import { useRouter } from 'next/router';
+
+const MessageInput = ({
+  onChange,
+  onSendMessage,
+  onStop,
+  isChatting,
+  TextareaDom,
+  resetInputVal
+}: {
+  onChange: (e: string) => void;
+  onSendMessage: (e: string) => void;
+  onStop: () => void;
+  isChatting: boolean;
+  TextareaDom: React.MutableRefObject<HTMLTextAreaElement | null>;
+  resetInputVal: (val: string) => void;
+}) => {
+  const { shareId } = useRouter().query as { shareId?: string };
+  const {
+    isSpeaking,
+    isTransCription,
+    stopSpeak,
+    startSpeak,
+    speakingTimeString,
+    renderAudioGraph,
+    stream
+  } = useSpeech({ shareId });
+  const { isPc } = useSystemStore();
+  const canvasRef = useRef<HTMLCanvasElement>();
+  const { t } = useTranslation();
+  const textareaMinH = '22px';
+  const havInput = !!TextareaDom.current?.value;
+
+  useEffect(() => {
+    if (!stream) {
+      return;
+    }
+    const audioContext = new AudioContext();
+    const analyser = audioContext.createAnalyser();
+    analyser.fftSize = 4096;
+    analyser.smoothingTimeConstant = 1;
+    const source = audioContext.createMediaStreamSource(stream);
+    source.connect(analyser);
+    const renderCurve = () => {
+      renderAudioGraph(analyser, canvasRef.current as HTMLCanvasElement);
+      window.requestAnimationFrame(renderCurve);
+    };
+    renderCurve();
+  }, [renderAudioGraph, stream]);
+
+  return (
+    <>
+      <Box m={['0 auto', '10px auto']} w={'100%'} maxW={['auto', 'min(800px, 100%)']} px={[0, 5]}>
+        <Box
+          py={'18px'}
+          position={'relative'}
+          boxShadow={isSpeaking ? `0 0 10px rgba(54,111,255,0.4)` : `0 0 10px rgba(0,0,0,0.2)`}
+          {...(isPc
+            ? {
+                border: '1px solid',
+                borderColor: 'rgba(0,0,0,0.12)'
+              }
+            : {
+                borderTop: '1px solid',
+                borderTopColor: 'rgba(0,0,0,0.15)'
+              })}
+          borderRadius={['none', 'md']}
+          backgroundColor={'white'}
+        >
+          {/* translate loading */}
+          <Box
+            position={'absolute'}
+            top={0}
+            bottom={0}
+            left={4}
+            right={['8px', '4px']}
+            zIndex={10}
+            display={'flex'}
+            alignItems={'center'}
+            bg={'white'}
+            pl={['5px', '10px']}
+            color="rgba(54,111,255,0.6)"
+            visibility={isSpeaking && isTransCription ? 'visible' : 'hidden'}
+          >
+            <Spinner size={'sm'} mr={4} />
+            {t('chat.Converting to text')}
+          </Box>
+          {/* input area */}
+          <Textarea
+            ref={TextareaDom}
+            py={0}
+            pr={['45px', '55px']}
+            border={'none'}
+            _focusVisible={{
+              border: 'none'
+            }}
+            placeholder={isSpeaking ? t('core.chat.Speaking') : t('core.chat.Type a message')}
+            resize={'none'}
+            rows={1}
+            height={'22px'}
+            lineHeight={'22px'}
+            maxHeight={'150px'}
+            maxLength={-1}
+            overflowY={'auto'}
+            whiteSpace={'pre-wrap'}
+            wordBreak={'break-all'}
+            boxShadow={'none !important'}
+            color={'myGray.900'}
+            isDisabled={isSpeaking}
+            onChange={(e) => {
+              const textarea = e.target;
+              textarea.style.height = textareaMinH;
+              textarea.style.height = `${textarea.scrollHeight}px`;
+              onChange(textarea.value);
+            }}
+            onKeyDown={(e) => {
+              // enter send.(pc or iframe && enter and unPress shift)
+              if ((isPc || window !== parent) && e.keyCode === 13 && !e.shiftKey) {
+                onSendMessage(TextareaDom.current?.value || '');
+                e.preventDefault();
+              }
+              // 全选内容
+              // @ts-ignore
+              e.key === 'a' && e.ctrlKey && e.target?.select();
+            }}
+          />
+          <Flex
+            position={'absolute'}
+            alignItems={'center'}
+            right={['12px', '14px']}
+            bottom={['15px', '13px']}
+          >
+            {/* voice-input */}
+            {!shareId && !havInput && !isChatting && (
+              <>
+                <canvas
+                  ref={canvasRef as any}
+                  style={{
+                    height: '30px',
+                    width: isSpeaking && !isTransCription ? '100px' : 0,
+                    background: 'white',
+                    zIndex: 0
+                  }}
+                />
+                <Flex
+                  mr={2}
+                  alignItems={'center'}
+                  justifyContent={'center'}
+                  h={['26px', '32px']}
+                  w={['26px', '32px']}
+                  borderRadius={'md'}
+                  cursor={'pointer'}
+                  _hover={{ bg: '#F5F5F8' }}
+                  onClick={() => {
+                    if (isSpeaking) {
+                      return stopSpeak();
+                    }
+                    startSpeak(resetInputVal);
+                  }}
+                >
+                  <MyTooltip label={isSpeaking ? t('core.chat.Stop Speak') : t('core.chat.Record')}>
+                    <MyIcon
+                      name={isSpeaking ? 'core/chat/stopSpeechFill' : 'core/chat/recordFill'}
+                      width={['20px', '22px']}
+                      height={['20px', '22px']}
+                      color={'myBlue.600'}
+                    />
+                  </MyTooltip>
+                </Flex>
+              </>
+            )}
+            {/* send and stop icon */}
+            {isSpeaking ? (
+              <Box color={'#5A646E'}>{speakingTimeString}</Box>
+            ) : (
+              <Flex
+                alignItems={'center'}
+                justifyContent={'center'}
+                h={['28px', '32px']}
+                w={['28px', '32px']}
+                borderRadius={'md'}
+                bg={isSpeaking || isChatting ? '' : !havInput ? '#E5E5E5' : 'myBlue.600'}
+                cursor={havInput ? 'pointer' : 'not-allowed'}
+                lineHeight={1}
+                onClick={() => {
+                  if (isChatting) {
+                    return onStop();
+                  }
+                  if (havInput) {
+                    onSendMessage(TextareaDom.current?.value || '');
+                  }
+                }}
+              >
+                {isChatting ? (
+                  <MyIcon
+                    className={styles.stopIcon}
+                    width={['22px', '25px']}
+                    height={['22px', '25px']}
+                    cursor={'pointer'}
+                    name={'stop'}
+                    color={'gray.500'}
+                  />
+                ) : (
+                  <MyTooltip label={t('core.chat.Send Message')}>
+                    <MyIcon
+                      name={'core/chat/sendFill'}
+                      width={['18px', '20px']}
+                      height={['18px', '20px']}
+                      color={'white'}
+                    />
+                  </MyTooltip>
+                )}
+              </Flex>
+            )}
+          </Flex>
+        </Box>
+      </Box>
+    </>
+  );
+};
+
+export default React.memo(MessageInput);
diff --git a/projects/app/src/components/ChatBox/index.tsx b/projects/app/src/components/ChatBox/index.tsx
index 247729ba0..c8e150ad7 100644
--- a/projects/app/src/components/ChatBox/index.tsx
+++ b/projects/app/src/components/ChatBox/index.tsx
@@ -26,7 +26,8 @@ import {
   Button,
   useTheme,
   BoxProps,
-  FlexProps
+  FlexProps,
+  Spinner
 } from '@chakra-ui/react';
 import { feConfigs } from '@/web/common/system/staticData';
 import { eventBus } from '@/web/common/utils/eventbus';
@@ -62,7 +63,7 @@ import styles from './index.module.scss';
 import { postQuestionGuide } from '@/web/core/ai/api';
 import { splitGuideModule } from '@/global/core/app/modules/utils';
 import { AppTTSConfigType } from '@/types/app';
-import { useSpeech } from '@/web/common/hooks/useSpeech';
+import MessageInput from './MessageInput';
 
 const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 24);
 
@@ -150,8 +151,6 @@ const ChatBox = (
   const [adminMarkData, setAdminMarkData] = useState<AdminMarkType & { chatItemId: string }>();
   const [questionGuides, setQuestionGuide] = useState<string[]>([]);
 
-  const { isSpeaking, startSpeak, stopSpeak } = useSpeech();
-
   const isChatting = useMemo(
     () =>
       chatHistory[chatHistory.length - 1] &&
@@ -241,6 +240,7 @@ const ChatBox = (
         TextareaDom.current.style.height =
           val === '' ? textareaMinH : `${TextareaDom.current.scrollHeight}px`;
       }
+      setRefresh((state) => !state);
     }, 100);
   }, []);
 
@@ -795,110 +795,18 @@ const ChatBox = (
       </Box>
       {/* message input */}
       {onStartChat && variableIsFinish && active ? (
-        <Box m={['0 auto', '10px auto']} w={'100%'} maxW={['auto', 'min(800px, 100%)']} px={[0, 5]}>
-          <Box
-            py={'18px'}
-            position={'relative'}
-            boxShadow={`0 0 10px rgba(0,0,0,0.2)`}
-            {...(isPc
-              ? {
-                  border: '1px solid',
-                  borderColor: 'rgba(0,0,0,0.12)'
-                }
-              : {
-                  borderTop: '1px solid',
-                  borderTopColor: 'rgba(0,0,0,0.15)'
-                })}
-            borderRadius={['none', 'md']}
-            backgroundColor={'white'}
-          >
-            {/* 输入框 */}
-            <Textarea
-              ref={TextareaDom}
-              py={0}
-              pr={['45px', '55px']}
-              border={'none'}
-              _focusVisible={{
-                border: 'none'
-              }}
-              placeholder="提问"
-              resize={'none'}
-              rows={1}
-              height={'22px'}
-              lineHeight={'22px'}
-              maxHeight={'150px'}
-              maxLength={-1}
-              overflowY={'auto'}
-              whiteSpace={'pre-wrap'}
-              wordBreak={'break-all'}
-              boxShadow={'none !important'}
-              color={'myGray.900'}
-              onChange={(e) => {
-                const textarea = e.target;
-                textarea.style.height = textareaMinH;
-                textarea.style.height = `${textarea.scrollHeight}px`;
-                setRefresh((state) => !state);
-              }}
-              onKeyDown={(e) => {
-                // enter send.(pc or iframe && enter and unPress shift)
-                if ((isPc || window !== parent) && e.keyCode === 13 && !e.shiftKey) {
-                  handleSubmit((data) => sendPrompt(data, TextareaDom.current?.value))();
-                  e.preventDefault();
-                }
-                // 全选内容
-                // @ts-ignore
-                e.key === 'a' && e.ctrlKey && e.target?.select();
-              }}
-            />
-            {/* 发送和等待按键 */}
-            <Flex
-              alignItems={'center'}
-              justifyContent={'center'}
-              h={['26px', '32px']}
-              w={['26px', '32px']}
-              position={'absolute'}
-              right={['12px', '14px']}
-              bottom={['15px', '13px']}
-              borderRadius={'md'}
-              // bg={TextareaDom.current?.value ? 'myBlue.600' : ''}
-              cursor={'pointer'}
-              lineHeight={1}
-              onClick={() => {
-                if (isChatting) {
-                  return chatController.current?.abort('stop');
-                }
-                if (TextareaDom.current?.value) {
-                  return handleSubmit((data) => sendPrompt(data, TextareaDom.current?.value))();
-                }
-                // speech
-                // if (isSpeaking) {
-                //   return stopSpeak();
-                // }
-                // startSpeak();
-              }}
-            >
-              {isChatting ? (
-                <MyIcon
-                  className={styles.stopIcon}
-                  width={['22px', '25px']}
-                  height={['22px', '25px']}
-                  cursor={'pointer'}
-                  name={'stop'}
-                  color={'gray.500'}
-                />
-              ) : (
-                <MyTooltip label={t('core.chat.Send Message')}>
-                  <MyIcon
-                    name={'core/chat/sendFill'}
-                    width={['16px', '22px']}
-                    height={['16px', '22px']}
-                    color={TextareaDom.current?.value ? 'myBlue.600' : 'myGray.400'}
-                  />
-                </MyTooltip>
-              )}
-            </Flex>
-          </Box>
-        </Box>
+        <MessageInput
+          onChange={(e) => {
+            setRefresh(!refresh);
+          }}
+          onSendMessage={(e) => {
+            handleSubmit((data) => sendPrompt(data, e))();
+          }}
+          onStop={() => chatController.current?.abort('stop')}
+          isChatting={isChatting}
+          TextareaDom={TextareaDom}
+          resetInputVal={resetInputVal}
+        />
       ) : null}
 
       {/* user feedback modal */}
@@ -1206,16 +1114,20 @@ function ChatController({
             <MyIcon {...controlIconStyle} name={'loading'} />
           </MyTooltip>
         ) : audioPlaying ? (
-          <MyTooltip label={'终止播放'}>
-            <MyIcon
-              {...controlIconStyle}
-              name={'pause'}
-              _hover={{ color: '#E74694' }}
-              onClick={() => cancelAudio()}
-            />
-          </MyTooltip>
+          <Flex alignItems={'center'} mr={2}>
+            <MyTooltip label={t('core.chat.tts.Stop Speech')}>
+              <MyIcon
+                {...controlIconStyle}
+                mr={1}
+                name={'core/chat/stopSpeech'}
+                _hover={{ color: '#E74694' }}
+                onClick={() => cancelAudio()}
+              />
+            </MyTooltip>
+            {/* <MyIcon name={'loading'} w={'16px'} /> */}
+          </Flex>
         ) : (
-          <MyTooltip label={'语音播报'}>
+          <MyTooltip label={t('core.app.TTS')}>
             <MyIcon
               {...controlIconStyle}
               name={'voice'}
diff --git a/projects/app/src/components/Icon/icons/core/app/headphones.svg b/projects/app/src/components/Icon/icons/core/app/headphones.svg
new file mode 100644
index 000000000..93230462f
--- /dev/null
+++ b/projects/app/src/components/Icon/icons/core/app/headphones.svg
@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="12" height="12" viewBox="0 0 16 16" fill="none">
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M8.00004 2.66665C6.58555 2.66665 5.229 3.22855 4.2288 4.22874C3.22861 5.22894 2.66671 6.58549 2.66671 7.99998V8.66665H4.00004C4.53047 8.66665 5.03918 8.87736 5.41425 9.25243C5.78933 9.62751 6.00004 10.1362 6.00004 10.6666V12.6666C6.00004 13.1971 5.78933 13.7058 5.41425 14.0809C5.03918 14.4559 4.53047 14.6666 4.00004 14.6666H3.33337C2.80294 14.6666 2.29423 14.4559 1.91916 14.0809C1.54409 13.7058 1.33337 13.1971 1.33337 12.6666V7.99998C1.33337 6.23187 2.03575 4.53618 3.286 3.28593C4.53624 2.03569 6.23193 1.33331 8.00004 1.33331C9.76815 1.33331 11.4638 2.03569 12.7141 3.28593C13.9643 4.53618 14.6667 6.23187 14.6667 7.99998V12.6666C14.6667 13.1971 14.456 13.7058 14.0809 14.0809C13.7058 14.4559 13.1971 14.6666 12.6667 14.6666H12C11.4696 14.6666 10.9609 14.4559 10.5858 14.0809C10.2108 13.7058 10 13.1971 10 12.6666V10.6666C10 10.1362 10.2108 9.62751 10.5858 9.25243C10.9609 8.87736 11.4696 8.66665 12 8.66665H13.3334V7.99998C13.3334 6.58549 12.7715 5.22894 11.7713 4.22874C10.7711 3.22855 9.41453 2.66665 8.00004 2.66665ZM13.3334 9.99998H12C11.8232 9.99998 11.6537 10.0702 11.5286 10.1952C11.4036 10.3203 11.3334 10.4898 11.3334 10.6666V12.6666C11.3334 12.8435 11.4036 13.013 11.5286 13.138C11.6537 13.2631 11.8232 13.3333 12 13.3333H12.6667C12.8435 13.3333 13.0131 13.2631 13.1381 13.138C13.2631 13.013 13.3334 12.8435 13.3334 12.6666V9.99998ZM2.66671 12.6666C2.66671 12.8435 2.73695 13.013 2.86197 13.138C2.98699 13.2631 3.15656 13.3333 3.33337 13.3333H4.00004C4.17685 13.3333 4.34642 13.2631 4.47144 13.138C4.59647 13.013 4.66671 12.8435 4.66671 12.6666V10.6666C4.66671 10.4898 4.59647 10.3203 4.47144 10.1952C4.34642 10.0702 4.17685 9.99998 4.00004 9.99998H2.66671V12.6666Z" fill="white"/>
+</svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/icons/core/app/tts.svg b/projects/app/src/components/Icon/icons/core/app/tts.svg
new file mode 100644
index 000000000..2679946e8
--- /dev/null
+++ b/projects/app/src/components/Icon/icons/core/app/tts.svg
@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="16" viewBox="0 0 22 18" fill="none">
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M4.63694 1.22278C5.02752 1.61324 5.02762 2.24641 4.63715 2.63699C2.94991 4.32474 2.00208 6.61352 2.00208 8.99999C2.00208 11.3865 2.94991 13.6752 4.63715 15.363C5.02762 15.7536 5.02752 16.3867 4.63694 16.7772C4.24636 17.1677 3.61319 17.1676 3.22273 16.777C1.16054 14.7142 0.0020752 11.9168 0.0020752 8.99999C0.0020752 6.08319 1.16054 3.2858 3.22273 1.22299C3.61319 0.832409 4.24636 0.832314 4.63694 1.22278ZM17.3629 1.22278C17.7535 0.832314 18.3867 0.832409 18.7772 1.22299C20.8393 3.2858 21.9978 6.08319 21.9978 8.99999C21.9978 11.9168 20.8393 14.7142 18.7772 16.777C18.3867 17.1676 17.7535 17.1677 17.3629 16.7772C16.9724 16.3867 16.9723 15.7536 17.3627 15.363C19.05 13.6752 19.9978 11.3865 19.9978 8.99999C19.9978 6.61352 19.05 4.32474 17.3627 2.63699C16.9723 2.24641 16.9724 1.61324 17.3629 1.22278ZM7.46744 4.04328C7.85775 4.43402 7.8574 5.06719 7.46665 5.45749C7.00177 5.92186 6.63298 6.4733 6.38135 7.08029C6.12973 7.68728 6.00022 8.33792 6.00022 8.99499C6.00022 9.65207 6.12973 10.3027 6.38135 10.9097C6.63298 11.5167 7.00177 12.0681 7.46665 12.5325C7.8574 12.9228 7.85775 13.556 7.46744 13.9467C7.07713 14.3374 6.44397 14.3378 6.05323 13.9475C5.40239 13.2974 4.88608 12.5254 4.53381 11.6756C4.18154 10.8258 4.00022 9.9149 4.00022 8.99499C4.00022 8.07508 4.18154 7.1642 4.53381 6.31441C4.88608 5.46462 5.40239 4.6926 6.05323 4.04249C6.44397 3.65219 7.07713 3.65254 7.46744 4.04328ZM14.5324 4.05328C14.9227 3.66254 15.5559 3.66219 15.9467 4.05249C16.5975 4.7026 17.1138 5.47462 17.4661 6.32441C17.8183 7.1742 17.9997 8.08509 17.9997 9.00499C17.9997 9.9249 17.8183 10.8358 17.4661 11.6856C17.1138 12.5354 16.5975 13.3074 15.9467 13.9575C15.5559 14.3478 14.9227 14.3474 14.5324 13.9567C14.1421 13.566 14.1425 12.9328 14.5332 12.5425C14.9981 12.0781 15.3669 11.5267 15.6185 10.9197C15.8701 10.3127 15.9997 9.66207 15.9997 9.00499C15.9997 8.34792 15.8701 7.69728 15.6185 7.09029C15.3669 6.4833 14.9981 5.93186 14.5332 5.46749C14.1425 5.07719 14.1421 4.44402 14.5324 4.05328ZM10.9999 7.99999C10.4477 7.99999 9.99994 8.44771 9.99994 8.99999C9.99994 9.55228 10.4477 9.99999 10.9999 9.99999C11.5522 9.99999 11.9999 9.55228 11.9999 8.99999C11.9999 8.44771 11.5522 7.99999 10.9999 7.99999ZM7.99994 8.99999C7.99994 7.34314 9.34309 5.99999 10.9999 5.99999C12.6568 5.99999 13.9999 7.34314 13.9999 8.99999C13.9999 10.6568 12.6568 12 10.9999 12C9.34309 12 7.99994 10.6568 7.99994 8.99999Z" fill="#3370FF"/>
+</svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/icons/core/chat/recordFill.svg b/projects/app/src/components/Icon/icons/core/chat/recordFill.svg
index 959dea3f7..23921cbc8 100644
--- a/projects/app/src/components/Icon/icons/core/chat/recordFill.svg
+++ b/projects/app/src/components/Icon/icons/core/chat/recordFill.svg
@@ -1 +1,3 @@
-<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1699507042803" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="2849" xmlns:xlink="http://www.w3.org/1999/xlink" width="128" height="128"><path d="M512 628.50844445L512 628.50844445c106.79940741 0 194.18074075-87.38133333 194.18074075-194.18074075L706.18074075 201.31081482c0-106.79940741-87.38133333-194.18074075-194.18074075-194.18074074l0 0c-106.79940741 0-194.18074075 87.38133333-194.18074075 194.18074074l0 233.01688888C317.81925925 541.12711111 405.20059259 628.50844445 512 628.50844445z" p-id="2850"></path><path d="M857.39899259 488.21285925c3.2768-21.23851852-11.16539259-41.02068148-32.40391111-44.29748147-21.23851852-3.15543703-41.02068148 11.28675555-44.29748148 32.40391111C760.30862222 607.39128889 644.89244445 706.18074075 512 706.18074075c-132.89244445 0-248.42998518-98.91081482-268.6976-229.98281483-3.2768-21.23851852-23.18032592-35.68071111-44.29748148-32.4039111-21.23851852 3.2768-35.68071111 23.05896297-32.40391111 44.29748148 24.51531852 158.37866667 150.49007408 276.46482963 306.56284444 293.45564445L473.16385185 900.36148148l-116.50844444 0c-21.48124445 0-38.83614815 17.3549037-38.83614816 38.83614815s17.3549037 38.83614815 38.83614816 38.83614815l310.68918518 0c21.48124445 0 38.83614815-17.3549037 38.83614816-38.83614815s-17.3549037-38.83614815-38.83614816-38.83614815l-116.50844444 0 0-118.81434073C706.78755555 764.55632592 832.88367408 646.59152592 857.39899259 488.21285925z" p-id="2851"></path></svg>
\ No newline at end of file
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20" fill="none">
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M7.64302 0.976311C8.26814 0.351189 9.11599 0 10 0C10.8841 0 11.7319 0.351189 12.3571 0.976311C12.9822 1.60143 13.3334 2.44928 13.3334 3.33333V10C13.3334 10.8841 12.9822 11.7319 12.3571 12.357C11.7319 12.9821 10.8841 13.3333 10 13.3333C9.11599 13.3333 8.26814 12.9821 7.64302 12.357C7.0179 11.7319 6.66671 10.8841 6.66671 10V3.33333C6.66671 2.44928 7.0179 1.60143 7.64302 0.976311ZM10 1.66667C9.55801 1.66667 9.13409 1.84226 8.82153 2.15482C8.50897 2.46738 8.33337 2.89131 8.33337 3.33333V10C8.33337 10.442 8.50897 10.866 8.82153 11.1785C9.13409 11.4911 9.55801 11.6667 10 11.6667C10.4421 11.6667 10.866 11.4911 11.1786 11.1785C11.4911 10.866 11.6667 10.442 11.6667 10V3.33333C11.6667 2.89131 11.4911 2.46738 11.1786 2.15482C10.866 1.84226 10.4421 1.66667 10 1.66667ZM4.16671 7.5C4.62694 7.5 5.00004 7.8731 5.00004 8.33333V10C5.00004 11.3261 5.52682 12.5979 6.46451 13.5355C7.40219 14.4732 8.67396 15 10 15C11.3261 15 12.5979 14.4732 13.5356 13.5355C14.4733 12.5979 15 11.3261 15 10V8.33333C15 7.8731 15.3731 7.5 15.8334 7.5C16.2936 7.5 16.6667 7.8731 16.6667 8.33333V10C16.6667 11.7681 15.9643 13.4638 14.7141 14.714C13.6619 15.7662 12.2942 16.4304 10.8334 16.6144V18.3333H13.3334C13.7936 18.3333 14.1667 18.7064 14.1667 19.1667C14.1667 19.6269 13.7936 20 13.3334 20H6.66671C6.20647 20 5.83337 19.6269 5.83337 19.1667C5.83337 18.7064 6.20647 18.3333 6.66671 18.3333H9.16671V16.6144C7.70587 16.4304 6.33818 15.7662 5.286 14.714C4.03575 13.4638 3.33337 11.7681 3.33337 10V8.33333C3.33337 7.8731 3.70647 7.5 4.16671 7.5Z" fill="#485058"/>
+</svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/icons/core/chat/speaking.svg b/projects/app/src/components/Icon/icons/core/chat/speaking.svg
new file mode 100644
index 000000000..77eaa5819
--- /dev/null
+++ b/projects/app/src/components/Icon/icons/core/chat/speaking.svg
@@ -0,0 +1,11 @@
+<svg width="32" height="32" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg"
+    xmlns:xlink="http://www.w3.org/1999/xlink">
+    <rect width="32" height="32" fill="url(#pattern0)" />
+    <defs>
+        <pattern id="pattern0" patternContentUnits="objectBoundingBox" width="1" height="1">
+            <use xlink:href="#image0_18_1411" transform="scale(0.00666667)" />
+        </pattern>
+        <image id="image0_18_1411" width="150" height="150"
+            xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAJYAAACWCAYAAAA8AXHiAAAAAXNSR0IArs4c6QAABV1JREFUeF7tnEFy3DYQRTV738Y5gG/kHMW5UQ6Q3Mb7caFSqEAsggQo/j+Q/9NOLqob/fpNgyAlP974goCAwEMQk5AQeEMsJJAQQCwJVoIiFg5ICCCWBCtBEQsHJAQQS4KVoIiFAxICiCXBSlDEwgEJAcSSYCUoYuGAhABiSbASFLFwQEIAsSRYCYpYOCAhgFgSrARFLByQEEAsCVaCIhYOSAgglgQrQRELByQEEEuClaCIhQMSAoglwUpQxMIBCQHEkmAlKGLhgIQAYkmwEhSxcEBCALEkWAmKWDggIYBYEqwERSwckBBALAlWgiIWDkgIIJYEK0ERCwckBBBLgpWgiIUDEgKIJcFKUMTCAQkBxJJgJShi4YCEAGJJsBIUsXBAQgCxJFgJilg4ICGAWBKsBEUsHJAQQCwJVoIiFg5ICCCWBCtBEQsHJAQQS4KVoIiFAxICiCXBSlDEwgEJAcSSYCUoYuGAhMBSYn378/n8+fy/zn/+eiy1PkkHLgRtOa3KaJnGFViFcSvWl8fb298/kKu698f357Mw2TIq368m2BJi7UlVYBaI5Qu5/uOwnehVuBU5vVysnlTtDrHap/HC7vXhH+lJtSqnJcRqR/u2Ayt+Gj9syYUAZRtsf6x80zZvtduGl4u1BVbgbaGVf0ufWpXTO7sauRBr82ltxepBmxWrbq/tYeCqmNtYr4rzdTOxWoxlOiDWjlhbobbQZsTqnS5nDwH1BNbKeXVb3q6pxplZE2JN3j+UBt4l1tFBYFaKu05gZ4eT0QmIWDeLVcKVUT/SgLOT0+h2cRZndoKeHU5GHqcg1ovFareuvYPAiKC9A0WVfFTQ+uzpjjUh1oJiVSFGJ83e0b6WVW+UR+6P9rbBuu23x/ER2RFrUbFmttTe0b4VdETS7ZbaO/Ui1qQ0I5ef3bzPCHE0Iepa/h14sX20pplJc/QopZ2AiDViyuQ1CrHae5rtlPioWDPb6tHku0usKvrMfd9kiy5dvsST96PHDatNLMQa8+y3FEs9sUZlv3Ma927emVgd0e+E33sY2U7EO7ZCxDqfWkysHUZ3yX5XnLJEJta5zO+uuBP+3stnxc07E+u8yb/1xNo7FNy1FY7EKfjbZ1l76xl9XcXEOpfZNrFUYo3KsBWrFt6uazQWYi0gVvvS90oTjybN7O8+nT19H518iPVCsaoQvccNo00sP997Xzg6YVoMPblmYiHWi8XqyTUjVS1hT4grcXpb4sirnLoWxFpArMklfIrLEWuyTXc+bphM/akuR6zJdh39nlENdXX7mVzK0pf3PoC80jlo29HJaeYGd2kzbljc3jMxxDoBuycXUr2HdvRHqzMHgRscPw3x8ifvR8fy1WCd0jRd8Bn+V56lxDL1hTQGAohlgJyYArESu26oGbEMkBNTIFZi1w01I5YBcmIKxErsuqFmxDJATkyBWIldN9SMWAbIiSkQK7HrhpoRywA5MQViJXbdUDNiGSAnpkCsxK4bakYsA+TEFIiV2HVDzYhlgJyYArESu26oGbEMkBNTIFZi1w01I5YBcmIKxErsuqFmxDJATkyBWIldN9SMWAbIiSkQK7HrhpoRywA5MQViJXbdUDNiGSAnpkCsxK4bakYsA+TEFIiV2HVDzYhlgJyYArESu26oGbEMkBNTIFZi1w01I5YBcmIKxErsuqFmxDJATkyBWIldN9SMWAbIiSkQK7HrhpoRywA5MQViJXbdUDNiGSAnpkCsxK4bakYsA+TEFIiV2HVDzYhlgJyYArESu26oGbEMkBNTIFZi1w01I5YBcmIKxErsuqFmxDJATkyBWIldN9SMWAbIiSl+AcEzjbVWPFoJAAAAAElFTkSuQmCC" />
+    </defs>
+</svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/icons/core/chat/stopSpeech.svg b/projects/app/src/components/Icon/icons/core/chat/stopSpeech.svg
new file mode 100644
index 000000000..c607e6dd3
--- /dev/null
+++ b/projects/app/src/components/Icon/icons/core/chat/stopSpeech.svg
@@ -0,0 +1,4 @@
+<svg viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <path fill-rule="evenodd" clip-rule="evenodd"
+        d="M7.99996 1.99999C4.68625 1.99999 1.99996 4.68628 1.99996 7.99999C1.99996 11.3137 4.68625 14 7.99996 14C11.3137 14 14 11.3137 14 7.99999C14 4.68628 11.3137 1.99999 7.99996 1.99999ZM0.666626 7.99999C0.666626 3.9499 3.94987 0.666656 7.99996 0.666656C12.05 0.666656 15.3333 3.9499 15.3333 7.99999C15.3333 12.0501 12.05 15.3333 7.99996 15.3333C3.94987 15.3333 0.666626 12.0501 0.666626 7.99999ZM5.33329 5.99999C5.33329 5.6318 5.63177 5.33332 5.99996 5.33332H9.99996C10.3682 5.33332 10.6666 5.6318 10.6666 5.99999V9.99999C10.6666 10.3682 10.3682 10.6667 9.99996 10.6667H5.99996C5.63177 10.6667 5.33329 10.3682 5.33329 9.99999V5.99999ZM6.66663 6.66666V9.33332H9.33329V6.66666H6.66663Z" />
+</svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/icons/core/chat/stopSpeechFill.svg b/projects/app/src/components/Icon/icons/core/chat/stopSpeechFill.svg
index 7fa7b69ab..64b726127 100644
--- a/projects/app/src/components/Icon/icons/core/chat/stopSpeechFill.svg
+++ b/projects/app/src/components/Icon/icons/core/chat/stopSpeechFill.svg
@@ -1,8 +1,10 @@
-<?xml version="1.0" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1699507299637"
-    class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="3033"
-    xmlns:xlink="http://www.w3.org/1999/xlink" width="128" height="128">
-    <path
-        d="M512 0a512 512 0 0 1 512 512c0 282.769067-229.230933 512-512 512S0 794.769067 0 512 229.230933 0 512 0zM388.022613 314.88C347.62752 314.88 314.88 347.62752 314.88 388.022613v247.954774C314.88 676.37248 347.62752 709.12 388.022613 709.12h247.954774C676.37248 709.12 709.12 676.37248 709.12 635.977387V388.022613C709.12 347.62752 676.37248 314.88 635.977387 314.88H388.022613z"
-        p-id="3034"></path>
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 20 20" fill="none">
+  <g clip-path="url(#clip0_74_2)">
+    <path fill-rule="evenodd" clip-rule="evenodd" d="M10 2.49999C5.85791 2.49999 2.50004 5.85786 2.50004 10C2.50004 14.1421 5.85791 17.5 10 17.5C14.1422 17.5 17.5 14.1421 17.5 10C17.5 5.85786 14.1422 2.49999 10 2.49999ZM0.833374 10C0.833374 4.93739 4.93743 0.833328 10 0.833328C15.0627 0.833328 19.1667 4.93739 19.1667 10C19.1667 15.0626 15.0627 19.1667 10 19.1667C4.93743 19.1667 0.833374 15.0626 0.833374 10ZM6.66671 7.5C6.66671 7.03976 7.0398 6.66666 7.50004 6.66666H12.5C12.9603 6.66666 13.3334 7.03976 13.3334 7.5V12.5C13.3334 12.9602 12.9603 13.3333 12.5 13.3333H7.50004C7.0398 13.3333 6.66671 12.9602 6.66671 12.5V7.5ZM8.33337 8.33333V11.6667H11.6667V8.33333H8.33337Z" fill="#3370FF"/>
+  </g>
+  <defs>
+    <clipPath id="clip0_74_2">
+      <rect width="20" height="20" fill="white"/>
+    </clipPath>
+  </defs>
 </svg>
\ No newline at end of file
diff --git a/projects/app/src/components/Icon/index.tsx b/projects/app/src/components/Icon/index.tsx
index 5f24622f7..b73d1838a 100644
--- a/projects/app/src/components/Icon/index.tsx
+++ b/projects/app/src/components/Icon/index.tsx
@@ -105,10 +105,14 @@ const iconPaths = {
   'support/permission/privateLight': () => import('./icons/support/permission/privateLight.svg'),
   'support/permission/publicLight': () => import('./icons/support/permission/publicLight.svg'),
   'core/app/ttsFill': () => import('./icons/core/app/ttsFill.svg'),
+  'core/app/tts': () => import('./icons/core/app/tts.svg'),
+  'core/app/headphones': () => import('./icons/core/app/headphones.svg'),
   'common/playLight': () => import('./icons/common/playLight.svg'),
   'core/chat/sendFill': () => import('./icons/core/chat/sendFill.svg'),
   'core/chat/recordFill': () => import('./icons/core/chat/recordFill.svg'),
-  'core/chat/stopSpeechFill': () => import('./icons/core/chat/stopSpeechFill.svg')
+  'core/chat/stopSpeechFill': () => import('./icons/core/chat/stopSpeechFill.svg'),
+  'core/chat/stopSpeech': () => import('./icons/core/chat/stopSpeech.svg'),
+  'core/chat/speaking': () => import('./icons/core/chat/speaking.svg')
 };
 
 export type IconName = keyof typeof iconPaths;
diff --git a/projects/app/src/components/Markdown/img/Image.tsx b/projects/app/src/components/Markdown/img/Image.tsx
index 16cecdae2..e2770b9b5 100644
--- a/projects/app/src/components/Markdown/img/Image.tsx
+++ b/projects/app/src/components/Markdown/img/Image.tsx
@@ -1,9 +1,18 @@
 import React, { useState } from 'react';
-import { Image, Skeleton } from '@chakra-ui/react';
+import {
+  Image,
+  Modal,
+  ModalCloseButton,
+  ModalContent,
+  ModalOverlay,
+  Skeleton,
+  useDisclosure
+} from '@chakra-ui/react';
 
 const MdImage = ({ src }: { src?: string }) => {
   const [isLoading, setIsLoading] = useState(true);
   const [succeed, setSucceed] = useState(false);
+  const { isOpen, onOpen, onClose } = useDisclosure();
   return (
     <Skeleton
       minH="100px"
@@ -30,9 +39,23 @@ const MdImage = ({ src }: { src?: string }) => {
         onError={() => setIsLoading(false)}
         onClick={() => {
           if (!succeed) return;
-          window.open(src, '_blank');
+          onOpen();
         }}
       />
+      <Modal isOpen={isOpen} onClose={onClose}>
+        <ModalOverlay />
+        <ModalContent m={'auto'}>
+          <ModalCloseButton />
+          <Image
+            src={src}
+            alt={''}
+            fallbackSrc={'/imgs/errImg.png'}
+            fallbackStrategy={'onError'}
+            loading="eager"
+            objectFit={'contain'}
+          />
+        </ModalContent>
+      </Modal>
     </Skeleton>
   );
 };
diff --git a/projects/app/src/components/MyModal/index.tsx b/projects/app/src/components/MyModal/index.tsx
index 24dab5b79..72627d605 100644
--- a/projects/app/src/components/MyModal/index.tsx
+++ b/projects/app/src/components/MyModal/index.tsx
@@ -42,7 +42,23 @@ const MyModal = ({
         maxH={'90vh'}
         {...props}
       >
-        {!!title && <ModalHeader>{title}</ModalHeader>}
+        {!title && onClose && <ModalCloseButton zIndex={1} />}
+        {!!title && (
+          <ModalHeader
+            display={'flex'}
+            alignItems={'center'}
+            fontWeight={500}
+            background={'#FBFBFC'}
+            borderBottom={'1px solid #F4F6F8'}
+            roundedTop={'lg'}
+            py={3}
+          >
+            {title}
+            <Box flex={1} />
+            {onClose && <ModalCloseButton position={'relative'} top={0} right={0} />}
+          </ModalHeader>
+        )}
+
         <Box
           overflow={props.overflow || 'overlay'}
           h={'100%'}
@@ -51,7 +67,6 @@ const MyModal = ({
         >
           {children}
         </Box>
-        {onClose && <ModalCloseButton />}
       </ModalContent>
     </Modal>
   );
diff --git a/projects/app/src/components/Select/index.tsx b/projects/app/src/components/Select/index.tsx
index 81591933f..c28fc951b 100644
--- a/projects/app/src/components/Select/index.tsx
+++ b/projects/app/src/components/Select/index.tsx
@@ -6,7 +6,8 @@ import {
   MenuItem,
   Button,
   useDisclosure,
-  useOutsideClick
+  useOutsideClick,
+  MenuButton
 } from '@chakra-ui/react';
 import type { ButtonProps } from '@chakra-ui/react';
 import { ChevronDownIcon } from '@chakra-ui/icons';
@@ -47,80 +48,81 @@ const MySelect = (
   });
 
   return (
-    <Menu autoSelect={false} isOpen={isOpen} onOpen={onOpen} onClose={onClose}>
-      <Box
+    <Menu
+      autoSelect={false}
+      isOpen={isOpen}
+      onOpen={onOpen}
+      onClose={onClose}
+      strategy={'fixed'}
+      matchWidth
+    >
+      {/* <Box
         ref={SelectRef}
         position={'relative'}
         onClick={() => {
           isOpen ? onClose() : onOpen();
         }}
-      >
-        <Button
-          ref={ref}
-          width={width}
-          px={3}
-          variant={'base'}
-          display={'flex'}
-          alignItems={'center'}
-          justifyContent={'space-between'}
-          _active={{
-            transform: ''
-          }}
-          {...(isOpen
-            ? {
-                boxShadow: '0px 0px 4px #A8DBFF',
-                borderColor: 'myBlue.600'
-              }
-            : {})}
-          {...props}
-        >
-          {selectItem?.alias || selectItem?.label || placeholder}
-          <Box flex={1} />
-          <ChevronDownIcon />
-        </Button>
-
-        <MenuList
-          minW={(() => {
-            const w = ref.current?.clientWidth;
-            if (w) {
-              return `${w}px !important`;
+      > */}
+      <MenuButton
+        as={Button}
+        ref={ref}
+        width={width}
+        px={3}
+        rightIcon={<ChevronDownIcon />}
+        variant={'base'}
+        textAlign={'left'}
+        _active={{
+          transform: 'none'
+        }}
+        {...(isOpen
+          ? {
+              boxShadow: '0px 0px 4px #A8DBFF',
+              borderColor: 'myBlue.600'
             }
-            return Array.isArray(width)
-              ? width.map((item) => `${item} !important`)
-              : `${width} !important`;
-          })()}
-          p={'6px'}
-          border={'1px solid #fff'}
-          boxShadow={
-            '0px 2px 4px rgba(161, 167, 179, 0.25), 0px 0px 1px rgba(121, 141, 159, 0.25);'
+          : {})}
+        {...props}
+      >
+        {selectItem?.alias || selectItem?.label || placeholder}
+      </MenuButton>
+
+      <MenuList
+        minW={(() => {
+          const w = ref.current?.clientWidth;
+          if (w) {
+            return `${w}px !important`;
           }
-          zIndex={99}
-          transform={'translateY(35px) !important'}
-          maxH={'40vh'}
-          overflowY={'auto'}
-        >
-          {list.map((item) => (
-            <MenuItem
-              key={item.value}
-              {...menuItemStyles}
-              {...(value === item.value
-                ? {
-                    color: 'myBlue.600',
-                    bg: 'myWhite.300'
-                  }
-                : {})}
-              onClick={() => {
-                if (onchange && value !== item.value) {
-                  onchange(item.value);
+          return Array.isArray(width)
+            ? width.map((item) => `${item} !important`)
+            : `${width} !important`;
+        })()}
+        p={'6px'}
+        border={'1px solid #fff'}
+        boxShadow={'0px 2px 4px rgba(161, 167, 179, 0.25), 0px 0px 1px rgba(121, 141, 159, 0.25);'}
+        zIndex={99}
+        maxH={'40vh'}
+        overflowY={'auto'}
+      >
+        {list.map((item) => (
+          <MenuItem
+            key={item.value}
+            {...menuItemStyles}
+            {...(value === item.value
+              ? {
+                  color: 'myBlue.600',
+                  bg: 'myWhite.300'
                 }
-              }}
-              whiteSpace={'pre-wrap'}
-            >
-              {item.label}
-            </MenuItem>
-          ))}
-        </MenuList>
-      </Box>
+              : {})}
+            onClick={() => {
+              if (onchange && value !== item.value) {
+                onchange(item.value);
+              }
+            }}
+            whiteSpace={'pre-wrap'}
+          >
+            {item.label}
+          </MenuItem>
+        ))}
+      </MenuList>
     </Menu>
   );
 };
diff --git a/projects/app/src/global/common/api/systemRes.d.ts b/projects/app/src/global/common/api/systemRes.d.ts
index 1a7d9f216..cb2af314d 100644
--- a/projects/app/src/global/common/api/systemRes.d.ts
+++ b/projects/app/src/global/common/api/systemRes.d.ts
@@ -3,11 +3,24 @@ import type {
   FunctionModelItemType,
   LLMModelItemType,
   VectorModelItemType,
-  AudioSpeechModels
+  AudioSpeechModels,
+  WhisperModelType
 } from '@fastgpt/global/core/ai/model.d';
 
 import type { FeConfigsType } from '@fastgpt/global/common/system/types/index.d';
 
+export type ConfigFileType = {
+  FeConfig: FeConfigsType;
+  SystemParams: SystemEnvType;
+  ChatModels: ChatModelItemType[];
+  QAModels: LLMModelItemType[];
+  CQModels: FunctionModelItemType[];
+  ExtractModels: FunctionModelItemType[];
+  QGModels: LLMModelItemType[];
+  VectorModels: VectorModelItemType[];
+  AudioSpeechModels: AudioSpeechModelType[];
+  WhisperModel: WhisperModelType;
+};
 export type InitDateResponse = {
   chatModels: ChatModelItemType[];
   qaModels: LLMModelItemType[];
diff --git a/projects/app/src/global/core/chat/api.d.ts b/projects/app/src/global/core/chat/api.d.ts
index 8da11b219..2c609a7de 100644
--- a/projects/app/src/global/core/chat/api.d.ts
+++ b/projects/app/src/global/core/chat/api.d.ts
@@ -3,4 +3,5 @@ import type { AppTTSConfigType } from '@/types/app';
 export type GetChatSpeechProps = {
   ttsConfig: AppTTSConfigType;
   input: string;
+  shareId?: string;
 };
diff --git a/projects/app/src/pages/api/core/chat/item/getSpeech.ts b/projects/app/src/pages/api/core/chat/item/getSpeech.ts
index 277c1ba2e..2844e80e1 100644
--- a/projects/app/src/pages/api/core/chat/item/getSpeech.ts
+++ b/projects/app/src/pages/api/core/chat/item/getSpeech.ts
@@ -4,7 +4,7 @@ import { connectToDatabase } from '@/service/mongo';
 import { GetChatSpeechProps } from '@/global/core/chat/api.d';
 import { text2Speech } from '@fastgpt/service/core/ai/audio/speech';
 import { pushAudioSpeechBill } from '@/service/support/wallet/bill/push';
-import { authCert } from '@fastgpt/service/support/permission/auth/common';
+import { authCertAndShareId } from '@fastgpt/service/support/permission/auth/common';
 import { authType2BillSource } from '@/service/support/wallet/bill/utils';
 import { getAudioSpeechModel } from '@/service/core/ai/model';
 import { MongoTTSBuffer } from '@fastgpt/service/common/buffer/tts/schema';
@@ -19,16 +19,16 @@ import { MongoTTSBuffer } from '@fastgpt/service/common/buffer/tts/schema';
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   try {
     await connectToDatabase();
-    const { ttsConfig, input } = req.body as GetChatSpeechProps;
+    const { ttsConfig, input, shareId } = req.body as GetChatSpeechProps;
 
     if (!ttsConfig.model || !ttsConfig.voice) {
       throw new Error('model or voice not found');
     }
 
-    const { teamId, tmbId, authType } = await authCert({ req, authToken: true });
+    const { teamId, tmbId, authType } = await authCertAndShareId({ req, authToken: true, shareId });
 
     const ttsModel = getAudioSpeechModel(ttsConfig.model);
-    const voiceData = ttsModel.voices.find((item) => item.value === ttsConfig.voice);
+    const voiceData = ttsModel.voices?.find((item) => item.value === ttsConfig.voice);
 
     if (!voiceData) {
       throw new Error('voice not found');
@@ -37,7 +37,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     const ttsBuffer = await MongoTTSBuffer.findOne(
       {
         bufferId: voiceData.bufferId,
-        text: input
+        text: JSON.stringify({ text: input, speed: ttsConfig.speed })
       },
       'buffer'
     );
@@ -51,6 +51,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
       input,
       model: ttsConfig.model,
       voice: ttsConfig.voice,
+      speed: ttsConfig.speed,
       props: {
         // temp code
         baseUrl: ttsModel.baseUrl || '',
@@ -68,7 +69,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
 
           await MongoTTSBuffer.create({
             bufferId: voiceData.bufferId,
-            text: input,
+            text: JSON.stringify({ text: input, speed: ttsConfig.speed }),
             buffer
           });
         } catch (error) {}
diff --git a/projects/app/src/pages/api/system/getInitData.ts b/projects/app/src/pages/api/system/getInitData.ts
index 715ef58ba..83c818e8f 100644
--- a/projects/app/src/pages/api/system/getInitData.ts
+++ b/projects/app/src/pages/api/system/getInitData.ts
@@ -2,7 +2,7 @@ import type { FeConfigsType, SystemEnvType } from '@fastgpt/global/common/system
 import type { NextApiRequest, NextApiResponse } from 'next';
 import { jsonRes } from '@fastgpt/service/common/response';
 import { readFileSync } from 'fs';
-import type { InitDateResponse } from '@/global/common/api/systemRes';
+import type { ConfigFileType, InitDateResponse } from '@/global/common/api/systemRes';
 import { formatPrice } from '@fastgpt/global/support/wallet/bill/tools';
 import { getTikTokenEnc } from '@fastgpt/global/common/string/tiktoken';
 import { initHttpAgent } from '@fastgpt/service/common/middle/httpAgent';
@@ -13,15 +13,9 @@ import {
   defaultExtractModels,
   defaultQGModels,
   defaultVectorModels,
-  defaultAudioSpeechModels
+  defaultAudioSpeechModels,
+  defaultWhisperModel
 } from '@fastgpt/global/core/ai/model';
-import {
-  AudioSpeechModelType,
-  ChatModelItemType,
-  FunctionModelItemType,
-  LLMModelItemType,
-  VectorModelItemType
-} from '@fastgpt/global/core/ai/model.d';
 
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   getInitConfig();
@@ -83,60 +77,39 @@ export function getInitConfig() {
 
     const filename =
       process.env.NODE_ENV === 'development' ? 'data/config.local.json' : '/app/data/config.json';
-    const res = JSON.parse(readFileSync(filename, 'utf-8')) as {
-      FeConfig: FeConfigsType;
-      SystemParams: SystemEnvType;
-      ChatModels: ChatModelItemType[];
-      QAModels: LLMModelItemType[];
-      CQModels: FunctionModelItemType[];
-      ExtractModels: FunctionModelItemType[];
-      QGModels: LLMModelItemType[];
-      VectorModels: VectorModelItemType[];
-      AudioSpeechModels: AudioSpeechModelType[];
-    };
+    const res = JSON.parse(readFileSync(filename, 'utf-8')) as ConfigFileType;
 
     console.log(`System Version: ${global.systemVersion}`);
 
-    console.log(res);
-
-    global.systemEnv = res.SystemParams
-      ? { ...defaultSystemEnv, ...res.SystemParams }
-      : defaultSystemEnv;
-    global.feConfigs = res.FeConfig
-      ? { ...defaultFeConfigs, ...res.FeConfig, isPlus: !!res.SystemParams?.pluginBaseUrl }
-      : defaultFeConfigs;
-
-    global.chatModels = res.ChatModels || defaultChatModels;
-    global.qaModels = res.QAModels || defaultQAModels;
-    global.cqModels = res.CQModels || defaultCQModels;
-    global.extractModels = res.ExtractModels || defaultExtractModels;
-    global.qgModels = res.QGModels || defaultQGModels;
-
-    global.vectorModels = res.VectorModels || defaultVectorModels;
-
-    global.audioSpeechModels = res.AudioSpeechModels || defaultAudioSpeechModels;
+    setDefaultData(res);
   } catch (error) {
     setDefaultData();
     console.log('get init config error, set default', error);
   }
 }
 
-export function setDefaultData() {
-  global.systemEnv = defaultSystemEnv;
-  global.feConfigs = defaultFeConfigs;
+export function setDefaultData(res?: ConfigFileType) {
+  global.systemEnv = res?.SystemParams
+    ? { ...defaultSystemEnv, ...res.SystemParams }
+    : defaultSystemEnv;
+  global.feConfigs = res?.FeConfig
+    ? { ...defaultFeConfigs, ...res.FeConfig, isPlus: !!res.SystemParams?.pluginBaseUrl }
+    : defaultFeConfigs;
 
-  global.chatModels = defaultChatModels;
-  global.qaModels = defaultQAModels;
-  global.cqModels = defaultCQModels;
-  global.extractModels = defaultExtractModels;
-  global.qgModels = defaultQGModels;
+  global.chatModels = res?.ChatModels || defaultChatModels;
+  global.qaModels = res?.QAModels || defaultQAModels;
+  global.cqModels = res?.CQModels || defaultCQModels;
+  global.extractModels = res?.ExtractModels || defaultExtractModels;
+  global.qgModels = res?.QGModels || defaultQGModels;
 
-  global.vectorModels = defaultVectorModels;
-  global.audioSpeechModels = defaultAudioSpeechModels;
+  global.vectorModels = res?.VectorModels || defaultVectorModels;
+
+  global.audioSpeechModels = res?.AudioSpeechModels || defaultAudioSpeechModels;
+
+  global.whisperModel = res?.WhisperModel || defaultWhisperModel;
 
   global.priceMd = '';
 
-  console.log('use default config');
   console.log(global);
 }
 
@@ -178,6 +151,10 @@ ${global.extractModels
 ${global.qgModels
   ?.map((item) => `| 下一步指引-${item.name} | ${formatPrice(item.price, 1000)} |`)
   .join('\n')}
+${global.audioSpeechModels
+  ?.map((item) => `| 语音播放-${item.name} | ${formatPrice(item.price, 1000)} |`)
+  .join('\n')}
+${`| 语音输入-${global.whisperModel.name} | ${global.whisperModel.price}/分钟 |`}
 `;
   console.log(global.priceMd);
 }
diff --git a/projects/app/src/pages/api/v1/audio/transcriptions.ts b/projects/app/src/pages/api/v1/audio/transcriptions.ts
index 7294b7e3a..bf0ba0fac 100644
--- a/projects/app/src/pages/api/v1/audio/transcriptions.ts
+++ b/projects/app/src/pages/api/v1/audio/transcriptions.ts
@@ -1,10 +1,11 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 import { jsonRes } from '@fastgpt/service/common/response';
-import { authCert } from '@fastgpt/service/support/permission/auth/common';
+import { authCert, authCertAndShareId } from '@fastgpt/service/support/permission/auth/common';
 import { withNextCors } from '@fastgpt/service/common/middle/cors';
 import { getUploadModel } from '@fastgpt/service/common/file/upload/multer';
 import fs from 'fs';
 import { getAIApi } from '@fastgpt/service/core/ai/config';
+import { pushWhisperBill } from '@/service/support/wallet/bill/push';
 
 const upload = getUploadModel({
   maxSize: 2
@@ -12,9 +13,16 @@ const upload = getUploadModel({
 
 export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
   try {
+    const {
+      files,
+      metadata: { duration, shareId }
+    } = await upload.doUpload<{ duration: number; shareId?: string }>(req, res);
+
     const { teamId, tmbId } = await authCert({ req, authToken: true });
 
-    const { files } = await upload.doUpload(req, res);
+    if (!global.whisperModel) {
+      throw new Error('whisper model not found');
+    }
 
     const file = files[0];
 
@@ -26,7 +34,13 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
 
     const result = await ai.audio.transcriptions.create({
       file: fs.createReadStream(file.path),
-      model: 'whisper-1'
+      model: global.whisperModel.model
+    });
+
+    pushWhisperBill({
+      teamId,
+      tmbId,
+      duration
     });
 
     jsonRes(res, {
diff --git a/projects/app/src/pages/app/detail/components/TTSSelect.tsx b/projects/app/src/pages/app/detail/components/TTSSelect.tsx
index a407e6fdb..22c2d6017 100644
--- a/projects/app/src/pages/app/detail/components/TTSSelect.tsx
+++ b/projects/app/src/pages/app/detail/components/TTSSelect.tsx
@@ -1,15 +1,16 @@
 import MyIcon from '@/components/Icon';
 import MyTooltip from '@/components/MyTooltip';
 import { QuestionOutlineIcon } from '@chakra-ui/icons';
-import { Box, Flex } from '@chakra-ui/react';
+import { Box, Button, Flex, ModalBody, useDisclosure } from '@chakra-ui/react';
 import React, { useCallback, useMemo } from 'react';
 import { useTranslation } from 'next-i18next';
 import MySelect from '@/components/Select';
 import { TTSTypeEnum } from '@/constants/app';
 import { AppTTSConfigType } from '@/types/app';
 import { useAudioPlay } from '@/web/common/utils/voice';
-import { useLoading } from '@/web/common/hooks/useLoading';
 import { audioSpeechModels } from '@/web/common/system/staticData';
+import MyModal from '@/components/MyModal';
+import MySlider from '@/components/Slider';
 
 const TTSSelect = ({
   value,
@@ -19,8 +20,16 @@ const TTSSelect = ({
   onChange: (e: AppTTSConfigType) => void;
 }) => {
   const { t } = useTranslation();
-  const { playAudio, audioLoading } = useAudioPlay({ ttsConfig: value });
-  const { Loading } = useLoading();
+  const { isOpen, onOpen, onClose } = useDisclosure();
+
+  const list = useMemo(
+    () => [
+      { label: t('core.app.tts.Close'), value: TTSTypeEnum.none },
+      { label: t('core.app.tts.Web'), value: TTSTypeEnum.web },
+      ...audioSpeechModels.map((item) => item?.voices || []).flat()
+    ],
+    [t]
+  );
 
   const formatValue = useMemo(() => {
     if (!value || !value.type) {
@@ -31,63 +40,126 @@ const TTSSelect = ({
     }
     return value.voice;
   }, [value]);
+  const formLabel = useMemo(
+    () => list.find((item) => item.value === formatValue)?.label || t('common.UnKnow'),
+    [formatValue, list, t]
+  );
+
+  const { playAudio, cancelAudio, audioLoading, audioPlaying } = useAudioPlay({ ttsConfig: value });
 
   const onclickChange = useCallback(
     (e: string) => {
       if (e === TTSTypeEnum.none || e === TTSTypeEnum.web) {
         onChange({ type: e as `${TTSTypeEnum}` });
       } else {
-        const audioModel = audioSpeechModels.find((item) =>
-          item.voices.find((voice) => voice.value === e)
+        const audioModel = audioSpeechModels.find(
+          (item) => item.voices?.find((voice) => voice.value === e)
         );
         if (!audioModel) {
           return;
         }
         onChange({
+          ...value,
           type: TTSTypeEnum.model,
           model: audioModel.model,
-          voice: e,
-          speed: 1
+          voice: e
         });
       }
     },
-    [onChange]
+    [onChange, value]
   );
 
   return (
     <Flex alignItems={'center'}>
-      <MyIcon name={'core/app/ttsFill'} mr={2} w={'16px'} />
+      <MyIcon name={'core/app/tts'} mr={2} w={'16px'} />
       <Box>{t('core.app.TTS')}</Box>
       <MyTooltip label={t('core.app.TTS Tip')} forceShow>
         <QuestionOutlineIcon display={['none', 'inline']} ml={1} />
       </MyTooltip>
       <Box flex={1} />
-      {formatValue !== TTSTypeEnum.none && (
-        <MyTooltip label={t('core.app.tts.Test Listen')}>
-          <MyIcon
-            mr={1}
-            name="common/playLight"
-            w={['14px', '16px']}
-            cursor={'pointer'}
-            onClick={() => {
-              playAudio({
-                text: t('core.app.tts.Test Listen Text')
-              });
-            }}
-          />
-        </MyTooltip>
-      )}
-      <MySelect
-        w={'150px'}
-        value={formatValue}
-        list={[
-          { label: t('core.app.tts.Close'), value: TTSTypeEnum.none },
-          { label: t('core.app.tts.Web'), value: TTSTypeEnum.web },
-          ...audioSpeechModels.map((item) => item.voices).flat()
-        ]}
-        onchange={onclickChange}
-      />
-      <Loading loading={audioLoading} />
+      <MyTooltip label={t('core.app.Select TTS')}>
+        <Box
+          cursor={'pointer'}
+          _hover={{ bg: 'myGray.100' }}
+          py={2}
+          px={3}
+          borderRadius={'md'}
+          onClick={onOpen}
+          color={'myGray.600'}
+        >
+          {formLabel}
+        </Box>
+      </MyTooltip>
+      <MyModal
+        title={
+          <>
+            <MyIcon name={'core/app/tts'} mr={2} w={'20px'} />
+            {t('core.app.TTS')}
+          </>
+        }
+        isOpen={isOpen}
+        onClose={onClose}
+        w={'500px'}
+      >
+        <ModalBody px={[5, 16]} py={[4, 8]}>
+          <Flex justifyContent={'space-between'} alignItems={'center'}>
+            {t('core.app.tts.Speech model')}
+            <MySelect w={'220px'} value={formatValue} list={list} onchange={onclickChange} />
+          </Flex>
+          <Flex mt={8} justifyContent={'space-between'} alignItems={'center'}>
+            {t('core.app.tts.Speech speed')}
+            <MySlider
+              markList={[
+                { label: '0.3', value: 0.3 },
+                { label: '2', value: 2 }
+              ]}
+              width={'220px'}
+              min={0.3}
+              max={2}
+              step={0.1}
+              value={value.speed || 1}
+              onChange={(e) => {
+                onChange({
+                  ...value,
+                  speed: e
+                });
+              }}
+            />
+          </Flex>
+          {formatValue !== TTSTypeEnum.none && (
+            <Flex mt={10} justifyContent={'end'}>
+              {audioPlaying ? (
+                <Flex>
+                  <MyIcon name={'core/chat/speaking'} w={'16px'} />
+                  <Button
+                    ml={3}
+                    variant={'gray'}
+                    isLoading={audioLoading}
+                    leftIcon={<MyIcon name={'core/chat/stopSpeech'} w={'16px'} />}
+                    onClick={() => {
+                      cancelAudio();
+                    }}
+                  >
+                    {t('core.chat.tts.Stop Speech')}
+                  </Button>
+                </Flex>
+              ) : (
+                <Button
+                  isLoading={audioLoading}
+                  leftIcon={<MyIcon name={'core/app/headphones'} w={'16px'} />}
+                  onClick={() => {
+                    playAudio({
+                      text: t('core.app.tts.Test Listen Text')
+                    });
+                  }}
+                >
+                  {t('core.app.tts.Test Listen')}
+                </Button>
+              )}
+            </Flex>
+          )}
+        </ModalBody>
+      </MyModal>
     </Flex>
   );
 };
diff --git a/projects/app/src/pages/index.tsx b/projects/app/src/pages/index.tsx
index f6a460334..997ed7786 100644
--- a/projects/app/src/pages/index.tsx
+++ b/projects/app/src/pages/index.tsx
@@ -10,7 +10,6 @@ import Ability from './components/Ability';
 import Choice from './components/Choice';
 import Footer from './components/Footer';
 import Loading from '@/components/Loading';
-import Head from 'next/head';
 
 const Home = ({ homeUrl = '/' }: { homeUrl: string }) => {
   const router = useRouter();
@@ -26,9 +25,6 @@ const Home = ({ homeUrl = '/' }: { homeUrl: string }) => {
 
   return (
     <>
-      <Head>
-        <title>{feConfigs?.systemTitle || 'FastGPT'}</title>
-      </Head>
       <Box id="home" bg={'myWhite.600'} h={'100vh'} overflowY={'auto'} overflowX={'hidden'}>
         <Box position={'fixed'} zIndex={10} top={0} left={0} right={0}>
           <Navbar />
diff --git a/projects/app/src/service/support/wallet/bill/push.ts b/projects/app/src/service/support/wallet/bill/push.ts
index 4efa32f95..8388b01ed 100644
--- a/projects/app/src/service/support/wallet/bill/push.ts
+++ b/projects/app/src/service/support/wallet/bill/push.ts
@@ -1,4 +1,4 @@
-import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
+import { BillSourceEnum, PRICE_SCALE } from '@fastgpt/global/support/wallet/bill/constants';
 import { getAudioSpeechModel, getQAModel } from '@/service/core/ai/model';
 import type { ChatHistoryItemResType } from '@fastgpt/global/core/chat/api.d';
 import { formatPrice } from '@fastgpt/global/support/wallet/bill/tools';
@@ -205,3 +205,37 @@ export function pushAudioSpeechBill({
     ]
   });
 }
+
+export function pushWhisperBill({
+  teamId,
+  tmbId,
+  duration
+}: {
+  teamId: string;
+  tmbId: string;
+  duration: number;
+}) {
+  const modelData = global.whisperModel;
+
+  if (!modelData) return;
+
+  const total = ((modelData.price * duration) / 60) * PRICE_SCALE;
+
+  const name = 'wallet.bill.Whisper';
+
+  createBill({
+    teamId,
+    tmbId,
+    appName: name,
+    total,
+    source: BillSourceEnum.fastgpt,
+    list: [
+      {
+        moduleName: name,
+        amount: total,
+        model: modelData.name,
+        tokenLen: duration
+      }
+    ]
+  });
+}
diff --git a/projects/app/src/types/index.d.ts b/projects/app/src/types/index.d.ts
index 6ad97a55d..173262084 100644
--- a/projects/app/src/types/index.d.ts
+++ b/projects/app/src/types/index.d.ts
@@ -3,7 +3,8 @@ import {
   ChatModelItemType,
   FunctionModelItemType,
   LLMModelItemType,
-  VectorModelItemType
+  VectorModelItemType,
+  WhisperModelType
 } from '@fastgpt/global/core/ai/model.d';
 import { TrackEventName } from '@/constants/common';
 
@@ -27,6 +28,7 @@ declare global {
   var extractModels: FunctionModelItemType[];
   var qgModels: LLMModelItemType[];
   var audioSpeechModels: AudioSpeechModelType[];
+  var whisperModel: WhisperModelType;
 
   var priceMd: string;
   var systemVersion: string;
diff --git a/projects/app/src/web/common/api/fetch.ts b/projects/app/src/web/common/api/fetch.ts
index 052fd3328..3f32bbebb 100644
--- a/projects/app/src/web/common/api/fetch.ts
+++ b/projects/app/src/web/common/api/fetch.ts
@@ -110,6 +110,12 @@ export const streamFetch = ({
       };
       read();
     } catch (err: any) {
+      if (abortSignal.signal.aborted) {
+        return resolve({
+          responseText: '',
+          responseData: []
+        });
+      }
       console.log(err, 'fetch error');
 
       reject(getErrText(err, '请求异常'));
diff --git a/projects/app/src/web/common/hooks/useSpeech.ts b/projects/app/src/web/common/hooks/useSpeech.ts
index 4a2192e8e..7ae42daa0 100644
--- a/projects/app/src/web/common/hooks/useSpeech.ts
+++ b/projects/app/src/web/common/hooks/useSpeech.ts
@@ -1,20 +1,71 @@
-import { useEffect, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { POST } from '../api/request';
 import { useToast } from './useToast';
 import { useTranslation } from 'next-i18next';
 import { getErrText } from '@fastgpt/global/common/error/utils';
 
-export const useSpeech = () => {
+export const useSpeech = (props?: { shareId?: string }) => {
+  const { shareId } = props || {};
   const { t } = useTranslation();
   const mediaRecorder = useRef<MediaRecorder>();
+  const mediaStream = useRef<MediaStream>();
   const { toast } = useToast();
   const [isSpeaking, setIsSpeaking] = useState(false);
+  const [isTransCription, setIsTransCription] = useState(false);
+  const [audioSecond, setAudioSecone] = useState(0);
+  const intervalRef = useRef<any>();
+  const startTimestamp = useRef(0);
 
-  const startSpeak = async () => {
+  const speakingTimeString = useMemo(() => {
+    const minutes: number = Math.floor(audioSecond / 60);
+    const remainingSeconds: number = Math.floor(audioSecond % 60);
+    const formattedMinutes: string = minutes.toString().padStart(2, '0');
+    const formattedSeconds: string = remainingSeconds.toString().padStart(2, '0');
+    return `${formattedMinutes}:${formattedSeconds}`;
+  }, [audioSecond]);
+
+  const renderAudioGraph = (analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
+    const bufferLength = analyser.frequencyBinCount;
+    const backgroundColor = 'white';
+    const dataArray = new Uint8Array(bufferLength);
+    analyser.getByteTimeDomainData(dataArray);
+    const canvasCtx = canvas?.getContext('2d');
+    const width = 300;
+    const height = 200;
+    if (!canvasCtx) return;
+    canvasCtx.clearRect(0, 0, width, height);
+    canvasCtx.fillStyle = backgroundColor;
+    canvasCtx.fillRect(0, 0, width, height);
+    const barWidth = (width / bufferLength) * 2.5;
+    let x = 0;
+
+    canvasCtx.moveTo(x, height / 2);
+    for (let i = 0; i < bufferLength; i += 10) {
+      const barHeight = (dataArray[i] / 256) * height - height * 0.15;
+      canvasCtx.fillStyle = '#3370FF';
+      const adjustedBarHeight = Math.max(0, barHeight);
+      canvasCtx.fillRect(x, height - adjustedBarHeight, barWidth, adjustedBarHeight);
+      x += barWidth + 1;
+    }
+  };
+
+  const startSpeak = async (onFinish: (text: string) => void) => {
     try {
       const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      mediaStream.current = stream;
       mediaRecorder.current = new MediaRecorder(stream);
       const chunks: Blob[] = [];
+      setIsSpeaking(true);
+
+      mediaRecorder.current.onstart = () => {
+        startTimestamp.current = Date.now();
+        setAudioSecone(0);
+        intervalRef.current = setInterval(() => {
+          const currentTimestamp = Date.now();
+          const duration = (currentTimestamp - startTimestamp.current) / 1000;
+          setAudioSecone(duration);
+        }, 1000);
+      };
 
       mediaRecorder.current.ondataavailable = (e) => {
         chunks.push(e.data);
@@ -23,48 +74,66 @@ export const useSpeech = () => {
       mediaRecorder.current.onstop = async () => {
         const formData = new FormData();
         const blob = new Blob(chunks, { type: 'audio/webm' });
+
+        const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
+
         formData.append('files', blob, 'recording.webm');
+        formData.append('metadata', JSON.stringify({ duration, shareId }));
 
-        const link = document.createElement('a');
-        link.href = URL.createObjectURL(blob);
-        link.download = 'recording.webm';
-        document.body.appendChild(link);
-        link.click();
-        link.remove();
-
+        setIsTransCription(true);
         try {
-          const result = await POST<string[]>('/v1/audio/transcriptions', formData, {
+          const result = await POST<string>('/v1/audio/transcriptions', formData, {
             timeout: 60000,
             headers: {
               'Content-Type': 'multipart/form-data; charset=utf-8'
             }
           });
-
-          console.log(result, '===');
+          onFinish(result);
         } catch (error) {
           toast({
             status: 'warning',
             title: getErrText(error, t('common.speech.error tip'))
           });
         }
+        setIsTransCription(false);
+        setIsSpeaking(false);
+      };
+
+      mediaRecorder.current.onerror = (e) => {
+        console.log('error', e);
         setIsSpeaking(false);
       };
 
       mediaRecorder.current.start();
-
-      setIsSpeaking(true);
     } catch (error) {}
   };
 
   const stopSpeak = () => {
     if (mediaRecorder.current) {
       mediaRecorder.current?.stop();
+      clearInterval(intervalRef.current);
     }
   };
 
+  useEffect(() => {
+    return () => {
+      clearInterval(intervalRef.current);
+      if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
+        mediaRecorder.current.stop();
+      }
+      if (mediaStream.current) {
+        mediaStream.current.getTracks().forEach((track) => track.stop());
+      }
+    };
+  }, []);
+
   return {
     startSpeak,
     stopSpeak,
-    isSpeaking
+    isSpeaking,
+    isTransCription,
+    renderAudioGraph,
+    stream: mediaStream.current,
+    speakingTimeString
   };
 };
diff --git a/projects/app/src/web/common/utils/voice.ts b/projects/app/src/web/common/utils/voice.ts
index adb06f229..c93b85ccb 100644
--- a/projects/app/src/web/common/utils/voice.ts
+++ b/projects/app/src/web/common/utils/voice.ts
@@ -4,9 +4,11 @@ import { getErrText } from '@fastgpt/global/common/error/utils';
 import { AppTTSConfigType } from '@/types/app';
 import { TTSTypeEnum } from '@/constants/app';
 import { useTranslation } from 'next-i18next';
+import { useRouter } from 'next/router';
 
 export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
   const { t } = useTranslation();
+  const { shareId } = useRouter().query as { shareId?: string };
   const { ttsConfig } = props || {};
   const { toast } = useToast();
   const [audio, setAudio] = useState<HTMLAudioElement>();
@@ -16,6 +18,7 @@ export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
   // Check whether the voice is supported
   const hasAudio = useMemo(() => {
     if (ttsConfig?.type === TTSTypeEnum.none) return false;
+    if (ttsConfig?.type === TTSTypeEnum.model) return true;
     const voices = window.speechSynthesis?.getVoices?.() || []; // 获取语言包
     const voice = voices.find((item) => {
       return item.lang === 'zh-CN';
@@ -55,7 +58,8 @@ export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
             body: JSON.stringify({
               chatItemId,
               ttsConfig,
-              input: text
+              input: text,
+              shareId
             })
           });
           setAudioLoading(false);
diff --git a/projects/app/src/web/styles/theme.ts b/projects/app/src/web/styles/theme.ts
index 16508c4ec..8b158c3f6 100644
--- a/projects/app/src/web/styles/theme.ts
+++ b/projects/app/src/web/styles/theme.ts
@@ -66,6 +66,14 @@ const Button = defineStyleConfig({
         bg: '#3370ff !important'
       }
     },
+    gray: {
+      bg: '#F5F5F8',
+      color: 'myBlue.700',
+      border: '1px solid #EFF0F1',
+      _hover: {
+        bg: '#3370FF1A'
+      }
+    },
     base: {
       color: 'myGray.900',
       border: '1px solid',
@@ -81,6 +89,23 @@ const Button = defineStyleConfig({
         color: 'myBlue.700'
       },
       _disabled: { bg: 'myGray.100 !important', color: 'myGray.700 !important' }
+    },
+    boxBtn: {
+      px: 3,
+      py: '2px',
+      borderRadius: 'md',
+      _hover: {
+        bg: 'myGray.200'
+      }
+    },
+    blue: {
+      borderRadius: 'md',
+      bg: '#3370FF',
+      color: 'white',
+      fontSize: 'sm',
+      _hover: {
+        bg: '#145BFF'
+      }
     }
   },
   defaultProps: {