perf: mobile voice input (#4437)

* update:Mobile voice interaction (#4362) * Add files via upload * Add files via upload * Update ollama.md * Update ollama.md * Add files via upload * Update useSpeech.ts * Update ChatInput.tsx * Update useSpeech.ts * Update ChatInput.tsx * Update useSpeech.ts * Update constants.ts * Add files via upload * Update ChatInput.tsx * Update useSpeech.ts * Update useSpeech.ts * Update useSpeech.ts * Update ChatInput.tsx * Add files via upload * Update common.json * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update useSpeech.ts * Update useSpeech.ts * Update common.json * Update common.json * Update common.json * Update VoiceInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update useSpeech.ts * Update common.json * Update chat.json * Update common.json * Update chat.json * Update common.json * Update chat.json * Update VoiceInput.tsx * Update ChatInput.tsx * Update useSpeech.ts * Update VoiceInput.tsx * speech ui * 优化语音输入组件，调整输入框显示逻辑，修复语音输入遮罩层样式，更新画布背景透明度，增强用户交互体验。 (#4435) * perf: mobil voice input --------- Co-authored-by: dreamer6680 <1468683855@qq.com>
2025-10-15 07:31:19 +00:00 · 2025-04-02 22:25:50 +08:00
parent c2e088cf39
commit e4c4941a50
8 changed files with 675 additions and 323 deletions
--- a/packages/web/components/common/Icon/constants.ts
+++ b/packages/web/components/common/Icon/constants.ts
@@ -183,6 +183,7 @@ export const iconPaths = {
  'core/chat/feedback/goodLight': () => import('./icons/core/chat/feedback/goodLight.svg'),
  'core/chat/fileSelect': () => import('./icons/core/chat/fileSelect.svg'),
  'core/chat/finishSpeak': () => import('./icons/core/chat/finishSpeak.svg'),
  'core/chat/backText':()  => import('./icons/core/chat/backText.svg'),
  'core/chat/imgSelect': () => import('./icons/core/chat/imgSelect.svg'),
  'core/chat/quoteFill': () => import('./icons/core/chat/quoteFill.svg'),
  'core/chat/quoteSign': () => import('./icons/core/chat/quoteSign.svg'),
--- a/packages/web/components/common/Icon/icons/core/chat/backText.svg
+++ b/packages/web/components/common/Icon/icons/core/chat/backText.svg
@@ -0,0 +1,4 @@
 <svg
        class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" width="200" height="200">
    <path d="M512 74.666667C270.933333 74.666667 74.666667 270.933333 74.666667 512S270.933333 949.333333 512 949.333333 949.333333 753.066667 949.333333 512 753.066667 74.666667 512 74.666667z m0 810.666666c-204.8 0-373.333333-168.533333-373.333333-373.333333S307.2 138.666667 512 138.666667 885.333333 307.2 885.333333 512 716.8 885.333333 512 885.333333z" fill="#666666"></path>
    <path d="M448 437.333333c17.066667 0 32-14.933333 32-32v-42.666666c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666666c0 17.066667 14.933333 32 32 32zM576 437.333333c17.066667 0 32-14.933333 32-32v-42.666666c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666666c0 17.066667 14.933333 32 32 32zM320 437.333333c17.066667 0 32-14.933333 32-32v-42.666666c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666666c0 17.066667 14.933333 32 32 32zM704 330.666667c-17.066667 0-32 14.933333-32 32v42.666666c0 17.066667 14.933333 32 32 32s32-14.933333 32-32v-42.666666c0-17.066667-14.933333-32-32-32zM448 586.666667c17.066667 0 32-14.933333 32-32v-42.666667c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666667c0 17.066667 14.933333 32 32 32zM576 586.666667c17.066667 0 32-14.933333 32-32v-42.666667c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666667c0 17.066667 14.933333 32 32 32zM352 554.666667v-42.666667c0-17.066667-14.933333-32-32-32s-32 14.933333-32 32v42.666667c0 17.066667 14.933333 32 32 32s32-14.933333 32-32zM704 480c-17.066667 0-32 14.933333-32 32v42.666667c0 17.066667 14.933333 32 32 32s32-14.933333 32-32v-42.666667c0-17.066667-14.933333-32-32-32zM682.666667 650.666667H341.333333c-17.066667 0-32 14.933333-32 32s14.933333 32 32 32h341.333334c17.066667 0 32-14.933333 32-32s-14.933333-32-32-32z" fill="#666666" ></path></svg>
--- a/packages/web/i18n/en/chat.json
+++ b/packages/web/i18n/en/chat.json
@@ -3,6 +3,7 @@
  "Delete_all": "Clear All Lexicon",
  "LLM_model_response_empty": "The model flow response is empty, please check whether the model flow output is normal.",
  "ai_reasoning": "Thinking process",
  "back_to_text": "Text input",
  "chat.quote.No Data": "The file cannot be found",
  "chat.quote.deleted": "This data has been deleted ~",
  "chat_history": "Conversation History",
@@ -16,6 +17,8 @@
  "content_empty": "No Content",
  "contextual": "{{num}} Contexts",
  "contextual_preview": "Contextual Preview {{num}} Items",
  "core.chat.moveCancel": "Swipe to Cancel",
  "core.chat.shortSpeak": "Speaking Time is Too Short",
  "csv_input_lexicon_tip": "Only CSV batch import is supported, click to download the template",
  "custom_input_guide_url": "Custom Lexicon URL",
  "data_source": "Source Dataset: {{name}}",
@@ -41,11 +44,14 @@
  "not_query": "Missing query content",
  "not_select_file": "No file selected",
  "plugins_output": "Plugin Output",
  "press_to_speak": "Hold down to speak",
  "query_extension_IO_tokens": "Problem Optimization Input/Output Tokens",
  "query_extension_result": "Problem optimization results",
  "question_tip": "From top to bottom, the response order of each module",
  "read_raw_source": "Open the original text",
  "reasoning_text": "Thinking process",
  "release_cancel": "Release Cancel",
  "release_send": "Release send, slide up to cancel",
  "response.child total points": "Sub-workflow point consumption",
  "response.dataset_concat_length": "Combined total",
  "response.node_inputs": "Node Inputs",
--- a/packages/web/i18n/zh-CN/chat.json
+++ b/packages/web/i18n/zh-CN/chat.json
@@ -3,6 +3,7 @@
  "Delete_all": "清空词库",
  "LLM_model_response_empty": "模型流响应为空，请检查模型流输出是否正常",
  "ai_reasoning": "思考过程",
  "back_to_text": "返回输入",
  "chat.quote.No Data": "找不到该文件",
  "chat.quote.deleted": "该数据已被删除～",
  "chat_history": "聊天记录",
@@ -16,6 +17,8 @@
  "content_empty": "内容为空",
  "contextual": "{{num}}条上下文",
  "contextual_preview": "上下文预览 {{num}} 条",
  "core.chat.moveCancel": "上滑取消",
  "core.chat.shortSpeak": "说话时间太短",
  "csv_input_lexicon_tip": "仅支持 CSV 批量导入，点击下载模板",
  "custom_input_guide_url": "自定义词库地址",
  "data_source": "来源知识库: {{name}}",
@@ -41,11 +44,14 @@
  "not_query": "缺少查询内容",
  "not_select_file": "未选择文件",
  "plugins_output": "插件输出",
  "press_to_speak": "按住说话",
  "query_extension_IO_tokens": "问题优化输入/输出 Tokens",
  "query_extension_result": "问题优化结果",
  "question_tip": "从上到下，为各个模块的响应顺序",
  "read_raw_source": "打开原文",
  "reasoning_text": "思考过程",
  "release_cancel": "松开取消",
  "release_send": "松开发送，上滑取消",
  "response.child total points": "子工作流积分消耗",
  "response.dataset_concat_length": "合并后总数",
  "response.node_inputs": "节点输入",
--- a/packages/web/i18n/zh-Hant/chat.json
+++ b/packages/web/i18n/zh-Hant/chat.json
@@ -3,6 +3,7 @@
  "Delete_all": "清除所有詞彙",
  "LLM_model_response_empty": "模型流程回應為空，請檢查模型流程輸出是否正常",
  "ai_reasoning": "思考過程",
  "back_to_text": "返回輸入",
  "chat.quote.No Data": "找不到該文件",
  "chat.quote.deleted": "該數據已被刪除～",
  "chat_history": "對話紀錄",
@@ -35,16 +36,20 @@
  "is_chatting": "對話進行中...請稍候",
  "items": "筆",
  "module_runtime_and": "模組執行總時間",
  "moveCancel": "上滑取消",
  "multiple_AI_conversations": "多組 AI 對話",
  "new_input_guide_lexicon": "新增詞彙庫",
  "no_workflow_response": "無工作流程資料",
  "not_query": "缺少查詢內容",
  "not_select_file": "尚未選取檔案",
  "plugins_output": "外掛程式輸出",
  "press_to_speak": "按住說話",
  "query_extension_IO_tokens": "問題優化輸入/輸出 Tokens",
  "question_tip": "由上至下，各個模組的回應順序",
  "read_raw_source": "打開原文",
  "reasoning_text": "思考過程",
  "release_cancel": "鬆開取消",
  "release_send": "鬆開發送，上滑取消",
  "response.child total points": "子工作流程點數消耗",
  "response.dataset_concat_length": "合併總數",
  "response.node_inputs": "節點輸入",
@@ -53,6 +58,7 @@
  "select_file": "上傳檔案",
  "select_file_img": "上傳檔案 / 圖片",
  "select_img": "上傳圖片",
  "shortSpeak ": "說話時間太短",
  "source_cronJob": "定時執行",
  "stream_output": "串流輸出",
  "to_dataset": "前往知識庫",
--- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/ChatInput.tsx
+++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/ChatInput.tsx
@@ -1,7 +1,6 @@
 import { useSpeech } from '@/web/common/hooks/useSpeech';
 import { useSystemStore } from '@/web/common/system/useSystemStore';
 import { Box, Flex, Spinner, Textarea } from '@chakra-ui/react';
-import React, { useRef, useEffect, useCallback, useMemo } from 'react';
+import React, { useRef, useEffect, useCallback, useMemo, useState } from 'react';
 import { useTranslation } from 'next-i18next';
 import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
 import MyIcon from '@fastgpt/web/components/common/Icon';
@@ -18,6 +17,7 @@ import FilePreview from '../../components/FilePreview';
 import { useFileUpload } from '../hooks/useFileUpload';
 import ComplianceTip from '@/components/common/ComplianceTip/index';
 import { useToast } from '@fastgpt/web/hooks/useToast';
 import VoiceInput, { type VoiceInputComponentRef } from './VoiceInput';
 const InputGuideBox = dynamic(() => import('./InputGuideBox'));
@@ -44,6 +44,7 @@ const ChatInput = ({
  const { t } = useTranslation();
  const { toast } = useToast();
  const { isPc } = useSystem();
  const VoiceInputRef = useRef<VoiceInputComponentRef>(null);
  const { setValue, watch, control } = chatForm;
  const inputValue = watch('input');
@@ -53,7 +54,6 @@ const ChatInput = ({
  const chatId = useContextSelector(ChatBoxContext, (v) => v.chatId);
  const isChatting = useContextSelector(ChatBoxContext, (v) => v.isChatting);
  const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
  const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
  const chatInputGuide = useContextSelector(ChatBoxContext, (v) => v.chatInputGuide);
  const fileSelectConfig = useContextSelector(ChatBoxContext, (v) => v.fileSelectConfig);
@@ -106,86 +106,6 @@ const ChatInput = ({
    [TextareaDom, canSendMessage, fileList, onSendMessage, replaceFiles]
  );
  /* whisper init */
  const canvasRef = useRef<HTMLCanvasElement>(null);
  const {
    isSpeaking,
    isTransCription,
    stopSpeak,
    startSpeak,
    speakingTimeString,
    renderAudioGraph,
    stream
  } = useSpeech({ appId, ...outLinkAuthData });
  const onWhisperRecord = useCallback(() => {
    const finishWhisperTranscription = (text: string) => {
      if (!text) return;
      if (whisperConfig?.autoSend) {
        onSendMessage({
          text,
          files: fileList,
          autoTTSResponse
        });
        replaceFiles([]);
      } else {
        resetInputVal({ text });
      }
    };
    if (isSpeaking) {
      return stopSpeak();
    }
    startSpeak(finishWhisperTranscription);
  }, [
    autoTTSResponse,
    fileList,
    isSpeaking,
    onSendMessage,
    replaceFiles,
    resetInputVal,
    startSpeak,
    stopSpeak,
    whisperConfig?.autoSend
  ]);
  useEffect(() => {
    if (!stream) {
      return;
    }
    const audioContext = new AudioContext();
    const analyser = audioContext.createAnalyser();
    analyser.fftSize = 4096;
    analyser.smoothingTimeConstant = 1;
    const source = audioContext.createMediaStreamSource(stream);
    source.connect(analyser);
    const renderCurve = () => {
      if (!canvasRef.current) return;
      renderAudioGraph(analyser, canvasRef.current);
      window.requestAnimationFrame(renderCurve);
    };
    renderCurve();
  }, [renderAudioGraph, stream]);
  const RenderTranslateLoading = useMemo(
    () => (
      <Flex
        position={'absolute'}
        top={0}
        bottom={0}
        left={0}
        right={0}
        zIndex={10}
        pl={5}
        alignItems={'center'}
        bg={'white'}
        color={'primary.500'}
        visibility={isSpeaking && isTransCription ? 'visible' : 'hidden'}
      >
        <Spinner size={'sm'} mr={4} />
        {t('common:core.chat.Converting to text')}
      </Flex>
    ),
    [isSpeaking, isTransCription, t]
  );
  const RenderTextarea = useMemo(
    () => (
      <Flex alignItems={'flex-end'} mt={fileList.length > 0 ? 1 : 0} pl={[2, 4]}>
@@ -198,7 +118,6 @@ const ChatInput = ({
            cursor={'pointer'}
            transform={'translateY(1px)'}
            onClick={() => {
              if (isSpeaking) return;
              onOpenSelectFile();
            }}
          >
@@ -208,7 +127,6 @@ const ChatInput = ({
            <File onSelect={(files) => onSelectFile({ files })} />
          </Flex>
        )}
        {/* input area */}
        <Textarea
          ref={TextareaDom}
@@ -220,11 +138,7 @@ const ChatInput = ({
            border: 'none'
          }}
          placeholder={
-            isSpeaking
+            isPc ? t('common:core.chat.Type a message') : t('chat:input_placeholder_phone')
              ? t('common:core.chat.Speaking')
              : isPc
                ? t('common:core.chat.Type a message')
                : t('chat:input_placeholder_phone')
          }
          resize={'none'}
          rows={1}
@@ -237,9 +151,8 @@ const ChatInput = ({
          wordBreak={'break-all'}
          boxShadow={'none !important'}
          color={'myGray.900'}
          isDisabled={isSpeaking}
          value={inputValue}
          fontSize={['md', 'sm']}
          value={inputValue}
          onChange={(e) => {
            const textarea = e.target;
            textarea.style.height = textareaMinH;
@@ -290,118 +203,78 @@ const ChatInput = ({
            }
          }}
        />
-        <Flex alignItems={'center'} position={'absolute'} right={[2, 4]} bottom={['10px', '12px']}>
+        <Flex
-          {/* voice-input */}
+          alignItems={'center'}
-          {whisperConfig?.open && !inputValue && !isChatting && (
+          position={'absolute'}
-            <>
+          right={[2, 4]}
-              <canvas
+          bottom={['10px', '12px']}
-                ref={canvasRef}
+          zIndex={3}
-                style={{
+        >
-                  height: '30px',
+          {/* Voice input icon */}
-                  width: isSpeaking && !isTransCription ? '100px' : 0,
+          {whisperConfig?.open && !inputValue && (
-                  background: 'white',
+            <MyTooltip label={t('common:core.chat.Record')}>
-                  zIndex: 0
+              <Flex
                alignItems={'center'}
                justifyContent={'center'}
                flexShrink={0}
                h={['28px', '32px']}
                w={['28px', '32px']}
                mr={2}
                borderRadius={'md'}
                cursor={'pointer'}
                _hover={{ bg: '#F5F5F8' }}
                onClick={() => {
                  VoiceInputRef.current?.onSpeak?.();
                }}
              />
              {isSpeaking && (
                <MyTooltip label={t('common:core.chat.Cancel Speak')}>
                  <Flex
                    mr={2}
                    alignItems={'center'}
                    justifyContent={'center'}
                    flexShrink={0}
                    h={['26px', '32px']}
                    w={['26px', '32px']}
                    borderRadius={'md'}
                    cursor={'pointer'}
                    _hover={{ bg: '#F5F5F8' }}
                    onClick={() => stopSpeak(true)}
                  >
                    <MyIcon
                      name={'core/chat/cancelSpeak'}
                      width={['20px', '22px']}
                      height={['20px', '22px']}
                    />
                  </Flex>
                </MyTooltip>
              )}
              <MyTooltip
                label={
                  isSpeaking ? t('common:core.chat.Finish Speak') : t('common:core.chat.Record')
                }
              >
                <Flex
                  mr={2}
                  alignItems={'center'}
                  justifyContent={'center'}
                  flexShrink={0}
                  h={['26px', '32px']}
                  w={['26px', '32px']}
                  borderRadius={'md'}
                  cursor={'pointer'}
                  _hover={{ bg: '#F5F5F8' }}
                  onClick={onWhisperRecord}
                >
                  <MyIcon
                    name={isSpeaking ? 'core/chat/finishSpeak' : 'core/chat/recordFill'}
                    width={['20px', '22px']}
                    height={['20px', '22px']}
                    color={isSpeaking ? 'primary.500' : 'myGray.600'}
                  />
                </Flex>
              </MyTooltip>
            </>
          )}
          {/* send and stop icon */}
          {isSpeaking ? (
            <Box color={'#5A646E'} w={'36px'} textAlign={'right'} whiteSpace={'nowrap'}>
              {speakingTimeString}
            </Box>
          ) : (
            <Flex
              alignItems={'center'}
              justifyContent={'center'}
              flexShrink={0}
              h={['28px', '32px']}
              w={['28px', '32px']}
              borderRadius={'md'}
              bg={
                isSpeaking || isChatting
                  ? ''
                  : !havInput || hasFileUploading
                    ? '#E5E5E5'
                    : 'primary.500'
              }
              cursor={havInput ? 'pointer' : 'not-allowed'}
              lineHeight={1}
              onClick={() => {
                if (isChatting) {
                  return onStop();
                }
                return handleSend();
              }}
            >
              {isChatting ? (
                <MyIcon
-                  animation={'zoomStopIcon 0.4s infinite alternate'}
+                  name={'core/chat/recordFill'}
                  width={['22px', '25px']}
                  height={['22px', '25px']}
-                  cursor={'pointer'}
+                  color={'myGray.600'}
                  name={'stop'}
                  color={'gray.500'}
                />
-              ) : (
+              </Flex>
-                <MyTooltip label={t('common:core.chat.Send Message')}>
+            </MyTooltip>
                  <MyIcon
                    name={'core/chat/sendFill'}
                    width={['18px', '20px']}
                    height={['18px', '20px']}
                    color={'white'}
                  />
                </MyTooltip>
              )}
            </Flex>
          )}
          {/* send and stop icon */}
          <Flex
            alignItems={'center'}
            justifyContent={'center'}
            flexShrink={0}
            h={['28px', '32px']}
            w={['28px', '32px']}
            borderRadius={'md'}
            bg={isChatting ? '' : !havInput || hasFileUploading ? '#E5E5E5' : 'primary.500'}
            cursor={havInput ? 'pointer' : 'not-allowed'}
            lineHeight={1}
            onClick={() => {
              if (isChatting) {
                return onStop();
              }
              return handleSend();
            }}
          >
            {isChatting ? (
              <MyIcon
                animation={'zoomStopIcon 0.4s infinite alternate'}
                width={['22px', '25px']}
                height={['22px', '25px']}
                cursor={'pointer'}
                name={'stop'}
                color={'gray.500'}
              />
            ) : (
              <MyTooltip label={t('common:core.chat.Send Message')}>
                <MyIcon
                  name={'core/chat/sendFill'}
                  width={['18px', '20px']}
                  height={['18px', '20px']}
                  color={'white'}
                />
              </MyTooltip>
            )}
          </Flex>
        </Flex>
      </Flex>
    ),
@@ -415,21 +288,15 @@ const ChatInput = ({
      inputValue,
      isChatting,
      isPc,
      isSpeaking,
      isTransCription,
      onOpenSelectFile,
      onSelectFile,
      onStop,
      onWhisperRecord,
      selectFileIcon,
      selectFileLabel,
      setValue,
      showSelectFile,
      showSelectImg,
-      speakingTimeString,
+      t
      stopSpeak,
      t,
      whisperConfig?.open
    ]
  );
@@ -468,7 +335,7 @@ const ChatInput = ({
        pt={fileList.length > 0 ? '0' : ['14px', '18px']}
        pb={['14px', '18px']}
        position={'relative'}
-        boxShadow={isSpeaking ? `0 0 10px rgba(54,111,255,0.4)` : `0 0 10px rgba(0,0,0,0.2)`}
+        boxShadow={`0 0 10px rgba(0,0,0,0.2)`}
        borderRadius={['none', 'md']}
        bg={'white'}
        overflow={'display'}
@@ -495,15 +362,20 @@ const ChatInput = ({
            }}
          />
        )}
        {/* translate loading */}
        {RenderTranslateLoading}
        {/* file preview */}
        <Box px={[1, 3]}>
          <FilePreview fileList={fileList} removeFiles={removeFiles} />
        </Box>
        {/* voice input and loading container */}
        {!inputValue && (
          <VoiceInput
            ref={VoiceInputRef}
            onSendMessage={onSendMessage}
            resetInputVal={resetInputVal}
          />
        )}
        {RenderTextarea}
      </Box>
      <ComplianceTip type={'chat'} />
--- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/VoiceInput.tsx
+++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/VoiceInput.tsx
@@ -0,0 +1,367 @@
 import { useSpeech } from '@/web/common/hooks/useSpeech';
 import { Box, Flex, HStack, Spinner } from '@chakra-ui/react';
 import React, {
  useRef,
  useEffect,
  useCallback,
  useState,
  forwardRef,
  useImperativeHandle,
  useMemo
 } from 'react';
 import { useTranslation } from 'next-i18next';
 import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
 import MyIcon from '@fastgpt/web/components/common/Icon';
 import { useSystem } from '@fastgpt/web/hooks/useSystem';
 import { useContextSelector } from 'use-context-selector';
 import { ChatBoxContext } from '../Provider';
 import MyIconButton from '@/pageComponents/account/team/OrgManage/IconButton';
 export interface VoiceInputComponentRef {
  onSpeak: () => void;
 }
 type VoiceInputProps = {
  onSendMessage: (params: { text: string; files?: any[]; autoTTSResponse?: boolean }) => void;
  resetInputVal: (val: { text: string }) => void;
 };
 // PC voice input
 const PCVoiceInput = ({
  speakingTimeString,
  stopSpeak,
  canvasRef
 }: {
  speakingTimeString: string;
  stopSpeak: (param: boolean) => void;
  canvasRef: React.RefObject<HTMLCanvasElement>;
 }) => {
  const { t } = useTranslation();
  return (
    <HStack h={'100%'} px={4}>
      <Box fontSize="sm" color="myGray.500" flex={'1 0 0'}>
        {t('common:core.chat.Speaking')}
      </Box>
      <canvas
        ref={canvasRef}
        style={{
          height: '10px',
          width: '100px',
          background: 'white'
        }}
      />
      <Box fontSize="sm" color="myGray.500" whiteSpace={'nowrap'}>
        {speakingTimeString}
      </Box>
      <MyTooltip label={t('common:core.chat.Cancel Speak')}>
        <MyIconButton
          name={'core/chat/cancelSpeak'}
          h={'22px'}
          w={'22px'}
          onClick={() => stopSpeak(true)}
        />
      </MyTooltip>
      <MyTooltip label={t('common:core.chat.Finish Speak')}>
        <MyIconButton
          name={'core/chat/finishSpeak'}
          h={'22px'}
          w={'22px'}
          onClick={() => stopSpeak(false)}
        />
      </MyTooltip>
    </HStack>
  );
 };
 // mobile voice input
 const MobileVoiceInput = ({
  isSpeaking,
  onStartSpeak,
  onCloseSpeak,
  stopSpeak,
  canvasRef
 }: {
  isSpeaking: boolean;
  onStartSpeak: () => void;
  onCloseSpeak: () => any;
  stopSpeak: (param: boolean) => void;
  canvasRef: React.RefObject<HTMLCanvasElement>;
 }) => {
  const { t } = useTranslation();
  const isPressing = useRef(false);
  const startTimeRef = useRef(0); // 防抖
  const startYRef = useRef(0);
  const [isCancel, setIsCancel] = useState(false);
  const handleTouchStart = useCallback(
    (e: React.TouchEvent<HTMLDivElement>) => {
      isPressing.current = true;
      setIsCancel(false);
      startTimeRef.current = Date.now();
      const touch = e.touches[0];
      startYRef.current = touch.pageY;
      onStartSpeak();
    },
    [onStartSpeak]
  );
  const handleTouchMove = useCallback(
    (e: React.TouchEvent<HTMLDivElement>) => {
      const touch = e.touches[0] as Touch;
      const currentY = touch.pageY;
      const deltaY = startYRef.current - currentY;
      if (deltaY > 90) {
        setIsCancel(true);
      } else if (deltaY <= 90) {
        setIsCancel(false);
      }
    },
    [startYRef]
  );
  const handleTouchEnd = useCallback(
    (e: React.TouchEvent<HTMLDivElement>) => {
      if (!isPressing.current) return;
      const endTime = Date.now();
      const timeDifference = endTime - startTimeRef.current;
      if (isCancel || timeDifference < 200) {
        stopSpeak(true);
      } else {
        stopSpeak(false);
      }
    },
    [isCancel, stopSpeak]
  );
  return (
    <Flex position="relative" h="100%">
      {/* Back Icon */}
      {!isSpeaking && (
        <MyTooltip label={t('chat:back_to_text')}>
          <MyIconButton
            position="absolute"
            right={2}
            top={'50%'}
            transform={'translateY(-50%)'}
            zIndex={5}
            name={'core/chat/backText'}
            h={'22px'}
            w={'22px'}
            onClick={onCloseSpeak}
          />
        </MyTooltip>
      )}
      <Flex
        alignItems={'center'}
        justifyContent={'center'}
        h="100%"
        flex="1 0 0"
        bg={isSpeaking ? (isCancel ? 'red.500' : 'primary.500') : 'white'}
        onTouchMove={handleTouchMove}
        onTouchEnd={handleTouchEnd}
        onTouchStart={handleTouchStart}
        onTouchCancel={() => {
          stopSpeak(true);
        }}
        zIndex={4}
      >
        <Box visibility={isSpeaking ? 'hidden' : 'visible'}>{t('chat:press_to_speak')}</Box>
        <Box
          position="absolute"
          h={'100%'}
          w={'100%'}
          as="canvas"
          ref={canvasRef}
          flex="0 0 80%"
          visibility={isSpeaking ? 'visible' : 'hidden'}
        />
      </Flex>
      {/* Mask */}
      {isSpeaking && (
        <Flex
          justifyContent="center"
          alignItems="center"
          height="100%"
          position="fixed"
          left={0}
          right={0}
          bottom={'50px'}
          h={'200px'}
          bg="linear-gradient(to top, white, rgba(255, 255, 255, 0.7), rgba(255, 255, 255, 0))"
        >
          <Box fontSize="sm" color="myGray.500" position="absolute" bottom={'10px'}>
            {isCancel ? t('chat:release_cancel') : t('chat:release_send')}
          </Box>
        </Flex>
      )}
    </Flex>
  );
 };
 const VoiceInput = forwardRef<VoiceInputComponentRef, VoiceInputProps>(
  ({ onSendMessage, resetInputVal }, ref) => {
    const { t } = useTranslation();
    const { isPc } = useSystem();
    const outLinkAuthData = useContextSelector(ChatBoxContext, (v) => v.outLinkAuthData);
    const appId = useContextSelector(ChatBoxContext, (v) => v.appId);
    const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
    const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
    const canvasRef = useRef<HTMLCanvasElement>(null);
    const {
      isSpeaking,
      isTransCription,
      stopSpeak,
      startSpeak,
      speakingTimeString,
      renderAudioGraphPc,
      renderAudioGraphMobile,
      stream
    } = useSpeech({ appId, ...outLinkAuthData });
    const [mobilePreSpeak, setMobilePreSpeak] = useState(false);
    // Canvas render
    useEffect(() => {
      if (!stream) {
        return;
      }
      const audioContext = new AudioContext();
      const analyser = audioContext.createAnalyser();
      analyser.fftSize = 4096;
      analyser.smoothingTimeConstant = 1;
      const source = audioContext.createMediaStreamSource(stream);
      source.connect(analyser);
      let animationFrameId: number | null = null;
      const renderCurve = () => {
        const canvas = canvasRef.current;
        if (!canvas) return;
        const ctx = canvas.getContext('2d');
        if (!ctx) return;
        if (!stream.active) {
          ctx.clearRect(0, 0, canvas.width, canvas.height);
          if (animationFrameId) {
            window.cancelAnimationFrame(animationFrameId);
            animationFrameId = null;
          }
          return;
        }
        if (isPc) {
          renderAudioGraphPc(analyser, canvas);
        } else {
          renderAudioGraphMobile(analyser, canvas);
        }
        animationFrameId = window.requestAnimationFrame(renderCurve);
      };
      renderCurve();
      return () => {
        if (animationFrameId) {
          window.cancelAnimationFrame(animationFrameId);
        }
        audioContext.close();
        source.disconnect();
        analyser.disconnect();
      };
    }, [stream, canvasRef, renderAudioGraphPc, renderAudioGraphMobile, isPc]);
    const onStartSpeak = useCallback(() => {
      const finishWhisperTranscription = (text: string) => {
        if (!text) return;
        if (whisperConfig?.autoSend) {
          onSendMessage({
            text,
            autoTTSResponse
          });
        } else {
          resetInputVal({ text });
        }
      };
      startSpeak(finishWhisperTranscription);
    }, []);
    const onSpeach = useCallback(() => {
      if (isPc) {
        onStartSpeak();
      } else {
        setMobilePreSpeak(true);
      }
    }, []);
    useImperativeHandle(ref, () => ({
      onSpeak: onSpeach
    }));
    if (!whisperConfig?.open) return null;
    if (!mobilePreSpeak && !isSpeaking && !isTransCription) return null;
    return (
      <Box
        position="absolute"
        overflow={'hidden'}
        userSelect={'none'}
        top={0}
        left={0}
        right={0}
        bottom={0}
        bg="white"
        zIndex={5}
        borderRadius={isPc ? 'md' : ''}
        onContextMenu={(e) => e.preventDefault()}
      >
        {isPc ? (
          <PCVoiceInput
            speakingTimeString={speakingTimeString}
            stopSpeak={stopSpeak}
            canvasRef={canvasRef}
          />
        ) : (
          <MobileVoiceInput
            isSpeaking={isSpeaking}
            onStartSpeak={onStartSpeak}
            onCloseSpeak={() => setMobilePreSpeak(false)}
            stopSpeak={stopSpeak}
            canvasRef={canvasRef}
          />
        )}
        {isTransCription && (
          <Flex
            position={'absolute'}
            top={0}
            bottom={0}
            left={0}
            right={0}
            pl={5}
            alignItems={'center'}
            bg={'white'}
            color={'primary.500'}
            zIndex={6}
          >
            <Spinner size={'sm'} mr={4} />
            {t('common:core.chat.Converting to text')}
          </Flex>
        )}
      </Box>
    );
  }
 );
 VoiceInput.displayName = 'VoiceInput';
 export default VoiceInput;
--- a/projects/app/src/web/common/hooks/useSpeech.ts
+++ b/projects/app/src/web/common/hooks/useSpeech.ts
@@ -7,16 +7,21 @@ import { OutLinkChatAuthProps } from '@fastgpt/global/support/permission/chat';
 export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) => {
  const { t } = useTranslation();
  const mediaRecorder = useRef<MediaRecorder>();
  const [mediaStream, setMediaStream] = useState<MediaStream>();
  const { toast } = useToast();
  const [isSpeaking, setIsSpeaking] = useState(false);
  const [isTransCription, setIsTransCription] = useState(false);
  const [audioSecond, setAudioSecond] = useState(0);
  const intervalRef = useRef<any>();
  const startTimestamp = useRef(0);
  const cancelWhisperSignal = useRef(false);
  const mediaRecorder = useRef<MediaRecorder>();
  const [mediaStream, setMediaStream] = useState<MediaStream>();
  const timeIntervalRef = useRef<any>();
  const cancelWhisperSignal = useRef(false);
  const stopCalledRef = useRef(false);
  const startTimestamp = useRef(0);
  const [audioSecond, setAudioSecond] = useState(0);
  const speakingTimeString = useMemo(() => {
    const minutes: number = Math.floor(audioSecond / 60);
    const remainingSeconds: number = Math.floor(audioSecond % 60);
@@ -25,17 +30,16 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
    return `${formattedMinutes}:${formattedSeconds}`;
  }, [audioSecond]);
-  const renderAudioGraph = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
+  const renderAudioGraphPc = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
    const bufferLength = analyser.frequencyBinCount;
    const backgroundColor = 'white';
    const dataArray = new Uint8Array(bufferLength);
    analyser.getByteTimeDomainData(dataArray);
    const canvasCtx = canvas?.getContext('2d');
-    const width = 300;
+    const width = canvas.width;
-    const height = 200;
+    const height = canvas.height;
    if (!canvasCtx) return;
    canvasCtx.clearRect(0, 0, width, height);
-    canvasCtx.fillStyle = backgroundColor;
+    canvasCtx.fillStyle = 'white';
    canvasCtx.fillRect(0, 0, width, height);
    const barWidth = (width / bufferLength) * 2.5;
    let x = 0;
@@ -49,127 +53,212 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
      x += barWidth + 1;
    }
  }, []);
  const renderAudioGraphMobile = useCallback(
    (analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
      const canvasCtx = canvas?.getContext('2d');
      if (!canvasCtx) return;
-  const startSpeak = async (onFinish: (text: string) => void) => {
+      const bufferLength = analyser.frequencyBinCount;
-    if (!navigator?.mediaDevices?.getUserMedia) {
+      const dataArray = new Uint8Array(bufferLength);
-      return toast({
+      analyser.getByteTimeDomainData(dataArray);
-        status: 'warning',
+
-        title: t('common:common.speech.not support')
+      const width = canvas.width;
-      });
+      const height = canvas.height;
-    }
+      canvasCtx.clearRect(0, 0, width, height);
-    try {
+
      // Set transparent background
      canvasCtx.fillStyle = 'rgba(255, 255, 255, 0)';
      canvasCtx.fillRect(0, 0, width, height);
      const centerY = height / 2;
      const barWidth = (width / bufferLength) * 15;
      const gap = 2; // 添加间隙
      let x = width * 0.1;
      let sum = 0;
      let maxDiff = 0;
      for (let i = 0; i < bufferLength; i++) {
        sum += dataArray[i];
        maxDiff = Math.max(maxDiff, Math.abs(dataArray[i] - 128));
      }
      const average = sum / bufferLength;
      // draw initial rectangle waveform
      canvasCtx.beginPath();
      canvasCtx.fillStyle = '#FFFFFF';
      const initialHeight = height * 0.1;
      for (let i = 0; i < width * 0.8; i += barWidth + gap) {
        canvasCtx.fillRect(i + width * 0.1, centerY - initialHeight, barWidth, initialHeight);
        canvasCtx.fillRect(i + width * 0.1, centerY, barWidth, initialHeight);
      }
      // draw dynamic waveform
      canvasCtx.beginPath();
      for (let i = 0; i < bufferLength; i += 4) {
        const value = dataArray[i];
        const normalizedValue = (value - average) / 128;
        const amplification = 2.5;
        const barHeight = normalizedValue * height * 0.4 * amplification;
        canvasCtx.fillStyle = '#FFFFFF';
        canvasCtx.fillRect(x, centerY - Math.abs(barHeight), barWidth, Math.abs(barHeight));
        canvasCtx.fillRect(x, centerY, barWidth, Math.abs(barHeight));
        x += barWidth + gap; // 增加间隔
        if (x > width * 0.9) break;
      }
    },
    []
  );
  const startSpeak = useCallback(
    async (onFinish: (text: string) => void) => {
      if (!navigator?.mediaDevices?.getUserMedia) {
        return toast({
          status: 'warning',
          title: t('common:common.speech.not support')
        });
      }
      // Init status
      if (timeIntervalRef.current) {
        clearInterval(timeIntervalRef.current);
      }
      cancelWhisperSignal.current = false;
      stopCalledRef.current = false;
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      setMediaStream(stream);
      mediaRecorder.current = new MediaRecorder(stream);
      const chunks: Blob[] = [];
      setIsSpeaking(true);
      setAudioSecond(0);
-      mediaRecorder.current.onstart = () => {
+      try {
-        startTimestamp.current = Date.now();
+        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-        setAudioSecond(0);
+        setMediaStream(stream);
        intervalRef.current = setInterval(() => {
          const currentTimestamp = Date.now();
          const duration = (currentTimestamp - startTimestamp.current) / 1000;
          setAudioSecond(duration);
        }, 1000);
      };
-      mediaRecorder.current.ondataavailable = (e) => {
+        mediaRecorder.current = new MediaRecorder(stream);
-        chunks.push(e.data);
+        const chunks: Blob[] = [];
      };
-      mediaRecorder.current.onstop = async () => {
+        mediaRecorder.current.onstart = () => {
-        if (!cancelWhisperSignal.current) {
+          startTimestamp.current = Date.now();
-          const formData = new FormData();
+          timeIntervalRef.current = setInterval(() => {
-          const { options, filename } = (() => {
+            const currentTimestamp = Date.now();
-            if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
+            const duration = (currentTimestamp - startTimestamp.current) / 1000;
-              return {
+            setAudioSecond(duration);
-                options: { mimeType: 'video/webm; codecs=vp9' },
+          }, 1000);
-                filename: 'recording.mp3'
+        };
-              };
+        mediaRecorder.current.ondataavailable = (e) => {
-            }
+          chunks.push(e.data);
-            if (MediaRecorder.isTypeSupported('video/webm')) {
+        };
        mediaRecorder.current.onstop = async () => {
          // close media stream
          stream.getTracks().forEach((track) => track.stop());
          setIsSpeaking(false);
          if (timeIntervalRef.current) {
            clearInterval(timeIntervalRef.current);
          }
          if (!cancelWhisperSignal.current) {
            const formData = new FormData();
            const { options, filename } = (() => {
              if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
                return {
                  options: { mimeType: 'video/webm; codecs=vp9' },
                  filename: 'recording.mp3'
                };
              }
              if (MediaRecorder.isTypeSupported('video/webm')) {
                return {
                  options: { type: 'video/webm' },
                  filename: 'recording.mp3'
                };
              }
              if (MediaRecorder.isTypeSupported('video/mp4')) {
                return {
                  options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
                  filename: 'recording.mp4'
                };
              }
              return {
                options: { type: 'video/webm' },
                filename: 'recording.mp3'
              };
-            }
+            })();
            if (MediaRecorder.isTypeSupported('video/mp4')) {
              return {
                options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
                filename: 'recording.mp4'
              };
            }
            return {
              options: { type: 'video/webm' },
              filename: 'recording.mp3'
            };
          })();
-          const blob = new Blob(chunks, options);
+            const blob = new Blob(chunks, options);
-          const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
+            const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
-          formData.append('file', blob, filename);
+            formData.append('file', blob, filename);
-          formData.append(
+            formData.append(
-            'data',
+              'data',
-            JSON.stringify({
+              JSON.stringify({
-              ...props,
+                ...props,
-              duration
+                duration
-            })
+              })
-          );
+            );
-          setIsTransCription(true);
+            setIsTransCription(true);
-          try {
+            try {
-            const result = await POST<string>('/v1/audio/transcriptions', formData, {
+              const result = await POST<string>('/v1/audio/transcriptions', formData, {
-              timeout: 60000,
+                timeout: 60000,
-              headers: {
+                headers: {
-                'Content-Type': 'multipart/form-data; charset=utf-8'
+                  'Content-Type': 'multipart/form-data; charset=utf-8'
-              }
+                }
-            });
+              });
-            onFinish(result);
+              onFinish(result);
-          } catch (error) {
+            } catch (error) {
-            toast({
+              toast({
-              status: 'warning',
+                status: 'warning',
-              title: getErrText(error, t('common:common.speech.error tip'))
+                title: getErrText(error, t('common:common.speech.error tip'))
-            });
+              });
            }
            setIsTransCription(false);
          }
        };
        mediaRecorder.current.onerror = (e) => {
          if (timeIntervalRef.current) {
            clearInterval(timeIntervalRef.current);
          }
          console.log('error', e);
          setIsSpeaking(false);
        };
        // If onclick stop, stop speak
        if (stopCalledRef.current) {
          mediaRecorder.current.stop();
        } else {
          mediaRecorder.current.start();
        }
      } catch (error) {
        toast({
          status: 'warning',
          title: getErrText(error, 'Whisper error')
        });
        console.log(error);
      }
    },
    [toast, t, props]
  );
-        // close media stream
+  const stopSpeak = useCallback((cancel = false) => {
        stream.getTracks().forEach((track) => track.stop());
        setIsTransCription(false);
        setIsSpeaking(false);
      };
      mediaRecorder.current.onerror = (e) => {
        console.log('error', e);
        setIsSpeaking(false);
      };
      mediaRecorder.current.start();
    } catch (error) {
      toast({
        status: 'warning',
        title: getErrText(error, 'Whisper error')
      });
      console.log(error);
    }
  };
  const stopSpeak = (cancel = false) => {
    cancelWhisperSignal.current = cancel;
-    if (mediaRecorder.current) {
+    stopCalledRef.current = true;
      mediaRecorder.current?.stop();
      clearInterval(intervalRef.current);
    }
  };
    if (timeIntervalRef.current) {
      clearInterval(timeIntervalRef.current);
    }
    if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
      mediaRecorder.current.stop();
    }
  }, []);
  // Leave page, stop speak
  useEffect(() => {
    return () => {
-      clearInterval(intervalRef.current);
+      clearInterval(timeIntervalRef.current);
      if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
        mediaRecorder.current.stop();
      }
@@ -184,14 +273,15 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
    if (audioSecond >= 60) {
      stopSpeak();
    }
-  }, [audioSecond]);
+  }, [audioSecond, stopSpeak]);
  return {
    startSpeak,
    stopSpeak,
    isSpeaking,
    isTransCription,
-    renderAudioGraph,
+    renderAudioGraphPc,
    renderAudioGraphMobile,
    stream: mediaStream,
    speakingTimeString
  };