perf: mobile voice input (#4437)

* update:Mobile voice interaction (#4362) * Add files via upload * Add files via upload * Update ollama.md * Update ollama.md * Add files via upload * Update useSpeech.ts * Update ChatInput.tsx * Update useSpeech.ts * Update ChatInput.tsx * Update useSpeech.ts * Update constants.ts * Add files via upload * Update ChatInput.tsx * Update useSpeech.ts * Update useSpeech.ts * Update useSpeech.ts * Update ChatInput.tsx * Add files via upload * Update common.json * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update useSpeech.ts * Update useSpeech.ts * Update common.json * Update common.json * Update common.json * Update VoiceInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update VoiceInput.tsx * Update ChatInput.tsx * Update useSpeech.ts * Update common.json * Update chat.json * Update common.json * Update chat.json * Update common.json * Update chat.json * Update VoiceInput.tsx * Update ChatInput.tsx * Update useSpeech.ts * Update VoiceInput.tsx * speech ui * 优化语音输入组件，调整输入框显示逻辑，修复语音输入遮罩层样式，更新画布背景透明度，增强用户交互体验。 (#4435) * perf: mobil voice input --------- Co-authored-by: dreamer6680 <1468683855@qq.com>
2025-10-14 15:11:13 +00:00 · 2025-04-02 22:25:50 +08:00
parent c2e088cf39
commit e4c4941a50
8 changed files with 675 additions and 323 deletions
--- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/ChatInput.tsx
+++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/ChatInput.tsx
@@ -1,7 +1,6 @@
-import { useSpeech } from '@/web/common/hooks/useSpeech';
 import { useSystemStore } from '@/web/common/system/useSystemStore';
 import { Box, Flex, Spinner, Textarea } from '@chakra-ui/react';
-import React, { useRef, useEffect, useCallback, useMemo } from 'react';
+import React, { useRef, useEffect, useCallback, useMemo, useState } from 'react';
 import { useTranslation } from 'next-i18next';
 import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
 import MyIcon from '@fastgpt/web/components/common/Icon';
@@ -18,6 +17,7 @@ import FilePreview from '../../components/FilePreview';
 import { useFileUpload } from '../hooks/useFileUpload';
 import ComplianceTip from '@/components/common/ComplianceTip/index';
 import { useToast } from '@fastgpt/web/hooks/useToast';
+import VoiceInput, { type VoiceInputComponentRef } from './VoiceInput';

 const InputGuideBox = dynamic(() => import('./InputGuideBox'));

@@ -44,6 +44,7 @@ const ChatInput = ({
  const { t } = useTranslation();
  const { toast } = useToast();
  const { isPc } = useSystem();
+  const VoiceInputRef = useRef<VoiceInputComponentRef>(null);

  const { setValue, watch, control } = chatForm;
  const inputValue = watch('input');
@@ -53,7 +54,6 @@ const ChatInput = ({
  const chatId = useContextSelector(ChatBoxContext, (v) => v.chatId);
  const isChatting = useContextSelector(ChatBoxContext, (v) => v.isChatting);
  const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
-  const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
  const chatInputGuide = useContextSelector(ChatBoxContext, (v) => v.chatInputGuide);
  const fileSelectConfig = useContextSelector(ChatBoxContext, (v) => v.fileSelectConfig);

@@ -106,86 +106,6 @@ const ChatInput = ({
    [TextareaDom, canSendMessage, fileList, onSendMessage, replaceFiles]
  );

-  /* whisper init */
-  const canvasRef = useRef<HTMLCanvasElement>(null);
-  const {
-    isSpeaking,
-    isTransCription,
-    stopSpeak,
-    startSpeak,
-    speakingTimeString,
-    renderAudioGraph,
-    stream
-  } = useSpeech({ appId, ...outLinkAuthData });
-  const onWhisperRecord = useCallback(() => {
-    const finishWhisperTranscription = (text: string) => {
-      if (!text) return;
-      if (whisperConfig?.autoSend) {
-        onSendMessage({
-          text,
-          files: fileList,
-          autoTTSResponse
-        });
-        replaceFiles([]);
-      } else {
-        resetInputVal({ text });
-      }
-    };
-    if (isSpeaking) {
-      return stopSpeak();
-    }
-    startSpeak(finishWhisperTranscription);
-  }, [
-    autoTTSResponse,
-    fileList,
-    isSpeaking,
-    onSendMessage,
-    replaceFiles,
-    resetInputVal,
-    startSpeak,
-    stopSpeak,
-    whisperConfig?.autoSend
-  ]);
-  useEffect(() => {
-    if (!stream) {
-      return;
-    }
-    const audioContext = new AudioContext();
-    const analyser = audioContext.createAnalyser();
-    analyser.fftSize = 4096;
-    analyser.smoothingTimeConstant = 1;
-    const source = audioContext.createMediaStreamSource(stream);
-    source.connect(analyser);
-    const renderCurve = () => {
-      if (!canvasRef.current) return;
-      renderAudioGraph(analyser, canvasRef.current);
-      window.requestAnimationFrame(renderCurve);
-    };
-    renderCurve();
-  }, [renderAudioGraph, stream]);
-
-  const RenderTranslateLoading = useMemo(
-    () => (
-      <Flex
-        position={'absolute'}
-        top={0}
-        bottom={0}
-        left={0}
-        right={0}
-        zIndex={10}
-        pl={5}
-        alignItems={'center'}
-        bg={'white'}
-        color={'primary.500'}
-        visibility={isSpeaking && isTransCription ? 'visible' : 'hidden'}
-      >
-        <Spinner size={'sm'} mr={4} />
-        {t('common:core.chat.Converting to text')}
-      </Flex>
-    ),
-    [isSpeaking, isTransCription, t]
-  );
-
  const RenderTextarea = useMemo(
    () => (
      <Flex alignItems={'flex-end'} mt={fileList.length > 0 ? 1 : 0} pl={[2, 4]}>
@@ -198,7 +118,6 @@ const ChatInput = ({
            cursor={'pointer'}
            transform={'translateY(1px)'}
            onClick={() => {
-              if (isSpeaking) return;
              onOpenSelectFile();
            }}
          >
@@ -208,7 +127,6 @@ const ChatInput = ({
            <File onSelect={(files) => onSelectFile({ files })} />
          </Flex>
        )}
-
        {/* input area */}
        <Textarea
          ref={TextareaDom}
@@ -220,11 +138,7 @@ const ChatInput = ({
            border: 'none'
          }}
          placeholder={
-            isSpeaking
-              ? t('common:core.chat.Speaking')
-              : isPc
-                ? t('common:core.chat.Type a message')
-                : t('chat:input_placeholder_phone')
+            isPc ? t('common:core.chat.Type a message') : t('chat:input_placeholder_phone')
          }
          resize={'none'}
          rows={1}
@@ -237,9 +151,8 @@ const ChatInput = ({
          wordBreak={'break-all'}
          boxShadow={'none !important'}
          color={'myGray.900'}
-          isDisabled={isSpeaking}
-          value={inputValue}
          fontSize={['md', 'sm']}
+          value={inputValue}
          onChange={(e) => {
            const textarea = e.target;
            textarea.style.height = textareaMinH;
@@ -290,118 +203,78 @@ const ChatInput = ({
            }
          }}
        />
-        <Flex alignItems={'center'} position={'absolute'} right={[2, 4]} bottom={['10px', '12px']}>
-          {/* voice-input */}
-          {whisperConfig?.open && !inputValue && !isChatting && (
-            <>
-              <canvas
-                ref={canvasRef}
-                style={{
-                  height: '30px',
-                  width: isSpeaking && !isTransCription ? '100px' : 0,
-                  background: 'white',
-                  zIndex: 0
+        <Flex
+          alignItems={'center'}
+          position={'absolute'}
+          right={[2, 4]}
+          bottom={['10px', '12px']}
+          zIndex={3}
+        >
+          {/* Voice input icon */}
+          {whisperConfig?.open && !inputValue && (
+            <MyTooltip label={t('common:core.chat.Record')}>
+              <Flex
+                alignItems={'center'}
+                justifyContent={'center'}
+                flexShrink={0}
+                h={['28px', '32px']}
+                w={['28px', '32px']}
+                mr={2}
+                borderRadius={'md'}
+                cursor={'pointer'}
+                _hover={{ bg: '#F5F5F8' }}
+                onClick={() => {
+                  VoiceInputRef.current?.onSpeak?.();
                }}
-              />
-              {isSpeaking && (
-                <MyTooltip label={t('common:core.chat.Cancel Speak')}>
-                  <Flex
-                    mr={2}
-                    alignItems={'center'}
-                    justifyContent={'center'}
-                    flexShrink={0}
-                    h={['26px', '32px']}
-                    w={['26px', '32px']}
-                    borderRadius={'md'}
-                    cursor={'pointer'}
-                    _hover={{ bg: '#F5F5F8' }}
-                    onClick={() => stopSpeak(true)}
-                  >
-                    <MyIcon
-                      name={'core/chat/cancelSpeak'}
-                      width={['20px', '22px']}
-                      height={['20px', '22px']}
-                    />
-                  </Flex>
-                </MyTooltip>
-              )}
-              <MyTooltip
-                label={
-                  isSpeaking ? t('common:core.chat.Finish Speak') : t('common:core.chat.Record')
-                }
              >
-                <Flex
-                  mr={2}
-                  alignItems={'center'}
-                  justifyContent={'center'}
-                  flexShrink={0}
-                  h={['26px', '32px']}
-                  w={['26px', '32px']}
-                  borderRadius={'md'}
-                  cursor={'pointer'}
-                  _hover={{ bg: '#F5F5F8' }}
-                  onClick={onWhisperRecord}
-                >
-                  <MyIcon
-                    name={isSpeaking ? 'core/chat/finishSpeak' : 'core/chat/recordFill'}
-                    width={['20px', '22px']}
-                    height={['20px', '22px']}
-                    color={isSpeaking ? 'primary.500' : 'myGray.600'}
-                  />
-                </Flex>
-              </MyTooltip>
-            </>
-          )}
-          {/* send and stop icon */}
-          {isSpeaking ? (
-            <Box color={'#5A646E'} w={'36px'} textAlign={'right'} whiteSpace={'nowrap'}>
-              {speakingTimeString}
-            </Box>
-          ) : (
-            <Flex
-              alignItems={'center'}
-              justifyContent={'center'}
-              flexShrink={0}
-              h={['28px', '32px']}
-              w={['28px', '32px']}
-              borderRadius={'md'}
-              bg={
-                isSpeaking || isChatting
-                  ? ''
-                  : !havInput || hasFileUploading
-                    ? '#E5E5E5'
-                    : 'primary.500'
-              }
-              cursor={havInput ? 'pointer' : 'not-allowed'}
-              lineHeight={1}
-              onClick={() => {
-                if (isChatting) {
-                  return onStop();
-                }
-                return handleSend();
-              }}
-            >
-              {isChatting ? (
                <MyIcon
-                  animation={'zoomStopIcon 0.4s infinite alternate'}
+                  name={'core/chat/recordFill'}
                  width={['22px', '25px']}
                  height={['22px', '25px']}
-                  cursor={'pointer'}
-                  name={'stop'}
-                  color={'gray.500'}
+                  color={'myGray.600'}
                />
-              ) : (
-                <MyTooltip label={t('common:core.chat.Send Message')}>
-                  <MyIcon
-                    name={'core/chat/sendFill'}
-                    width={['18px', '20px']}
-                    height={['18px', '20px']}
-                    color={'white'}
-                  />
-                </MyTooltip>
-              )}
-            </Flex>
+              </Flex>
+            </MyTooltip>
          )}
+
+          {/* send and stop icon */}
+          <Flex
+            alignItems={'center'}
+            justifyContent={'center'}
+            flexShrink={0}
+            h={['28px', '32px']}
+            w={['28px', '32px']}
+            borderRadius={'md'}
+            bg={isChatting ? '' : !havInput || hasFileUploading ? '#E5E5E5' : 'primary.500'}
+            cursor={havInput ? 'pointer' : 'not-allowed'}
+            lineHeight={1}
+            onClick={() => {
+              if (isChatting) {
+                return onStop();
+              }
+              return handleSend();
+            }}
+          >
+            {isChatting ? (
+              <MyIcon
+                animation={'zoomStopIcon 0.4s infinite alternate'}
+                width={['22px', '25px']}
+                height={['22px', '25px']}
+                cursor={'pointer'}
+                name={'stop'}
+                color={'gray.500'}
+              />
+            ) : (
+              <MyTooltip label={t('common:core.chat.Send Message')}>
+                <MyIcon
+                  name={'core/chat/sendFill'}
+                  width={['18px', '20px']}
+                  height={['18px', '20px']}
+                  color={'white'}
+                />
+              </MyTooltip>
+            )}
+          </Flex>
        </Flex>
      </Flex>
    ),
@@ -415,21 +288,15 @@ const ChatInput = ({
      inputValue,
      isChatting,
      isPc,
-      isSpeaking,
-      isTransCription,
      onOpenSelectFile,
      onSelectFile,
      onStop,
-      onWhisperRecord,
      selectFileIcon,
      selectFileLabel,
      setValue,
      showSelectFile,
      showSelectImg,
-      speakingTimeString,
-      stopSpeak,
-      t,
-      whisperConfig?.open
+      t
    ]
  );

@@ -468,7 +335,7 @@ const ChatInput = ({
        pt={fileList.length > 0 ? '0' : ['14px', '18px']}
        pb={['14px', '18px']}
        position={'relative'}
-        boxShadow={isSpeaking ? `0 0 10px rgba(54,111,255,0.4)` : `0 0 10px rgba(0,0,0,0.2)`}
+        boxShadow={`0 0 10px rgba(0,0,0,0.2)`}
        borderRadius={['none', 'md']}
        bg={'white'}
        overflow={'display'}
@@ -495,15 +362,20 @@ const ChatInput = ({
            }}
          />
        )}
-
-        {/* translate loading */}
-        {RenderTranslateLoading}
-
        {/* file preview */}
        <Box px={[1, 3]}>
          <FilePreview fileList={fileList} removeFiles={removeFiles} />
        </Box>

+        {/* voice input and loading container */}
+        {!inputValue && (
+          <VoiceInput
+            ref={VoiceInputRef}
+            onSendMessage={onSendMessage}
+            resetInputVal={resetInputVal}
+          />
+        )}
+
        {RenderTextarea}
      </Box>
      <ComplianceTip type={'chat'} />
--- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/VoiceInput.tsx
+++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/Input/VoiceInput.tsx
@@ -0,0 +1,367 @@
+import { useSpeech } from '@/web/common/hooks/useSpeech';
+import { Box, Flex, HStack, Spinner } from '@chakra-ui/react';
+import React, {
+  useRef,
+  useEffect,
+  useCallback,
+  useState,
+  forwardRef,
+  useImperativeHandle,
+  useMemo
+} from 'react';
+import { useTranslation } from 'next-i18next';
+import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
+import MyIcon from '@fastgpt/web/components/common/Icon';
+import { useSystem } from '@fastgpt/web/hooks/useSystem';
+import { useContextSelector } from 'use-context-selector';
+import { ChatBoxContext } from '../Provider';
+import MyIconButton from '@/pageComponents/account/team/OrgManage/IconButton';
+
+export interface VoiceInputComponentRef {
+  onSpeak: () => void;
+}
+
+type VoiceInputProps = {
+  onSendMessage: (params: { text: string; files?: any[]; autoTTSResponse?: boolean }) => void;
+  resetInputVal: (val: { text: string }) => void;
+};
+
+// PC voice input
+const PCVoiceInput = ({
+  speakingTimeString,
+  stopSpeak,
+  canvasRef
+}: {
+  speakingTimeString: string;
+  stopSpeak: (param: boolean) => void;
+  canvasRef: React.RefObject<HTMLCanvasElement>;
+}) => {
+  const { t } = useTranslation();
+
+  return (
+    <HStack h={'100%'} px={4}>
+      <Box fontSize="sm" color="myGray.500" flex={'1 0 0'}>
+        {t('common:core.chat.Speaking')}
+      </Box>
+      <canvas
+        ref={canvasRef}
+        style={{
+          height: '10px',
+          width: '100px',
+          background: 'white'
+        }}
+      />
+      <Box fontSize="sm" color="myGray.500" whiteSpace={'nowrap'}>
+        {speakingTimeString}
+      </Box>
+      <MyTooltip label={t('common:core.chat.Cancel Speak')}>
+        <MyIconButton
+          name={'core/chat/cancelSpeak'}
+          h={'22px'}
+          w={'22px'}
+          onClick={() => stopSpeak(true)}
+        />
+      </MyTooltip>
+      <MyTooltip label={t('common:core.chat.Finish Speak')}>
+        <MyIconButton
+          name={'core/chat/finishSpeak'}
+          h={'22px'}
+          w={'22px'}
+          onClick={() => stopSpeak(false)}
+        />
+      </MyTooltip>
+    </HStack>
+  );
+};
+
+// mobile voice input
+const MobileVoiceInput = ({
+  isSpeaking,
+  onStartSpeak,
+  onCloseSpeak,
+  stopSpeak,
+  canvasRef
+}: {
+  isSpeaking: boolean;
+  onStartSpeak: () => void;
+  onCloseSpeak: () => any;
+  stopSpeak: (param: boolean) => void;
+  canvasRef: React.RefObject<HTMLCanvasElement>;
+}) => {
+  const { t } = useTranslation();
+
+  const isPressing = useRef(false);
+  const startTimeRef = useRef(0); // 防抖
+
+  const startYRef = useRef(0);
+
+  const [isCancel, setIsCancel] = useState(false);
+
+  const handleTouchStart = useCallback(
+    (e: React.TouchEvent<HTMLDivElement>) => {
+      isPressing.current = true;
+      setIsCancel(false);
+
+      startTimeRef.current = Date.now();
+      const touch = e.touches[0];
+      startYRef.current = touch.pageY;
+
+      onStartSpeak();
+    },
+    [onStartSpeak]
+  );
+
+  const handleTouchMove = useCallback(
+    (e: React.TouchEvent<HTMLDivElement>) => {
+      const touch = e.touches[0] as Touch;
+      const currentY = touch.pageY;
+      const deltaY = startYRef.current - currentY;
+
+      if (deltaY > 90) {
+        setIsCancel(true);
+      } else if (deltaY <= 90) {
+        setIsCancel(false);
+      }
+    },
+    [startYRef]
+  );
+
+  const handleTouchEnd = useCallback(
+    (e: React.TouchEvent<HTMLDivElement>) => {
+      if (!isPressing.current) return;
+
+      const endTime = Date.now();
+      const timeDifference = endTime - startTimeRef.current;
+
+      if (isCancel || timeDifference < 200) {
+        stopSpeak(true);
+      } else {
+        stopSpeak(false);
+      }
+    },
+    [isCancel, stopSpeak]
+  );
+
+  return (
+    <Flex position="relative" h="100%">
+      {/* Back Icon */}
+      {!isSpeaking && (
+        <MyTooltip label={t('chat:back_to_text')}>
+          <MyIconButton
+            position="absolute"
+            right={2}
+            top={'50%'}
+            transform={'translateY(-50%)'}
+            zIndex={5}
+            name={'core/chat/backText'}
+            h={'22px'}
+            w={'22px'}
+            onClick={onCloseSpeak}
+          />
+        </MyTooltip>
+      )}
+      <Flex
+        alignItems={'center'}
+        justifyContent={'center'}
+        h="100%"
+        flex="1 0 0"
+        bg={isSpeaking ? (isCancel ? 'red.500' : 'primary.500') : 'white'}
+        onTouchMove={handleTouchMove}
+        onTouchEnd={handleTouchEnd}
+        onTouchStart={handleTouchStart}
+        onTouchCancel={() => {
+          stopSpeak(true);
+        }}
+        zIndex={4}
+      >
+        <Box visibility={isSpeaking ? 'hidden' : 'visible'}>{t('chat:press_to_speak')}</Box>
+        <Box
+          position="absolute"
+          h={'100%'}
+          w={'100%'}
+          as="canvas"
+          ref={canvasRef}
+          flex="0 0 80%"
+          visibility={isSpeaking ? 'visible' : 'hidden'}
+        />
+      </Flex>
+
+      {/* Mask */}
+      {isSpeaking && (
+        <Flex
+          justifyContent="center"
+          alignItems="center"
+          height="100%"
+          position="fixed"
+          left={0}
+          right={0}
+          bottom={'50px'}
+          h={'200px'}
+          bg="linear-gradient(to top, white, rgba(255, 255, 255, 0.7), rgba(255, 255, 255, 0))"
+        >
+          <Box fontSize="sm" color="myGray.500" position="absolute" bottom={'10px'}>
+            {isCancel ? t('chat:release_cancel') : t('chat:release_send')}
+          </Box>
+        </Flex>
+      )}
+    </Flex>
+  );
+};
+
+const VoiceInput = forwardRef<VoiceInputComponentRef, VoiceInputProps>(
+  ({ onSendMessage, resetInputVal }, ref) => {
+    const { t } = useTranslation();
+    const { isPc } = useSystem();
+
+    const outLinkAuthData = useContextSelector(ChatBoxContext, (v) => v.outLinkAuthData);
+    const appId = useContextSelector(ChatBoxContext, (v) => v.appId);
+    const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
+    const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
+    const canvasRef = useRef<HTMLCanvasElement>(null);
+
+    const {
+      isSpeaking,
+      isTransCription,
+      stopSpeak,
+      startSpeak,
+      speakingTimeString,
+      renderAudioGraphPc,
+      renderAudioGraphMobile,
+      stream
+    } = useSpeech({ appId, ...outLinkAuthData });
+
+    const [mobilePreSpeak, setMobilePreSpeak] = useState(false);
+
+    // Canvas render
+    useEffect(() => {
+      if (!stream) {
+        return;
+      }
+
+      const audioContext = new AudioContext();
+      const analyser = audioContext.createAnalyser();
+      analyser.fftSize = 4096;
+      analyser.smoothingTimeConstant = 1;
+      const source = audioContext.createMediaStreamSource(stream);
+      source.connect(analyser);
+
+      let animationFrameId: number | null = null;
+      const renderCurve = () => {
+        const canvas = canvasRef.current;
+        if (!canvas) return;
+
+        const ctx = canvas.getContext('2d');
+        if (!ctx) return;
+
+        if (!stream.active) {
+          ctx.clearRect(0, 0, canvas.width, canvas.height);
+          if (animationFrameId) {
+            window.cancelAnimationFrame(animationFrameId);
+            animationFrameId = null;
+          }
+          return;
+        }
+
+        if (isPc) {
+          renderAudioGraphPc(analyser, canvas);
+        } else {
+          renderAudioGraphMobile(analyser, canvas);
+        }
+        animationFrameId = window.requestAnimationFrame(renderCurve);
+      };
+
+      renderCurve();
+
+      return () => {
+        if (animationFrameId) {
+          window.cancelAnimationFrame(animationFrameId);
+        }
+        audioContext.close();
+        source.disconnect();
+        analyser.disconnect();
+      };
+    }, [stream, canvasRef, renderAudioGraphPc, renderAudioGraphMobile, isPc]);
+
+    const onStartSpeak = useCallback(() => {
+      const finishWhisperTranscription = (text: string) => {
+        if (!text) return;
+        if (whisperConfig?.autoSend) {
+          onSendMessage({
+            text,
+            autoTTSResponse
+          });
+        } else {
+          resetInputVal({ text });
+        }
+      };
+      startSpeak(finishWhisperTranscription);
+    }, []);
+
+    const onSpeach = useCallback(() => {
+      if (isPc) {
+        onStartSpeak();
+      } else {
+        setMobilePreSpeak(true);
+      }
+    }, []);
+    useImperativeHandle(ref, () => ({
+      onSpeak: onSpeach
+    }));
+
+    if (!whisperConfig?.open) return null;
+    if (!mobilePreSpeak && !isSpeaking && !isTransCription) return null;
+
+    return (
+      <Box
+        position="absolute"
+        overflow={'hidden'}
+        userSelect={'none'}
+        top={0}
+        left={0}
+        right={0}
+        bottom={0}
+        bg="white"
+        zIndex={5}
+        borderRadius={isPc ? 'md' : ''}
+        onContextMenu={(e) => e.preventDefault()}
+      >
+        {isPc ? (
+          <PCVoiceInput
+            speakingTimeString={speakingTimeString}
+            stopSpeak={stopSpeak}
+            canvasRef={canvasRef}
+          />
+        ) : (
+          <MobileVoiceInput
+            isSpeaking={isSpeaking}
+            onStartSpeak={onStartSpeak}
+            onCloseSpeak={() => setMobilePreSpeak(false)}
+            stopSpeak={stopSpeak}
+            canvasRef={canvasRef}
+          />
+        )}
+
+        {isTransCription && (
+          <Flex
+            position={'absolute'}
+            top={0}
+            bottom={0}
+            left={0}
+            right={0}
+            pl={5}
+            alignItems={'center'}
+            bg={'white'}
+            color={'primary.500'}
+            zIndex={6}
+          >
+            <Spinner size={'sm'} mr={4} />
+            {t('common:core.chat.Converting to text')}
+          </Flex>
+        )}
+      </Box>
+    );
+  }
+);
+VoiceInput.displayName = 'VoiceInput';
+
+export default VoiceInput;
--- a/projects/app/src/web/common/hooks/useSpeech.ts
+++ b/projects/app/src/web/common/hooks/useSpeech.ts
@@ -7,16 +7,21 @@ import { OutLinkChatAuthProps } from '@fastgpt/global/support/permission/chat';

 export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) => {
  const { t } = useTranslation();
-  const mediaRecorder = useRef<MediaRecorder>();
-  const [mediaStream, setMediaStream] = useState<MediaStream>();
  const { toast } = useToast();
+
  const [isSpeaking, setIsSpeaking] = useState(false);
  const [isTransCription, setIsTransCription] = useState(false);
-  const [audioSecond, setAudioSecond] = useState(0);
-  const intervalRef = useRef<any>();
-  const startTimestamp = useRef(0);
-  const cancelWhisperSignal = useRef(false);

+  const mediaRecorder = useRef<MediaRecorder>();
+  const [mediaStream, setMediaStream] = useState<MediaStream>();
+
+  const timeIntervalRef = useRef<any>();
+  const cancelWhisperSignal = useRef(false);
+  const stopCalledRef = useRef(false);
+
+  const startTimestamp = useRef(0);
+
+  const [audioSecond, setAudioSecond] = useState(0);
  const speakingTimeString = useMemo(() => {
    const minutes: number = Math.floor(audioSecond / 60);
    const remainingSeconds: number = Math.floor(audioSecond % 60);
@@ -25,17 +30,16 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
    return `${formattedMinutes}:${formattedSeconds}`;
  }, [audioSecond]);

-  const renderAudioGraph = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
+  const renderAudioGraphPc = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
    const bufferLength = analyser.frequencyBinCount;
-    const backgroundColor = 'white';
    const dataArray = new Uint8Array(bufferLength);
    analyser.getByteTimeDomainData(dataArray);
    const canvasCtx = canvas?.getContext('2d');
-    const width = 300;
-    const height = 200;
+    const width = canvas.width;
+    const height = canvas.height;
    if (!canvasCtx) return;
    canvasCtx.clearRect(0, 0, width, height);
-    canvasCtx.fillStyle = backgroundColor;
+    canvasCtx.fillStyle = 'white';
    canvasCtx.fillRect(0, 0, width, height);
    const barWidth = (width / bufferLength) * 2.5;
    let x = 0;
@@ -49,127 +53,212 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
      x += barWidth + 1;
    }
  }, []);
+  const renderAudioGraphMobile = useCallback(
+    (analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
+      const canvasCtx = canvas?.getContext('2d');
+      if (!canvasCtx) return;

-  const startSpeak = async (onFinish: (text: string) => void) => {
-    if (!navigator?.mediaDevices?.getUserMedia) {
-      return toast({
-        status: 'warning',
-        title: t('common:common.speech.not support')
-      });
-    }
-    try {
+      const bufferLength = analyser.frequencyBinCount;
+      const dataArray = new Uint8Array(bufferLength);
+      analyser.getByteTimeDomainData(dataArray);
+
+      const width = canvas.width;
+      const height = canvas.height;
+      canvasCtx.clearRect(0, 0, width, height);
+
+      // Set transparent background
+      canvasCtx.fillStyle = 'rgba(255, 255, 255, 0)';
+      canvasCtx.fillRect(0, 0, width, height);
+
+      const centerY = height / 2;
+      const barWidth = (width / bufferLength) * 15;
+      const gap = 2; // 添加间隙
+      let x = width * 0.1;
+
+      let sum = 0;
+      let maxDiff = 0;
+
+      for (let i = 0; i < bufferLength; i++) {
+        sum += dataArray[i];
+        maxDiff = Math.max(maxDiff, Math.abs(dataArray[i] - 128));
+      }
+      const average = sum / bufferLength;
+
+      // draw initial rectangle waveform
+      canvasCtx.beginPath();
+      canvasCtx.fillStyle = '#FFFFFF';
+
+      const initialHeight = height * 0.1;
+      for (let i = 0; i < width * 0.8; i += barWidth + gap) {
+        canvasCtx.fillRect(i + width * 0.1, centerY - initialHeight, barWidth, initialHeight);
+        canvasCtx.fillRect(i + width * 0.1, centerY, barWidth, initialHeight);
+      }
+
+      // draw dynamic waveform
+      canvasCtx.beginPath();
+      for (let i = 0; i < bufferLength; i += 4) {
+        const value = dataArray[i];
+        const normalizedValue = (value - average) / 128;
+        const amplification = 2.5;
+        const barHeight = normalizedValue * height * 0.4 * amplification;
+
+        canvasCtx.fillStyle = '#FFFFFF';
+
+        canvasCtx.fillRect(x, centerY - Math.abs(barHeight), barWidth, Math.abs(barHeight));
+        canvasCtx.fillRect(x, centerY, barWidth, Math.abs(barHeight));
+
+        x += barWidth + gap; // 增加间隔
+
+        if (x > width * 0.9) break;
+      }
+    },
+    []
+  );
+
+  const startSpeak = useCallback(
+    async (onFinish: (text: string) => void) => {
+      if (!navigator?.mediaDevices?.getUserMedia) {
+        return toast({
+          status: 'warning',
+          title: t('common:common.speech.not support')
+        });
+      }
+
+      // Init status
+      if (timeIntervalRef.current) {
+        clearInterval(timeIntervalRef.current);
+      }
      cancelWhisperSignal.current = false;
+      stopCalledRef.current = false;

-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      setMediaStream(stream);
-
-      mediaRecorder.current = new MediaRecorder(stream);
-      const chunks: Blob[] = [];
      setIsSpeaking(true);
+      setAudioSecond(0);

-      mediaRecorder.current.onstart = () => {
-        startTimestamp.current = Date.now();
-        setAudioSecond(0);
-        intervalRef.current = setInterval(() => {
-          const currentTimestamp = Date.now();
-          const duration = (currentTimestamp - startTimestamp.current) / 1000;
-          setAudioSecond(duration);
-        }, 1000);
-      };
+      try {
+        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        setMediaStream(stream);

-      mediaRecorder.current.ondataavailable = (e) => {
-        chunks.push(e.data);
-      };
+        mediaRecorder.current = new MediaRecorder(stream);
+        const chunks: Blob[] = [];

-      mediaRecorder.current.onstop = async () => {
-        if (!cancelWhisperSignal.current) {
-          const formData = new FormData();
-          const { options, filename } = (() => {
-            if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
-              return {
-                options: { mimeType: 'video/webm; codecs=vp9' },
-                filename: 'recording.mp3'
-              };
-            }
-            if (MediaRecorder.isTypeSupported('video/webm')) {
+        mediaRecorder.current.onstart = () => {
+          startTimestamp.current = Date.now();
+          timeIntervalRef.current = setInterval(() => {
+            const currentTimestamp = Date.now();
+            const duration = (currentTimestamp - startTimestamp.current) / 1000;
+            setAudioSecond(duration);
+          }, 1000);
+        };
+        mediaRecorder.current.ondataavailable = (e) => {
+          chunks.push(e.data);
+        };
+        mediaRecorder.current.onstop = async () => {
+          // close media stream
+          stream.getTracks().forEach((track) => track.stop());
+          setIsSpeaking(false);
+
+          if (timeIntervalRef.current) {
+            clearInterval(timeIntervalRef.current);
+          }
+
+          if (!cancelWhisperSignal.current) {
+            const formData = new FormData();
+            const { options, filename } = (() => {
+              if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
+                return {
+                  options: { mimeType: 'video/webm; codecs=vp9' },
+                  filename: 'recording.mp3'
+                };
+              }
+              if (MediaRecorder.isTypeSupported('video/webm')) {
+                return {
+                  options: { type: 'video/webm' },
+                  filename: 'recording.mp3'
+                };
+              }
+              if (MediaRecorder.isTypeSupported('video/mp4')) {
+                return {
+                  options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
+                  filename: 'recording.mp4'
+                };
+              }
              return {
                options: { type: 'video/webm' },
                filename: 'recording.mp3'
              };
-            }
-            if (MediaRecorder.isTypeSupported('video/mp4')) {
-              return {
-                options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
-                filename: 'recording.mp4'
-              };
-            }
-            return {
-              options: { type: 'video/webm' },
-              filename: 'recording.mp3'
-            };
-          })();
+            })();

-          const blob = new Blob(chunks, options);
-          const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
-          formData.append('file', blob, filename);
-          formData.append(
-            'data',
-            JSON.stringify({
-              ...props,
-              duration
-            })
-          );
+            const blob = new Blob(chunks, options);
+            const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
+            formData.append('file', blob, filename);
+            formData.append(
+              'data',
+              JSON.stringify({
+                ...props,
+                duration
+              })
+            );

-          setIsTransCription(true);
-          try {
-            const result = await POST<string>('/v1/audio/transcriptions', formData, {
-              timeout: 60000,
-              headers: {
-                'Content-Type': 'multipart/form-data; charset=utf-8'
-              }
-            });
-            onFinish(result);
-          } catch (error) {
-            toast({
-              status: 'warning',
-              title: getErrText(error, t('common:common.speech.error tip'))
-            });
+            setIsTransCription(true);
+            try {
+              const result = await POST<string>('/v1/audio/transcriptions', formData, {
+                timeout: 60000,
+                headers: {
+                  'Content-Type': 'multipart/form-data; charset=utf-8'
+                }
+              });
+              onFinish(result);
+            } catch (error) {
+              toast({
+                status: 'warning',
+                title: getErrText(error, t('common:common.speech.error tip'))
+              });
+            }
+            setIsTransCription(false);
          }
+        };
+        mediaRecorder.current.onerror = (e) => {
+          if (timeIntervalRef.current) {
+            clearInterval(timeIntervalRef.current);
+          }
+          console.log('error', e);
+          setIsSpeaking(false);
+        };
+
+        // If onclick stop, stop speak
+        if (stopCalledRef.current) {
+          mediaRecorder.current.stop();
+        } else {
+          mediaRecorder.current.start();
        }
+      } catch (error) {
+        toast({
+          status: 'warning',
+          title: getErrText(error, 'Whisper error')
+        });
+        console.log(error);
+      }
+    },
+    [toast, t, props]
+  );

-        // close media stream
-        stream.getTracks().forEach((track) => track.stop());
-
-        setIsTransCription(false);
-        setIsSpeaking(false);
-      };
-
-      mediaRecorder.current.onerror = (e) => {
-        console.log('error', e);
-        setIsSpeaking(false);
-      };
-
-      mediaRecorder.current.start();
-    } catch (error) {
-      toast({
-        status: 'warning',
-        title: getErrText(error, 'Whisper error')
-      });
-      console.log(error);
-    }
-  };
-
-  const stopSpeak = (cancel = false) => {
+  const stopSpeak = useCallback((cancel = false) => {
    cancelWhisperSignal.current = cancel;
-    if (mediaRecorder.current) {
-      mediaRecorder.current?.stop();
-      clearInterval(intervalRef.current);
-    }
-  };
+    stopCalledRef.current = true;

+    if (timeIntervalRef.current) {
+      clearInterval(timeIntervalRef.current);
+    }
+
+    if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
+      mediaRecorder.current.stop();
+    }
+  }, []);
+
+  // Leave page, stop speak
  useEffect(() => {
    return () => {
-      clearInterval(intervalRef.current);
+      clearInterval(timeIntervalRef.current);
      if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
        mediaRecorder.current.stop();
      }
@@ -184,14 +273,15 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
    if (audioSecond >= 60) {
      stopSpeak();
    }
-  }, [audioSecond]);
+  }, [audioSecond, stopSpeak]);

  return {
    startSpeak,
    stopSpeak,
    isSpeaking,
    isTransCription,
-    renderAudioGraph,
+    renderAudioGraphPc,
+    renderAudioGraphMobile,
    stream: mediaStream,
    speakingTimeString
  };