Add whisper and tts ui (#484)

Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
2025-08-01 20:27:45 +00:00 · 2023-11-17 00:03:05 +08:00
parent f6aea484ce
commit 4358b6de4d
34 changed files with 806 additions and 333 deletions
--- a/projects/app/src/web/common/api/fetch.ts
+++ b/projects/app/src/web/common/api/fetch.ts
@@ -110,6 +110,12 @@ export const streamFetch = ({
      };
      read();
    } catch (err: any) {
+      if (abortSignal.signal.aborted) {
+        return resolve({
+          responseText: '',
+          responseData: []
+        });
+      }
      console.log(err, 'fetch error');

      reject(getErrText(err, '请求异常'));
--- a/projects/app/src/web/common/hooks/useSpeech.ts
+++ b/projects/app/src/web/common/hooks/useSpeech.ts
@@ -1,20 +1,71 @@
-import { useEffect, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { POST } from '../api/request';
 import { useToast } from './useToast';
 import { useTranslation } from 'next-i18next';
 import { getErrText } from '@fastgpt/global/common/error/utils';

-export const useSpeech = () => {
+export const useSpeech = (props?: { shareId?: string }) => {
+  const { shareId } = props || {};
  const { t } = useTranslation();
  const mediaRecorder = useRef<MediaRecorder>();
+  const mediaStream = useRef<MediaStream>();
  const { toast } = useToast();
  const [isSpeaking, setIsSpeaking] = useState(false);
+  const [isTransCription, setIsTransCription] = useState(false);
+  const [audioSecond, setAudioSecone] = useState(0);
+  const intervalRef = useRef<any>();
+  const startTimestamp = useRef(0);

-  const startSpeak = async () => {
+  const speakingTimeString = useMemo(() => {
+    const minutes: number = Math.floor(audioSecond / 60);
+    const remainingSeconds: number = Math.floor(audioSecond % 60);
+    const formattedMinutes: string = minutes.toString().padStart(2, '0');
+    const formattedSeconds: string = remainingSeconds.toString().padStart(2, '0');
+    return `${formattedMinutes}:${formattedSeconds}`;
+  }, [audioSecond]);
+
+  const renderAudioGraph = (analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
+    const bufferLength = analyser.frequencyBinCount;
+    const backgroundColor = 'white';
+    const dataArray = new Uint8Array(bufferLength);
+    analyser.getByteTimeDomainData(dataArray);
+    const canvasCtx = canvas?.getContext('2d');
+    const width = 300;
+    const height = 200;
+    if (!canvasCtx) return;
+    canvasCtx.clearRect(0, 0, width, height);
+    canvasCtx.fillStyle = backgroundColor;
+    canvasCtx.fillRect(0, 0, width, height);
+    const barWidth = (width / bufferLength) * 2.5;
+    let x = 0;
+
+    canvasCtx.moveTo(x, height / 2);
+    for (let i = 0; i < bufferLength; i += 10) {
+      const barHeight = (dataArray[i] / 256) * height - height * 0.15;
+      canvasCtx.fillStyle = '#3370FF';
+      const adjustedBarHeight = Math.max(0, barHeight);
+      canvasCtx.fillRect(x, height - adjustedBarHeight, barWidth, adjustedBarHeight);
+      x += barWidth + 1;
+    }
+  };
+
+  const startSpeak = async (onFinish: (text: string) => void) => {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      mediaStream.current = stream;
      mediaRecorder.current = new MediaRecorder(stream);
      const chunks: Blob[] = [];
+      setIsSpeaking(true);
+
+      mediaRecorder.current.onstart = () => {
+        startTimestamp.current = Date.now();
+        setAudioSecone(0);
+        intervalRef.current = setInterval(() => {
+          const currentTimestamp = Date.now();
+          const duration = (currentTimestamp - startTimestamp.current) / 1000;
+          setAudioSecone(duration);
+        }, 1000);
+      };

      mediaRecorder.current.ondataavailable = (e) => {
        chunks.push(e.data);
@@ -23,48 +74,66 @@ export const useSpeech = () => {
      mediaRecorder.current.onstop = async () => {
        const formData = new FormData();
        const blob = new Blob(chunks, { type: 'audio/webm' });
+
+        const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
+
        formData.append('files', blob, 'recording.webm');
+        formData.append('metadata', JSON.stringify({ duration, shareId }));

-        const link = document.createElement('a');
-        link.href = URL.createObjectURL(blob);
-        link.download = 'recording.webm';
-        document.body.appendChild(link);
-        link.click();
-        link.remove();
-
+        setIsTransCription(true);
        try {
-          const result = await POST<string[]>('/v1/audio/transcriptions', formData, {
+          const result = await POST<string>('/v1/audio/transcriptions', formData, {
            timeout: 60000,
            headers: {
              'Content-Type': 'multipart/form-data; charset=utf-8'
            }
          });
-
-          console.log(result, '===');
+          onFinish(result);
        } catch (error) {
          toast({
            status: 'warning',
            title: getErrText(error, t('common.speech.error tip'))
          });
        }
+        setIsTransCription(false);
+        setIsSpeaking(false);
+      };
+
+      mediaRecorder.current.onerror = (e) => {
+        console.log('error', e);
        setIsSpeaking(false);
      };

      mediaRecorder.current.start();
-
-      setIsSpeaking(true);
    } catch (error) {}
  };

  const stopSpeak = () => {
    if (mediaRecorder.current) {
      mediaRecorder.current?.stop();
+      clearInterval(intervalRef.current);
    }
  };

+  useEffect(() => {
+    return () => {
+      clearInterval(intervalRef.current);
+      if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
+        mediaRecorder.current.stop();
+      }
+      if (mediaStream.current) {
+        mediaStream.current.getTracks().forEach((track) => track.stop());
+      }
+    };
+  }, []);
+
  return {
    startSpeak,
    stopSpeak,
-    isSpeaking
+    isSpeaking,
+    isTransCription,
+    renderAudioGraph,
+    stream: mediaStream.current,
+    speakingTimeString
  };
 };
--- a/projects/app/src/web/common/utils/voice.ts
+++ b/projects/app/src/web/common/utils/voice.ts
@@ -4,9 +4,11 @@ import { getErrText } from '@fastgpt/global/common/error/utils';
 import { AppTTSConfigType } from '@/types/app';
 import { TTSTypeEnum } from '@/constants/app';
 import { useTranslation } from 'next-i18next';
+import { useRouter } from 'next/router';

 export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
  const { t } = useTranslation();
+  const { shareId } = useRouter().query as { shareId?: string };
  const { ttsConfig } = props || {};
  const { toast } = useToast();
  const [audio, setAudio] = useState<HTMLAudioElement>();
@@ -16,6 +18,7 @@ export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
  // Check whether the voice is supported
  const hasAudio = useMemo(() => {
    if (ttsConfig?.type === TTSTypeEnum.none) return false;
+    if (ttsConfig?.type === TTSTypeEnum.model) return true;
    const voices = window.speechSynthesis?.getVoices?.() || []; // 获取语言包
    const voice = voices.find((item) => {
      return item.lang === 'zh-CN';
@@ -55,7 +58,8 @@ export const useAudioPlay = (props?: { ttsConfig?: AppTTSConfigType }) => {
            body: JSON.stringify({
              chatItemId,
              ttsConfig,
-              input: text
+              input: text,
+              shareId
            })
          });
          setAudioLoading(false);
--- a/projects/app/src/web/styles/theme.ts
+++ b/projects/app/src/web/styles/theme.ts
@@ -66,6 +66,14 @@ const Button = defineStyleConfig({
        bg: '#3370ff !important'
      }
    },
+    gray: {
+      bg: '#F5F5F8',
+      color: 'myBlue.700',
+      border: '1px solid #EFF0F1',
+      _hover: {
+        bg: '#3370FF1A'
+      }
+    },
    base: {
      color: 'myGray.900',
      border: '1px solid',
@@ -81,6 +89,23 @@ const Button = defineStyleConfig({
        color: 'myBlue.700'
      },
      _disabled: { bg: 'myGray.100 !important', color: 'myGray.700 !important' }
+    },
+    boxBtn: {
+      px: 3,
+      py: '2px',
+      borderRadius: 'md',
+      _hover: {
+        bg: 'myGray.200'
+      }
+    },
+    blue: {
+      borderRadius: 'md',
+      bg: '#3370FF',
+      color: 'white',
+      fontSize: 'sm',
+      _hover: {
+        bg: '#145BFF'
+      }
    }
  },
  defaultProps: {