perf: mobile voice input (#4437)

* update:Mobile voice interaction (#4362)

* Add files via upload

* Add files via upload

* Update ollama.md

* Update ollama.md

* Add files via upload

* Update useSpeech.ts

* Update ChatInput.tsx

* Update useSpeech.ts

* Update ChatInput.tsx

* Update useSpeech.ts

* Update constants.ts

* Add files via upload

* Update ChatInput.tsx

* Update useSpeech.ts

* Update useSpeech.ts

* Update useSpeech.ts

* Update ChatInput.tsx

* Add files via upload

* Update common.json

* Update VoiceInput.tsx

* Update ChatInput.tsx

* Update VoiceInput.tsx

* Update useSpeech.ts

* Update useSpeech.ts

* Update common.json

* Update common.json

* Update common.json

* Update VoiceInput.tsx

* Update VoiceInput.tsx

* Update ChatInput.tsx

* Update VoiceInput.tsx

* Update ChatInput.tsx

* Update VoiceInput.tsx

* Update ChatInput.tsx

* Update useSpeech.ts

* Update common.json

* Update chat.json

* Update common.json

* Update chat.json

* Update common.json

* Update chat.json

* Update VoiceInput.tsx

* Update ChatInput.tsx

* Update useSpeech.ts

* Update VoiceInput.tsx

* speech ui

* 优化语音输入组件,调整输入框显示逻辑,修复语音输入遮罩层样式,更新画布背景透明度,增强用户交互体验。 (#4435)

* perf: mobil voice input

---------

Co-authored-by: dreamer6680 <1468683855@qq.com>
This commit is contained in:
Archer
2025-04-02 22:25:50 +08:00
committed by archer
parent c2e088cf39
commit e4c4941a50
8 changed files with 675 additions and 323 deletions

View File

@@ -1,7 +1,6 @@
import { useSpeech } from '@/web/common/hooks/useSpeech';
import { useSystemStore } from '@/web/common/system/useSystemStore';
import { Box, Flex, Spinner, Textarea } from '@chakra-ui/react';
import React, { useRef, useEffect, useCallback, useMemo } from 'react';
import React, { useRef, useEffect, useCallback, useMemo, useState } from 'react';
import { useTranslation } from 'next-i18next';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import MyIcon from '@fastgpt/web/components/common/Icon';
@@ -18,6 +17,7 @@ import FilePreview from '../../components/FilePreview';
import { useFileUpload } from '../hooks/useFileUpload';
import ComplianceTip from '@/components/common/ComplianceTip/index';
import { useToast } from '@fastgpt/web/hooks/useToast';
import VoiceInput, { type VoiceInputComponentRef } from './VoiceInput';
const InputGuideBox = dynamic(() => import('./InputGuideBox'));
@@ -44,6 +44,7 @@ const ChatInput = ({
const { t } = useTranslation();
const { toast } = useToast();
const { isPc } = useSystem();
const VoiceInputRef = useRef<VoiceInputComponentRef>(null);
const { setValue, watch, control } = chatForm;
const inputValue = watch('input');
@@ -53,7 +54,6 @@ const ChatInput = ({
const chatId = useContextSelector(ChatBoxContext, (v) => v.chatId);
const isChatting = useContextSelector(ChatBoxContext, (v) => v.isChatting);
const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
const chatInputGuide = useContextSelector(ChatBoxContext, (v) => v.chatInputGuide);
const fileSelectConfig = useContextSelector(ChatBoxContext, (v) => v.fileSelectConfig);
@@ -106,86 +106,6 @@ const ChatInput = ({
[TextareaDom, canSendMessage, fileList, onSendMessage, replaceFiles]
);
/* whisper init */
const canvasRef = useRef<HTMLCanvasElement>(null);
const {
isSpeaking,
isTransCription,
stopSpeak,
startSpeak,
speakingTimeString,
renderAudioGraph,
stream
} = useSpeech({ appId, ...outLinkAuthData });
const onWhisperRecord = useCallback(() => {
const finishWhisperTranscription = (text: string) => {
if (!text) return;
if (whisperConfig?.autoSend) {
onSendMessage({
text,
files: fileList,
autoTTSResponse
});
replaceFiles([]);
} else {
resetInputVal({ text });
}
};
if (isSpeaking) {
return stopSpeak();
}
startSpeak(finishWhisperTranscription);
}, [
autoTTSResponse,
fileList,
isSpeaking,
onSendMessage,
replaceFiles,
resetInputVal,
startSpeak,
stopSpeak,
whisperConfig?.autoSend
]);
useEffect(() => {
if (!stream) {
return;
}
const audioContext = new AudioContext();
const analyser = audioContext.createAnalyser();
analyser.fftSize = 4096;
analyser.smoothingTimeConstant = 1;
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser);
const renderCurve = () => {
if (!canvasRef.current) return;
renderAudioGraph(analyser, canvasRef.current);
window.requestAnimationFrame(renderCurve);
};
renderCurve();
}, [renderAudioGraph, stream]);
const RenderTranslateLoading = useMemo(
() => (
<Flex
position={'absolute'}
top={0}
bottom={0}
left={0}
right={0}
zIndex={10}
pl={5}
alignItems={'center'}
bg={'white'}
color={'primary.500'}
visibility={isSpeaking && isTransCription ? 'visible' : 'hidden'}
>
<Spinner size={'sm'} mr={4} />
{t('common:core.chat.Converting to text')}
</Flex>
),
[isSpeaking, isTransCription, t]
);
const RenderTextarea = useMemo(
() => (
<Flex alignItems={'flex-end'} mt={fileList.length > 0 ? 1 : 0} pl={[2, 4]}>
@@ -198,7 +118,6 @@ const ChatInput = ({
cursor={'pointer'}
transform={'translateY(1px)'}
onClick={() => {
if (isSpeaking) return;
onOpenSelectFile();
}}
>
@@ -208,7 +127,6 @@ const ChatInput = ({
<File onSelect={(files) => onSelectFile({ files })} />
</Flex>
)}
{/* input area */}
<Textarea
ref={TextareaDom}
@@ -220,11 +138,7 @@ const ChatInput = ({
border: 'none'
}}
placeholder={
isSpeaking
? t('common:core.chat.Speaking')
: isPc
? t('common:core.chat.Type a message')
: t('chat:input_placeholder_phone')
isPc ? t('common:core.chat.Type a message') : t('chat:input_placeholder_phone')
}
resize={'none'}
rows={1}
@@ -237,9 +151,8 @@ const ChatInput = ({
wordBreak={'break-all'}
boxShadow={'none !important'}
color={'myGray.900'}
isDisabled={isSpeaking}
value={inputValue}
fontSize={['md', 'sm']}
value={inputValue}
onChange={(e) => {
const textarea = e.target;
textarea.style.height = textareaMinH;
@@ -290,118 +203,78 @@ const ChatInput = ({
}
}}
/>
<Flex alignItems={'center'} position={'absolute'} right={[2, 4]} bottom={['10px', '12px']}>
{/* voice-input */}
{whisperConfig?.open && !inputValue && !isChatting && (
<>
<canvas
ref={canvasRef}
style={{
height: '30px',
width: isSpeaking && !isTransCription ? '100px' : 0,
background: 'white',
zIndex: 0
<Flex
alignItems={'center'}
position={'absolute'}
right={[2, 4]}
bottom={['10px', '12px']}
zIndex={3}
>
{/* Voice input icon */}
{whisperConfig?.open && !inputValue && (
<MyTooltip label={t('common:core.chat.Record')}>
<Flex
alignItems={'center'}
justifyContent={'center'}
flexShrink={0}
h={['28px', '32px']}
w={['28px', '32px']}
mr={2}
borderRadius={'md'}
cursor={'pointer'}
_hover={{ bg: '#F5F5F8' }}
onClick={() => {
VoiceInputRef.current?.onSpeak?.();
}}
/>
{isSpeaking && (
<MyTooltip label={t('common:core.chat.Cancel Speak')}>
<Flex
mr={2}
alignItems={'center'}
justifyContent={'center'}
flexShrink={0}
h={['26px', '32px']}
w={['26px', '32px']}
borderRadius={'md'}
cursor={'pointer'}
_hover={{ bg: '#F5F5F8' }}
onClick={() => stopSpeak(true)}
>
<MyIcon
name={'core/chat/cancelSpeak'}
width={['20px', '22px']}
height={['20px', '22px']}
/>
</Flex>
</MyTooltip>
)}
<MyTooltip
label={
isSpeaking ? t('common:core.chat.Finish Speak') : t('common:core.chat.Record')
}
>
<Flex
mr={2}
alignItems={'center'}
justifyContent={'center'}
flexShrink={0}
h={['26px', '32px']}
w={['26px', '32px']}
borderRadius={'md'}
cursor={'pointer'}
_hover={{ bg: '#F5F5F8' }}
onClick={onWhisperRecord}
>
<MyIcon
name={isSpeaking ? 'core/chat/finishSpeak' : 'core/chat/recordFill'}
width={['20px', '22px']}
height={['20px', '22px']}
color={isSpeaking ? 'primary.500' : 'myGray.600'}
/>
</Flex>
</MyTooltip>
</>
)}
{/* send and stop icon */}
{isSpeaking ? (
<Box color={'#5A646E'} w={'36px'} textAlign={'right'} whiteSpace={'nowrap'}>
{speakingTimeString}
</Box>
) : (
<Flex
alignItems={'center'}
justifyContent={'center'}
flexShrink={0}
h={['28px', '32px']}
w={['28px', '32px']}
borderRadius={'md'}
bg={
isSpeaking || isChatting
? ''
: !havInput || hasFileUploading
? '#E5E5E5'
: 'primary.500'
}
cursor={havInput ? 'pointer' : 'not-allowed'}
lineHeight={1}
onClick={() => {
if (isChatting) {
return onStop();
}
return handleSend();
}}
>
{isChatting ? (
<MyIcon
animation={'zoomStopIcon 0.4s infinite alternate'}
name={'core/chat/recordFill'}
width={['22px', '25px']}
height={['22px', '25px']}
cursor={'pointer'}
name={'stop'}
color={'gray.500'}
color={'myGray.600'}
/>
) : (
<MyTooltip label={t('common:core.chat.Send Message')}>
<MyIcon
name={'core/chat/sendFill'}
width={['18px', '20px']}
height={['18px', '20px']}
color={'white'}
/>
</MyTooltip>
)}
</Flex>
</Flex>
</MyTooltip>
)}
{/* send and stop icon */}
<Flex
alignItems={'center'}
justifyContent={'center'}
flexShrink={0}
h={['28px', '32px']}
w={['28px', '32px']}
borderRadius={'md'}
bg={isChatting ? '' : !havInput || hasFileUploading ? '#E5E5E5' : 'primary.500'}
cursor={havInput ? 'pointer' : 'not-allowed'}
lineHeight={1}
onClick={() => {
if (isChatting) {
return onStop();
}
return handleSend();
}}
>
{isChatting ? (
<MyIcon
animation={'zoomStopIcon 0.4s infinite alternate'}
width={['22px', '25px']}
height={['22px', '25px']}
cursor={'pointer'}
name={'stop'}
color={'gray.500'}
/>
) : (
<MyTooltip label={t('common:core.chat.Send Message')}>
<MyIcon
name={'core/chat/sendFill'}
width={['18px', '20px']}
height={['18px', '20px']}
color={'white'}
/>
</MyTooltip>
)}
</Flex>
</Flex>
</Flex>
),
@@ -415,21 +288,15 @@ const ChatInput = ({
inputValue,
isChatting,
isPc,
isSpeaking,
isTransCription,
onOpenSelectFile,
onSelectFile,
onStop,
onWhisperRecord,
selectFileIcon,
selectFileLabel,
setValue,
showSelectFile,
showSelectImg,
speakingTimeString,
stopSpeak,
t,
whisperConfig?.open
t
]
);
@@ -468,7 +335,7 @@ const ChatInput = ({
pt={fileList.length > 0 ? '0' : ['14px', '18px']}
pb={['14px', '18px']}
position={'relative'}
boxShadow={isSpeaking ? `0 0 10px rgba(54,111,255,0.4)` : `0 0 10px rgba(0,0,0,0.2)`}
boxShadow={`0 0 10px rgba(0,0,0,0.2)`}
borderRadius={['none', 'md']}
bg={'white'}
overflow={'display'}
@@ -495,15 +362,20 @@ const ChatInput = ({
}}
/>
)}
{/* translate loading */}
{RenderTranslateLoading}
{/* file preview */}
<Box px={[1, 3]}>
<FilePreview fileList={fileList} removeFiles={removeFiles} />
</Box>
{/* voice input and loading container */}
{!inputValue && (
<VoiceInput
ref={VoiceInputRef}
onSendMessage={onSendMessage}
resetInputVal={resetInputVal}
/>
)}
{RenderTextarea}
</Box>
<ComplianceTip type={'chat'} />

View File

@@ -0,0 +1,367 @@
import { useSpeech } from '@/web/common/hooks/useSpeech';
import { Box, Flex, HStack, Spinner } from '@chakra-ui/react';
import React, {
useRef,
useEffect,
useCallback,
useState,
forwardRef,
useImperativeHandle,
useMemo
} from 'react';
import { useTranslation } from 'next-i18next';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import MyIcon from '@fastgpt/web/components/common/Icon';
import { useSystem } from '@fastgpt/web/hooks/useSystem';
import { useContextSelector } from 'use-context-selector';
import { ChatBoxContext } from '../Provider';
import MyIconButton from '@/pageComponents/account/team/OrgManage/IconButton';
export interface VoiceInputComponentRef {
onSpeak: () => void;
}
type VoiceInputProps = {
onSendMessage: (params: { text: string; files?: any[]; autoTTSResponse?: boolean }) => void;
resetInputVal: (val: { text: string }) => void;
};
// PC voice input
const PCVoiceInput = ({
speakingTimeString,
stopSpeak,
canvasRef
}: {
speakingTimeString: string;
stopSpeak: (param: boolean) => void;
canvasRef: React.RefObject<HTMLCanvasElement>;
}) => {
const { t } = useTranslation();
return (
<HStack h={'100%'} px={4}>
<Box fontSize="sm" color="myGray.500" flex={'1 0 0'}>
{t('common:core.chat.Speaking')}
</Box>
<canvas
ref={canvasRef}
style={{
height: '10px',
width: '100px',
background: 'white'
}}
/>
<Box fontSize="sm" color="myGray.500" whiteSpace={'nowrap'}>
{speakingTimeString}
</Box>
<MyTooltip label={t('common:core.chat.Cancel Speak')}>
<MyIconButton
name={'core/chat/cancelSpeak'}
h={'22px'}
w={'22px'}
onClick={() => stopSpeak(true)}
/>
</MyTooltip>
<MyTooltip label={t('common:core.chat.Finish Speak')}>
<MyIconButton
name={'core/chat/finishSpeak'}
h={'22px'}
w={'22px'}
onClick={() => stopSpeak(false)}
/>
</MyTooltip>
</HStack>
);
};
// mobile voice input
const MobileVoiceInput = ({
isSpeaking,
onStartSpeak,
onCloseSpeak,
stopSpeak,
canvasRef
}: {
isSpeaking: boolean;
onStartSpeak: () => void;
onCloseSpeak: () => any;
stopSpeak: (param: boolean) => void;
canvasRef: React.RefObject<HTMLCanvasElement>;
}) => {
const { t } = useTranslation();
const isPressing = useRef(false);
const startTimeRef = useRef(0); // 防抖
const startYRef = useRef(0);
const [isCancel, setIsCancel] = useState(false);
const handleTouchStart = useCallback(
(e: React.TouchEvent<HTMLDivElement>) => {
isPressing.current = true;
setIsCancel(false);
startTimeRef.current = Date.now();
const touch = e.touches[0];
startYRef.current = touch.pageY;
onStartSpeak();
},
[onStartSpeak]
);
const handleTouchMove = useCallback(
(e: React.TouchEvent<HTMLDivElement>) => {
const touch = e.touches[0] as Touch;
const currentY = touch.pageY;
const deltaY = startYRef.current - currentY;
if (deltaY > 90) {
setIsCancel(true);
} else if (deltaY <= 90) {
setIsCancel(false);
}
},
[startYRef]
);
const handleTouchEnd = useCallback(
(e: React.TouchEvent<HTMLDivElement>) => {
if (!isPressing.current) return;
const endTime = Date.now();
const timeDifference = endTime - startTimeRef.current;
if (isCancel || timeDifference < 200) {
stopSpeak(true);
} else {
stopSpeak(false);
}
},
[isCancel, stopSpeak]
);
return (
<Flex position="relative" h="100%">
{/* Back Icon */}
{!isSpeaking && (
<MyTooltip label={t('chat:back_to_text')}>
<MyIconButton
position="absolute"
right={2}
top={'50%'}
transform={'translateY(-50%)'}
zIndex={5}
name={'core/chat/backText'}
h={'22px'}
w={'22px'}
onClick={onCloseSpeak}
/>
</MyTooltip>
)}
<Flex
alignItems={'center'}
justifyContent={'center'}
h="100%"
flex="1 0 0"
bg={isSpeaking ? (isCancel ? 'red.500' : 'primary.500') : 'white'}
onTouchMove={handleTouchMove}
onTouchEnd={handleTouchEnd}
onTouchStart={handleTouchStart}
onTouchCancel={() => {
stopSpeak(true);
}}
zIndex={4}
>
<Box visibility={isSpeaking ? 'hidden' : 'visible'}>{t('chat:press_to_speak')}</Box>
<Box
position="absolute"
h={'100%'}
w={'100%'}
as="canvas"
ref={canvasRef}
flex="0 0 80%"
visibility={isSpeaking ? 'visible' : 'hidden'}
/>
</Flex>
{/* Mask */}
{isSpeaking && (
<Flex
justifyContent="center"
alignItems="center"
height="100%"
position="fixed"
left={0}
right={0}
bottom={'50px'}
h={'200px'}
bg="linear-gradient(to top, white, rgba(255, 255, 255, 0.7), rgba(255, 255, 255, 0))"
>
<Box fontSize="sm" color="myGray.500" position="absolute" bottom={'10px'}>
{isCancel ? t('chat:release_cancel') : t('chat:release_send')}
</Box>
</Flex>
)}
</Flex>
);
};
const VoiceInput = forwardRef<VoiceInputComponentRef, VoiceInputProps>(
({ onSendMessage, resetInputVal }, ref) => {
const { t } = useTranslation();
const { isPc } = useSystem();
const outLinkAuthData = useContextSelector(ChatBoxContext, (v) => v.outLinkAuthData);
const appId = useContextSelector(ChatBoxContext, (v) => v.appId);
const whisperConfig = useContextSelector(ChatBoxContext, (v) => v.whisperConfig);
const autoTTSResponse = useContextSelector(ChatBoxContext, (v) => v.autoTTSResponse);
const canvasRef = useRef<HTMLCanvasElement>(null);
const {
isSpeaking,
isTransCription,
stopSpeak,
startSpeak,
speakingTimeString,
renderAudioGraphPc,
renderAudioGraphMobile,
stream
} = useSpeech({ appId, ...outLinkAuthData });
const [mobilePreSpeak, setMobilePreSpeak] = useState(false);
// Canvas render
useEffect(() => {
if (!stream) {
return;
}
const audioContext = new AudioContext();
const analyser = audioContext.createAnalyser();
analyser.fftSize = 4096;
analyser.smoothingTimeConstant = 1;
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser);
let animationFrameId: number | null = null;
const renderCurve = () => {
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
if (!stream.active) {
ctx.clearRect(0, 0, canvas.width, canvas.height);
if (animationFrameId) {
window.cancelAnimationFrame(animationFrameId);
animationFrameId = null;
}
return;
}
if (isPc) {
renderAudioGraphPc(analyser, canvas);
} else {
renderAudioGraphMobile(analyser, canvas);
}
animationFrameId = window.requestAnimationFrame(renderCurve);
};
renderCurve();
return () => {
if (animationFrameId) {
window.cancelAnimationFrame(animationFrameId);
}
audioContext.close();
source.disconnect();
analyser.disconnect();
};
}, [stream, canvasRef, renderAudioGraphPc, renderAudioGraphMobile, isPc]);
const onStartSpeak = useCallback(() => {
const finishWhisperTranscription = (text: string) => {
if (!text) return;
if (whisperConfig?.autoSend) {
onSendMessage({
text,
autoTTSResponse
});
} else {
resetInputVal({ text });
}
};
startSpeak(finishWhisperTranscription);
}, []);
const onSpeach = useCallback(() => {
if (isPc) {
onStartSpeak();
} else {
setMobilePreSpeak(true);
}
}, []);
useImperativeHandle(ref, () => ({
onSpeak: onSpeach
}));
if (!whisperConfig?.open) return null;
if (!mobilePreSpeak && !isSpeaking && !isTransCription) return null;
return (
<Box
position="absolute"
overflow={'hidden'}
userSelect={'none'}
top={0}
left={0}
right={0}
bottom={0}
bg="white"
zIndex={5}
borderRadius={isPc ? 'md' : ''}
onContextMenu={(e) => e.preventDefault()}
>
{isPc ? (
<PCVoiceInput
speakingTimeString={speakingTimeString}
stopSpeak={stopSpeak}
canvasRef={canvasRef}
/>
) : (
<MobileVoiceInput
isSpeaking={isSpeaking}
onStartSpeak={onStartSpeak}
onCloseSpeak={() => setMobilePreSpeak(false)}
stopSpeak={stopSpeak}
canvasRef={canvasRef}
/>
)}
{isTransCription && (
<Flex
position={'absolute'}
top={0}
bottom={0}
left={0}
right={0}
pl={5}
alignItems={'center'}
bg={'white'}
color={'primary.500'}
zIndex={6}
>
<Spinner size={'sm'} mr={4} />
{t('common:core.chat.Converting to text')}
</Flex>
)}
</Box>
);
}
);
VoiceInput.displayName = 'VoiceInput';
export default VoiceInput;

View File

@@ -7,16 +7,21 @@ import { OutLinkChatAuthProps } from '@fastgpt/global/support/permission/chat';
export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) => {
const { t } = useTranslation();
const mediaRecorder = useRef<MediaRecorder>();
const [mediaStream, setMediaStream] = useState<MediaStream>();
const { toast } = useToast();
const [isSpeaking, setIsSpeaking] = useState(false);
const [isTransCription, setIsTransCription] = useState(false);
const [audioSecond, setAudioSecond] = useState(0);
const intervalRef = useRef<any>();
const startTimestamp = useRef(0);
const cancelWhisperSignal = useRef(false);
const mediaRecorder = useRef<MediaRecorder>();
const [mediaStream, setMediaStream] = useState<MediaStream>();
const timeIntervalRef = useRef<any>();
const cancelWhisperSignal = useRef(false);
const stopCalledRef = useRef(false);
const startTimestamp = useRef(0);
const [audioSecond, setAudioSecond] = useState(0);
const speakingTimeString = useMemo(() => {
const minutes: number = Math.floor(audioSecond / 60);
const remainingSeconds: number = Math.floor(audioSecond % 60);
@@ -25,17 +30,16 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
return `${formattedMinutes}:${formattedSeconds}`;
}, [audioSecond]);
const renderAudioGraph = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
const renderAudioGraphPc = useCallback((analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
const bufferLength = analyser.frequencyBinCount;
const backgroundColor = 'white';
const dataArray = new Uint8Array(bufferLength);
analyser.getByteTimeDomainData(dataArray);
const canvasCtx = canvas?.getContext('2d');
const width = 300;
const height = 200;
const width = canvas.width;
const height = canvas.height;
if (!canvasCtx) return;
canvasCtx.clearRect(0, 0, width, height);
canvasCtx.fillStyle = backgroundColor;
canvasCtx.fillStyle = 'white';
canvasCtx.fillRect(0, 0, width, height);
const barWidth = (width / bufferLength) * 2.5;
let x = 0;
@@ -49,127 +53,212 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
x += barWidth + 1;
}
}, []);
const renderAudioGraphMobile = useCallback(
(analyser: AnalyserNode, canvas: HTMLCanvasElement) => {
const canvasCtx = canvas?.getContext('2d');
if (!canvasCtx) return;
const startSpeak = async (onFinish: (text: string) => void) => {
if (!navigator?.mediaDevices?.getUserMedia) {
return toast({
status: 'warning',
title: t('common:common.speech.not support')
});
}
try {
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
analyser.getByteTimeDomainData(dataArray);
const width = canvas.width;
const height = canvas.height;
canvasCtx.clearRect(0, 0, width, height);
// Set transparent background
canvasCtx.fillStyle = 'rgba(255, 255, 255, 0)';
canvasCtx.fillRect(0, 0, width, height);
const centerY = height / 2;
const barWidth = (width / bufferLength) * 15;
const gap = 2; // 添加间隙
let x = width * 0.1;
let sum = 0;
let maxDiff = 0;
for (let i = 0; i < bufferLength; i++) {
sum += dataArray[i];
maxDiff = Math.max(maxDiff, Math.abs(dataArray[i] - 128));
}
const average = sum / bufferLength;
// draw initial rectangle waveform
canvasCtx.beginPath();
canvasCtx.fillStyle = '#FFFFFF';
const initialHeight = height * 0.1;
for (let i = 0; i < width * 0.8; i += barWidth + gap) {
canvasCtx.fillRect(i + width * 0.1, centerY - initialHeight, barWidth, initialHeight);
canvasCtx.fillRect(i + width * 0.1, centerY, barWidth, initialHeight);
}
// draw dynamic waveform
canvasCtx.beginPath();
for (let i = 0; i < bufferLength; i += 4) {
const value = dataArray[i];
const normalizedValue = (value - average) / 128;
const amplification = 2.5;
const barHeight = normalizedValue * height * 0.4 * amplification;
canvasCtx.fillStyle = '#FFFFFF';
canvasCtx.fillRect(x, centerY - Math.abs(barHeight), barWidth, Math.abs(barHeight));
canvasCtx.fillRect(x, centerY, barWidth, Math.abs(barHeight));
x += barWidth + gap; // 增加间隔
if (x > width * 0.9) break;
}
},
[]
);
const startSpeak = useCallback(
async (onFinish: (text: string) => void) => {
if (!navigator?.mediaDevices?.getUserMedia) {
return toast({
status: 'warning',
title: t('common:common.speech.not support')
});
}
// Init status
if (timeIntervalRef.current) {
clearInterval(timeIntervalRef.current);
}
cancelWhisperSignal.current = false;
stopCalledRef.current = false;
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
setMediaStream(stream);
mediaRecorder.current = new MediaRecorder(stream);
const chunks: Blob[] = [];
setIsSpeaking(true);
setAudioSecond(0);
mediaRecorder.current.onstart = () => {
startTimestamp.current = Date.now();
setAudioSecond(0);
intervalRef.current = setInterval(() => {
const currentTimestamp = Date.now();
const duration = (currentTimestamp - startTimestamp.current) / 1000;
setAudioSecond(duration);
}, 1000);
};
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
setMediaStream(stream);
mediaRecorder.current.ondataavailable = (e) => {
chunks.push(e.data);
};
mediaRecorder.current = new MediaRecorder(stream);
const chunks: Blob[] = [];
mediaRecorder.current.onstop = async () => {
if (!cancelWhisperSignal.current) {
const formData = new FormData();
const { options, filename } = (() => {
if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
return {
options: { mimeType: 'video/webm; codecs=vp9' },
filename: 'recording.mp3'
};
}
if (MediaRecorder.isTypeSupported('video/webm')) {
mediaRecorder.current.onstart = () => {
startTimestamp.current = Date.now();
timeIntervalRef.current = setInterval(() => {
const currentTimestamp = Date.now();
const duration = (currentTimestamp - startTimestamp.current) / 1000;
setAudioSecond(duration);
}, 1000);
};
mediaRecorder.current.ondataavailable = (e) => {
chunks.push(e.data);
};
mediaRecorder.current.onstop = async () => {
// close media stream
stream.getTracks().forEach((track) => track.stop());
setIsSpeaking(false);
if (timeIntervalRef.current) {
clearInterval(timeIntervalRef.current);
}
if (!cancelWhisperSignal.current) {
const formData = new FormData();
const { options, filename } = (() => {
if (MediaRecorder.isTypeSupported('video/webm; codecs=vp9')) {
return {
options: { mimeType: 'video/webm; codecs=vp9' },
filename: 'recording.mp3'
};
}
if (MediaRecorder.isTypeSupported('video/webm')) {
return {
options: { type: 'video/webm' },
filename: 'recording.mp3'
};
}
if (MediaRecorder.isTypeSupported('video/mp4')) {
return {
options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
filename: 'recording.mp4'
};
}
return {
options: { type: 'video/webm' },
filename: 'recording.mp3'
};
}
if (MediaRecorder.isTypeSupported('video/mp4')) {
return {
options: { mimeType: 'video/mp4', videoBitsPerSecond: 100000 },
filename: 'recording.mp4'
};
}
return {
options: { type: 'video/webm' },
filename: 'recording.mp3'
};
})();
})();
const blob = new Blob(chunks, options);
const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
formData.append('file', blob, filename);
formData.append(
'data',
JSON.stringify({
...props,
duration
})
);
const blob = new Blob(chunks, options);
const duration = Math.round((Date.now() - startTimestamp.current) / 1000);
formData.append('file', blob, filename);
formData.append(
'data',
JSON.stringify({
...props,
duration
})
);
setIsTransCription(true);
try {
const result = await POST<string>('/v1/audio/transcriptions', formData, {
timeout: 60000,
headers: {
'Content-Type': 'multipart/form-data; charset=utf-8'
}
});
onFinish(result);
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, t('common:common.speech.error tip'))
});
setIsTransCription(true);
try {
const result = await POST<string>('/v1/audio/transcriptions', formData, {
timeout: 60000,
headers: {
'Content-Type': 'multipart/form-data; charset=utf-8'
}
});
onFinish(result);
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, t('common:common.speech.error tip'))
});
}
setIsTransCription(false);
}
};
mediaRecorder.current.onerror = (e) => {
if (timeIntervalRef.current) {
clearInterval(timeIntervalRef.current);
}
console.log('error', e);
setIsSpeaking(false);
};
// If onclick stop, stop speak
if (stopCalledRef.current) {
mediaRecorder.current.stop();
} else {
mediaRecorder.current.start();
}
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, 'Whisper error')
});
console.log(error);
}
},
[toast, t, props]
);
// close media stream
stream.getTracks().forEach((track) => track.stop());
setIsTransCription(false);
setIsSpeaking(false);
};
mediaRecorder.current.onerror = (e) => {
console.log('error', e);
setIsSpeaking(false);
};
mediaRecorder.current.start();
} catch (error) {
toast({
status: 'warning',
title: getErrText(error, 'Whisper error')
});
console.log(error);
}
};
const stopSpeak = (cancel = false) => {
const stopSpeak = useCallback((cancel = false) => {
cancelWhisperSignal.current = cancel;
if (mediaRecorder.current) {
mediaRecorder.current?.stop();
clearInterval(intervalRef.current);
}
};
stopCalledRef.current = true;
if (timeIntervalRef.current) {
clearInterval(timeIntervalRef.current);
}
if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
mediaRecorder.current.stop();
}
}, []);
// Leave page, stop speak
useEffect(() => {
return () => {
clearInterval(intervalRef.current);
clearInterval(timeIntervalRef.current);
if (mediaRecorder.current && mediaRecorder.current.state !== 'inactive') {
mediaRecorder.current.stop();
}
@@ -184,14 +273,15 @@ export const useSpeech = (props?: OutLinkChatAuthProps & { appId?: string }) =>
if (audioSecond >= 60) {
stopSpeak();
}
}, [audioSecond]);
}, [audioSecond, stopSpeak]);
return {
startSpeak,
stopSpeak,
isSpeaking,
isTransCription,
renderAudioGraph,
renderAudioGraphPc,
renderAudioGraphMobile,
stream: mediaStream,
speakingTimeString
};