mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-26 05:15:48 +08:00
feat(voice): add optional speech-to-text input and read-aloud TTS
Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
This commit is contained in:
88
src/components/chat/hooks/useTts.ts
Normal file
88
src/components/chat/hooks/useTts.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import { useCallback, useEffect, useRef, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Only one message speaks at a time across the whole app.
|
||||
let stopActive: (() => void) | null = null;
|
||||
|
||||
export type TtsState = 'idle' | 'loading' | 'playing';
|
||||
|
||||
/**
|
||||
* Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
|
||||
* (Kokoro sidecar via the Express proxy; cleaning happens server-side),
|
||||
* plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
|
||||
*/
|
||||
export function useTts(getText: () => string) {
|
||||
const [state, setState] = useState<TtsState>('idle');
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const urlRef = useRef<string | null>(null);
|
||||
|
||||
const reset = useCallback(() => {
|
||||
if (audioRef.current) {
|
||||
audioRef.current.onended = null;
|
||||
audioRef.current.onerror = null;
|
||||
audioRef.current.pause();
|
||||
audioRef.current.src = '';
|
||||
audioRef.current = null;
|
||||
}
|
||||
if (urlRef.current) {
|
||||
URL.revokeObjectURL(urlRef.current);
|
||||
urlRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
reset();
|
||||
setState('idle');
|
||||
if (stopActive) stopActive = null;
|
||||
}, [reset]);
|
||||
|
||||
// Cleanup on unmount.
|
||||
useEffect(() => () => reset(), [reset]);
|
||||
|
||||
const play = useCallback(async () => {
|
||||
if (stopActive) stopActive();
|
||||
const text = getText();
|
||||
if (!text || !text.trim()) return;
|
||||
|
||||
// Create + "unlock" the audio element synchronously inside the click gesture,
|
||||
// so iOS Safari lets us play it after the async fetch resolves.
|
||||
const audio = new Audio();
|
||||
audioRef.current = audio;
|
||||
audio.onended = () => stop();
|
||||
audio.onerror = () => stop();
|
||||
try {
|
||||
audio.play().catch(() => {});
|
||||
audio.pause();
|
||||
} catch {
|
||||
/* unlock attempt; ignore */
|
||||
}
|
||||
stopActive = stop;
|
||||
setState('loading');
|
||||
|
||||
try {
|
||||
const res = await authenticatedFetch('/api/voice/tts', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ text }),
|
||||
});
|
||||
if (!res.ok) throw new Error(`tts ${res.status}`);
|
||||
const blob = await res.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
urlRef.current = url;
|
||||
if (audioRef.current !== audio) return; // stopped while loading
|
||||
audio.src = url;
|
||||
audio.load();
|
||||
await audio.play();
|
||||
setState('playing');
|
||||
} catch {
|
||||
reset();
|
||||
setState('idle');
|
||||
}
|
||||
}, [getText, reset, stop]);
|
||||
|
||||
const toggle = useCallback(() => {
|
||||
if (state === 'playing' || state === 'loading') stop();
|
||||
else play();
|
||||
}, [state, play, stop]);
|
||||
|
||||
return { state, toggle };
|
||||
}
|
||||
38
src/components/chat/hooks/useVoiceAvailable.ts
Normal file
38
src/components/chat/hooks/useVoiceAvailable.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
|
||||
// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
|
||||
let cached: boolean | null = null;
|
||||
let inflight: Promise<boolean> | null = null;
|
||||
|
||||
function probe(): Promise<boolean> {
|
||||
if (cached !== null) return Promise.resolve(cached);
|
||||
if (!inflight) {
|
||||
inflight = authenticatedFetch('/api/voice/health')
|
||||
.then((r) => (r.ok ? r.json() : { enabled: false }))
|
||||
.then((d) => {
|
||||
cached = Boolean(d?.enabled);
|
||||
return cached;
|
||||
})
|
||||
.catch(() => {
|
||||
cached = false;
|
||||
return false;
|
||||
});
|
||||
}
|
||||
return inflight;
|
||||
}
|
||||
|
||||
export function useVoiceAvailable(): boolean {
|
||||
const [available, setAvailable] = useState<boolean>(cached ?? false);
|
||||
useEffect(() => {
|
||||
let mounted = true;
|
||||
probe().then((v) => {
|
||||
if (mounted) setAvailable(v);
|
||||
});
|
||||
return () => {
|
||||
mounted = false;
|
||||
};
|
||||
}, []);
|
||||
return available;
|
||||
}
|
||||
106
src/components/chat/hooks/useVoiceInput.ts
Normal file
106
src/components/chat/hooks/useVoiceInput.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import { useCallback, useRef, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
|
||||
const MIME_CANDIDATES = [
|
||||
'audio/webm;codecs=opus',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/ogg;codecs=opus',
|
||||
'audio/ogg',
|
||||
];
|
||||
|
||||
function pickMime(): string {
|
||||
for (const t of MIME_CANDIDATES) {
|
||||
try {
|
||||
if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
|
||||
} catch {
|
||||
/* isTypeSupported can throw on some iOS versions */
|
||||
}
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
|
||||
|
||||
/**
|
||||
* Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
|
||||
* (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
|
||||
* Ported from tooler's VoiceInput.js.
|
||||
*/
|
||||
export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
|
||||
const [state, setState] = useState<VoiceInputState>('idle');
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const chunksRef = useRef<Blob[]>([]);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
|
||||
const stopTracks = () => {
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
};
|
||||
|
||||
const start = useCallback(async () => {
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: { echoCancellation: true, noiseSuppression: true },
|
||||
});
|
||||
streamRef.current = stream;
|
||||
const mimeType = pickMime();
|
||||
const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
|
||||
recorderRef.current = rec;
|
||||
chunksRef.current = [];
|
||||
|
||||
rec.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunksRef.current.push(e.data);
|
||||
};
|
||||
|
||||
rec.onstop = async () => {
|
||||
stopTracks();
|
||||
const type = rec.mimeType || 'audio/webm';
|
||||
const blob = new Blob(chunksRef.current, { type });
|
||||
if (blob.size < 800) {
|
||||
setState('idle');
|
||||
onError?.('Recording too short');
|
||||
return;
|
||||
}
|
||||
setState('transcribing');
|
||||
try {
|
||||
const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
|
||||
const fd = new FormData();
|
||||
fd.append('audio', blob, `recording.${ext}`);
|
||||
const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
|
||||
if (!res.ok) throw new Error(`transcribe ${res.status}`);
|
||||
const data = await res.json();
|
||||
const text = String(data?.text || '').trim();
|
||||
if (text) onTranscript(text);
|
||||
else onError?.('No speech detected');
|
||||
} catch (e) {
|
||||
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
|
||||
} finally {
|
||||
setState('idle');
|
||||
}
|
||||
};
|
||||
|
||||
rec.start();
|
||||
setState('recording');
|
||||
} catch (e) {
|
||||
const err = e as { name?: string; message?: string };
|
||||
let msg = `Mic error: ${err?.message || e}`;
|
||||
if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
|
||||
else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
|
||||
onError?.(msg);
|
||||
setState('idle');
|
||||
}
|
||||
}, [onTranscript, onError]);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
if (recorderRef.current && state === 'recording') recorderRef.current.stop();
|
||||
}, [state]);
|
||||
|
||||
const toggle = useCallback(() => {
|
||||
if (state === 'recording') stop();
|
||||
else if (state === 'idle') start();
|
||||
}, [state, start, stop]);
|
||||
|
||||
return { state, toggle };
|
||||
}
|
||||
@@ -404,6 +404,7 @@ function ChatInterface({
|
||||
renderInputWithMentions={renderInputWithMentions}
|
||||
textareaRef={textareaRef}
|
||||
input={input}
|
||||
onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
|
||||
onInputChange={handleInputChange}
|
||||
onTextareaClick={handleTextareaClick}
|
||||
onTextareaKeyDown={handleKeyDown}
|
||||
|
||||
@@ -26,6 +26,7 @@ import {
|
||||
import CommandMenu from './CommandMenu';
|
||||
import ClaudeStatus from './ClaudeStatus';
|
||||
import ImageAttachment from './ImageAttachment';
|
||||
import VoiceInputButton from './VoiceInputButton';
|
||||
import PermissionRequestsBanner from './PermissionRequestsBanner';
|
||||
import TokenUsageSummary from './TokenUsageSummary';
|
||||
|
||||
@@ -89,6 +90,7 @@ interface ChatComposerProps {
|
||||
renderInputWithMentions: (text: string) => ReactNode;
|
||||
textareaRef: RefObject<HTMLTextAreaElement>;
|
||||
input: string;
|
||||
onVoiceTranscript?: (text: string) => void;
|
||||
onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
|
||||
onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
|
||||
onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
|
||||
@@ -143,6 +145,7 @@ export default function ChatComposer({
|
||||
renderInputWithMentions,
|
||||
textareaRef,
|
||||
input,
|
||||
onVoiceTranscript,
|
||||
onInputChange,
|
||||
onTextareaClick,
|
||||
onTextareaKeyDown,
|
||||
@@ -315,6 +318,8 @@ export default function ChatComposer({
|
||||
<ImageIcon />
|
||||
</PromptInputButton>
|
||||
|
||||
{onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
|
||||
|
||||
<button
|
||||
type="button"
|
||||
onClick={onModeSwitch}
|
||||
|
||||
@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share
|
||||
|
||||
import { Markdown } from './Markdown';
|
||||
import MessageCopyControl from './MessageCopyControl';
|
||||
import MessageSpeakControl from './MessageSpeakControl';
|
||||
|
||||
type DiffLine = {
|
||||
type: string;
|
||||
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
|
||||
{shouldShowAssistantCopyControl && (
|
||||
<MessageCopyControl content={assistantCopyContent} messageType="assistant" />
|
||||
)}
|
||||
{shouldShowAssistantCopyControl && (
|
||||
<MessageSpeakControl content={assistantCopyContent} />
|
||||
)}
|
||||
{!isGrouped && <span>{formattedTime}</span>}
|
||||
</div>
|
||||
)}
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import { Volume2, Loader2, Square } from 'lucide-react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { useTts } from '../../hooks/useTts';
|
||||
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
|
||||
|
||||
// Tap-to-speak button beside the copy control on assistant messages.
|
||||
// Renders nothing unless the optional voice feature is enabled.
|
||||
const MessageSpeakControl = ({ content }: { content: string }) => {
|
||||
const { t } = useTranslation('chat');
|
||||
const available = useVoiceAvailable();
|
||||
const { state, toggle } = useTts(() => content);
|
||||
|
||||
if (!available) return null;
|
||||
|
||||
const title =
|
||||
state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
|
||||
|
||||
return (
|
||||
<button
|
||||
type="button"
|
||||
onClick={toggle}
|
||||
title={title}
|
||||
aria-label={title}
|
||||
className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
|
||||
>
|
||||
{state === 'playing' ? (
|
||||
<Square className="h-3.5 w-3.5" />
|
||||
) : state === 'loading' ? (
|
||||
<Loader2 className="h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<Volume2 className="h-3.5 w-3.5" />
|
||||
)}
|
||||
</button>
|
||||
);
|
||||
};
|
||||
|
||||
export default MessageSpeakControl;
|
||||
40
src/components/chat/view/subcomponents/VoiceInputButton.tsx
Normal file
40
src/components/chat/view/subcomponents/VoiceInputButton.tsx
Normal file
@@ -0,0 +1,40 @@
|
||||
import { Mic, Square, Loader2 } from 'lucide-react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { useVoiceInput } from '../../hooks/useVoiceInput';
|
||||
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
|
||||
import { PromptInputButton } from '../../../../shared/view/ui';
|
||||
|
||||
type Props = {
|
||||
onTranscript: (text: string) => void;
|
||||
onError?: (msg: string) => void;
|
||||
};
|
||||
|
||||
// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
|
||||
export default function VoiceInputButton({ onTranscript, onError }: Props) {
|
||||
const { t } = useTranslation('chat');
|
||||
const available = useVoiceAvailable();
|
||||
const { state, toggle } = useVoiceInput(onTranscript, onError);
|
||||
|
||||
if (!available) return null;
|
||||
|
||||
const icon =
|
||||
state === 'recording' ? (
|
||||
<Square className="text-red-500" />
|
||||
) : state === 'transcribing' ? (
|
||||
<Loader2 className="animate-spin" />
|
||||
) : (
|
||||
<Mic />
|
||||
);
|
||||
|
||||
return (
|
||||
<PromptInputButton
|
||||
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
|
||||
onClick={(e: { preventDefault: () => void }) => {
|
||||
e.preventDefault();
|
||||
toggle();
|
||||
}}
|
||||
>
|
||||
{icon}
|
||||
</PromptInputButton>
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user