feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on
assistant messages. Both are opt-in and hidden unless a voice backend is
configured via VOICE_SIDECAR_URL.

The auth-gated /api/voice proxy forwards to a configurable backend exposing
/transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health
and hides the controls when disabled. Adds i18n keys and docs/voice.md.

Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper
for STT, Kokoro-82M for TTS, both CPU-capable).
This commit is contained in:
newsbubbles
2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions

View File

@@ -0,0 +1,88 @@
import { useCallback, useEffect, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Only one message speaks at a time across the whole app.
let stopActive: (() => void) | null = null;
export type TtsState = 'idle' | 'loading' | 'playing';
/**
* Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
* (Kokoro sidecar via the Express proxy; cleaning happens server-side),
* plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
*/
export function useTts(getText: () => string) {
const [state, setState] = useState<TtsState>('idle');
const audioRef = useRef<HTMLAudioElement | null>(null);
const urlRef = useRef<string | null>(null);
const reset = useCallback(() => {
if (audioRef.current) {
audioRef.current.onended = null;
audioRef.current.onerror = null;
audioRef.current.pause();
audioRef.current.src = '';
audioRef.current = null;
}
if (urlRef.current) {
URL.revokeObjectURL(urlRef.current);
urlRef.current = null;
}
}, []);
const stop = useCallback(() => {
reset();
setState('idle');
if (stopActive) stopActive = null;
}, [reset]);
// Cleanup on unmount.
useEffect(() => () => reset(), [reset]);
const play = useCallback(async () => {
if (stopActive) stopActive();
const text = getText();
if (!text || !text.trim()) return;
// Create + "unlock" the audio element synchronously inside the click gesture,
// so iOS Safari lets us play it after the async fetch resolves.
const audio = new Audio();
audioRef.current = audio;
audio.onended = () => stop();
audio.onerror = () => stop();
try {
audio.play().catch(() => {});
audio.pause();
} catch {
/* unlock attempt; ignore */
}
stopActive = stop;
setState('loading');
try {
const res = await authenticatedFetch('/api/voice/tts', {
method: 'POST',
body: JSON.stringify({ text }),
});
if (!res.ok) throw new Error(`tts ${res.status}`);
const blob = await res.blob();
const url = URL.createObjectURL(blob);
urlRef.current = url;
if (audioRef.current !== audio) return; // stopped while loading
audio.src = url;
audio.load();
await audio.play();
setState('playing');
} catch {
reset();
setState('idle');
}
}, [getText, reset, stop]);
const toggle = useCallback(() => {
if (state === 'playing' || state === 'loading') stop();
else play();
}, [state, play, stop]);
return { state, toggle };
}

View File

@@ -0,0 +1,38 @@
import { useEffect, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
let cached: boolean | null = null;
let inflight: Promise<boolean> | null = null;
function probe(): Promise<boolean> {
if (cached !== null) return Promise.resolve(cached);
if (!inflight) {
inflight = authenticatedFetch('/api/voice/health')
.then((r) => (r.ok ? r.json() : { enabled: false }))
.then((d) => {
cached = Boolean(d?.enabled);
return cached;
})
.catch(() => {
cached = false;
return false;
});
}
return inflight;
}
export function useVoiceAvailable(): boolean {
const [available, setAvailable] = useState<boolean>(cached ?? false);
useEffect(() => {
let mounted = true;
probe().then((v) => {
if (mounted) setAvailable(v);
});
return () => {
mounted = false;
};
}, []);
return available;
}

View File

@@ -0,0 +1,106 @@
import { useCallback, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
const MIME_CANDIDATES = [
'audio/webm;codecs=opus',
'audio/webm',
'audio/mp4',
'audio/ogg;codecs=opus',
'audio/ogg',
];
function pickMime(): string {
for (const t of MIME_CANDIDATES) {
try {
if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
} catch {
/* isTypeSupported can throw on some iOS versions */
}
}
return '';
}
export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
/**
* Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
* (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
* Ported from tooler's VoiceInput.js.
*/
export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
const [state, setState] = useState<VoiceInputState>('idle');
const recorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const stopTracks = () => {
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
};
const start = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true },
});
streamRef.current = stream;
const mimeType = pickMime();
const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
recorderRef.current = rec;
chunksRef.current = [];
rec.ondataavailable = (e) => {
if (e.data.size > 0) chunksRef.current.push(e.data);
};
rec.onstop = async () => {
stopTracks();
const type = rec.mimeType || 'audio/webm';
const blob = new Blob(chunksRef.current, { type });
if (blob.size < 800) {
setState('idle');
onError?.('Recording too short');
return;
}
setState('transcribing');
try {
const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
const fd = new FormData();
fd.append('audio', blob, `recording.${ext}`);
const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
if (!res.ok) throw new Error(`transcribe ${res.status}`);
const data = await res.json();
const text = String(data?.text || '').trim();
if (text) onTranscript(text);
else onError?.('No speech detected');
} catch (e) {
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
} finally {
setState('idle');
}
};
rec.start();
setState('recording');
} catch (e) {
const err = e as { name?: string; message?: string };
let msg = `Mic error: ${err?.message || e}`;
if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
onError?.(msg);
setState('idle');
}
}, [onTranscript, onError]);
const stop = useCallback(() => {
if (recorderRef.current && state === 'recording') recorderRef.current.stop();
}, [state]);
const toggle = useCallback(() => {
if (state === 'recording') stop();
else if (state === 'idle') start();
}, [state, start, stop]);
return { state, toggle };
}