feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-26 13:35:49 +08:00 · 2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions
--- a/src/components/chat/hooks/useTts.ts
+++ b/src/components/chat/hooks/useTts.ts
@@ -0,0 +1,88 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Only one message speaks at a time across the whole app.
+let stopActive: (() => void) | null = null;
+
+export type TtsState = 'idle' | 'loading' | 'playing';
+
+/**
+ * Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
+ * (Kokoro sidecar via the Express proxy; cleaning happens server-side),
+ * plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
+ */
+export function useTts(getText: () => string) {
+  const [state, setState] = useState<TtsState>('idle');
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const urlRef = useRef<string | null>(null);
+
+  const reset = useCallback(() => {
+    if (audioRef.current) {
+      audioRef.current.onended = null;
+      audioRef.current.onerror = null;
+      audioRef.current.pause();
+      audioRef.current.src = '';
+      audioRef.current = null;
+    }
+    if (urlRef.current) {
+      URL.revokeObjectURL(urlRef.current);
+      urlRef.current = null;
+    }
+  }, []);
+
+  const stop = useCallback(() => {
+    reset();
+    setState('idle');
+    if (stopActive) stopActive = null;
+  }, [reset]);
+
+  // Cleanup on unmount.
+  useEffect(() => () => reset(), [reset]);
+
+  const play = useCallback(async () => {
+    if (stopActive) stopActive();
+    const text = getText();
+    if (!text || !text.trim()) return;
+
+    // Create + "unlock" the audio element synchronously inside the click gesture,
+    // so iOS Safari lets us play it after the async fetch resolves.
+    const audio = new Audio();
+    audioRef.current = audio;
+    audio.onended = () => stop();
+    audio.onerror = () => stop();
+    try {
+      audio.play().catch(() => {});
+      audio.pause();
+    } catch {
+      /* unlock attempt; ignore */
+    }
+    stopActive = stop;
+    setState('loading');
+
+    try {
+      const res = await authenticatedFetch('/api/voice/tts', {
+        method: 'POST',
+        body: JSON.stringify({ text }),
+      });
+      if (!res.ok) throw new Error(`tts ${res.status}`);
+      const blob = await res.blob();
+      const url = URL.createObjectURL(blob);
+      urlRef.current = url;
+      if (audioRef.current !== audio) return; // stopped while loading
+      audio.src = url;
+      audio.load();
+      await audio.play();
+      setState('playing');
+    } catch {
+      reset();
+      setState('idle');
+    }
+  }, [getText, reset, stop]);
+
+  const toggle = useCallback(() => {
+    if (state === 'playing' || state === 'loading') stop();
+    else play();
+  }, [state, play, stop]);
+
+  return { state, toggle };
+}