feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-26 05:15:48 +08:00 · 2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions
--- a/src/components/chat/hooks/useTts.ts
+++ b/src/components/chat/hooks/useTts.ts
@@ -0,0 +1,88 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Only one message speaks at a time across the whole app.
+let stopActive: (() => void) | null = null;
+
+export type TtsState = 'idle' | 'loading' | 'playing';
+
+/**
+ * Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
+ * (Kokoro sidecar via the Express proxy; cleaning happens server-side),
+ * plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
+ */
+export function useTts(getText: () => string) {
+  const [state, setState] = useState<TtsState>('idle');
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const urlRef = useRef<string | null>(null);
+
+  const reset = useCallback(() => {
+    if (audioRef.current) {
+      audioRef.current.onended = null;
+      audioRef.current.onerror = null;
+      audioRef.current.pause();
+      audioRef.current.src = '';
+      audioRef.current = null;
+    }
+    if (urlRef.current) {
+      URL.revokeObjectURL(urlRef.current);
+      urlRef.current = null;
+    }
+  }, []);
+
+  const stop = useCallback(() => {
+    reset();
+    setState('idle');
+    if (stopActive) stopActive = null;
+  }, [reset]);
+
+  // Cleanup on unmount.
+  useEffect(() => () => reset(), [reset]);
+
+  const play = useCallback(async () => {
+    if (stopActive) stopActive();
+    const text = getText();
+    if (!text || !text.trim()) return;
+
+    // Create + "unlock" the audio element synchronously inside the click gesture,
+    // so iOS Safari lets us play it after the async fetch resolves.
+    const audio = new Audio();
+    audioRef.current = audio;
+    audio.onended = () => stop();
+    audio.onerror = () => stop();
+    try {
+      audio.play().catch(() => {});
+      audio.pause();
+    } catch {
+      /* unlock attempt; ignore */
+    }
+    stopActive = stop;
+    setState('loading');
+
+    try {
+      const res = await authenticatedFetch('/api/voice/tts', {
+        method: 'POST',
+        body: JSON.stringify({ text }),
+      });
+      if (!res.ok) throw new Error(`tts ${res.status}`);
+      const blob = await res.blob();
+      const url = URL.createObjectURL(blob);
+      urlRef.current = url;
+      if (audioRef.current !== audio) return; // stopped while loading
+      audio.src = url;
+      audio.load();
+      await audio.play();
+      setState('playing');
+    } catch {
+      reset();
+      setState('idle');
+    }
+  }, [getText, reset, stop]);
+
+  const toggle = useCallback(() => {
+    if (state === 'playing' || state === 'loading') stop();
+    else play();
+  }, [state, play, stop]);
+
+  return { state, toggle };
+}
--- a/src/components/chat/hooks/useVoiceAvailable.ts
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -0,0 +1,38 @@
+import { useEffect, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
+// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
+let cached: boolean | null = null;
+let inflight: Promise<boolean> | null = null;
+
+function probe(): Promise<boolean> {
+  if (cached !== null) return Promise.resolve(cached);
+  if (!inflight) {
+    inflight = authenticatedFetch('/api/voice/health')
+      .then((r) => (r.ok ? r.json() : { enabled: false }))
+      .then((d) => {
+        cached = Boolean(d?.enabled);
+        return cached;
+      })
+      .catch(() => {
+        cached = false;
+        return false;
+      });
+  }
+  return inflight;
+}
+
+export function useVoiceAvailable(): boolean {
+  const [available, setAvailable] = useState<boolean>(cached ?? false);
+  useEffect(() => {
+    let mounted = true;
+    probe().then((v) => {
+      if (mounted) setAvailable(v);
+    });
+    return () => {
+      mounted = false;
+    };
+  }, []);
+  return available;
+}
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -0,0 +1,106 @@
+import { useCallback, useRef, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
+const MIME_CANDIDATES = [
+  'audio/webm;codecs=opus',
+  'audio/webm',
+  'audio/mp4',
+  'audio/ogg;codecs=opus',
+  'audio/ogg',
+];
+
+function pickMime(): string {
+  for (const t of MIME_CANDIDATES) {
+    try {
+      if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
+    } catch {
+      /* isTypeSupported can throw on some iOS versions */
+    }
+  }
+  return '';
+}
+
+export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
+
+/**
+ * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
+ * (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
+ * Ported from tooler's VoiceInput.js.
+ */
+export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
+  const [state, setState] = useState<VoiceInputState>('idle');
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+
+  const stopTracks = () => {
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+  };
+
+  const start = useCallback(async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: { echoCancellation: true, noiseSuppression: true },
+      });
+      streamRef.current = stream;
+      const mimeType = pickMime();
+      const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
+      recorderRef.current = rec;
+      chunksRef.current = [];
+
+      rec.ondataavailable = (e) => {
+        if (e.data.size > 0) chunksRef.current.push(e.data);
+      };
+
+      rec.onstop = async () => {
+        stopTracks();
+        const type = rec.mimeType || 'audio/webm';
+        const blob = new Blob(chunksRef.current, { type });
+        if (blob.size < 800) {
+          setState('idle');
+          onError?.('Recording too short');
+          return;
+        }
+        setState('transcribing');
+        try {
+          const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
+          const fd = new FormData();
+          fd.append('audio', blob, `recording.${ext}`);
+          const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
+          if (!res.ok) throw new Error(`transcribe ${res.status}`);
+          const data = await res.json();
+          const text = String(data?.text || '').trim();
+          if (text) onTranscript(text);
+          else onError?.('No speech detected');
+        } catch (e) {
+          onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
+        } finally {
+          setState('idle');
+        }
+      };
+
+      rec.start();
+      setState('recording');
+    } catch (e) {
+      const err = e as { name?: string; message?: string };
+      let msg = `Mic error: ${err?.message || e}`;
+      if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
+      else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
+      onError?.(msg);
+      setState('idle');
+    }
+  }, [onTranscript, onError]);
+
+  const stop = useCallback(() => {
+    if (recorderRef.current && state === 'recording') recorderRef.current.stop();
+  }, [state]);
+
+  const toggle = useCallback(() => {
+    if (state === 'recording') stop();
+    else if (state === 'idle') start();
+  }, [state, start, stop]);
+
+  return { state, toggle };
+}
--- a/src/components/chat/view/ChatInterface.tsx
+++ b/src/components/chat/view/ChatInterface.tsx
@@ -404,6 +404,7 @@ function ChatInterface({
          renderInputWithMentions={renderInputWithMentions}
          textareaRef={textareaRef}
          input={input}
+          onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
          onInputChange={handleInputChange}
          onTextareaClick={handleTextareaClick}
          onTextareaKeyDown={handleKeyDown}
--- a/src/components/chat/view/subcomponents/ChatComposer.tsx
+++ b/src/components/chat/view/subcomponents/ChatComposer.tsx
@@ -26,6 +26,7 @@ import {
 import CommandMenu from './CommandMenu';
 import ClaudeStatus from './ClaudeStatus';
 import ImageAttachment from './ImageAttachment';
+import VoiceInputButton from './VoiceInputButton';
 import PermissionRequestsBanner from './PermissionRequestsBanner';
 import TokenUsageSummary from './TokenUsageSummary';

@@ -89,6 +90,7 @@ interface ChatComposerProps {
  renderInputWithMentions: (text: string) => ReactNode;
  textareaRef: RefObject<HTMLTextAreaElement>;
  input: string;
+  onVoiceTranscript?: (text: string) => void;
  onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
  onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
  onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -143,6 +145,7 @@ export default function ChatComposer({
  renderInputWithMentions,
  textareaRef,
  input,
+  onVoiceTranscript,
  onInputChange,
  onTextareaClick,
  onTextareaKeyDown,
@@ -315,6 +318,8 @@ export default function ChatComposer({
              <ImageIcon />
            </PromptInputButton>

+            {onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
+
            <button
              type="button"
              onClick={onModeSwitch}
--- a/src/components/chat/view/subcomponents/MessageComponent.tsx
+++ b/src/components/chat/view/subcomponents/MessageComponent.tsx
@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share

 import { Markdown } from './Markdown';
 import MessageCopyControl from './MessageCopyControl';
+import MessageSpeakControl from './MessageSpeakControl';

 type DiffLine = {
  type: string;
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
                {shouldShowAssistantCopyControl && (
                  <MessageCopyControl content={assistantCopyContent} messageType="assistant" />
                )}
+                {shouldShowAssistantCopyControl && (
+                  <MessageSpeakControl content={assistantCopyContent} />
+                )}
                {!isGrouped && <span>{formattedTime}</span>}
              </div>
            )}
--- a/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
+++ b/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
@@ -0,0 +1,37 @@
+import { Volume2, Loader2, Square } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { useTts } from '../../hooks/useTts';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+
+// Tap-to-speak button beside the copy control on assistant messages.
+// Renders nothing unless the optional voice feature is enabled.
+const MessageSpeakControl = ({ content }: { content: string }) => {
+  const { t } = useTranslation('chat');
+  const available = useVoiceAvailable();
+  const { state, toggle } = useTts(() => content);
+
+  if (!available) return null;
+
+  const title =
+    state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
+
+  return (
+    <button
+      type="button"
+      onClick={toggle}
+      title={title}
+      aria-label={title}
+      className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
+    >
+      {state === 'playing' ? (
+        <Square className="h-3.5 w-3.5" />
+      ) : state === 'loading' ? (
+        <Loader2 className="h-3.5 w-3.5 animate-spin" />
+      ) : (
+        <Volume2 className="h-3.5 w-3.5" />
+      )}
+    </button>
+  );
+};
+
+export default MessageSpeakControl;
--- a/src/components/chat/view/subcomponents/VoiceInputButton.tsx
+++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx
@@ -0,0 +1,40 @@
+import { Mic, Square, Loader2 } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { useVoiceInput } from '../../hooks/useVoiceInput';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+import { PromptInputButton } from '../../../../shared/view/ui';
+
+type Props = {
+  onTranscript: (text: string) => void;
+  onError?: (msg: string) => void;
+};
+
+// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
+export default function VoiceInputButton({ onTranscript, onError }: Props) {
+  const { t } = useTranslation('chat');
+  const available = useVoiceAvailable();
+  const { state, toggle } = useVoiceInput(onTranscript, onError);
+
+  if (!available) return null;
+
+  const icon =
+    state === 'recording' ? (
+      <Square className="text-red-500" />
+    ) : state === 'transcribing' ? (
+      <Loader2 className="animate-spin" />
+    ) : (
+      <Mic />
+    );
+
+  return (
+    <PromptInputButton
+      tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
+      onClick={(e: { preventDefault: () => void }) => {
+        e.preventDefault();
+        toggle();
+      }}
+    >
+      {icon}
+    </PromptInputButton>
+  );
+}