feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-25 04:13:51 +08:00 · 2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -142,3 +142,10 @@ tasks/

 # Git worktrees
 .worktrees/
+
+# Voice sidecar (Python) — generated, machine-specific, not committed
+voice-sidecar/.venv/
+voice-sidecar/voice_messages/
+voice-sidecar/**/__pycache__/
+*.pyc
+*.wav
--- a/docs/voice.md
+++ b/docs/voice.md
@@ -0,0 +1,57 @@
+# Voice (optional)
+
+Adds two opt-in voice features to the chat:
+
+- **Push-to-talk dictation** — a mic button in the composer records your voice, transcribes it
+  (speech-to-text), and drops the text into the input.
+- **Read-aloud** — a speaker button on each assistant message plays it back (text-to-speech).
+
+Voice is **disabled by default**. The UI only appears when a voice backend is configured, so it has
+zero impact on installs that don't use it.
+
+## Enable it
+
+Set `VOICE_SIDECAR_URL` for the server to point at a voice backend, then restart:
+
+```bash
+VOICE_SIDECAR_URL=http://127.0.0.1:8765 npm run server
+```
+
+When set, `GET /api/voice/health` reports `{ "enabled": true }` and the mic + speaker controls appear.
+All voice requests are proxied through the app's authenticated `/api/voice/*` routes, so the backend
+itself only needs to listen on localhost and is never exposed directly.
+
+## Backend contract
+
+`VOICE_SIDECAR_URL` can point at **any** service that implements two endpoints:
+
+| Method & path | Request | Response |
+|---|---|---|
+| `POST /transcribe` | multipart, field `audio` (webm/mp4/wav/…) | `{ "text": "..." }` |
+| `POST /tts` | form field `text` | audio bytes (`audio/*`, e.g. wav/mp3) |
+
+This keeps the feature provider-agnostic — you can back it with the bundled local sidecar, or a cloud
+transcription + TTS gateway, as long as it speaks that contract.
+
+## Reference backend: `voice-sidecar/`
+
+A local, no-API-key reference implementation using **faster-whisper** (STT) and **Kokoro-82M** (TTS),
+both CPU-capable.
+
+```bash
+cd voice-sidecar
+python -m venv .venv && . .venv/bin/activate    # (Windows: .venv\Scripts\activate)
+pip install -r requirements.txt
+python -m uvicorn app:app --host 127.0.0.1 --port 8765
+```
+
+Then run the app with `VOICE_SIDECAR_URL=http://127.0.0.1:8765`.
+
+Config (env, all optional) — see `voice-sidecar/.env.example`: `WHISPER_MODEL_SIZE`, `WHISPER_DEVICE`
+(`cpu`/`cuda`), `KOKORO_VOICE`, `VOICE_PORT`.
+
+## Notes
+
+- The first read-aloud is slow (~10–20s) while the model lazy-loads; it's near-instant and cached after.
+- Recording needs a secure context (HTTPS or localhost) for microphone access.
+- On iOS, playback is tap-initiated (manual read-aloud) to satisfy Safari's autoplay policy.
--- a/server/index.js
+++ b/server/index.js
@@ -72,6 +72,7 @@ import userRoutes from './routes/user.js';
 import geminiRoutes from './routes/gemini.js';
 import pluginsRoutes from './routes/plugins.js';
 import providerRoutes from './modules/providers/provider.routes.js';
+import voiceRoutes from './voice-proxy.js';
 import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js';
 import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js';
 import { configureWebPush } from './services/vapid-keys.js';
@@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
 // Agent API Routes (uses API key authentication)
 app.use('/api/agent', agentRoutes);

+app.use('/api/voice', authenticateToken, voiceRoutes);
+
 // Serve public files (like api-docs.html)
 app.use(express.static(path.join(APP_ROOT, 'public')));

--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -0,0 +1,87 @@
+// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
+//
+// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
+// backend (any implementation) exposing:
+//     POST /transcribe   (multipart field 'audio')  -> { text }
+//     POST /tts          (form field 'text')        -> audio bytes (audio/*)
+// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
+// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
+//
+// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
+// should bind to localhost and is never exposed directly.
+import express from 'express';
+
+const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
+const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
+
+const router = express.Router();
+
+// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
+let _upload = null;
+async function getUpload() {
+  if (!_upload) {
+    const multer = (await import('multer')).default;
+    _upload = multer({
+      storage: multer.memoryStorage(),
+      limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
+    });
+  }
+  return _upload;
+}
+
+function ensureEnabled(res) {
+  if (!VOICE_ENABLED) {
+    res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
+    return false;
+  }
+  return true;
+}
+
+// GET /api/voice/health -> { enabled }  (frontend hides the voice UI when disabled)
+router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
+
+// POST /api/voice/transcribe  (multipart 'audio') -> { text }
+router.post('/transcribe', async (req, res) => {
+  if (!ensureEnabled(res)) return;
+  const upload = await getUpload();
+  upload.single('audio')(req, res, async (err) => {
+    if (err) return res.status(400).json({ error: err.message });
+    if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
+    try {
+      const fd = new FormData();
+      fd.append(
+        'audio',
+        new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
+        req.file.originalname || 'recording.webm',
+      );
+      const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
+      const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
+      res.status(r.status).json(data);
+    } catch (e) {
+      res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
+    }
+  });
+});
+
+// POST /api/voice/tts  { text } -> audio bytes
+router.post('/tts', async (req, res) => {
+  if (!ensureEnabled(res)) return;
+  const text = req.body?.text;
+  if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
+  try {
+    const fd = new FormData();
+    fd.append('text', text);
+    const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
+    if (!r.ok) {
+      const errText = await r.text().catch(() => 'tts failed');
+      return res.status(r.status).json({ error: errText });
+    }
+    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
+    res.setHeader('Cache-Control', 'no-store');
+    res.send(Buffer.from(await r.arrayBuffer()));
+  } catch (e) {
+    res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
+  }
+});
+
+export default router;
--- a/src/components/chat/hooks/useTts.ts
+++ b/src/components/chat/hooks/useTts.ts
@@ -0,0 +1,88 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Only one message speaks at a time across the whole app.
+let stopActive: (() => void) | null = null;
+
+export type TtsState = 'idle' | 'loading' | 'playing';
+
+/**
+ * Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
+ * (Kokoro sidecar via the Express proxy; cleaning happens server-side),
+ * plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
+ */
+export function useTts(getText: () => string) {
+  const [state, setState] = useState<TtsState>('idle');
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const urlRef = useRef<string | null>(null);
+
+  const reset = useCallback(() => {
+    if (audioRef.current) {
+      audioRef.current.onended = null;
+      audioRef.current.onerror = null;
+      audioRef.current.pause();
+      audioRef.current.src = '';
+      audioRef.current = null;
+    }
+    if (urlRef.current) {
+      URL.revokeObjectURL(urlRef.current);
+      urlRef.current = null;
+    }
+  }, []);
+
+  const stop = useCallback(() => {
+    reset();
+    setState('idle');
+    if (stopActive) stopActive = null;
+  }, [reset]);
+
+  // Cleanup on unmount.
+  useEffect(() => () => reset(), [reset]);
+
+  const play = useCallback(async () => {
+    if (stopActive) stopActive();
+    const text = getText();
+    if (!text || !text.trim()) return;
+
+    // Create + "unlock" the audio element synchronously inside the click gesture,
+    // so iOS Safari lets us play it after the async fetch resolves.
+    const audio = new Audio();
+    audioRef.current = audio;
+    audio.onended = () => stop();
+    audio.onerror = () => stop();
+    try {
+      audio.play().catch(() => {});
+      audio.pause();
+    } catch {
+      /* unlock attempt; ignore */
+    }
+    stopActive = stop;
+    setState('loading');
+
+    try {
+      const res = await authenticatedFetch('/api/voice/tts', {
+        method: 'POST',
+        body: JSON.stringify({ text }),
+      });
+      if (!res.ok) throw new Error(`tts ${res.status}`);
+      const blob = await res.blob();
+      const url = URL.createObjectURL(blob);
+      urlRef.current = url;
+      if (audioRef.current !== audio) return; // stopped while loading
+      audio.src = url;
+      audio.load();
+      await audio.play();
+      setState('playing');
+    } catch {
+      reset();
+      setState('idle');
+    }
+  }, [getText, reset, stop]);
+
+  const toggle = useCallback(() => {
+    if (state === 'playing' || state === 'loading') stop();
+    else play();
+  }, [state, play, stop]);
+
+  return { state, toggle };
+}
--- a/src/components/chat/hooks/useVoiceAvailable.ts
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -0,0 +1,38 @@
+import { useEffect, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
+// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
+let cached: boolean | null = null;
+let inflight: Promise<boolean> | null = null;
+
+function probe(): Promise<boolean> {
+  if (cached !== null) return Promise.resolve(cached);
+  if (!inflight) {
+    inflight = authenticatedFetch('/api/voice/health')
+      .then((r) => (r.ok ? r.json() : { enabled: false }))
+      .then((d) => {
+        cached = Boolean(d?.enabled);
+        return cached;
+      })
+      .catch(() => {
+        cached = false;
+        return false;
+      });
+  }
+  return inflight;
+}
+
+export function useVoiceAvailable(): boolean {
+  const [available, setAvailable] = useState<boolean>(cached ?? false);
+  useEffect(() => {
+    let mounted = true;
+    probe().then((v) => {
+      if (mounted) setAvailable(v);
+    });
+    return () => {
+      mounted = false;
+    };
+  }, []);
+  return available;
+}
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -0,0 +1,106 @@
+import { useCallback, useRef, useState } from 'react';
+import { authenticatedFetch } from '../../../utils/api';
+
+// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
+const MIME_CANDIDATES = [
+  'audio/webm;codecs=opus',
+  'audio/webm',
+  'audio/mp4',
+  'audio/ogg;codecs=opus',
+  'audio/ogg',
+];
+
+function pickMime(): string {
+  for (const t of MIME_CANDIDATES) {
+    try {
+      if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
+    } catch {
+      /* isTypeSupported can throw on some iOS versions */
+    }
+  }
+  return '';
+}
+
+export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
+
+/**
+ * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
+ * (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
+ * Ported from tooler's VoiceInput.js.
+ */
+export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
+  const [state, setState] = useState<VoiceInputState>('idle');
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+
+  const stopTracks = () => {
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+  };
+
+  const start = useCallback(async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: { echoCancellation: true, noiseSuppression: true },
+      });
+      streamRef.current = stream;
+      const mimeType = pickMime();
+      const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
+      recorderRef.current = rec;
+      chunksRef.current = [];
+
+      rec.ondataavailable = (e) => {
+        if (e.data.size > 0) chunksRef.current.push(e.data);
+      };
+
+      rec.onstop = async () => {
+        stopTracks();
+        const type = rec.mimeType || 'audio/webm';
+        const blob = new Blob(chunksRef.current, { type });
+        if (blob.size < 800) {
+          setState('idle');
+          onError?.('Recording too short');
+          return;
+        }
+        setState('transcribing');
+        try {
+          const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
+          const fd = new FormData();
+          fd.append('audio', blob, `recording.${ext}`);
+          const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
+          if (!res.ok) throw new Error(`transcribe ${res.status}`);
+          const data = await res.json();
+          const text = String(data?.text || '').trim();
+          if (text) onTranscript(text);
+          else onError?.('No speech detected');
+        } catch (e) {
+          onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
+        } finally {
+          setState('idle');
+        }
+      };
+
+      rec.start();
+      setState('recording');
+    } catch (e) {
+      const err = e as { name?: string; message?: string };
+      let msg = `Mic error: ${err?.message || e}`;
+      if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
+      else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
+      onError?.(msg);
+      setState('idle');
+    }
+  }, [onTranscript, onError]);
+
+  const stop = useCallback(() => {
+    if (recorderRef.current && state === 'recording') recorderRef.current.stop();
+  }, [state]);
+
+  const toggle = useCallback(() => {
+    if (state === 'recording') stop();
+    else if (state === 'idle') start();
+  }, [state, start, stop]);
+
+  return { state, toggle };
+}
--- a/src/components/chat/view/ChatInterface.tsx
+++ b/src/components/chat/view/ChatInterface.tsx
@@ -404,6 +404,7 @@ function ChatInterface({
          renderInputWithMentions={renderInputWithMentions}
          textareaRef={textareaRef}
          input={input}
+          onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
          onInputChange={handleInputChange}
          onTextareaClick={handleTextareaClick}
          onTextareaKeyDown={handleKeyDown}
--- a/src/components/chat/view/subcomponents/ChatComposer.tsx
+++ b/src/components/chat/view/subcomponents/ChatComposer.tsx
@@ -26,6 +26,7 @@ import {
 import CommandMenu from './CommandMenu';
 import ClaudeStatus from './ClaudeStatus';
 import ImageAttachment from './ImageAttachment';
+import VoiceInputButton from './VoiceInputButton';
 import PermissionRequestsBanner from './PermissionRequestsBanner';
 import TokenUsageSummary from './TokenUsageSummary';

@@ -89,6 +90,7 @@ interface ChatComposerProps {
  renderInputWithMentions: (text: string) => ReactNode;
  textareaRef: RefObject<HTMLTextAreaElement>;
  input: string;
+  onVoiceTranscript?: (text: string) => void;
  onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
  onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
  onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -143,6 +145,7 @@ export default function ChatComposer({
  renderInputWithMentions,
  textareaRef,
  input,
+  onVoiceTranscript,
  onInputChange,
  onTextareaClick,
  onTextareaKeyDown,
@@ -315,6 +318,8 @@ export default function ChatComposer({
              <ImageIcon />
            </PromptInputButton>

+            {onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
+
            <button
              type="button"
              onClick={onModeSwitch}
--- a/src/components/chat/view/subcomponents/MessageComponent.tsx
+++ b/src/components/chat/view/subcomponents/MessageComponent.tsx
@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share

 import { Markdown } from './Markdown';
 import MessageCopyControl from './MessageCopyControl';
+import MessageSpeakControl from './MessageSpeakControl';

 type DiffLine = {
  type: string;
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
                {shouldShowAssistantCopyControl && (
                  <MessageCopyControl content={assistantCopyContent} messageType="assistant" />
                )}
+                {shouldShowAssistantCopyControl && (
+                  <MessageSpeakControl content={assistantCopyContent} />
+                )}
                {!isGrouped && <span>{formattedTime}</span>}
              </div>
            )}
--- a/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
+++ b/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
@@ -0,0 +1,37 @@
+import { Volume2, Loader2, Square } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { useTts } from '../../hooks/useTts';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+
+// Tap-to-speak button beside the copy control on assistant messages.
+// Renders nothing unless the optional voice feature is enabled.
+const MessageSpeakControl = ({ content }: { content: string }) => {
+  const { t } = useTranslation('chat');
+  const available = useVoiceAvailable();
+  const { state, toggle } = useTts(() => content);
+
+  if (!available) return null;
+
+  const title =
+    state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
+
+  return (
+    <button
+      type="button"
+      onClick={toggle}
+      title={title}
+      aria-label={title}
+      className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
+    >
+      {state === 'playing' ? (
+        <Square className="h-3.5 w-3.5" />
+      ) : state === 'loading' ? (
+        <Loader2 className="h-3.5 w-3.5 animate-spin" />
+      ) : (
+        <Volume2 className="h-3.5 w-3.5" />
+      )}
+    </button>
+  );
+};
+
+export default MessageSpeakControl;
--- a/src/components/chat/view/subcomponents/VoiceInputButton.tsx
+++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx
@@ -0,0 +1,40 @@
+import { Mic, Square, Loader2 } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { useVoiceInput } from '../../hooks/useVoiceInput';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+import { PromptInputButton } from '../../../../shared/view/ui';
+
+type Props = {
+  onTranscript: (text: string) => void;
+  onError?: (msg: string) => void;
+};
+
+// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
+export default function VoiceInputButton({ onTranscript, onError }: Props) {
+  const { t } = useTranslation('chat');
+  const available = useVoiceAvailable();
+  const { state, toggle } = useVoiceInput(onTranscript, onError);
+
+  if (!available) return null;
+
+  const icon =
+    state === 'recording' ? (
+      <Square className="text-red-500" />
+    ) : state === 'transcribing' ? (
+      <Loader2 className="animate-spin" />
+    ) : (
+      <Mic />
+    );
+
+  return (
+    <PromptInputButton
+      tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
+      onClick={(e: { preventDefault: () => void }) => {
+        e.preventDefault();
+        toggle();
+      }}
+    >
+      {icon}
+    </PromptInputButton>
+  );
+}
--- a/src/i18n/locales/en/chat.json
+++ b/src/i18n/locales/en/chat.json
@@ -122,6 +122,14 @@
      }
    }
  },
+  "voice": {
+    "input": "Voice input",
+    "stopRecording": "Stop recording",
+    "transcribing": "Transcribing…",
+    "speak": "Read aloud",
+    "stopSpeaking": "Stop",
+    "loading": "Loading…"
+  },
  "input": {
    "placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...",
    "placeholderDefault": "Type your message...",
--- a/voice-sidecar/.env.example
+++ b/voice-sidecar/.env.example
@@ -0,0 +1,14 @@
+# Voice sidecar config (all optional — these are the defaults).
+# The sidecar binds 127.0.0.1 only; CloudCLI's Express proxy reaches it.
+
+# Port the sidecar listens on (CloudCLI reaches it via VOICE_SIDECAR_URL).
+VOICE_PORT=8765
+
+# faster-whisper model size: tiny | base | small | medium | large-v3
+WHISPER_MODEL_SIZE=base
+# cpu (int8, default) or cuda (float16, needs a CUDA torch in the venv)
+WHISPER_DEVICE=cpu
+
+# Kokoro voice (see https://github.com/hexgrad/kokoro for the full list) and language code.
+KOKORO_VOICE=af_heart
+KOKORO_LANG=a
--- a/voice-sidecar/app.py
+++ b/voice-sidecar/app.py
@@ -0,0 +1,187 @@
+"""
+CloudCLI voice sidecar — local STT (faster-whisper) + local TTS (Kokoro-82M).
+
+Ported from the tooler voice endpoints (D:\\tooler\\backend\\server.py), swapping
+edge-tts -> Kokoro. Bound to 127.0.0.1 only; CloudCLI's Express server proxies to
+it behind JWT auth. Never exposed to the tailnet directly.
+
+Endpoints:
+  GET  /health           -> {status, whisper_loaded, kokoro_loaded}
+  POST /transcribe       (multipart 'audio')        -> {text, duration_ms}
+  POST /tts              (form 'text')              -> audio/wav bytes (cached)
+"""
+import asyncio
+import hashlib
+import logging
+import os
+import re
+import tempfile
+import time
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import Response
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("voice-sidecar")
+
+# ---- Config (env-overridable) -------------------------------------------------
+PORT = int(os.getenv("VOICE_PORT", "8765"))
+WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
+WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu").lower()      # "cpu" | "cuda"
+KOKORO_VOICE = os.getenv("KOKORO_VOICE", "af_heart")
+KOKORO_LANG = os.getenv("KOKORO_LANG", "a")                      # 'a' = American English
+KOKORO_SR = 24000
+
+VOICE_DIR = Path(__file__).parent / "voice_messages"
+VOICE_DIR.mkdir(exist_ok=True)
+
+# ---- Lazy model singletons ----------------------------------------------------
+_whisper = None
+_whisper_lock = asyncio.Lock()
+_kpipe = None
+_kpipe_lock = asyncio.Lock()
+
+
+async def get_whisper():
+    global _whisper
+    if _whisper is not None:
+        return _whisper
+    async with _whisper_lock:
+        if _whisper is not None:
+            return _whisper
+
+        def _load():
+            from faster_whisper import WhisperModel
+            if WHISPER_DEVICE == "cuda":
+                try:
+                    logger.info("[WHISPER] loading on CUDA (float16)...")
+                    return WhisperModel(WHISPER_MODEL_SIZE, device="cuda", compute_type="float16")
+                except Exception as e:  # noqa: BLE001
+                    logger.warning("[WHISPER] CUDA failed (%s), falling back to CPU", e)
+            logger.info("[WHISPER] loading '%s' on CPU (int8)", WHISPER_MODEL_SIZE)
+            return WhisperModel(WHISPER_MODEL_SIZE, device="cpu", compute_type="int8")
+
+        _whisper = await asyncio.get_event_loop().run_in_executor(None, _load)
+        logger.info("[WHISPER] ready")
+        return _whisper
+
+
+async def get_kokoro():
+    global _kpipe
+    if _kpipe is not None:
+        return _kpipe
+    async with _kpipe_lock:
+        if _kpipe is not None:
+            return _kpipe
+
+        def _load():
+            from kokoro import KPipeline
+            logger.info("[KOKORO] loading pipeline (lang=%s)...", KOKORO_LANG)
+            return KPipeline(lang_code=KOKORO_LANG)
+
+        _kpipe = await asyncio.get_event_loop().run_in_executor(None, _load)
+        logger.info("[KOKORO] ready")
+        return _kpipe
+
+
+# ---- Text cleaning (ported verbatim from tooler prepare_text_for_tts) ---------
+def prepare_text_for_tts(text: str) -> str:
+    """Strip/transform markdown for natural speech."""
+    text = re.sub(r"```[\s\S]*?```", " code block ", text)   # code fences -> spoken stub
+    text = re.sub(r"`([^`]+)`", r"\1", text)                  # unwrap inline code
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)            # bold
+    text = re.sub(r"\*([^*]+)\*", r"\1", text)                # italic
+    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)      # links -> link text
+    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)  # headers
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+# ---- App ----------------------------------------------------------------------
+app = FastAPI(title="CloudCLI voice sidecar")
+
+
+@app.get("/health")
+async def health():
+    return {
+        "status": "ok",
+        "whisper_loaded": _whisper is not None,
+        "kokoro_loaded": _kpipe is not None,
+    }
+
+
+@app.post("/transcribe")
+async def transcribe(audio: UploadFile = File(...)):
+    start = time.time()
+    suffix = Path(audio.filename or "rec.webm").suffix or ".webm"
+    content = await audio.read()
+    logger.info("[STT] %d bytes (%s)", len(content), audio.content_type)
+
+    tmp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        model = await get_whisper()
+
+        def _run():
+            segments, _info = model.transcribe(tmp_path, beam_size=5)
+            return "".join(seg.text for seg in segments).strip()
+
+        text = await asyncio.get_event_loop().run_in_executor(None, _run)
+        duration_ms = int((time.time() - start) * 1000)
+        logger.info("[STT] %dms: %s", duration_ms, text[:100])
+        return {"text": text, "duration_ms": duration_ms}
+    except Exception as e:  # noqa: BLE001
+        logger.error("[STT] failed: %s", e, exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+
+
+@app.post("/tts")
+async def tts(text: str = Form(...)):
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    if len(text) > 8000:
+        raise HTTPException(status_code=400, detail="Text too long (max 8000 chars)")
+
+    start = time.time()
+    clean = prepare_text_for_tts(text)
+    # Cache on the RAW text hash (matches tooler) so identical messages reuse audio.
+    key = hashlib.sha256(text.encode()).hexdigest()[:16]
+    out_path = VOICE_DIR / f"{key}.wav"
+
+    if not out_path.exists():
+        try:
+            pipeline = await get_kokoro()
+
+            def _synth():
+                chunks = [audio for _gs, _ps, audio in pipeline(clean, voice=KOKORO_VOICE)]
+                if not chunks:
+                    raise RuntimeError("Kokoro produced no audio")
+                full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
+                sf.write(str(out_path), full, KOKORO_SR)
+
+            await asyncio.get_event_loop().run_in_executor(None, _synth)
+            logger.info("[TTS] generated %s in %dms", out_path.name, int((time.time() - start) * 1000))
+        except Exception as e:  # noqa: BLE001
+            logger.error("[TTS] failed: %s", e, exc_info=True)
+            raise HTTPException(status_code=500, detail=f"TTS failed: {e}")
+    else:
+        logger.info("[TTS] cache hit %s", out_path.name)
+
+    return Response(content=out_path.read_bytes(), media_type="audio/wav")
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="info")
--- a/voice-sidecar/requirements.txt
+++ b/voice-sidecar/requirements.txt
@@ -0,0 +1,9 @@
+# CloudCLI voice sidecar — STT (faster-whisper) + TTS (Kokoro-82M)
+fastapi>=0.110.0
+uvicorn[standard]>=0.27.0
+python-multipart>=0.0.9
+faster-whisper>=1.0.0
+kokoro>=0.9.4
+misaki[en]>=0.9.4
+soundfile>=0.12.1
+numpy>=1.26.0
--- a/voice-sidecar/test_smoke.py
+++ b/voice-sidecar/test_smoke.py
@@ -0,0 +1,29 @@
+"""Smoke test: Kokoro TTS -> faster-whisper STT round-trip."""
+import time
+import numpy as np
+import soundfile as sf
+
+PHRASE = "Hello, this is a test of the CloudCLI voice sidecar."
+
+print("[1/3] Loading Kokoro pipeline...")
+t = time.time()
+from kokoro import KPipeline
+pipe = KPipeline(lang_code="a")
+print(f"      loaded in {time.time()-t:.1f}s")
+
+print("[2/3] Synthesizing...")
+t = time.time()
+chunks = [audio for _gs, _ps, audio in pipe(PHRASE, voice="af_heart")]
+full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
+sf.write("test.wav", full, 24000)
+dur = len(full) / 24000
+print(f"      synth {time.time()-t:.1f}s -> test.wav ({dur:.1f}s audio, {len(full)} samples)")
+
+print("[3/3] Transcribing back with faster-whisper (base, cpu int8)...")
+t = time.time()
+from faster_whisper import WhisperModel
+model = WhisperModel("base", device="cpu", compute_type="int8")
+segments, _info = model.transcribe("test.wav", beam_size=5)
+text = "".join(s.text for s in segments).strip()
+print(f"      transcribe {time.time()-t:.1f}s -> {text!r}")
+print("\nROUND-TRIP OK" if text else "\nROUND-TRIP PRODUCED NO TEXT")