feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on
assistant messages. Both are opt-in and hidden unless a voice backend is
configured via VOICE_SIDECAR_URL.

The auth-gated /api/voice proxy forwards to a configurable backend exposing
/transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health
and hides the controls when disabled. Adds i18n keys and docs/voice.md.

Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper
for STT, Kokoro-82M for TTS, both CPU-capable).
This commit is contained in:
newsbubbles
2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions

7
.gitignore vendored
View File

@@ -142,3 +142,10 @@ tasks/
# Git worktrees
.worktrees/
# Voice sidecar (Python) — generated, machine-specific, not committed
voice-sidecar/.venv/
voice-sidecar/voice_messages/
voice-sidecar/**/__pycache__/
*.pyc
*.wav

57
docs/voice.md Normal file
View File

@@ -0,0 +1,57 @@
# Voice (optional)
Adds two opt-in voice features to the chat:
- **Push-to-talk dictation** — a mic button in the composer records your voice, transcribes it
(speech-to-text), and drops the text into the input.
- **Read-aloud** — a speaker button on each assistant message plays it back (text-to-speech).
Voice is **disabled by default**. The UI only appears when a voice backend is configured, so it has
zero impact on installs that don't use it.
## Enable it
Set `VOICE_SIDECAR_URL` for the server to point at a voice backend, then restart:
```bash
VOICE_SIDECAR_URL=http://127.0.0.1:8765 npm run server
```
When set, `GET /api/voice/health` reports `{ "enabled": true }` and the mic + speaker controls appear.
All voice requests are proxied through the app's authenticated `/api/voice/*` routes, so the backend
itself only needs to listen on localhost and is never exposed directly.
## Backend contract
`VOICE_SIDECAR_URL` can point at **any** service that implements two endpoints:
| Method & path | Request | Response |
|---|---|---|
| `POST /transcribe` | multipart, field `audio` (webm/mp4/wav/…) | `{ "text": "..." }` |
| `POST /tts` | form field `text` | audio bytes (`audio/*`, e.g. wav/mp3) |
This keeps the feature provider-agnostic — you can back it with the bundled local sidecar, or a cloud
transcription + TTS gateway, as long as it speaks that contract.
## Reference backend: `voice-sidecar/`
A local, no-API-key reference implementation using **faster-whisper** (STT) and **Kokoro-82M** (TTS),
both CPU-capable.
```bash
cd voice-sidecar
python -m venv .venv && . .venv/bin/activate # (Windows: .venv\Scripts\activate)
pip install -r requirements.txt
python -m uvicorn app:app --host 127.0.0.1 --port 8765
```
Then run the app with `VOICE_SIDECAR_URL=http://127.0.0.1:8765`.
Config (env, all optional) — see `voice-sidecar/.env.example`: `WHISPER_MODEL_SIZE`, `WHISPER_DEVICE`
(`cpu`/`cuda`), `KOKORO_VOICE`, `VOICE_PORT`.
## Notes
- The first read-aloud is slow (~1020s) while the model lazy-loads; it's near-instant and cached after.
- Recording needs a secure context (HTTPS or localhost) for microphone access.
- On iOS, playback is tap-initiated (manual read-aloud) to satisfy Safari's autoplay policy.

View File

@@ -72,6 +72,7 @@ import userRoutes from './routes/user.js';
import geminiRoutes from './routes/gemini.js';
import pluginsRoutes from './routes/plugins.js';
import providerRoutes from './modules/providers/provider.routes.js';
import voiceRoutes from './voice-proxy.js';
import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js';
import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js';
import { configureWebPush } from './services/vapid-keys.js';
@@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
// Agent API Routes (uses API key authentication)
app.use('/api/agent', agentRoutes);
app.use('/api/voice', authenticateToken, voiceRoutes);
// Serve public files (like api-docs.html)
app.use(express.static(path.join(APP_ROOT, 'public')));

87
server/voice-proxy.js Normal file
View File

@@ -0,0 +1,87 @@
// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
//
// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
// backend (any implementation) exposing:
// POST /transcribe (multipart field 'audio') -> { text }
// POST /tts (form field 'text') -> audio bytes (audio/*)
// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
//
// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
// should bind to localhost and is never exposed directly.
import express from 'express';
const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
const router = express.Router();
// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
let _upload = null;
async function getUpload() {
if (!_upload) {
const multer = (await import('multer')).default;
_upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
});
}
return _upload;
}
function ensureEnabled(res) {
if (!VOICE_ENABLED) {
res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
return false;
}
return true;
}
// GET /api/voice/health -> { enabled } (frontend hides the voice UI when disabled)
router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
// POST /api/voice/transcribe (multipart 'audio') -> { text }
router.post('/transcribe', async (req, res) => {
if (!ensureEnabled(res)) return;
const upload = await getUpload();
upload.single('audio')(req, res, async (err) => {
if (err) return res.status(400).json({ error: err.message });
if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
try {
const fd = new FormData();
fd.append(
'audio',
new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
req.file.originalname || 'recording.webm',
);
const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
res.status(r.status).json(data);
} catch (e) {
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
}
});
});
// POST /api/voice/tts { text } -> audio bytes
router.post('/tts', async (req, res) => {
if (!ensureEnabled(res)) return;
const text = req.body?.text;
if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
try {
const fd = new FormData();
fd.append('text', text);
const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
if (!r.ok) {
const errText = await r.text().catch(() => 'tts failed');
return res.status(r.status).json({ error: errText });
}
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
res.setHeader('Cache-Control', 'no-store');
res.send(Buffer.from(await r.arrayBuffer()));
} catch (e) {
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
}
});
export default router;

View File

@@ -0,0 +1,88 @@
import { useCallback, useEffect, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Only one message speaks at a time across the whole app.
let stopActive: (() => void) | null = null;
export type TtsState = 'idle' | 'loading' | 'playing';
/**
* Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
* (Kokoro sidecar via the Express proxy; cleaning happens server-side),
* plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
*/
export function useTts(getText: () => string) {
const [state, setState] = useState<TtsState>('idle');
const audioRef = useRef<HTMLAudioElement | null>(null);
const urlRef = useRef<string | null>(null);
const reset = useCallback(() => {
if (audioRef.current) {
audioRef.current.onended = null;
audioRef.current.onerror = null;
audioRef.current.pause();
audioRef.current.src = '';
audioRef.current = null;
}
if (urlRef.current) {
URL.revokeObjectURL(urlRef.current);
urlRef.current = null;
}
}, []);
const stop = useCallback(() => {
reset();
setState('idle');
if (stopActive) stopActive = null;
}, [reset]);
// Cleanup on unmount.
useEffect(() => () => reset(), [reset]);
const play = useCallback(async () => {
if (stopActive) stopActive();
const text = getText();
if (!text || !text.trim()) return;
// Create + "unlock" the audio element synchronously inside the click gesture,
// so iOS Safari lets us play it after the async fetch resolves.
const audio = new Audio();
audioRef.current = audio;
audio.onended = () => stop();
audio.onerror = () => stop();
try {
audio.play().catch(() => {});
audio.pause();
} catch {
/* unlock attempt; ignore */
}
stopActive = stop;
setState('loading');
try {
const res = await authenticatedFetch('/api/voice/tts', {
method: 'POST',
body: JSON.stringify({ text }),
});
if (!res.ok) throw new Error(`tts ${res.status}`);
const blob = await res.blob();
const url = URL.createObjectURL(blob);
urlRef.current = url;
if (audioRef.current !== audio) return; // stopped while loading
audio.src = url;
audio.load();
await audio.play();
setState('playing');
} catch {
reset();
setState('idle');
}
}, [getText, reset, stop]);
const toggle = useCallback(() => {
if (state === 'playing' || state === 'loading') stop();
else play();
}, [state, play, stop]);
return { state, toggle };
}

View File

@@ -0,0 +1,38 @@
import { useEffect, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
let cached: boolean | null = null;
let inflight: Promise<boolean> | null = null;
function probe(): Promise<boolean> {
if (cached !== null) return Promise.resolve(cached);
if (!inflight) {
inflight = authenticatedFetch('/api/voice/health')
.then((r) => (r.ok ? r.json() : { enabled: false }))
.then((d) => {
cached = Boolean(d?.enabled);
return cached;
})
.catch(() => {
cached = false;
return false;
});
}
return inflight;
}
export function useVoiceAvailable(): boolean {
const [available, setAvailable] = useState<boolean>(cached ?? false);
useEffect(() => {
let mounted = true;
probe().then((v) => {
if (mounted) setAvailable(v);
});
return () => {
mounted = false;
};
}, []);
return available;
}

View File

@@ -0,0 +1,106 @@
import { useCallback, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
const MIME_CANDIDATES = [
'audio/webm;codecs=opus',
'audio/webm',
'audio/mp4',
'audio/ogg;codecs=opus',
'audio/ogg',
];
function pickMime(): string {
for (const t of MIME_CANDIDATES) {
try {
if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
} catch {
/* isTypeSupported can throw on some iOS versions */
}
}
return '';
}
export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
/**
* Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
* (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
* Ported from tooler's VoiceInput.js.
*/
export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
const [state, setState] = useState<VoiceInputState>('idle');
const recorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const stopTracks = () => {
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
};
const start = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true },
});
streamRef.current = stream;
const mimeType = pickMime();
const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
recorderRef.current = rec;
chunksRef.current = [];
rec.ondataavailable = (e) => {
if (e.data.size > 0) chunksRef.current.push(e.data);
};
rec.onstop = async () => {
stopTracks();
const type = rec.mimeType || 'audio/webm';
const blob = new Blob(chunksRef.current, { type });
if (blob.size < 800) {
setState('idle');
onError?.('Recording too short');
return;
}
setState('transcribing');
try {
const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
const fd = new FormData();
fd.append('audio', blob, `recording.${ext}`);
const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
if (!res.ok) throw new Error(`transcribe ${res.status}`);
const data = await res.json();
const text = String(data?.text || '').trim();
if (text) onTranscript(text);
else onError?.('No speech detected');
} catch (e) {
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
} finally {
setState('idle');
}
};
rec.start();
setState('recording');
} catch (e) {
const err = e as { name?: string; message?: string };
let msg = `Mic error: ${err?.message || e}`;
if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
onError?.(msg);
setState('idle');
}
}, [onTranscript, onError]);
const stop = useCallback(() => {
if (recorderRef.current && state === 'recording') recorderRef.current.stop();
}, [state]);
const toggle = useCallback(() => {
if (state === 'recording') stop();
else if (state === 'idle') start();
}, [state, start, stop]);
return { state, toggle };
}

View File

@@ -404,6 +404,7 @@ function ChatInterface({
renderInputWithMentions={renderInputWithMentions}
textareaRef={textareaRef}
input={input}
onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
onInputChange={handleInputChange}
onTextareaClick={handleTextareaClick}
onTextareaKeyDown={handleKeyDown}

View File

@@ -26,6 +26,7 @@ import {
import CommandMenu from './CommandMenu';
import ClaudeStatus from './ClaudeStatus';
import ImageAttachment from './ImageAttachment';
import VoiceInputButton from './VoiceInputButton';
import PermissionRequestsBanner from './PermissionRequestsBanner';
import TokenUsageSummary from './TokenUsageSummary';
@@ -89,6 +90,7 @@ interface ChatComposerProps {
renderInputWithMentions: (text: string) => ReactNode;
textareaRef: RefObject<HTMLTextAreaElement>;
input: string;
onVoiceTranscript?: (text: string) => void;
onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -143,6 +145,7 @@ export default function ChatComposer({
renderInputWithMentions,
textareaRef,
input,
onVoiceTranscript,
onInputChange,
onTextareaClick,
onTextareaKeyDown,
@@ -315,6 +318,8 @@ export default function ChatComposer({
<ImageIcon />
</PromptInputButton>
{onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
<button
type="button"
onClick={onModeSwitch}

View File

@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share
import { Markdown } from './Markdown';
import MessageCopyControl from './MessageCopyControl';
import MessageSpeakControl from './MessageSpeakControl';
type DiffLine = {
type: string;
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
{shouldShowAssistantCopyControl && (
<MessageCopyControl content={assistantCopyContent} messageType="assistant" />
)}
{shouldShowAssistantCopyControl && (
<MessageSpeakControl content={assistantCopyContent} />
)}
{!isGrouped && <span>{formattedTime}</span>}
</div>
)}

View File

@@ -0,0 +1,37 @@
import { Volume2, Loader2, Square } from 'lucide-react';
import { useTranslation } from 'react-i18next';
import { useTts } from '../../hooks/useTts';
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
// Tap-to-speak button beside the copy control on assistant messages.
// Renders nothing unless the optional voice feature is enabled.
const MessageSpeakControl = ({ content }: { content: string }) => {
const { t } = useTranslation('chat');
const available = useVoiceAvailable();
const { state, toggle } = useTts(() => content);
if (!available) return null;
const title =
state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
return (
<button
type="button"
onClick={toggle}
title={title}
aria-label={title}
className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
>
{state === 'playing' ? (
<Square className="h-3.5 w-3.5" />
) : state === 'loading' ? (
<Loader2 className="h-3.5 w-3.5 animate-spin" />
) : (
<Volume2 className="h-3.5 w-3.5" />
)}
</button>
);
};
export default MessageSpeakControl;

View File

@@ -0,0 +1,40 @@
import { Mic, Square, Loader2 } from 'lucide-react';
import { useTranslation } from 'react-i18next';
import { useVoiceInput } from '../../hooks/useVoiceInput';
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
import { PromptInputButton } from '../../../../shared/view/ui';
type Props = {
onTranscript: (text: string) => void;
onError?: (msg: string) => void;
};
// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
export default function VoiceInputButton({ onTranscript, onError }: Props) {
const { t } = useTranslation('chat');
const available = useVoiceAvailable();
const { state, toggle } = useVoiceInput(onTranscript, onError);
if (!available) return null;
const icon =
state === 'recording' ? (
<Square className="text-red-500" />
) : state === 'transcribing' ? (
<Loader2 className="animate-spin" />
) : (
<Mic />
);
return (
<PromptInputButton
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
onClick={(e: { preventDefault: () => void }) => {
e.preventDefault();
toggle();
}}
>
{icon}
</PromptInputButton>
);
}

View File

@@ -122,6 +122,14 @@
}
}
},
"voice": {
"input": "Voice input",
"stopRecording": "Stop recording",
"transcribing": "Transcribing…",
"speak": "Read aloud",
"stopSpeaking": "Stop",
"loading": "Loading…"
},
"input": {
"placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...",
"placeholderDefault": "Type your message...",

View File

@@ -0,0 +1,14 @@
# Voice sidecar config (all optional — these are the defaults).
# The sidecar binds 127.0.0.1 only; CloudCLI's Express proxy reaches it.
# Port the sidecar listens on (CloudCLI reaches it via VOICE_SIDECAR_URL).
VOICE_PORT=8765
# faster-whisper model size: tiny | base | small | medium | large-v3
WHISPER_MODEL_SIZE=base
# cpu (int8, default) or cuda (float16, needs a CUDA torch in the venv)
WHISPER_DEVICE=cpu
# Kokoro voice (see https://github.com/hexgrad/kokoro for the full list) and language code.
KOKORO_VOICE=af_heart
KOKORO_LANG=a

187
voice-sidecar/app.py Normal file
View File

@@ -0,0 +1,187 @@
"""
CloudCLI voice sidecar — local STT (faster-whisper) + local TTS (Kokoro-82M).
Ported from the tooler voice endpoints (D:\\tooler\\backend\\server.py), swapping
edge-tts -> Kokoro. Bound to 127.0.0.1 only; CloudCLI's Express server proxies to
it behind JWT auth. Never exposed to the tailnet directly.
Endpoints:
GET /health -> {status, whisper_loaded, kokoro_loaded}
POST /transcribe (multipart 'audio') -> {text, duration_ms}
POST /tts (form 'text') -> audio/wav bytes (cached)
"""
import asyncio
import hashlib
import logging
import os
import re
import tempfile
import time
from pathlib import Path
import numpy as np
import soundfile as sf
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import Response
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("voice-sidecar")
# ---- Config (env-overridable) -------------------------------------------------
PORT = int(os.getenv("VOICE_PORT", "8765"))
WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu").lower() # "cpu" | "cuda"
KOKORO_VOICE = os.getenv("KOKORO_VOICE", "af_heart")
KOKORO_LANG = os.getenv("KOKORO_LANG", "a") # 'a' = American English
KOKORO_SR = 24000
VOICE_DIR = Path(__file__).parent / "voice_messages"
VOICE_DIR.mkdir(exist_ok=True)
# ---- Lazy model singletons ----------------------------------------------------
_whisper = None
_whisper_lock = asyncio.Lock()
_kpipe = None
_kpipe_lock = asyncio.Lock()
async def get_whisper():
global _whisper
if _whisper is not None:
return _whisper
async with _whisper_lock:
if _whisper is not None:
return _whisper
def _load():
from faster_whisper import WhisperModel
if WHISPER_DEVICE == "cuda":
try:
logger.info("[WHISPER] loading on CUDA (float16)...")
return WhisperModel(WHISPER_MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e: # noqa: BLE001
logger.warning("[WHISPER] CUDA failed (%s), falling back to CPU", e)
logger.info("[WHISPER] loading '%s' on CPU (int8)", WHISPER_MODEL_SIZE)
return WhisperModel(WHISPER_MODEL_SIZE, device="cpu", compute_type="int8")
_whisper = await asyncio.get_event_loop().run_in_executor(None, _load)
logger.info("[WHISPER] ready")
return _whisper
async def get_kokoro():
global _kpipe
if _kpipe is not None:
return _kpipe
async with _kpipe_lock:
if _kpipe is not None:
return _kpipe
def _load():
from kokoro import KPipeline
logger.info("[KOKORO] loading pipeline (lang=%s)...", KOKORO_LANG)
return KPipeline(lang_code=KOKORO_LANG)
_kpipe = await asyncio.get_event_loop().run_in_executor(None, _load)
logger.info("[KOKORO] ready")
return _kpipe
# ---- Text cleaning (ported verbatim from tooler prepare_text_for_tts) ---------
def prepare_text_for_tts(text: str) -> str:
"""Strip/transform markdown for natural speech."""
text = re.sub(r"```[\s\S]*?```", " code block ", text) # code fences -> spoken stub
text = re.sub(r"`([^`]+)`", r"\1", text) # unwrap inline code
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # bold
text = re.sub(r"\*([^*]+)\*", r"\1", text) # italic
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # links -> link text
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # headers
text = re.sub(r"\s+", " ", text).strip()
return text
# ---- App ----------------------------------------------------------------------
app = FastAPI(title="CloudCLI voice sidecar")
@app.get("/health")
async def health():
return {
"status": "ok",
"whisper_loaded": _whisper is not None,
"kokoro_loaded": _kpipe is not None,
}
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...)):
start = time.time()
suffix = Path(audio.filename or "rec.webm").suffix or ".webm"
content = await audio.read()
logger.info("[STT] %d bytes (%s)", len(content), audio.content_type)
tmp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(content)
tmp_path = tmp.name
model = await get_whisper()
def _run():
segments, _info = model.transcribe(tmp_path, beam_size=5)
return "".join(seg.text for seg in segments).strip()
text = await asyncio.get_event_loop().run_in_executor(None, _run)
duration_ms = int((time.time() - start) * 1000)
logger.info("[STT] %dms: %s", duration_ms, text[:100])
return {"text": text, "duration_ms": duration_ms}
except Exception as e: # noqa: BLE001
logger.error("[STT] failed: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
finally:
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except OSError:
pass
@app.post("/tts")
async def tts(text: str = Form(...)):
if not text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
if len(text) > 8000:
raise HTTPException(status_code=400, detail="Text too long (max 8000 chars)")
start = time.time()
clean = prepare_text_for_tts(text)
# Cache on the RAW text hash (matches tooler) so identical messages reuse audio.
key = hashlib.sha256(text.encode()).hexdigest()[:16]
out_path = VOICE_DIR / f"{key}.wav"
if not out_path.exists():
try:
pipeline = await get_kokoro()
def _synth():
chunks = [audio for _gs, _ps, audio in pipeline(clean, voice=KOKORO_VOICE)]
if not chunks:
raise RuntimeError("Kokoro produced no audio")
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
sf.write(str(out_path), full, KOKORO_SR)
await asyncio.get_event_loop().run_in_executor(None, _synth)
logger.info("[TTS] generated %s in %dms", out_path.name, int((time.time() - start) * 1000))
except Exception as e: # noqa: BLE001
logger.error("[TTS] failed: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=f"TTS failed: {e}")
else:
logger.info("[TTS] cache hit %s", out_path.name)
return Response(content=out_path.read_bytes(), media_type="audio/wav")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="info")

View File

@@ -0,0 +1,9 @@
# CloudCLI voice sidecar — STT (faster-whisper) + TTS (Kokoro-82M)
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
python-multipart>=0.0.9
faster-whisper>=1.0.0
kokoro>=0.9.4
misaki[en]>=0.9.4
soundfile>=0.12.1
numpy>=1.26.0

View File

@@ -0,0 +1,29 @@
"""Smoke test: Kokoro TTS -> faster-whisper STT round-trip."""
import time
import numpy as np
import soundfile as sf
PHRASE = "Hello, this is a test of the CloudCLI voice sidecar."
print("[1/3] Loading Kokoro pipeline...")
t = time.time()
from kokoro import KPipeline
pipe = KPipeline(lang_code="a")
print(f" loaded in {time.time()-t:.1f}s")
print("[2/3] Synthesizing...")
t = time.time()
chunks = [audio for _gs, _ps, audio in pipe(PHRASE, voice="af_heart")]
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
sf.write("test.wav", full, 24000)
dur = len(full) / 24000
print(f" synth {time.time()-t:.1f}s -> test.wav ({dur:.1f}s audio, {len(full)} samples)")
print("[3/3] Transcribing back with faster-whisper (base, cpu int8)...")
t = time.time()
from faster_whisper import WhisperModel
model = WhisperModel("base", device="cpu", compute_type="int8")
segments, _info = model.transcribe("test.wav", beam_size=5)
text = "".join(s.text for s in segments).strip()
print(f" transcribe {time.time()-t:.1f}s -> {text!r}")
print("\nROUND-TRIP OK" if text else "\nROUND-TRIP PRODUCED NO TEXT")