Files
claudecodeui/voice-sidecar/app.py
newsbubbles d05585e1f4 feat(voice): add optional speech-to-text input and read-aloud TTS
Adds a push-to-talk mic button in the composer and a read-aloud button on
assistant messages. Both are opt-in and hidden unless a voice backend is
configured via VOICE_SIDECAR_URL.

The auth-gated /api/voice proxy forwards to a configurable backend exposing
/transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health
and hides the controls when disabled. Adds i18n keys and docs/voice.md.

Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper
for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-08 00:48:24 +01:00

188 lines
6.7 KiB
Python

"""
CloudCLI voice sidecar — local STT (faster-whisper) + local TTS (Kokoro-82M).
Ported from the tooler voice endpoints (D:\\tooler\\backend\\server.py), swapping
edge-tts -> Kokoro. Bound to 127.0.0.1 only; CloudCLI's Express server proxies to
it behind JWT auth. Never exposed to the tailnet directly.
Endpoints:
GET /health -> {status, whisper_loaded, kokoro_loaded}
POST /transcribe (multipart 'audio') -> {text, duration_ms}
POST /tts (form 'text') -> audio/wav bytes (cached)
"""
import asyncio
import hashlib
import logging
import os
import re
import tempfile
import time
from pathlib import Path
import numpy as np
import soundfile as sf
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import Response
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("voice-sidecar")
# ---- Config (env-overridable) -------------------------------------------------
PORT = int(os.getenv("VOICE_PORT", "8765"))
WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu").lower() # "cpu" | "cuda"
KOKORO_VOICE = os.getenv("KOKORO_VOICE", "af_heart")
KOKORO_LANG = os.getenv("KOKORO_LANG", "a") # 'a' = American English
KOKORO_SR = 24000
VOICE_DIR = Path(__file__).parent / "voice_messages"
VOICE_DIR.mkdir(exist_ok=True)
# ---- Lazy model singletons ----------------------------------------------------
_whisper = None
_whisper_lock = asyncio.Lock()
_kpipe = None
_kpipe_lock = asyncio.Lock()
async def get_whisper():
global _whisper
if _whisper is not None:
return _whisper
async with _whisper_lock:
if _whisper is not None:
return _whisper
def _load():
from faster_whisper import WhisperModel
if WHISPER_DEVICE == "cuda":
try:
logger.info("[WHISPER] loading on CUDA (float16)...")
return WhisperModel(WHISPER_MODEL_SIZE, device="cuda", compute_type="float16")
except Exception as e: # noqa: BLE001
logger.warning("[WHISPER] CUDA failed (%s), falling back to CPU", e)
logger.info("[WHISPER] loading '%s' on CPU (int8)", WHISPER_MODEL_SIZE)
return WhisperModel(WHISPER_MODEL_SIZE, device="cpu", compute_type="int8")
_whisper = await asyncio.get_event_loop().run_in_executor(None, _load)
logger.info("[WHISPER] ready")
return _whisper
async def get_kokoro():
global _kpipe
if _kpipe is not None:
return _kpipe
async with _kpipe_lock:
if _kpipe is not None:
return _kpipe
def _load():
from kokoro import KPipeline
logger.info("[KOKORO] loading pipeline (lang=%s)...", KOKORO_LANG)
return KPipeline(lang_code=KOKORO_LANG)
_kpipe = await asyncio.get_event_loop().run_in_executor(None, _load)
logger.info("[KOKORO] ready")
return _kpipe
# ---- Text cleaning (ported verbatim from tooler prepare_text_for_tts) ---------
def prepare_text_for_tts(text: str) -> str:
"""Strip/transform markdown for natural speech."""
text = re.sub(r"```[\s\S]*?```", " code block ", text) # code fences -> spoken stub
text = re.sub(r"`([^`]+)`", r"\1", text) # unwrap inline code
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # bold
text = re.sub(r"\*([^*]+)\*", r"\1", text) # italic
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # links -> link text
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # headers
text = re.sub(r"\s+", " ", text).strip()
return text
# ---- App ----------------------------------------------------------------------
app = FastAPI(title="CloudCLI voice sidecar")
@app.get("/health")
async def health():
return {
"status": "ok",
"whisper_loaded": _whisper is not None,
"kokoro_loaded": _kpipe is not None,
}
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...)):
start = time.time()
suffix = Path(audio.filename or "rec.webm").suffix or ".webm"
content = await audio.read()
logger.info("[STT] %d bytes (%s)", len(content), audio.content_type)
tmp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(content)
tmp_path = tmp.name
model = await get_whisper()
def _run():
segments, _info = model.transcribe(tmp_path, beam_size=5)
return "".join(seg.text for seg in segments).strip()
text = await asyncio.get_event_loop().run_in_executor(None, _run)
duration_ms = int((time.time() - start) * 1000)
logger.info("[STT] %dms: %s", duration_ms, text[:100])
return {"text": text, "duration_ms": duration_ms}
except Exception as e: # noqa: BLE001
logger.error("[STT] failed: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
finally:
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except OSError:
pass
@app.post("/tts")
async def tts(text: str = Form(...)):
if not text.strip():
raise HTTPException(status_code=400, detail="Text cannot be empty")
if len(text) > 8000:
raise HTTPException(status_code=400, detail="Text too long (max 8000 chars)")
start = time.time()
clean = prepare_text_for_tts(text)
# Cache on the RAW text hash (matches tooler) so identical messages reuse audio.
key = hashlib.sha256(text.encode()).hexdigest()[:16]
out_path = VOICE_DIR / f"{key}.wav"
if not out_path.exists():
try:
pipeline = await get_kokoro()
def _synth():
chunks = [audio for _gs, _ps, audio in pipeline(clean, voice=KOKORO_VOICE)]
if not chunks:
raise RuntimeError("Kokoro produced no audio")
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
sf.write(str(out_path), full, KOKORO_SR)
await asyncio.get_event_loop().run_in_executor(None, _synth)
logger.info("[TTS] generated %s in %dms", out_path.name, int((time.time() - start) * 1000))
except Exception as e: # noqa: BLE001
logger.error("[TTS] failed: %s", e, exc_info=True)
raise HTTPException(status_code=500, detail=f"TTS failed: {e}")
else:
logger.info("[TTS] cache hit %s", out_path.name)
return Response(content=out_path.read_bytes(), media_type="audio/wav")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="info")