Files
claudecodeui/voice-sidecar/test_smoke.py
newsbubbles d05585e1f4 feat(voice): add optional speech-to-text input and read-aloud TTS
Adds a push-to-talk mic button in the composer and a read-aloud button on
assistant messages. Both are opt-in and hidden unless a voice backend is
configured via VOICE_SIDECAR_URL.

The auth-gated /api/voice proxy forwards to a configurable backend exposing
/transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health
and hides the controls when disabled. Adds i18n keys and docs/voice.md.

Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper
for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-08 00:48:24 +01:00

30 lines
1.1 KiB
Python

"""Smoke test: Kokoro TTS -> faster-whisper STT round-trip."""
import time
import numpy as np
import soundfile as sf
PHRASE = "Hello, this is a test of the CloudCLI voice sidecar."
print("[1/3] Loading Kokoro pipeline...")
t = time.time()
from kokoro import KPipeline
pipe = KPipeline(lang_code="a")
print(f" loaded in {time.time()-t:.1f}s")
print("[2/3] Synthesizing...")
t = time.time()
chunks = [audio for _gs, _ps, audio in pipe(PHRASE, voice="af_heart")]
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
sf.write("test.wav", full, 24000)
dur = len(full) / 24000
print(f" synth {time.time()-t:.1f}s -> test.wav ({dur:.1f}s audio, {len(full)} samples)")
print("[3/3] Transcribing back with faster-whisper (base, cpu int8)...")
t = time.time()
from faster_whisper import WhisperModel
model = WhisperModel("base", device="cpu", compute_type="int8")
segments, _info = model.transcribe("test.wav", beam_size=5)
text = "".join(s.text for s in segments).strip()
print(f" transcribe {time.time()-t:.1f}s -> {text!r}")
print("\nROUND-TRIP OK" if text else "\nROUND-TRIP PRODUCED NO TEXT")