From d05585e1f4ad6d6af354649ddb4d79f194bbf5c9 Mon Sep 17 00:00:00 2001 From: newsbubbles Date: Mon, 8 Jun 2026 00:47:14 +0100 Subject: [PATCH] feat(voice): add optional speech-to-text input and read-aloud TTS Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable). --- .gitignore | 7 + docs/voice.md | 57 ++++++ server/index.js | 3 + server/voice-proxy.js | 87 ++++++++ src/components/chat/hooks/useTts.ts | 88 +++++++++ .../chat/hooks/useVoiceAvailable.ts | 38 ++++ src/components/chat/hooks/useVoiceInput.ts | 106 ++++++++++ src/components/chat/view/ChatInterface.tsx | 1 + .../chat/view/subcomponents/ChatComposer.tsx | 5 + .../view/subcomponents/MessageComponent.tsx | 4 + .../subcomponents/MessageSpeakControl.tsx | 37 ++++ .../view/subcomponents/VoiceInputButton.tsx | 40 ++++ src/i18n/locales/en/chat.json | 8 + voice-sidecar/.env.example | 14 ++ voice-sidecar/app.py | 187 ++++++++++++++++++ voice-sidecar/requirements.txt | 9 + voice-sidecar/test_smoke.py | 29 +++ 17 files changed, 720 insertions(+) create mode 100644 docs/voice.md create mode 100644 server/voice-proxy.js create mode 100644 src/components/chat/hooks/useTts.ts create mode 100644 src/components/chat/hooks/useVoiceAvailable.ts create mode 100644 src/components/chat/hooks/useVoiceInput.ts create mode 100644 src/components/chat/view/subcomponents/MessageSpeakControl.tsx create mode 100644 src/components/chat/view/subcomponents/VoiceInputButton.tsx create mode 100644 voice-sidecar/.env.example create mode 100644 voice-sidecar/app.py create mode 100644 voice-sidecar/requirements.txt create mode 100644 voice-sidecar/test_smoke.py diff --git a/.gitignore b/.gitignore index e6b7985b..8b7815d7 100755 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,10 @@ tasks/ # Git worktrees .worktrees/ + +# Voice sidecar (Python) — generated, machine-specific, not committed +voice-sidecar/.venv/ +voice-sidecar/voice_messages/ +voice-sidecar/**/__pycache__/ +*.pyc +*.wav diff --git a/docs/voice.md b/docs/voice.md new file mode 100644 index 00000000..b8e5baec --- /dev/null +++ b/docs/voice.md @@ -0,0 +1,57 @@ +# Voice (optional) + +Adds two opt-in voice features to the chat: + +- **Push-to-talk dictation** — a mic button in the composer records your voice, transcribes it + (speech-to-text), and drops the text into the input. +- **Read-aloud** — a speaker button on each assistant message plays it back (text-to-speech). + +Voice is **disabled by default**. The UI only appears when a voice backend is configured, so it has +zero impact on installs that don't use it. + +## Enable it + +Set `VOICE_SIDECAR_URL` for the server to point at a voice backend, then restart: + +```bash +VOICE_SIDECAR_URL=http://127.0.0.1:8765 npm run server +``` + +When set, `GET /api/voice/health` reports `{ "enabled": true }` and the mic + speaker controls appear. +All voice requests are proxied through the app's authenticated `/api/voice/*` routes, so the backend +itself only needs to listen on localhost and is never exposed directly. + +## Backend contract + +`VOICE_SIDECAR_URL` can point at **any** service that implements two endpoints: + +| Method & path | Request | Response | +|---|---|---| +| `POST /transcribe` | multipart, field `audio` (webm/mp4/wav/…) | `{ "text": "..." }` | +| `POST /tts` | form field `text` | audio bytes (`audio/*`, e.g. wav/mp3) | + +This keeps the feature provider-agnostic — you can back it with the bundled local sidecar, or a cloud +transcription + TTS gateway, as long as it speaks that contract. + +## Reference backend: `voice-sidecar/` + +A local, no-API-key reference implementation using **faster-whisper** (STT) and **Kokoro-82M** (TTS), +both CPU-capable. + +```bash +cd voice-sidecar +python -m venv .venv && . .venv/bin/activate # (Windows: .venv\Scripts\activate) +pip install -r requirements.txt +python -m uvicorn app:app --host 127.0.0.1 --port 8765 +``` + +Then run the app with `VOICE_SIDECAR_URL=http://127.0.0.1:8765`. + +Config (env, all optional) — see `voice-sidecar/.env.example`: `WHISPER_MODEL_SIZE`, `WHISPER_DEVICE` +(`cpu`/`cuda`), `KOKORO_VOICE`, `VOICE_PORT`. + +## Notes + +- The first read-aloud is slow (~10–20s) while the model lazy-loads; it's near-instant and cached after. +- Recording needs a secure context (HTTPS or localhost) for microphone access. +- On iOS, playback is tap-initiated (manual read-aloud) to satisfy Safari's autoplay policy. diff --git a/server/index.js b/server/index.js index 58c4ce74..b03d3a81 100755 --- a/server/index.js +++ b/server/index.js @@ -72,6 +72,7 @@ import userRoutes from './routes/user.js'; import geminiRoutes from './routes/gemini.js'; import pluginsRoutes from './routes/plugins.js'; import providerRoutes from './modules/providers/provider.routes.js'; +import voiceRoutes from './voice-proxy.js'; import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js'; import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js'; import { configureWebPush } from './services/vapid-keys.js'; @@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes); // Agent API Routes (uses API key authentication) app.use('/api/agent', agentRoutes); +app.use('/api/voice', authenticateToken, voiceRoutes); + // Serve public files (like api-docs.html) app.use(express.static(path.join(APP_ROOT, 'public'))); diff --git a/server/voice-proxy.js b/server/voice-proxy.js new file mode 100644 index 00000000..3bdb748a --- /dev/null +++ b/server/voice-proxy.js @@ -0,0 +1,87 @@ +// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend. +// +// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a +// backend (any implementation) exposing: +// POST /transcribe (multipart field 'audio') -> { text } +// POST /tts (form field 'text') -> audio bytes (audio/*) +// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any +// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway). +// +// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend +// should bind to localhost and is never exposed directly. +import express from 'express'; + +const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, ''); +const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL); + +const router = express.Router(); + +// Lazy multer (memory storage) for the audio upload — matches index.js's pattern. +let _upload = null; +async function getUpload() { + if (!_upload) { + const multer = (await import('multer')).default; + _upload = multer({ + storage: multer.memoryStorage(), + limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips + }); + } + return _upload; +} + +function ensureEnabled(res) { + if (!VOICE_ENABLED) { + res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' }); + return false; + } + return true; +} + +// GET /api/voice/health -> { enabled } (frontend hides the voice UI when disabled) +router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED })); + +// POST /api/voice/transcribe (multipart 'audio') -> { text } +router.post('/transcribe', async (req, res) => { + if (!ensureEnabled(res)) return; + const upload = await getUpload(); + upload.single('audio')(req, res, async (err) => { + if (err) return res.status(400).json({ error: err.message }); + if (!req.file) return res.status(400).json({ error: 'No audio uploaded' }); + try { + const fd = new FormData(); + fd.append( + 'audio', + new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }), + req.file.originalname || 'recording.webm', + ); + const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd }); + const data = await r.json().catch(() => ({ error: 'bad voice backend response' })); + res.status(r.status).json(data); + } catch (e) { + res.status(502).json({ error: `voice backend unreachable: ${e.message}` }); + } + }); +}); + +// POST /api/voice/tts { text } -> audio bytes +router.post('/tts', async (req, res) => { + if (!ensureEnabled(res)) return; + const text = req.body?.text; + if (!text || !text.trim()) return res.status(400).json({ error: 'text required' }); + try { + const fd = new FormData(); + fd.append('text', text); + const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd }); + if (!r.ok) { + const errText = await r.text().catch(() => 'tts failed'); + return res.status(r.status).json({ error: errText }); + } + res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav'); + res.setHeader('Cache-Control', 'no-store'); + res.send(Buffer.from(await r.arrayBuffer())); + } catch (e) { + res.status(502).json({ error: `voice backend unreachable: ${e.message}` }); + } +}); + +export default router; diff --git a/src/components/chat/hooks/useTts.ts b/src/components/chat/hooks/useTts.ts new file mode 100644 index 00000000..46ab0f27 --- /dev/null +++ b/src/components/chat/hooks/useTts.ts @@ -0,0 +1,88 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; +import { authenticatedFetch } from '../../../utils/api'; + +// Only one message speaks at a time across the whole app. +let stopActive: (() => void) | null = null; + +export type TtsState = 'idle' | 'loading' | 'playing'; + +/** + * Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts + * (Kokoro sidecar via the Express proxy; cleaning happens server-side), + * plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay. + */ +export function useTts(getText: () => string) { + const [state, setState] = useState('idle'); + const audioRef = useRef(null); + const urlRef = useRef(null); + + const reset = useCallback(() => { + if (audioRef.current) { + audioRef.current.onended = null; + audioRef.current.onerror = null; + audioRef.current.pause(); + audioRef.current.src = ''; + audioRef.current = null; + } + if (urlRef.current) { + URL.revokeObjectURL(urlRef.current); + urlRef.current = null; + } + }, []); + + const stop = useCallback(() => { + reset(); + setState('idle'); + if (stopActive) stopActive = null; + }, [reset]); + + // Cleanup on unmount. + useEffect(() => () => reset(), [reset]); + + const play = useCallback(async () => { + if (stopActive) stopActive(); + const text = getText(); + if (!text || !text.trim()) return; + + // Create + "unlock" the audio element synchronously inside the click gesture, + // so iOS Safari lets us play it after the async fetch resolves. + const audio = new Audio(); + audioRef.current = audio; + audio.onended = () => stop(); + audio.onerror = () => stop(); + try { + audio.play().catch(() => {}); + audio.pause(); + } catch { + /* unlock attempt; ignore */ + } + stopActive = stop; + setState('loading'); + + try { + const res = await authenticatedFetch('/api/voice/tts', { + method: 'POST', + body: JSON.stringify({ text }), + }); + if (!res.ok) throw new Error(`tts ${res.status}`); + const blob = await res.blob(); + const url = URL.createObjectURL(blob); + urlRef.current = url; + if (audioRef.current !== audio) return; // stopped while loading + audio.src = url; + audio.load(); + await audio.play(); + setState('playing'); + } catch { + reset(); + setState('idle'); + } + }, [getText, reset, stop]); + + const toggle = useCallback(() => { + if (state === 'playing' || state === 'loading') stop(); + else play(); + }, [state, play, stop]); + + return { state, toggle }; +} diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts new file mode 100644 index 00000000..463e4ff3 --- /dev/null +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -0,0 +1,38 @@ +import { useEffect, useState } from 'react'; +import { authenticatedFetch } from '../../../utils/api'; + +// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set). +// Probed once and cached app-wide so the mic/speak controls can hide themselves when off. +let cached: boolean | null = null; +let inflight: Promise | null = null; + +function probe(): Promise { + if (cached !== null) return Promise.resolve(cached); + if (!inflight) { + inflight = authenticatedFetch('/api/voice/health') + .then((r) => (r.ok ? r.json() : { enabled: false })) + .then((d) => { + cached = Boolean(d?.enabled); + return cached; + }) + .catch(() => { + cached = false; + return false; + }); + } + return inflight; +} + +export function useVoiceAvailable(): boolean { + const [available, setAvailable] = useState(cached ?? false); + useEffect(() => { + let mounted = true; + probe().then((v) => { + if (mounted) setAvailable(v); + }); + return () => { + mounted = false; + }; + }, []); + return available; +} diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts new file mode 100644 index 00000000..bc83a803 --- /dev/null +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -0,0 +1,106 @@ +import { useCallback, useRef, useState } from 'react'; +import { authenticatedFetch } from '../../../utils/api'; + +// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4. +const MIME_CANDIDATES = [ + 'audio/webm;codecs=opus', + 'audio/webm', + 'audio/mp4', + 'audio/ogg;codecs=opus', + 'audio/ogg', +]; + +function pickMime(): string { + for (const t of MIME_CANDIDATES) { + try { + if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t; + } catch { + /* isTypeSupported can throw on some iOS versions */ + } + } + return ''; +} + +export type VoiceInputState = 'idle' | 'recording' | 'transcribing'; + +/** + * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe + * (faster-whisper sidecar via the Express proxy), returns text via onTranscript. + * Ported from tooler's VoiceInput.js. + */ +export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) { + const [state, setState] = useState('idle'); + const recorderRef = useRef(null); + const chunksRef = useRef([]); + const streamRef = useRef(null); + + const stopTracks = () => { + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + }; + + const start = useCallback(async () => { + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true }, + }); + streamRef.current = stream; + const mimeType = pickMime(); + const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); + recorderRef.current = rec; + chunksRef.current = []; + + rec.ondataavailable = (e) => { + if (e.data.size > 0) chunksRef.current.push(e.data); + }; + + rec.onstop = async () => { + stopTracks(); + const type = rec.mimeType || 'audio/webm'; + const blob = new Blob(chunksRef.current, { type }); + if (blob.size < 800) { + setState('idle'); + onError?.('Recording too short'); + return; + } + setState('transcribing'); + try { + const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm'; + const fd = new FormData(); + fd.append('audio', blob, `recording.${ext}`); + const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd }); + if (!res.ok) throw new Error(`transcribe ${res.status}`); + const data = await res.json(); + const text = String(data?.text || '').trim(); + if (text) onTranscript(text); + else onError?.('No speech detected'); + } catch (e) { + onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + } finally { + setState('idle'); + } + }; + + rec.start(); + setState('recording'); + } catch (e) { + const err = e as { name?: string; message?: string }; + let msg = `Mic error: ${err?.message || e}`; + if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; + else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; + onError?.(msg); + setState('idle'); + } + }, [onTranscript, onError]); + + const stop = useCallback(() => { + if (recorderRef.current && state === 'recording') recorderRef.current.stop(); + }, [state]); + + const toggle = useCallback(() => { + if (state === 'recording') stop(); + else if (state === 'idle') start(); + }, [state, start, stop]); + + return { state, toggle }; +} diff --git a/src/components/chat/view/ChatInterface.tsx b/src/components/chat/view/ChatInterface.tsx index 01ecb68a..df2bcd88 100644 --- a/src/components/chat/view/ChatInterface.tsx +++ b/src/components/chat/view/ChatInterface.tsx @@ -404,6 +404,7 @@ function ChatInterface({ renderInputWithMentions={renderInputWithMentions} textareaRef={textareaRef} input={input} + onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)} onInputChange={handleInputChange} onTextareaClick={handleTextareaClick} onTextareaKeyDown={handleKeyDown} diff --git a/src/components/chat/view/subcomponents/ChatComposer.tsx b/src/components/chat/view/subcomponents/ChatComposer.tsx index 4812078b..ada0bca0 100644 --- a/src/components/chat/view/subcomponents/ChatComposer.tsx +++ b/src/components/chat/view/subcomponents/ChatComposer.tsx @@ -26,6 +26,7 @@ import { import CommandMenu from './CommandMenu'; import ClaudeStatus from './ClaudeStatus'; import ImageAttachment from './ImageAttachment'; +import VoiceInputButton from './VoiceInputButton'; import PermissionRequestsBanner from './PermissionRequestsBanner'; import TokenUsageSummary from './TokenUsageSummary'; @@ -89,6 +90,7 @@ interface ChatComposerProps { renderInputWithMentions: (text: string) => ReactNode; textareaRef: RefObject; input: string; + onVoiceTranscript?: (text: string) => void; onInputChange: (event: ChangeEvent) => void; onTextareaClick: (event: MouseEvent) => void; onTextareaKeyDown: (event: KeyboardEvent) => void; @@ -143,6 +145,7 @@ export default function ChatComposer({ renderInputWithMentions, textareaRef, input, + onVoiceTranscript, onInputChange, onTextareaClick, onTextareaKeyDown, @@ -315,6 +318,8 @@ export default function ChatComposer({ + {onVoiceTranscript && } + + ); +}; + +export default MessageSpeakControl; diff --git a/src/components/chat/view/subcomponents/VoiceInputButton.tsx b/src/components/chat/view/subcomponents/VoiceInputButton.tsx new file mode 100644 index 00000000..aeb3585f --- /dev/null +++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx @@ -0,0 +1,40 @@ +import { Mic, Square, Loader2 } from 'lucide-react'; +import { useTranslation } from 'react-i18next'; +import { useVoiceInput } from '../../hooks/useVoiceInput'; +import { useVoiceAvailable } from '../../hooks/useVoiceAvailable'; +import { PromptInputButton } from '../../../../shared/view/ui'; + +type Props = { + onTranscript: (text: string) => void; + onError?: (msg: string) => void; +}; + +// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled. +export default function VoiceInputButton({ onTranscript, onError }: Props) { + const { t } = useTranslation('chat'); + const available = useVoiceAvailable(); + const { state, toggle } = useVoiceInput(onTranscript, onError); + + if (!available) return null; + + const icon = + state === 'recording' ? ( + + ) : state === 'transcribing' ? ( + + ) : ( + + ); + + return ( + void }) => { + e.preventDefault(); + toggle(); + }} + > + {icon} + + ); +} diff --git a/src/i18n/locales/en/chat.json b/src/i18n/locales/en/chat.json index 8d3f4e93..2766f17c 100644 --- a/src/i18n/locales/en/chat.json +++ b/src/i18n/locales/en/chat.json @@ -122,6 +122,14 @@ } } }, + "voice": { + "input": "Voice input", + "stopRecording": "Stop recording", + "transcribing": "Transcribing…", + "speak": "Read aloud", + "stopSpeaking": "Stop", + "loading": "Loading…" + }, "input": { "placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...", "placeholderDefault": "Type your message...", diff --git a/voice-sidecar/.env.example b/voice-sidecar/.env.example new file mode 100644 index 00000000..92842059 --- /dev/null +++ b/voice-sidecar/.env.example @@ -0,0 +1,14 @@ +# Voice sidecar config (all optional — these are the defaults). +# The sidecar binds 127.0.0.1 only; CloudCLI's Express proxy reaches it. + +# Port the sidecar listens on (CloudCLI reaches it via VOICE_SIDECAR_URL). +VOICE_PORT=8765 + +# faster-whisper model size: tiny | base | small | medium | large-v3 +WHISPER_MODEL_SIZE=base +# cpu (int8, default) or cuda (float16, needs a CUDA torch in the venv) +WHISPER_DEVICE=cpu + +# Kokoro voice (see https://github.com/hexgrad/kokoro for the full list) and language code. +KOKORO_VOICE=af_heart +KOKORO_LANG=a diff --git a/voice-sidecar/app.py b/voice-sidecar/app.py new file mode 100644 index 00000000..518f83bf --- /dev/null +++ b/voice-sidecar/app.py @@ -0,0 +1,187 @@ +""" +CloudCLI voice sidecar — local STT (faster-whisper) + local TTS (Kokoro-82M). + +Ported from the tooler voice endpoints (D:\\tooler\\backend\\server.py), swapping +edge-tts -> Kokoro. Bound to 127.0.0.1 only; CloudCLI's Express server proxies to +it behind JWT auth. Never exposed to the tailnet directly. + +Endpoints: + GET /health -> {status, whisper_loaded, kokoro_loaded} + POST /transcribe (multipart 'audio') -> {text, duration_ms} + POST /tts (form 'text') -> audio/wav bytes (cached) +""" +import asyncio +import hashlib +import logging +import os +import re +import tempfile +import time +from pathlib import Path + +import numpy as np +import soundfile as sf +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.responses import Response + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("voice-sidecar") + +# ---- Config (env-overridable) ------------------------------------------------- +PORT = int(os.getenv("VOICE_PORT", "8765")) +WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base") +WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu").lower() # "cpu" | "cuda" +KOKORO_VOICE = os.getenv("KOKORO_VOICE", "af_heart") +KOKORO_LANG = os.getenv("KOKORO_LANG", "a") # 'a' = American English +KOKORO_SR = 24000 + +VOICE_DIR = Path(__file__).parent / "voice_messages" +VOICE_DIR.mkdir(exist_ok=True) + +# ---- Lazy model singletons ---------------------------------------------------- +_whisper = None +_whisper_lock = asyncio.Lock() +_kpipe = None +_kpipe_lock = asyncio.Lock() + + +async def get_whisper(): + global _whisper + if _whisper is not None: + return _whisper + async with _whisper_lock: + if _whisper is not None: + return _whisper + + def _load(): + from faster_whisper import WhisperModel + if WHISPER_DEVICE == "cuda": + try: + logger.info("[WHISPER] loading on CUDA (float16)...") + return WhisperModel(WHISPER_MODEL_SIZE, device="cuda", compute_type="float16") + except Exception as e: # noqa: BLE001 + logger.warning("[WHISPER] CUDA failed (%s), falling back to CPU", e) + logger.info("[WHISPER] loading '%s' on CPU (int8)", WHISPER_MODEL_SIZE) + return WhisperModel(WHISPER_MODEL_SIZE, device="cpu", compute_type="int8") + + _whisper = await asyncio.get_event_loop().run_in_executor(None, _load) + logger.info("[WHISPER] ready") + return _whisper + + +async def get_kokoro(): + global _kpipe + if _kpipe is not None: + return _kpipe + async with _kpipe_lock: + if _kpipe is not None: + return _kpipe + + def _load(): + from kokoro import KPipeline + logger.info("[KOKORO] loading pipeline (lang=%s)...", KOKORO_LANG) + return KPipeline(lang_code=KOKORO_LANG) + + _kpipe = await asyncio.get_event_loop().run_in_executor(None, _load) + logger.info("[KOKORO] ready") + return _kpipe + + +# ---- Text cleaning (ported verbatim from tooler prepare_text_for_tts) --------- +def prepare_text_for_tts(text: str) -> str: + """Strip/transform markdown for natural speech.""" + text = re.sub(r"```[\s\S]*?```", " code block ", text) # code fences -> spoken stub + text = re.sub(r"`([^`]+)`", r"\1", text) # unwrap inline code + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # bold + text = re.sub(r"\*([^*]+)\*", r"\1", text) # italic + text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # links -> link text + text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # headers + text = re.sub(r"\s+", " ", text).strip() + return text + + +# ---- App ---------------------------------------------------------------------- +app = FastAPI(title="CloudCLI voice sidecar") + + +@app.get("/health") +async def health(): + return { + "status": "ok", + "whisper_loaded": _whisper is not None, + "kokoro_loaded": _kpipe is not None, + } + + +@app.post("/transcribe") +async def transcribe(audio: UploadFile = File(...)): + start = time.time() + suffix = Path(audio.filename or "rec.webm").suffix or ".webm" + content = await audio.read() + logger.info("[STT] %d bytes (%s)", len(content), audio.content_type) + + tmp_path = None + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(content) + tmp_path = tmp.name + + model = await get_whisper() + + def _run(): + segments, _info = model.transcribe(tmp_path, beam_size=5) + return "".join(seg.text for seg in segments).strip() + + text = await asyncio.get_event_loop().run_in_executor(None, _run) + duration_ms = int((time.time() - start) * 1000) + logger.info("[STT] %dms: %s", duration_ms, text[:100]) + return {"text": text, "duration_ms": duration_ms} + except Exception as e: # noqa: BLE001 + logger.error("[STT] failed: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail=f"Transcription failed: {e}") + finally: + if tmp_path and os.path.exists(tmp_path): + try: + os.unlink(tmp_path) + except OSError: + pass + + +@app.post("/tts") +async def tts(text: str = Form(...)): + if not text.strip(): + raise HTTPException(status_code=400, detail="Text cannot be empty") + if len(text) > 8000: + raise HTTPException(status_code=400, detail="Text too long (max 8000 chars)") + + start = time.time() + clean = prepare_text_for_tts(text) + # Cache on the RAW text hash (matches tooler) so identical messages reuse audio. + key = hashlib.sha256(text.encode()).hexdigest()[:16] + out_path = VOICE_DIR / f"{key}.wav" + + if not out_path.exists(): + try: + pipeline = await get_kokoro() + + def _synth(): + chunks = [audio for _gs, _ps, audio in pipeline(clean, voice=KOKORO_VOICE)] + if not chunks: + raise RuntimeError("Kokoro produced no audio") + full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks]) + sf.write(str(out_path), full, KOKORO_SR) + + await asyncio.get_event_loop().run_in_executor(None, _synth) + logger.info("[TTS] generated %s in %dms", out_path.name, int((time.time() - start) * 1000)) + except Exception as e: # noqa: BLE001 + logger.error("[TTS] failed: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail=f"TTS failed: {e}") + else: + logger.info("[TTS] cache hit %s", out_path.name) + + return Response(content=out_path.read_bytes(), media_type="audio/wav") + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="info") diff --git a/voice-sidecar/requirements.txt b/voice-sidecar/requirements.txt new file mode 100644 index 00000000..c37d56e9 --- /dev/null +++ b/voice-sidecar/requirements.txt @@ -0,0 +1,9 @@ +# CloudCLI voice sidecar — STT (faster-whisper) + TTS (Kokoro-82M) +fastapi>=0.110.0 +uvicorn[standard]>=0.27.0 +python-multipart>=0.0.9 +faster-whisper>=1.0.0 +kokoro>=0.9.4 +misaki[en]>=0.9.4 +soundfile>=0.12.1 +numpy>=1.26.0 diff --git a/voice-sidecar/test_smoke.py b/voice-sidecar/test_smoke.py new file mode 100644 index 00000000..224729fe --- /dev/null +++ b/voice-sidecar/test_smoke.py @@ -0,0 +1,29 @@ +"""Smoke test: Kokoro TTS -> faster-whisper STT round-trip.""" +import time +import numpy as np +import soundfile as sf + +PHRASE = "Hello, this is a test of the CloudCLI voice sidecar." + +print("[1/3] Loading Kokoro pipeline...") +t = time.time() +from kokoro import KPipeline +pipe = KPipeline(lang_code="a") +print(f" loaded in {time.time()-t:.1f}s") + +print("[2/3] Synthesizing...") +t = time.time() +chunks = [audio for _gs, _ps, audio in pipe(PHRASE, voice="af_heart")] +full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks]) +sf.write("test.wav", full, 24000) +dur = len(full) / 24000 +print(f" synth {time.time()-t:.1f}s -> test.wav ({dur:.1f}s audio, {len(full)} samples)") + +print("[3/3] Transcribing back with faster-whisper (base, cpu int8)...") +t = time.time() +from faster_whisper import WhisperModel +model = WhisperModel("base", device="cpu", compute_type="int8") +segments, _info = model.transcribe("test.wav", beam_size=5) +text = "".join(s.text for s in segments).strip() +print(f" transcribe {time.time()-t:.1f}s -> {text!r}") +print("\nROUND-TRIP OK" if text else "\nROUND-TRIP PRODUCED NO TEXT")