mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-25 04:13:51 +08:00
feat(voice): add optional speech-to-text input and read-aloud TTS
Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@@ -142,3 +142,10 @@ tasks/
|
||||
|
||||
# Git worktrees
|
||||
.worktrees/
|
||||
|
||||
# Voice sidecar (Python) — generated, machine-specific, not committed
|
||||
voice-sidecar/.venv/
|
||||
voice-sidecar/voice_messages/
|
||||
voice-sidecar/**/__pycache__/
|
||||
*.pyc
|
||||
*.wav
|
||||
|
||||
57
docs/voice.md
Normal file
57
docs/voice.md
Normal file
@@ -0,0 +1,57 @@
|
||||
# Voice (optional)
|
||||
|
||||
Adds two opt-in voice features to the chat:
|
||||
|
||||
- **Push-to-talk dictation** — a mic button in the composer records your voice, transcribes it
|
||||
(speech-to-text), and drops the text into the input.
|
||||
- **Read-aloud** — a speaker button on each assistant message plays it back (text-to-speech).
|
||||
|
||||
Voice is **disabled by default**. The UI only appears when a voice backend is configured, so it has
|
||||
zero impact on installs that don't use it.
|
||||
|
||||
## Enable it
|
||||
|
||||
Set `VOICE_SIDECAR_URL` for the server to point at a voice backend, then restart:
|
||||
|
||||
```bash
|
||||
VOICE_SIDECAR_URL=http://127.0.0.1:8765 npm run server
|
||||
```
|
||||
|
||||
When set, `GET /api/voice/health` reports `{ "enabled": true }` and the mic + speaker controls appear.
|
||||
All voice requests are proxied through the app's authenticated `/api/voice/*` routes, so the backend
|
||||
itself only needs to listen on localhost and is never exposed directly.
|
||||
|
||||
## Backend contract
|
||||
|
||||
`VOICE_SIDECAR_URL` can point at **any** service that implements two endpoints:
|
||||
|
||||
| Method & path | Request | Response |
|
||||
|---|---|---|
|
||||
| `POST /transcribe` | multipart, field `audio` (webm/mp4/wav/…) | `{ "text": "..." }` |
|
||||
| `POST /tts` | form field `text` | audio bytes (`audio/*`, e.g. wav/mp3) |
|
||||
|
||||
This keeps the feature provider-agnostic — you can back it with the bundled local sidecar, or a cloud
|
||||
transcription + TTS gateway, as long as it speaks that contract.
|
||||
|
||||
## Reference backend: `voice-sidecar/`
|
||||
|
||||
A local, no-API-key reference implementation using **faster-whisper** (STT) and **Kokoro-82M** (TTS),
|
||||
both CPU-capable.
|
||||
|
||||
```bash
|
||||
cd voice-sidecar
|
||||
python -m venv .venv && . .venv/bin/activate # (Windows: .venv\Scripts\activate)
|
||||
pip install -r requirements.txt
|
||||
python -m uvicorn app:app --host 127.0.0.1 --port 8765
|
||||
```
|
||||
|
||||
Then run the app with `VOICE_SIDECAR_URL=http://127.0.0.1:8765`.
|
||||
|
||||
Config (env, all optional) — see `voice-sidecar/.env.example`: `WHISPER_MODEL_SIZE`, `WHISPER_DEVICE`
|
||||
(`cpu`/`cuda`), `KOKORO_VOICE`, `VOICE_PORT`.
|
||||
|
||||
## Notes
|
||||
|
||||
- The first read-aloud is slow (~10–20s) while the model lazy-loads; it's near-instant and cached after.
|
||||
- Recording needs a secure context (HTTPS or localhost) for microphone access.
|
||||
- On iOS, playback is tap-initiated (manual read-aloud) to satisfy Safari's autoplay policy.
|
||||
@@ -72,6 +72,7 @@ import userRoutes from './routes/user.js';
|
||||
import geminiRoutes from './routes/gemini.js';
|
||||
import pluginsRoutes from './routes/plugins.js';
|
||||
import providerRoutes from './modules/providers/provider.routes.js';
|
||||
import voiceRoutes from './voice-proxy.js';
|
||||
import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js';
|
||||
import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js';
|
||||
import { configureWebPush } from './services/vapid-keys.js';
|
||||
@@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
|
||||
// Agent API Routes (uses API key authentication)
|
||||
app.use('/api/agent', agentRoutes);
|
||||
|
||||
app.use('/api/voice', authenticateToken, voiceRoutes);
|
||||
|
||||
// Serve public files (like api-docs.html)
|
||||
app.use(express.static(path.join(APP_ROOT, 'public')));
|
||||
|
||||
|
||||
87
server/voice-proxy.js
Normal file
87
server/voice-proxy.js
Normal file
@@ -0,0 +1,87 @@
|
||||
// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
|
||||
//
|
||||
// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
|
||||
// backend (any implementation) exposing:
|
||||
// POST /transcribe (multipart field 'audio') -> { text }
|
||||
// POST /tts (form field 'text') -> audio bytes (audio/*)
|
||||
// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
|
||||
// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
|
||||
//
|
||||
// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
|
||||
// should bind to localhost and is never exposed directly.
|
||||
import express from 'express';
|
||||
|
||||
const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
|
||||
const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
|
||||
|
||||
const router = express.Router();
|
||||
|
||||
// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
|
||||
let _upload = null;
|
||||
async function getUpload() {
|
||||
if (!_upload) {
|
||||
const multer = (await import('multer')).default;
|
||||
_upload = multer({
|
||||
storage: multer.memoryStorage(),
|
||||
limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
|
||||
});
|
||||
}
|
||||
return _upload;
|
||||
}
|
||||
|
||||
function ensureEnabled(res) {
|
||||
if (!VOICE_ENABLED) {
|
||||
res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// GET /api/voice/health -> { enabled } (frontend hides the voice UI when disabled)
|
||||
router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
|
||||
|
||||
// POST /api/voice/transcribe (multipart 'audio') -> { text }
|
||||
router.post('/transcribe', async (req, res) => {
|
||||
if (!ensureEnabled(res)) return;
|
||||
const upload = await getUpload();
|
||||
upload.single('audio')(req, res, async (err) => {
|
||||
if (err) return res.status(400).json({ error: err.message });
|
||||
if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
|
||||
try {
|
||||
const fd = new FormData();
|
||||
fd.append(
|
||||
'audio',
|
||||
new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
|
||||
req.file.originalname || 'recording.webm',
|
||||
);
|
||||
const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
|
||||
const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
|
||||
res.status(r.status).json(data);
|
||||
} catch (e) {
|
||||
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// POST /api/voice/tts { text } -> audio bytes
|
||||
router.post('/tts', async (req, res) => {
|
||||
if (!ensureEnabled(res)) return;
|
||||
const text = req.body?.text;
|
||||
if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
|
||||
try {
|
||||
const fd = new FormData();
|
||||
fd.append('text', text);
|
||||
const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
|
||||
if (!r.ok) {
|
||||
const errText = await r.text().catch(() => 'tts failed');
|
||||
return res.status(r.status).json({ error: errText });
|
||||
}
|
||||
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
|
||||
res.setHeader('Cache-Control', 'no-store');
|
||||
res.send(Buffer.from(await r.arrayBuffer()));
|
||||
} catch (e) {
|
||||
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
88
src/components/chat/hooks/useTts.ts
Normal file
88
src/components/chat/hooks/useTts.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import { useCallback, useEffect, useRef, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Only one message speaks at a time across the whole app.
|
||||
let stopActive: (() => void) | null = null;
|
||||
|
||||
export type TtsState = 'idle' | 'loading' | 'playing';
|
||||
|
||||
/**
|
||||
* Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts
|
||||
* (Kokoro sidecar via the Express proxy; cleaning happens server-side),
|
||||
* plays the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay.
|
||||
*/
|
||||
export function useTts(getText: () => string) {
|
||||
const [state, setState] = useState<TtsState>('idle');
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const urlRef = useRef<string | null>(null);
|
||||
|
||||
const reset = useCallback(() => {
|
||||
if (audioRef.current) {
|
||||
audioRef.current.onended = null;
|
||||
audioRef.current.onerror = null;
|
||||
audioRef.current.pause();
|
||||
audioRef.current.src = '';
|
||||
audioRef.current = null;
|
||||
}
|
||||
if (urlRef.current) {
|
||||
URL.revokeObjectURL(urlRef.current);
|
||||
urlRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
reset();
|
||||
setState('idle');
|
||||
if (stopActive) stopActive = null;
|
||||
}, [reset]);
|
||||
|
||||
// Cleanup on unmount.
|
||||
useEffect(() => () => reset(), [reset]);
|
||||
|
||||
const play = useCallback(async () => {
|
||||
if (stopActive) stopActive();
|
||||
const text = getText();
|
||||
if (!text || !text.trim()) return;
|
||||
|
||||
// Create + "unlock" the audio element synchronously inside the click gesture,
|
||||
// so iOS Safari lets us play it after the async fetch resolves.
|
||||
const audio = new Audio();
|
||||
audioRef.current = audio;
|
||||
audio.onended = () => stop();
|
||||
audio.onerror = () => stop();
|
||||
try {
|
||||
audio.play().catch(() => {});
|
||||
audio.pause();
|
||||
} catch {
|
||||
/* unlock attempt; ignore */
|
||||
}
|
||||
stopActive = stop;
|
||||
setState('loading');
|
||||
|
||||
try {
|
||||
const res = await authenticatedFetch('/api/voice/tts', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ text }),
|
||||
});
|
||||
if (!res.ok) throw new Error(`tts ${res.status}`);
|
||||
const blob = await res.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
urlRef.current = url;
|
||||
if (audioRef.current !== audio) return; // stopped while loading
|
||||
audio.src = url;
|
||||
audio.load();
|
||||
await audio.play();
|
||||
setState('playing');
|
||||
} catch {
|
||||
reset();
|
||||
setState('idle');
|
||||
}
|
||||
}, [getText, reset, stop]);
|
||||
|
||||
const toggle = useCallback(() => {
|
||||
if (state === 'playing' || state === 'loading') stop();
|
||||
else play();
|
||||
}, [state, play, stop]);
|
||||
|
||||
return { state, toggle };
|
||||
}
|
||||
38
src/components/chat/hooks/useVoiceAvailable.ts
Normal file
38
src/components/chat/hooks/useVoiceAvailable.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
|
||||
// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
|
||||
let cached: boolean | null = null;
|
||||
let inflight: Promise<boolean> | null = null;
|
||||
|
||||
function probe(): Promise<boolean> {
|
||||
if (cached !== null) return Promise.resolve(cached);
|
||||
if (!inflight) {
|
||||
inflight = authenticatedFetch('/api/voice/health')
|
||||
.then((r) => (r.ok ? r.json() : { enabled: false }))
|
||||
.then((d) => {
|
||||
cached = Boolean(d?.enabled);
|
||||
return cached;
|
||||
})
|
||||
.catch(() => {
|
||||
cached = false;
|
||||
return false;
|
||||
});
|
||||
}
|
||||
return inflight;
|
||||
}
|
||||
|
||||
export function useVoiceAvailable(): boolean {
|
||||
const [available, setAvailable] = useState<boolean>(cached ?? false);
|
||||
useEffect(() => {
|
||||
let mounted = true;
|
||||
probe().then((v) => {
|
||||
if (mounted) setAvailable(v);
|
||||
});
|
||||
return () => {
|
||||
mounted = false;
|
||||
};
|
||||
}, []);
|
||||
return available;
|
||||
}
|
||||
106
src/components/chat/hooks/useVoiceInput.ts
Normal file
106
src/components/chat/hooks/useVoiceInput.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import { useCallback, useRef, useState } from 'react';
|
||||
import { authenticatedFetch } from '../../../utils/api';
|
||||
|
||||
// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
|
||||
const MIME_CANDIDATES = [
|
||||
'audio/webm;codecs=opus',
|
||||
'audio/webm',
|
||||
'audio/mp4',
|
||||
'audio/ogg;codecs=opus',
|
||||
'audio/ogg',
|
||||
];
|
||||
|
||||
function pickMime(): string {
|
||||
for (const t of MIME_CANDIDATES) {
|
||||
try {
|
||||
if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
|
||||
} catch {
|
||||
/* isTypeSupported can throw on some iOS versions */
|
||||
}
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
|
||||
|
||||
/**
|
||||
* Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
|
||||
* (faster-whisper sidecar via the Express proxy), returns text via onTranscript.
|
||||
* Ported from tooler's VoiceInput.js.
|
||||
*/
|
||||
export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
|
||||
const [state, setState] = useState<VoiceInputState>('idle');
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const chunksRef = useRef<Blob[]>([]);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
|
||||
const stopTracks = () => {
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
};
|
||||
|
||||
const start = useCallback(async () => {
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: { echoCancellation: true, noiseSuppression: true },
|
||||
});
|
||||
streamRef.current = stream;
|
||||
const mimeType = pickMime();
|
||||
const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
|
||||
recorderRef.current = rec;
|
||||
chunksRef.current = [];
|
||||
|
||||
rec.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunksRef.current.push(e.data);
|
||||
};
|
||||
|
||||
rec.onstop = async () => {
|
||||
stopTracks();
|
||||
const type = rec.mimeType || 'audio/webm';
|
||||
const blob = new Blob(chunksRef.current, { type });
|
||||
if (blob.size < 800) {
|
||||
setState('idle');
|
||||
onError?.('Recording too short');
|
||||
return;
|
||||
}
|
||||
setState('transcribing');
|
||||
try {
|
||||
const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
|
||||
const fd = new FormData();
|
||||
fd.append('audio', blob, `recording.${ext}`);
|
||||
const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
|
||||
if (!res.ok) throw new Error(`transcribe ${res.status}`);
|
||||
const data = await res.json();
|
||||
const text = String(data?.text || '').trim();
|
||||
if (text) onTranscript(text);
|
||||
else onError?.('No speech detected');
|
||||
} catch (e) {
|
||||
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
|
||||
} finally {
|
||||
setState('idle');
|
||||
}
|
||||
};
|
||||
|
||||
rec.start();
|
||||
setState('recording');
|
||||
} catch (e) {
|
||||
const err = e as { name?: string; message?: string };
|
||||
let msg = `Mic error: ${err?.message || e}`;
|
||||
if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
|
||||
else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
|
||||
onError?.(msg);
|
||||
setState('idle');
|
||||
}
|
||||
}, [onTranscript, onError]);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
if (recorderRef.current && state === 'recording') recorderRef.current.stop();
|
||||
}, [state]);
|
||||
|
||||
const toggle = useCallback(() => {
|
||||
if (state === 'recording') stop();
|
||||
else if (state === 'idle') start();
|
||||
}, [state, start, stop]);
|
||||
|
||||
return { state, toggle };
|
||||
}
|
||||
@@ -404,6 +404,7 @@ function ChatInterface({
|
||||
renderInputWithMentions={renderInputWithMentions}
|
||||
textareaRef={textareaRef}
|
||||
input={input}
|
||||
onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
|
||||
onInputChange={handleInputChange}
|
||||
onTextareaClick={handleTextareaClick}
|
||||
onTextareaKeyDown={handleKeyDown}
|
||||
|
||||
@@ -26,6 +26,7 @@ import {
|
||||
import CommandMenu from './CommandMenu';
|
||||
import ClaudeStatus from './ClaudeStatus';
|
||||
import ImageAttachment from './ImageAttachment';
|
||||
import VoiceInputButton from './VoiceInputButton';
|
||||
import PermissionRequestsBanner from './PermissionRequestsBanner';
|
||||
import TokenUsageSummary from './TokenUsageSummary';
|
||||
|
||||
@@ -89,6 +90,7 @@ interface ChatComposerProps {
|
||||
renderInputWithMentions: (text: string) => ReactNode;
|
||||
textareaRef: RefObject<HTMLTextAreaElement>;
|
||||
input: string;
|
||||
onVoiceTranscript?: (text: string) => void;
|
||||
onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
|
||||
onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
|
||||
onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
|
||||
@@ -143,6 +145,7 @@ export default function ChatComposer({
|
||||
renderInputWithMentions,
|
||||
textareaRef,
|
||||
input,
|
||||
onVoiceTranscript,
|
||||
onInputChange,
|
||||
onTextareaClick,
|
||||
onTextareaKeyDown,
|
||||
@@ -315,6 +318,8 @@ export default function ChatComposer({
|
||||
<ImageIcon />
|
||||
</PromptInputButton>
|
||||
|
||||
{onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
|
||||
|
||||
<button
|
||||
type="button"
|
||||
onClick={onModeSwitch}
|
||||
|
||||
@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share
|
||||
|
||||
import { Markdown } from './Markdown';
|
||||
import MessageCopyControl from './MessageCopyControl';
|
||||
import MessageSpeakControl from './MessageSpeakControl';
|
||||
|
||||
type DiffLine = {
|
||||
type: string;
|
||||
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
|
||||
{shouldShowAssistantCopyControl && (
|
||||
<MessageCopyControl content={assistantCopyContent} messageType="assistant" />
|
||||
)}
|
||||
{shouldShowAssistantCopyControl && (
|
||||
<MessageSpeakControl content={assistantCopyContent} />
|
||||
)}
|
||||
{!isGrouped && <span>{formattedTime}</span>}
|
||||
</div>
|
||||
)}
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import { Volume2, Loader2, Square } from 'lucide-react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { useTts } from '../../hooks/useTts';
|
||||
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
|
||||
|
||||
// Tap-to-speak button beside the copy control on assistant messages.
|
||||
// Renders nothing unless the optional voice feature is enabled.
|
||||
const MessageSpeakControl = ({ content }: { content: string }) => {
|
||||
const { t } = useTranslation('chat');
|
||||
const available = useVoiceAvailable();
|
||||
const { state, toggle } = useTts(() => content);
|
||||
|
||||
if (!available) return null;
|
||||
|
||||
const title =
|
||||
state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
|
||||
|
||||
return (
|
||||
<button
|
||||
type="button"
|
||||
onClick={toggle}
|
||||
title={title}
|
||||
aria-label={title}
|
||||
className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
|
||||
>
|
||||
{state === 'playing' ? (
|
||||
<Square className="h-3.5 w-3.5" />
|
||||
) : state === 'loading' ? (
|
||||
<Loader2 className="h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<Volume2 className="h-3.5 w-3.5" />
|
||||
)}
|
||||
</button>
|
||||
);
|
||||
};
|
||||
|
||||
export default MessageSpeakControl;
|
||||
40
src/components/chat/view/subcomponents/VoiceInputButton.tsx
Normal file
40
src/components/chat/view/subcomponents/VoiceInputButton.tsx
Normal file
@@ -0,0 +1,40 @@
|
||||
import { Mic, Square, Loader2 } from 'lucide-react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { useVoiceInput } from '../../hooks/useVoiceInput';
|
||||
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
|
||||
import { PromptInputButton } from '../../../../shared/view/ui';
|
||||
|
||||
type Props = {
|
||||
onTranscript: (text: string) => void;
|
||||
onError?: (msg: string) => void;
|
||||
};
|
||||
|
||||
// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
|
||||
export default function VoiceInputButton({ onTranscript, onError }: Props) {
|
||||
const { t } = useTranslation('chat');
|
||||
const available = useVoiceAvailable();
|
||||
const { state, toggle } = useVoiceInput(onTranscript, onError);
|
||||
|
||||
if (!available) return null;
|
||||
|
||||
const icon =
|
||||
state === 'recording' ? (
|
||||
<Square className="text-red-500" />
|
||||
) : state === 'transcribing' ? (
|
||||
<Loader2 className="animate-spin" />
|
||||
) : (
|
||||
<Mic />
|
||||
);
|
||||
|
||||
return (
|
||||
<PromptInputButton
|
||||
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
|
||||
onClick={(e: { preventDefault: () => void }) => {
|
||||
e.preventDefault();
|
||||
toggle();
|
||||
}}
|
||||
>
|
||||
{icon}
|
||||
</PromptInputButton>
|
||||
);
|
||||
}
|
||||
@@ -122,6 +122,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"voice": {
|
||||
"input": "Voice input",
|
||||
"stopRecording": "Stop recording",
|
||||
"transcribing": "Transcribing…",
|
||||
"speak": "Read aloud",
|
||||
"stopSpeaking": "Stop",
|
||||
"loading": "Loading…"
|
||||
},
|
||||
"input": {
|
||||
"placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...",
|
||||
"placeholderDefault": "Type your message...",
|
||||
|
||||
14
voice-sidecar/.env.example
Normal file
14
voice-sidecar/.env.example
Normal file
@@ -0,0 +1,14 @@
|
||||
# Voice sidecar config (all optional — these are the defaults).
|
||||
# The sidecar binds 127.0.0.1 only; CloudCLI's Express proxy reaches it.
|
||||
|
||||
# Port the sidecar listens on (CloudCLI reaches it via VOICE_SIDECAR_URL).
|
||||
VOICE_PORT=8765
|
||||
|
||||
# faster-whisper model size: tiny | base | small | medium | large-v3
|
||||
WHISPER_MODEL_SIZE=base
|
||||
# cpu (int8, default) or cuda (float16, needs a CUDA torch in the venv)
|
||||
WHISPER_DEVICE=cpu
|
||||
|
||||
# Kokoro voice (see https://github.com/hexgrad/kokoro for the full list) and language code.
|
||||
KOKORO_VOICE=af_heart
|
||||
KOKORO_LANG=a
|
||||
187
voice-sidecar/app.py
Normal file
187
voice-sidecar/app.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""
|
||||
CloudCLI voice sidecar — local STT (faster-whisper) + local TTS (Kokoro-82M).
|
||||
|
||||
Ported from the tooler voice endpoints (D:\\tooler\\backend\\server.py), swapping
|
||||
edge-tts -> Kokoro. Bound to 127.0.0.1 only; CloudCLI's Express server proxies to
|
||||
it behind JWT auth. Never exposed to the tailnet directly.
|
||||
|
||||
Endpoints:
|
||||
GET /health -> {status, whisper_loaded, kokoro_loaded}
|
||||
POST /transcribe (multipart 'audio') -> {text, duration_ms}
|
||||
POST /tts (form 'text') -> audio/wav bytes (cached)
|
||||
"""
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import Response
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("voice-sidecar")
|
||||
|
||||
# ---- Config (env-overridable) -------------------------------------------------
|
||||
PORT = int(os.getenv("VOICE_PORT", "8765"))
|
||||
WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
|
||||
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu").lower() # "cpu" | "cuda"
|
||||
KOKORO_VOICE = os.getenv("KOKORO_VOICE", "af_heart")
|
||||
KOKORO_LANG = os.getenv("KOKORO_LANG", "a") # 'a' = American English
|
||||
KOKORO_SR = 24000
|
||||
|
||||
VOICE_DIR = Path(__file__).parent / "voice_messages"
|
||||
VOICE_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# ---- Lazy model singletons ----------------------------------------------------
|
||||
_whisper = None
|
||||
_whisper_lock = asyncio.Lock()
|
||||
_kpipe = None
|
||||
_kpipe_lock = asyncio.Lock()
|
||||
|
||||
|
||||
async def get_whisper():
|
||||
global _whisper
|
||||
if _whisper is not None:
|
||||
return _whisper
|
||||
async with _whisper_lock:
|
||||
if _whisper is not None:
|
||||
return _whisper
|
||||
|
||||
def _load():
|
||||
from faster_whisper import WhisperModel
|
||||
if WHISPER_DEVICE == "cuda":
|
||||
try:
|
||||
logger.info("[WHISPER] loading on CUDA (float16)...")
|
||||
return WhisperModel(WHISPER_MODEL_SIZE, device="cuda", compute_type="float16")
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.warning("[WHISPER] CUDA failed (%s), falling back to CPU", e)
|
||||
logger.info("[WHISPER] loading '%s' on CPU (int8)", WHISPER_MODEL_SIZE)
|
||||
return WhisperModel(WHISPER_MODEL_SIZE, device="cpu", compute_type="int8")
|
||||
|
||||
_whisper = await asyncio.get_event_loop().run_in_executor(None, _load)
|
||||
logger.info("[WHISPER] ready")
|
||||
return _whisper
|
||||
|
||||
|
||||
async def get_kokoro():
|
||||
global _kpipe
|
||||
if _kpipe is not None:
|
||||
return _kpipe
|
||||
async with _kpipe_lock:
|
||||
if _kpipe is not None:
|
||||
return _kpipe
|
||||
|
||||
def _load():
|
||||
from kokoro import KPipeline
|
||||
logger.info("[KOKORO] loading pipeline (lang=%s)...", KOKORO_LANG)
|
||||
return KPipeline(lang_code=KOKORO_LANG)
|
||||
|
||||
_kpipe = await asyncio.get_event_loop().run_in_executor(None, _load)
|
||||
logger.info("[KOKORO] ready")
|
||||
return _kpipe
|
||||
|
||||
|
||||
# ---- Text cleaning (ported verbatim from tooler prepare_text_for_tts) ---------
|
||||
def prepare_text_for_tts(text: str) -> str:
|
||||
"""Strip/transform markdown for natural speech."""
|
||||
text = re.sub(r"```[\s\S]*?```", " code block ", text) # code fences -> spoken stub
|
||||
text = re.sub(r"`([^`]+)`", r"\1", text) # unwrap inline code
|
||||
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) # bold
|
||||
text = re.sub(r"\*([^*]+)\*", r"\1", text) # italic
|
||||
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # links -> link text
|
||||
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # headers
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# ---- App ----------------------------------------------------------------------
|
||||
app = FastAPI(title="CloudCLI voice sidecar")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"whisper_loaded": _whisper is not None,
|
||||
"kokoro_loaded": _kpipe is not None,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(audio: UploadFile = File(...)):
|
||||
start = time.time()
|
||||
suffix = Path(audio.filename or "rec.webm").suffix or ".webm"
|
||||
content = await audio.read()
|
||||
logger.info("[STT] %d bytes (%s)", len(content), audio.content_type)
|
||||
|
||||
tmp_path = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
model = await get_whisper()
|
||||
|
||||
def _run():
|
||||
segments, _info = model.transcribe(tmp_path, beam_size=5)
|
||||
return "".join(seg.text for seg in segments).strip()
|
||||
|
||||
text = await asyncio.get_event_loop().run_in_executor(None, _run)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
logger.info("[STT] %dms: %s", duration_ms, text[:100])
|
||||
return {"text": text, "duration_ms": duration_ms}
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.error("[STT] failed: %s", e, exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
|
||||
finally:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
@app.post("/tts")
|
||||
async def tts(text: str = Form(...)):
|
||||
if not text.strip():
|
||||
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
||||
if len(text) > 8000:
|
||||
raise HTTPException(status_code=400, detail="Text too long (max 8000 chars)")
|
||||
|
||||
start = time.time()
|
||||
clean = prepare_text_for_tts(text)
|
||||
# Cache on the RAW text hash (matches tooler) so identical messages reuse audio.
|
||||
key = hashlib.sha256(text.encode()).hexdigest()[:16]
|
||||
out_path = VOICE_DIR / f"{key}.wav"
|
||||
|
||||
if not out_path.exists():
|
||||
try:
|
||||
pipeline = await get_kokoro()
|
||||
|
||||
def _synth():
|
||||
chunks = [audio for _gs, _ps, audio in pipeline(clean, voice=KOKORO_VOICE)]
|
||||
if not chunks:
|
||||
raise RuntimeError("Kokoro produced no audio")
|
||||
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
|
||||
sf.write(str(out_path), full, KOKORO_SR)
|
||||
|
||||
await asyncio.get_event_loop().run_in_executor(None, _synth)
|
||||
logger.info("[TTS] generated %s in %dms", out_path.name, int((time.time() - start) * 1000))
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.error("[TTS] failed: %s", e, exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"TTS failed: {e}")
|
||||
else:
|
||||
logger.info("[TTS] cache hit %s", out_path.name)
|
||||
|
||||
return Response(content=out_path.read_bytes(), media_type="audio/wav")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="127.0.0.1", port=PORT, log_level="info")
|
||||
9
voice-sidecar/requirements.txt
Normal file
9
voice-sidecar/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
# CloudCLI voice sidecar — STT (faster-whisper) + TTS (Kokoro-82M)
|
||||
fastapi>=0.110.0
|
||||
uvicorn[standard]>=0.27.0
|
||||
python-multipart>=0.0.9
|
||||
faster-whisper>=1.0.0
|
||||
kokoro>=0.9.4
|
||||
misaki[en]>=0.9.4
|
||||
soundfile>=0.12.1
|
||||
numpy>=1.26.0
|
||||
29
voice-sidecar/test_smoke.py
Normal file
29
voice-sidecar/test_smoke.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Smoke test: Kokoro TTS -> faster-whisper STT round-trip."""
|
||||
import time
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
PHRASE = "Hello, this is a test of the CloudCLI voice sidecar."
|
||||
|
||||
print("[1/3] Loading Kokoro pipeline...")
|
||||
t = time.time()
|
||||
from kokoro import KPipeline
|
||||
pipe = KPipeline(lang_code="a")
|
||||
print(f" loaded in {time.time()-t:.1f}s")
|
||||
|
||||
print("[2/3] Synthesizing...")
|
||||
t = time.time()
|
||||
chunks = [audio for _gs, _ps, audio in pipe(PHRASE, voice="af_heart")]
|
||||
full = np.concatenate([np.asarray(c, dtype=np.float32) for c in chunks])
|
||||
sf.write("test.wav", full, 24000)
|
||||
dur = len(full) / 24000
|
||||
print(f" synth {time.time()-t:.1f}s -> test.wav ({dur:.1f}s audio, {len(full)} samples)")
|
||||
|
||||
print("[3/3] Transcribing back with faster-whisper (base, cpu int8)...")
|
||||
t = time.time()
|
||||
from faster_whisper import WhisperModel
|
||||
model = WhisperModel("base", device="cpu", compute_type="int8")
|
||||
segments, _info = model.transcribe("test.wav", beam_size=5)
|
||||
text = "".join(s.text for s in segments).strip()
|
||||
print(f" transcribe {time.time()-t:.1f}s -> {text!r}")
|
||||
print("\nROUND-TRIP OK" if text else "\nROUND-TRIP PRODUCED NO TEXT")
|
||||
Reference in New Issue
Block a user