feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable).
2026-06-25 12:16:00 +08:00 · 2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions
--- a/server/index.js
+++ b/server/index.js
@@ -72,6 +72,7 @@ import userRoutes from './routes/user.js';
 import geminiRoutes from './routes/gemini.js';
 import pluginsRoutes from './routes/plugins.js';
 import providerRoutes from './modules/providers/provider.routes.js';
+import voiceRoutes from './voice-proxy.js';
 import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js';
 import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js';
 import { configureWebPush } from './services/vapid-keys.js';
@@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
 // Agent API Routes (uses API key authentication)
 app.use('/api/agent', agentRoutes);

+app.use('/api/voice', authenticateToken, voiceRoutes);
+
 // Serve public files (like api-docs.html)
 app.use(express.static(path.join(APP_ROOT, 'public')));

--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -0,0 +1,87 @@
+// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
+//
+// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
+// backend (any implementation) exposing:
+//     POST /transcribe   (multipart field 'audio')  -> { text }
+//     POST /tts          (form field 'text')        -> audio bytes (audio/*)
+// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
+// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
+//
+// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
+// should bind to localhost and is never exposed directly.
+import express from 'express';
+
+const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
+const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
+
+const router = express.Router();
+
+// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
+let _upload = null;
+async function getUpload() {
+  if (!_upload) {
+    const multer = (await import('multer')).default;
+    _upload = multer({
+      storage: multer.memoryStorage(),
+      limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
+    });
+  }
+  return _upload;
+}
+
+function ensureEnabled(res) {
+  if (!VOICE_ENABLED) {
+    res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
+    return false;
+  }
+  return true;
+}
+
+// GET /api/voice/health -> { enabled }  (frontend hides the voice UI when disabled)
+router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
+
+// POST /api/voice/transcribe  (multipart 'audio') -> { text }
+router.post('/transcribe', async (req, res) => {
+  if (!ensureEnabled(res)) return;
+  const upload = await getUpload();
+  upload.single('audio')(req, res, async (err) => {
+    if (err) return res.status(400).json({ error: err.message });
+    if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
+    try {
+      const fd = new FormData();
+      fd.append(
+        'audio',
+        new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
+        req.file.originalname || 'recording.webm',
+      );
+      const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
+      const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
+      res.status(r.status).json(data);
+    } catch (e) {
+      res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
+    }
+  });
+});
+
+// POST /api/voice/tts  { text } -> audio bytes
+router.post('/tts', async (req, res) => {
+  if (!ensureEnabled(res)) return;
+  const text = req.body?.text;
+  if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
+  try {
+    const fd = new FormData();
+    fd.append('text', text);
+    const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
+    if (!r.ok) {
+      const errText = await r.text().catch(() => 'tts failed');
+      return res.status(r.status).json({ error: errText });
+    }
+    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
+    res.setHeader('Cache-Control', 'no-store');
+    res.send(Buffer.from(await r.arrayBuffer()));
+  } catch (e) {
+    res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
+  }
+});
+
+export default router;