feat(voice): add optional speech-to-text input and read-aloud TTS

Adds a push-to-talk mic button in the composer and a read-aloud button on
assistant messages. Both are opt-in and hidden unless a voice backend is
configured via VOICE_SIDECAR_URL.

The auth-gated /api/voice proxy forwards to a configurable backend exposing
/transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health
and hides the controls when disabled. Adds i18n keys and docs/voice.md.

Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper
for STT, Kokoro-82M for TTS, both CPU-capable).
This commit is contained in:
newsbubbles
2026-06-08 00:47:14 +01:00
parent af3a28abc7
commit d05585e1f4
17 changed files with 720 additions and 0 deletions

View File

@@ -72,6 +72,7 @@ import userRoutes from './routes/user.js';
import geminiRoutes from './routes/gemini.js';
import pluginsRoutes from './routes/plugins.js';
import providerRoutes from './modules/providers/provider.routes.js';
import voiceRoutes from './voice-proxy.js';
import { startEnabledPluginServers, stopAllPlugins, getPluginPort } from './utils/plugin-process-manager.js';
import { initializeDatabase, projectsDb, sessionsDb } from './modules/database/index.js';
import { configureWebPush } from './services/vapid-keys.js';
@@ -204,6 +205,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
// Agent API Routes (uses API key authentication)
app.use('/api/agent', agentRoutes);
app.use('/api/voice', authenticateToken, voiceRoutes);
// Serve public files (like api-docs.html)
app.use(express.static(path.join(APP_ROOT, 'public')));

87
server/voice-proxy.js Normal file
View File

@@ -0,0 +1,87 @@
// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
//
// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
// backend (any implementation) exposing:
// POST /transcribe (multipart field 'audio') -> { text }
// POST /tts (form field 'text') -> audio bytes (audio/*)
// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
//
// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
// should bind to localhost and is never exposed directly.
import express from 'express';
const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
const router = express.Router();
// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
let _upload = null;
async function getUpload() {
if (!_upload) {
const multer = (await import('multer')).default;
_upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
});
}
return _upload;
}
function ensureEnabled(res) {
if (!VOICE_ENABLED) {
res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
return false;
}
return true;
}
// GET /api/voice/health -> { enabled } (frontend hides the voice UI when disabled)
router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
// POST /api/voice/transcribe (multipart 'audio') -> { text }
router.post('/transcribe', async (req, res) => {
if (!ensureEnabled(res)) return;
const upload = await getUpload();
upload.single('audio')(req, res, async (err) => {
if (err) return res.status(400).json({ error: err.message });
if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
try {
const fd = new FormData();
fd.append(
'audio',
new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
req.file.originalname || 'recording.webm',
);
const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
res.status(r.status).json(data);
} catch (e) {
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
}
});
});
// POST /api/voice/tts { text } -> audio bytes
router.post('/tts', async (req, res) => {
if (!ensureEnabled(res)) return;
const text = req.body?.text;
if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
try {
const fd = new FormData();
fd.append('text', text);
const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
if (!r.ok) {
const errText = await r.text().catch(() => 'tts failed');
return res.status(r.status).json({ error: errText });
}
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
res.setHeader('Cache-Control', 'no-store');
res.send(Buffer.from(await r.arrayBuffer()));
} catch (e) {
res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
}
});
export default router;