refactor(voice): provider-agnostic backend and in-app config

Switches the voice proxy to the OpenAI audio API (/v1/audio/transcriptions and /v1/audio/speech) so it works with OpenAI, Groq, or a local server. Adds a Settings -> Voice tab (base URL, API key, models, voice) plus a Quick Settings toggle, and removes the bundled Python sidecar. Review fixes: stop mic tracks on unmount, clear the global TTS stop handler and revoke leaked blob URLs, add fetch timeouts in the proxy, surface mic errors in the button, trim before appending transcripts, and drop the repo-wide wav ignore.
2026-06-26 13:35:49 +08:00 · 2026-06-09 10:05:06 +01:00
parent d05585e1f4
commit 711936d279
21 changed files with 367 additions and 365 deletions
--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -1,48 +1,71 @@
-// Optional voice proxy — forwards speech-to-text / text-to-speech to a configurable backend.
+// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend.
 //
-// Opt-in: voice is DISABLED unless VOICE_SIDECAR_URL is set. When set, it must point at a
-// backend (any implementation) exposing:
-//     POST /transcribe   (multipart field 'audio')  -> { text }
-//     POST /tts          (form field 'text')        -> audio bytes (audio/*)
-// A reference backend (local faster-whisper + Kokoro) ships in /voice-sidecar, but any
-// service implementing the two endpoints works (e.g. a cloud transcription + TTS gateway).
+// The backend is whatever the user points at: OpenAI, Groq, or a local server
+// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the
+// standard OpenAI audio endpoints:
+//     POST {base}/audio/transcriptions   (multipart 'file' + 'model')      -> { text }
+//     POST {base}/audio/speech           ({ model, voice, input })         -> audio bytes
 //
-// Mounted at /api/voice behind authenticateToken, so it inherits the app's auth. The backend
-// should bind to localhost and is never exposed directly.
+// Config is resolved per-request from headers (set by the client's voice settings),
+// falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
 import express from 'express';

-const VOICE_SIDECAR_URL = (process.env.VOICE_SIDECAR_URL || '').replace(/\/$/, '');
-const VOICE_ENABLED = Boolean(VOICE_SIDECAR_URL);
+const ENV = {
+  baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''),
+  apiKey: process.env.VOICE_API_KEY || '',
+  sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
+  ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
+  ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
+  ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3',
+};
+
+// Per-request config: client headers (from the user's voice settings) override env defaults.
+function resolveConfig(req) {
+  const h = req.headers;
+  return {
+    baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''),
+    apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
+    sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
+    ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
+    ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
+  };
+}

 const router = express.Router();

-// Lazy multer (memory storage) for the audio upload — matches index.js's pattern.
+const VOICE_TIMEOUT_MS = Number(process.env.VOICE_TIMEOUT_MS || 60000);
+async function fetchWithTimeout(url, options = {}) {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
+  try {
+    return await fetch(url, { ...options, signal: controller.signal });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
 let _upload = null;
 async function getUpload() {
  if (!_upload) {
    const multer = (await import('multer')).default;
-    _upload = multer({
-      storage: multer.memoryStorage(),
-      limits: { fileSize: 25 * 1024 * 1024 }, // 25MB — short dictation clips
-    });
+    _upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } });
  }
  return _upload;
 }

-function ensureEnabled(res) {
-  if (!VOICE_ENABLED) {
-    res.status(503).json({ error: 'Voice is not configured. Set VOICE_SIDECAR_URL to enable it.' });
-    return false;
-  }
-  return true;
+function authHeader(apiKey) {
+  return apiKey ? { Authorization: `Bearer ${apiKey}` } : {};
 }

-// GET /api/voice/health -> { enabled }  (frontend hides the voice UI when disabled)
-router.get('/health', (_req, res) => res.json({ enabled: VOICE_ENABLED }));
+// GET /api/voice/health -> { configured } (true if a base URL is available)
+router.get('/health', (req, res) => {
+  res.json({ configured: Boolean(resolveConfig(req).baseUrl) });
+});

 // POST /api/voice/transcribe  (multipart 'audio') -> { text }
 router.post('/transcribe', async (req, res) => {
-  if (!ensureEnabled(res)) return;
+  const cfg = resolveConfig(req);
+  if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
  const upload = await getUpload();
  upload.single('audio')(req, res, async (err) => {
    if (err) return res.status(400).json({ error: err.message });
@@ -50,13 +73,21 @@ router.post('/transcribe', async (req, res) => {
    try {
      const fd = new FormData();
      fd.append(
-        'audio',
+        'file',
        new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
        req.file.originalname || 'recording.webm',
      );
-      const r = await fetch(`${VOICE_SIDECAR_URL}/transcribe`, { method: 'POST', body: fd });
-      const data = await r.json().catch(() => ({ error: 'bad voice backend response' }));
-      res.status(r.status).json(data);
+      fd.append('model', cfg.sttModel);
+      const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, {
+        method: 'POST',
+        headers: authHeader(cfg.apiKey),
+        body: fd,
+      });
+      const text = await r.text();
+      if (!r.ok) return res.status(r.status).json({ error: text || 'transcription failed' });
+      let data;
+      try { data = JSON.parse(text); } catch { data = { text }; }
+      res.json({ text: data.text ?? '' });
    } catch (e) {
      res.status(502).json({ error: `voice backend unreachable: ${e.message}` });
    }
@@ -65,18 +96,26 @@ router.post('/transcribe', async (req, res) => {

 // POST /api/voice/tts  { text } -> audio bytes
 router.post('/tts', async (req, res) => {
-  if (!ensureEnabled(res)) return;
+  const cfg = resolveConfig(req);
+  if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
  const text = req.body?.text;
  if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
  try {
-    const fd = new FormData();
-    fd.append('text', text);
-    const r = await fetch(`${VOICE_SIDECAR_URL}/tts`, { method: 'POST', body: fd });
+    const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) },
+      body: JSON.stringify({
+        model: cfg.ttsModel,
+        voice: cfg.ttsVoice,
+        input: text,
+        response_format: ENV.ttsFormat,
+      }),
+    });
    if (!r.ok) {
      const errText = await r.text().catch(() => 'tts failed');
      return res.status(r.status).json({ error: errText });
    }
-    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/wav');
+    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
    res.setHeader('Cache-Control', 'no-store');
    res.send(Buffer.from(await r.arrayBuffer()));
  } catch (e) {