Files
claudecodeui/server/voice-proxy.js
newsbubbles f285715e31 fix(voice): address review (SSRF guard, auth mapping, client timeout)
Validates the user-supplied backend URL (http/https only, blocks the link-local
metadata range) to prevent SSRF; remaps upstream 401/403 so a bad voice API key
isn't read as the app's own auth failing; adds a client-side AbortController timeout
on the read-aloud request so the button can't sit in loading if a request stalls.
2026-06-10 17:56:02 +01:00

163 lines
6.1 KiB
JavaScript

// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend.
//
// The backend is whatever the user points at: OpenAI, Groq, or a local server
// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the
// standard OpenAI audio endpoints:
// POST {base}/audio/transcriptions (multipart 'file' + 'model') -> { text }
// POST {base}/audio/speech ({ model, voice, input }) -> audio bytes
//
// Config is resolved per-request from headers (set by the client's voice settings),
// falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
import express from 'express';
const ENV = {
baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''),
apiKey: process.env.VOICE_API_KEY || '',
sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3',
};
// Per-request config: client headers (from the user's voice settings) override env defaults.
function resolveConfig(req) {
const h = req.headers;
return {
baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''),
apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
};
}
const router = express.Router();
// Generous by default — local TTS can synthesize long messages at ~real-time on CPU.
const VOICE_TIMEOUT_MS = Number(process.env.VOICE_TIMEOUT_MS || 300000);
async function fetchWithTimeout(url, options = {}) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
try {
return await fetch(url, { ...options, signal: controller.signal });
} finally {
clearTimeout(timer);
}
}
// Turn backend failures into a clear, actionable message for the client.
function backendError(res, e) {
if (e && e.name === 'AbortError') {
return res.status(504).json({
error: `Voice backend timed out after ${Math.round(VOICE_TIMEOUT_MS / 1000)}s. Check your sidecar or API.`,
});
}
return res.status(502).json({ error: `Voice backend unreachable: ${e.message}` });
}
// SSRF guard for the user-configurable backend URL: http/https only, and block the
// link-local / cloud-metadata range. localhost/private are allowed on purpose so users
// can run a local voice server (LocalAI, Speaches, etc.).
function isAllowedBackendUrl(raw) {
let u;
try {
u = new URL(raw);
} catch {
return false;
}
if (u.protocol !== 'http:' && u.protocol !== 'https:') return false;
if (u.hostname === '169.254.169.254' || u.hostname.startsWith('169.254.')) return false;
return true;
}
// Don't surface an upstream 401/403 as if the user's own app login failed.
function upstreamError(res, status, text) {
if (status === 401 || status === 403) {
return res.status(502).json({ error: 'Voice backend rejected the request (check the API key).' });
}
return res.status(status).json({ error: text || 'voice backend error' });
}
let _upload = null;
async function getUpload() {
if (!_upload) {
const multer = (await import('multer')).default;
_upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } });
}
return _upload;
}
function authHeader(apiKey) {
return apiKey ? { Authorization: `Bearer ${apiKey}` } : {};
}
// GET /api/voice/health -> { configured } (true if a base URL is available)
router.get('/health', (req, res) => {
res.json({ configured: Boolean(resolveConfig(req).baseUrl) });
});
// POST /api/voice/transcribe (multipart 'audio') -> { text }
router.post('/transcribe', async (req, res) => {
const cfg = resolveConfig(req);
if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
const upload = await getUpload();
upload.single('audio')(req, res, async (err) => {
if (err) return res.status(400).json({ error: err.message });
if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
try {
const fd = new FormData();
fd.append(
'file',
new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
req.file.originalname || 'recording.webm',
);
fd.append('model', cfg.sttModel);
const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, {
method: 'POST',
headers: authHeader(cfg.apiKey),
body: fd,
});
const text = await r.text();
if (!r.ok) return upstreamError(res, r.status, text);
let data;
try { data = JSON.parse(text); } catch { data = { text }; }
res.json({ text: data.text ?? '' });
} catch (e) {
backendError(res, e);
}
});
});
// POST /api/voice/tts { text } -> audio bytes
router.post('/tts', async (req, res) => {
const cfg = resolveConfig(req);
if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
const text = req.body?.text;
if (!text || !text.trim()) return res.status(400).json({ error: 'text required' });
try {
const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) },
body: JSON.stringify({
model: cfg.ttsModel,
voice: cfg.ttsVoice,
input: text,
response_format: ENV.ttsFormat,
}),
});
if (!r.ok) {
const errText = await r.text().catch(() => 'tts failed');
return upstreamError(res, r.status, errText);
}
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
res.setHeader('Cache-Control', 'no-store');
res.send(Buffer.from(await r.arrayBuffer()));
} catch (e) {
backendError(res, e);
}
});
export default router;