mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-26 05:15:48 +08:00
User-selected backend URLs must remain usable without letting clients control server requests. Call custom providers from the browser while keeping the server proxy bound to its configured host. This restores voice controls for frontend settings without reopening the SSRF path.
225 lines
8.3 KiB
JavaScript
225 lines
8.3 KiB
JavaScript
// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend.
|
|
//
|
|
// The backend is whatever the user points at: OpenAI, Groq, or a local server
|
|
// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the
|
|
// standard OpenAI audio endpoints:
|
|
// POST {base}/audio/transcriptions (multipart 'file' + 'model') -> { text }
|
|
// POST {base}/audio/speech ({ model, voice, input }) -> audio bytes
|
|
//
|
|
// Config is resolved per-request from headers (set by the client's voice settings),
|
|
// falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
|
|
import { Readable } from 'node:stream';
|
|
|
|
import express from 'express';
|
|
|
|
const ENV = {
|
|
baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''),
|
|
apiKey: process.env.VOICE_API_KEY || '',
|
|
sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
|
|
ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
|
|
ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
|
|
};
|
|
|
|
/**
|
|
* Resolve the voice backend config for a request. Client headers (set from the
|
|
* user's in-app voice settings) take precedence over the server env defaults.
|
|
* @param {import('express').Request} req
|
|
* @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}}
|
|
*/
|
|
function resolveConfig(req) {
|
|
const h = req.headers;
|
|
return {
|
|
// Security: do not allow clients to control the outbound backend host.
|
|
// Always use the server-side configured base URL.
|
|
baseUrl: ENV.baseUrl,
|
|
apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
|
|
sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
|
|
ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
|
|
ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
|
|
ttsFormat: String(h['x-voice-tts-format'] || '').trim(),
|
|
};
|
|
}
|
|
|
|
const router = express.Router();
|
|
|
|
// Generous by default — local TTS can synthesize long messages at ~real-time on CPU.
|
|
// Guard against a non-numeric/zero override that would make setTimeout fire immediately.
|
|
const DEFAULT_VOICE_TIMEOUT_MS = 300000;
|
|
const _parsedTimeout = Number(process.env.VOICE_TIMEOUT_MS);
|
|
const VOICE_TIMEOUT_MS = Number.isFinite(_parsedTimeout) && _parsedTimeout > 0
|
|
? _parsedTimeout
|
|
: DEFAULT_VOICE_TIMEOUT_MS;
|
|
|
|
/**
|
|
* fetch() with an AbortController timeout so a stalled backend can't hold the
|
|
* request open indefinitely. Aborts after VOICE_TIMEOUT_MS.
|
|
* @param {string} url
|
|
* @param {RequestInit} [options]
|
|
* @returns {Promise<Response>}
|
|
*/
|
|
async function fetchWithTimeout(url, options = {}) {
|
|
const parsed = new URL(url);
|
|
if (!['http:', 'https:'].includes(parsed.protocol) || !isAllowedBackendUrl(parsed.origin)) {
|
|
throw new Error('Blocked outbound voice backend URL');
|
|
}
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
|
|
try {
|
|
return await fetch(parsed.toString(), { redirect: 'manual', ...options, signal: controller.signal });
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Turn a backend fetch failure into a clear, actionable client response:
|
|
* 504 on timeout (AbortError), 502 otherwise.
|
|
* @param {import('express').Response} res
|
|
* @param {Error} e
|
|
*/
|
|
function backendError(res, e) {
|
|
if (e && e.name === 'AbortError') {
|
|
return res.status(504).json({
|
|
error: `Voice backend timed out after ${Math.round(VOICE_TIMEOUT_MS / 1000)}s. Check your voice backend.`,
|
|
});
|
|
}
|
|
return res.status(502).json({ error: `Voice backend unreachable: ${e.message}` });
|
|
}
|
|
|
|
/**
|
|
* SSRF guard for the user-configurable backend URL: allow http/https only and
|
|
* block the link-local / cloud-metadata range (169.254.x). localhost and private
|
|
* ranges are allowed on purpose so users can point at a local voice server
|
|
* (LocalAI, Speaches, Kokoro-FastAPI, etc.).
|
|
* @param {string} raw
|
|
* @returns {boolean}
|
|
*/
|
|
function isAllowedBackendUrl(raw) {
|
|
let u;
|
|
try {
|
|
u = new URL(raw);
|
|
} catch {
|
|
return false;
|
|
}
|
|
if (u.protocol !== 'http:' && u.protocol !== 'https:') return false;
|
|
if (u.hostname === '169.254.169.254' || u.hostname.startsWith('169.254.')) return false;
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Relay an upstream (backend) error to the client without making an upstream
|
|
* 401/403 look like the user's own app login failed.
|
|
* @param {import('express').Response} res
|
|
* @param {number} status
|
|
* @param {string} [text]
|
|
*/
|
|
function upstreamError(res, status, text) {
|
|
if (status === 401 || status === 403) {
|
|
return res.status(502).json({ error: 'Voice backend rejected the request (check the API key).' });
|
|
}
|
|
return res.status(status).json({ error: text || 'voice backend error' });
|
|
}
|
|
|
|
let _upload = null;
|
|
/**
|
|
* Lazily build a memory-storage multer instance (25 MB cap) for audio uploads,
|
|
* so multer is only imported when the voice feature is actually used.
|
|
* @returns {Promise<import('multer').Multer>}
|
|
*/
|
|
async function getUpload() {
|
|
if (!_upload) {
|
|
const multer = (await import('multer')).default;
|
|
_upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } });
|
|
}
|
|
return _upload;
|
|
}
|
|
|
|
/**
|
|
* Build the Authorization header for the backend, or an empty object when no
|
|
* key is configured (e.g. a local server that needs none).
|
|
* @param {string} apiKey
|
|
* @returns {Record<string, string>}
|
|
*/
|
|
function authHeader(apiKey) {
|
|
return apiKey ? { Authorization: `Bearer ${apiKey}` } : {};
|
|
}
|
|
|
|
/**
|
|
* GET /api/voice/health -> { configured } (true when a backend base URL is set).
|
|
*/
|
|
router.get('/health', (req, res) => {
|
|
res.json({ configured: Boolean(resolveConfig(req).baseUrl) });
|
|
});
|
|
|
|
/**
|
|
* POST /api/voice/transcribe (multipart 'audio') -> { text }.
|
|
* Forwards the uploaded audio to the backend's /audio/transcriptions endpoint.
|
|
*/
|
|
router.post('/transcribe', async (req, res) => {
|
|
const cfg = resolveConfig(req);
|
|
if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
|
|
if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
|
|
const upload = await getUpload();
|
|
upload.single('audio')(req, res, async (err) => {
|
|
if (err) return res.status(400).json({ error: err.message });
|
|
if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
|
|
try {
|
|
const fd = new FormData();
|
|
fd.append(
|
|
'file',
|
|
new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
|
|
req.file.originalname || 'recording.webm',
|
|
);
|
|
fd.append('model', cfg.sttModel);
|
|
const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, {
|
|
method: 'POST',
|
|
headers: authHeader(cfg.apiKey),
|
|
body: fd,
|
|
});
|
|
const text = await r.text();
|
|
if (!r.ok) return upstreamError(res, r.status, text);
|
|
let data;
|
|
try { data = JSON.parse(text); } catch { data = { text }; }
|
|
res.json({ text: data.text ?? '' });
|
|
} catch (e) {
|
|
backendError(res, e);
|
|
}
|
|
});
|
|
});
|
|
|
|
/**
|
|
* POST /api/voice/tts { text } -> audio bytes.
|
|
* Forwards the text to the backend's /audio/speech endpoint and streams the audio back.
|
|
*/
|
|
router.post('/tts', async (req, res) => {
|
|
const cfg = resolveConfig(req);
|
|
if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
|
|
if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
|
|
const text = req.body?.text;
|
|
if (typeof text !== 'string' || !text.trim()) return res.status(400).json({ error: 'text required' });
|
|
try {
|
|
const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) },
|
|
body: JSON.stringify({
|
|
model: cfg.ttsModel,
|
|
voice: cfg.ttsVoice,
|
|
input: text,
|
|
...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}),
|
|
}),
|
|
});
|
|
if (!r.ok) {
|
|
const errText = await r.text().catch(() => 'tts failed');
|
|
return upstreamError(res, r.status, errText);
|
|
}
|
|
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
|
|
res.setHeader('Cache-Control', 'no-store');
|
|
if (!r.body) return res.end();
|
|
Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res);
|
|
} catch (e) {
|
|
backendError(res, e);
|
|
}
|
|
});
|
|
|
|
export default router;
|