From 0e6373305bb8c08e91a57ed7118ea1d5eff155a5 Mon Sep 17 00:00:00 2001 From: Haileyesus <118998054+blackmammoth@users.noreply.github.com> Date: Thu, 25 Jun 2026 17:10:42 +0300 Subject: [PATCH] fix(voice): separate client and server backends User-selected backend URLs must remain usable without letting clients control server requests. Call custom providers from the browser while keeping the server proxy bound to its configured host. This restores voice controls for frontend settings without reopening the SSRF path. --- server/voice-proxy.js | 4 +- .../chat/hooks/useVoiceAvailable.ts | 21 ++++--- src/components/chat/hooks/useVoiceInput.ts | 11 +--- src/hooks/useVoiceConfig.ts | 7 +-- src/i18n/locales/en/settings.json | 2 +- src/lib/voiceApi.ts | 60 +++++++++++++++++++ src/lib/voicePlayer.ts | 21 +++---- 7 files changed, 86 insertions(+), 40 deletions(-) create mode 100644 src/lib/voiceApi.ts diff --git a/server/voice-proxy.js b/server/voice-proxy.js index 149459fb..1ea4a6d8 100644 --- a/server/voice-proxy.js +++ b/server/voice-proxy.js @@ -29,7 +29,9 @@ const ENV = { function resolveConfig(req) { const h = req.headers; return { - baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''), + // Security: do not allow clients to control the outbound backend host. + // Always use the server-side configured base URL. + baseUrl: ENV.baseUrl, apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey, sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts index 0adccd0d..9ee92c48 100644 --- a/src/components/chat/hooks/useVoiceAvailable.ts +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -1,31 +1,26 @@ import { useEffect, useState } from 'react'; import { authenticatedFetch } from '../../../utils/api'; -import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; +import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig'; // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / // the Settings modal) and a configured voice backend. const STORAGE_KEY = 'uiPreferences'; const SYNC_EVENT = 'ui-preferences:sync'; -const healthRequests = new Map>(); +let healthRequest: Promise | null = null; function checkVoiceHealth(): Promise { - const baseUrl = voiceConfigHeaders()['x-voice-base-url']; - const signature = baseUrl || ''; - const pending = healthRequests.get(signature); - if (pending) return pending; - const request = authenticatedFetch('/api/voice/health', { - headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {}, - }) + if (healthRequest) return healthRequest; + const request = authenticatedFetch('/api/voice/health') .then(async (response) => { if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); const data = await response.json(); return data?.configured === true; }) .finally(() => { - healthRequests.delete(signature); + healthRequest = null; }); - healthRequests.set(signature, request); + healthRequest = request; return request; } @@ -65,6 +60,10 @@ export function useVoiceAvailable(): boolean { setAvailable(false); return; } + if (readVoiceConfig().baseUrl.trim()) { + setAvailable(true); + return; + } const id = ++requestId; try { const result = await checkVoiceHealth(); diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts index 400612a0..6fcadd56 100644 --- a/src/components/chat/hooks/useVoiceInput.ts +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -1,7 +1,6 @@ import { useCallback, useEffect, useRef, useState } from 'react'; -import { authenticatedFetch } from '../../../utils/api'; -import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; +import { transcribeVoice } from '../../../lib/voiceApi'; // Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4. const MIME_CANDIDATES = [ @@ -97,13 +96,7 @@ export function useVoiceInput( setState('transcribing'); try { const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm'; - const fd = new FormData(); - fd.append('audio', blob, `recording.${ext}`); - const res = await authenticatedFetch('/api/voice/transcribe', { - method: 'POST', - body: fd, - headers: voiceConfigHeaders(), - }); + const res = await transcribeVoice(blob, `recording.${ext}`); if (!res.ok) throw new Error(`transcribe ${res.status}`); const data = await res.json(); if (cancelledRef.current) return; diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts index c9141f45..303b6467 100644 --- a/src/hooks/useVoiceConfig.ts +++ b/src/hooks/useVoiceConfig.ts @@ -13,7 +13,7 @@ const STORAGE_KEY = 'voiceConfig'; export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; -function read(): VoiceConfig { +export function readVoiceConfig(): VoiceConfig { try { const raw = localStorage.getItem(STORAGE_KEY); if (!raw) return { ...DEFAULTS }; @@ -33,9 +33,8 @@ function read(): VoiceConfig { // Empty fields are omitted so the server's env defaults apply. export function voiceConfigHeaders(): Record { if (typeof window === 'undefined') return {}; - const c = read(); + const c = readVoiceConfig(); const h: Record = {}; - if (c.baseUrl) h['x-voice-base-url'] = c.baseUrl; if (c.apiKey) h['x-voice-api-key'] = c.apiKey; if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; @@ -46,7 +45,7 @@ export function voiceConfigHeaders(): Record { export function useVoiceConfig() { const [config, setConfig] = useState(() => - typeof window === 'undefined' ? { ...DEFAULTS } : read(), + typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(), ); const update = (patch: Partial) => { diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json index 11df5929..2d9772b1 100644 --- a/src/i18n/locales/en/settings.json +++ b/src/i18n/locales/en/settings.json @@ -63,7 +63,7 @@ "ttsModel": "Text-to-speech model", "voice": "Voice", "format": "Audio format", - "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match." + "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend." }, "quickSettings": { "title": "Quick Settings", diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts new file mode 100644 index 00000000..3f9549b4 --- /dev/null +++ b/src/lib/voiceApi.ts @@ -0,0 +1,60 @@ +import { authenticatedFetch } from '../utils/api'; +import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig'; + +function directUrl(baseUrl: string, path: string): string { + return `${baseUrl.replace(/\/$/, '')}${path}`; +} + +export function voiceConfigSignature(): string { + return JSON.stringify(readVoiceConfig()); +} + +export function transcribeVoice(blob: Blob, filename: string): Promise { + const config = readVoiceConfig(); + const body = new FormData(); + + if (config.baseUrl.trim()) { + body.append('file', blob, filename); + body.append('model', config.sttModel || 'whisper-1'); + return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), { + method: 'POST', + headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}, + body, + }); + } + + body.append('audio', blob, filename); + return authenticatedFetch('/api/voice/transcribe', { + method: 'POST', + headers: voiceConfigHeaders(), + body, + }); +} + +export function synthesizeVoice(text: string, signal: AbortSignal): Promise { + const config = readVoiceConfig(); + + if (config.baseUrl.trim()) { + return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}), + }, + body: JSON.stringify({ + model: config.ttsModel || 'tts-1', + voice: config.ttsVoice || 'alloy', + input: text, + ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}), + }), + signal, + }); + } + + return authenticatedFetch('/api/voice/tts', { + method: 'POST', + body: JSON.stringify({ text }), + headers: voiceConfigHeaders(), + signal, + }); +} diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts index b09f5170..4c239c29 100644 --- a/src/lib/voicePlayer.ts +++ b/src/lib/voicePlayer.ts @@ -1,5 +1,4 @@ -import { authenticatedFetch } from '../utils/api'; -import { voiceConfigHeaders } from '../hooks/useVoiceConfig'; +import { synthesizeVoice, voiceConfigSignature } from './voiceApi'; // A single app-level audio player for read-aloud. It owns one