From 0e6373305bb8c08e91a57ed7118ea1d5eff155a5 Mon Sep 17 00:00:00 2001
From: Haileyesus <118998054+blackmammoth@users.noreply.github.com>
Date: Thu, 25 Jun 2026 17:10:42 +0300
Subject: [PATCH] fix(voice): separate client and server backends

User-selected backend URLs must remain usable without letting clients control server requests.

Call custom providers from the browser while keeping the server proxy bound to its configured host.

This restores voice controls for frontend settings without reopening the SSRF path.
---
 server/voice-proxy.js                         |  4 +-
 .../chat/hooks/useVoiceAvailable.ts           | 21 ++++---
 src/components/chat/hooks/useVoiceInput.ts    | 11 +---
 src/hooks/useVoiceConfig.ts                   |  7 +--
 src/i18n/locales/en/settings.json             |  2 +-
 src/lib/voiceApi.ts                           | 60 +++++++++++++++++++
 src/lib/voicePlayer.ts                        | 21 +++----
 7 files changed, 86 insertions(+), 40 deletions(-)
 create mode 100644 src/lib/voiceApi.ts
diff --git a/server/voice-proxy.js b/server/voice-proxy.js
index 149459fb..1ea4a6d8 100644
--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -29,7 +29,9 @@ const ENV = {
 function resolveConfig(req) {
   const h = req.headers;
   return {
-    baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''),
+    // Security: do not allow clients to control the outbound backend host.
+    // Always use the server-side configured base URL.
+    baseUrl: ENV.baseUrl,
     apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
     sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
     ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts
index 0adccd0d..9ee92c48 100644
--- a/src/components/chat/hooks/useVoiceAvailable.ts
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -1,31 +1,26 @@
 import { useEffect, useState } from 'react';
 
 import { authenticatedFetch } from '../../../utils/api';
-import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
+import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig';
 
 // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
 // the Settings modal) and a configured voice backend.
 const STORAGE_KEY = 'uiPreferences';
 const SYNC_EVENT = 'ui-preferences:sync';
-const healthRequests = new Map<string, Promise<boolean>>();
+let healthRequest: Promise<boolean> | null = null;
 
 function checkVoiceHealth(): Promise<boolean> {
-  const baseUrl = voiceConfigHeaders()['x-voice-base-url'];
-  const signature = baseUrl || '';
-  const pending = healthRequests.get(signature);
-  if (pending) return pending;
-  const request = authenticatedFetch('/api/voice/health', {
-    headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {},
-  })
+  if (healthRequest) return healthRequest;
+  const request = authenticatedFetch('/api/voice/health')
     .then(async (response) => {
       if (!response.ok) throw new Error(`Voice health check failed (${response.status})`);
       const data = await response.json();
       return data?.configured === true;
     })
     .finally(() => {
-      healthRequests.delete(signature);
+      healthRequest = null;
     });
-  healthRequests.set(signature, request);
+  healthRequest = request;
   return request;
 }
 
@@ -65,6 +60,10 @@ export function useVoiceAvailable(): boolean {
         setAvailable(false);
         return;
       }
+      if (readVoiceConfig().baseUrl.trim()) {
+        setAvailable(true);
+        return;
+      }
       const id = ++requestId;
       try {
         const result = await checkVoiceHealth();
diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts
index 400612a0..6fcadd56 100644
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -1,7 +1,6 @@
 import { useCallback, useEffect, useRef, useState } from 'react';
 
-import { authenticatedFetch } from '../../../utils/api';
-import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
+import { transcribeVoice } from '../../../lib/voiceApi';
 
 // Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
 const MIME_CANDIDATES = [
@@ -97,13 +96,7 @@ export function useVoiceInput(
         setState('transcribing');
         try {
           const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
-          const fd = new FormData();
-          fd.append('audio', blob, `recording.${ext}`);
-          const res = await authenticatedFetch('/api/voice/transcribe', {
-            method: 'POST',
-            body: fd,
-            headers: voiceConfigHeaders(),
-          });
+          const res = await transcribeVoice(blob, `recording.${ext}`);
           if (!res.ok) throw new Error(`transcribe ${res.status}`);
           const data = await res.json();
           if (cancelledRef.current) return;
diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts
index c9141f45..303b6467 100644
--- a/src/hooks/useVoiceConfig.ts
+++ b/src/hooks/useVoiceConfig.ts
@@ -13,7 +13,7 @@ const STORAGE_KEY = 'voiceConfig';
 export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync';
 const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' };
 
-function read(): VoiceConfig {
+export function readVoiceConfig(): VoiceConfig {
   try {
     const raw = localStorage.getItem(STORAGE_KEY);
     if (!raw) return { ...DEFAULTS };
@@ -33,9 +33,8 @@ function read(): VoiceConfig {
 // Empty fields are omitted so the server's env defaults apply.
 export function voiceConfigHeaders(): Record<string, string> {
   if (typeof window === 'undefined') return {};
-  const c = read();
+  const c = readVoiceConfig();
   const h: Record<string, string> = {};
-  if (c.baseUrl) h['x-voice-base-url'] = c.baseUrl;
   if (c.apiKey) h['x-voice-api-key'] = c.apiKey;
   if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
   if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
@@ -46,7 +45,7 @@ export function voiceConfigHeaders(): Record<string, string> {
 
 export function useVoiceConfig() {
   const [config, setConfig] = useState<VoiceConfig>(() =>
-    typeof window === 'undefined' ? { ...DEFAULTS } : read(),
+    typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(),
   );
 
   const update = (patch: Partial<VoiceConfig>) => {
diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json
index 11df5929..2d9772b1 100644
--- a/src/i18n/locales/en/settings.json
+++ b/src/i18n/locales/en/settings.json
@@ -63,7 +63,7 @@
     "ttsModel": "Text-to-speech model",
     "voice": "Voice",
     "format": "Audio format",
-    "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match."
+    "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend."
   },
   "quickSettings": {
     "title": "Quick Settings",
diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts
new file mode 100644
index 00000000..3f9549b4
--- /dev/null
+++ b/src/lib/voiceApi.ts
@@ -0,0 +1,60 @@
+import { authenticatedFetch } from '../utils/api';
+import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig';
+
+function directUrl(baseUrl: string, path: string): string {
+  return `${baseUrl.replace(/\/$/, '')}${path}`;
+}
+
+export function voiceConfigSignature(): string {
+  return JSON.stringify(readVoiceConfig());
+}
+
+export function transcribeVoice(blob: Blob, filename: string): Promise<Response> {
+  const config = readVoiceConfig();
+  const body = new FormData();
+
+  if (config.baseUrl.trim()) {
+    body.append('file', blob, filename);
+    body.append('model', config.sttModel || 'whisper-1');
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), {
+      method: 'POST',
+      headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {},
+      body,
+    });
+  }
+
+  body.append('audio', blob, filename);
+  return authenticatedFetch('/api/voice/transcribe', {
+    method: 'POST',
+    headers: voiceConfigHeaders(),
+    body,
+  });
+}
+
+export function synthesizeVoice(text: string, signal: AbortSignal): Promise<Response> {
+  const config = readVoiceConfig();
+
+  if (config.baseUrl.trim()) {
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}),
+      },
+      body: JSON.stringify({
+        model: config.ttsModel || 'tts-1',
+        voice: config.ttsVoice || 'alloy',
+        input: text,
+        ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}),
+      }),
+      signal,
+    });
+  }
+
+  return authenticatedFetch('/api/voice/tts', {
+    method: 'POST',
+    body: JSON.stringify({ text }),
+    headers: voiceConfigHeaders(),
+    signal,
+  });
+}
diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts
index b09f5170..4c239c29 100644
--- a/src/lib/voicePlayer.ts
+++ b/src/lib/voicePlayer.ts
@@ -1,5 +1,4 @@
-import { authenticatedFetch } from '../utils/api';
-import { voiceConfigHeaders } from '../hooks/useVoiceConfig';
+import { synthesizeVoice, voiceConfigSignature } from './voiceApi';
 
 // A single app-level audio player for read-aloud. It owns one <audio> element, lives
 // outside the React tree, and caches generated audio by content. Because playback is not
@@ -16,8 +15,8 @@ const CACHE_MAX = 24;
 const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min
 
 // Stable id / cache key from the text and voice settings that affect its audio (djb2).
-export function voiceId(content: string, headers = voiceConfigHeaders()): string {
-  const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]);
+export function voiceId(content: string, signature = voiceConfigSignature()): string {
+  const input = JSON.stringify([content, signature]);
   let h = 5381;
   for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0;
   return (h >>> 0).toString(36);
@@ -82,13 +81,12 @@ class VoicePlayer {
   }
 
   toggle(content: string) {
-    const headers = voiceConfigHeaders();
-    const id = voiceId(content, headers);
+    const id = voiceId(content);
     if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
       this.stop();
       return;
     }
-    void this.play(id, content, headers);
+    void this.play(id, content);
   }
 
   stop() {
@@ -131,7 +129,7 @@ class VoicePlayer {
     }, 6000);
   }
 
-  private async play(id: string, content: string, headers: Record<string, string>) {
+  private async play(id: string, content: string) {
     const audio = this.ensureAudio();
     audio.pause();
     this.currentId = id;
@@ -149,12 +147,7 @@ class VoicePlayer {
         const controller = new AbortController();
         this.activeController = controller;
         const timer = setTimeout(() => controller.abort(), CLIENT_TIMEOUT_MS);
-        const res = await authenticatedFetch('/api/voice/tts', {
-          method: 'POST',
-          body: JSON.stringify({ text: content }),
-          headers,
-          signal: controller.signal,
-        }).finally(() => {
+        const res = await synthesizeVoice(content, controller.signal).finally(() => {
           clearTimeout(timer);
           if (this.activeController === controller) this.activeController = null;
         });