fix(voice): separate client and server backends

User-selected backend URLs must remain usable without letting clients control server requests. Call custom providers from the browser while keeping the server proxy bound to its configured host. This restores voice controls for frontend settings without reopening the SSRF path.
2026-06-26 05:15:48 +08:00 · 2026-06-25 17:10:42 +03:00
parent 43c0cca96e
commit 0e6373305b
7 changed files with 86 additions and 40 deletions
--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -29,7 +29,9 @@ const ENV = {
 function resolveConfig(req) {
  const h = req.headers;
  return {
-    baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''),
+    // Security: do not allow clients to control the outbound backend host.
+    // Always use the server-side configured base URL.
+    baseUrl: ENV.baseUrl,
    apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
    sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
    ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
--- a/src/components/chat/hooks/useVoiceAvailable.ts
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -1,31 +1,26 @@
 import { useEffect, useState } from 'react';

 import { authenticatedFetch } from '../../../utils/api';
-import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
+import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig';

 // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
 // the Settings modal) and a configured voice backend.
 const STORAGE_KEY = 'uiPreferences';
 const SYNC_EVENT = 'ui-preferences:sync';
-const healthRequests = new Map<string, Promise<boolean>>();
+let healthRequest: Promise<boolean> | null = null;

 function checkVoiceHealth(): Promise<boolean> {
-  const baseUrl = voiceConfigHeaders()['x-voice-base-url'];
-  const signature = baseUrl || '';
-  const pending = healthRequests.get(signature);
-  if (pending) return pending;
-  const request = authenticatedFetch('/api/voice/health', {
-    headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {},
-  })
+  if (healthRequest) return healthRequest;
+  const request = authenticatedFetch('/api/voice/health')
    .then(async (response) => {
      if (!response.ok) throw new Error(`Voice health check failed (${response.status})`);
      const data = await response.json();
      return data?.configured === true;
    })
    .finally(() => {
-      healthRequests.delete(signature);
+      healthRequest = null;
    });
-  healthRequests.set(signature, request);
+  healthRequest = request;
  return request;
 }

@@ -65,6 +60,10 @@ export function useVoiceAvailable(): boolean {
        setAvailable(false);
        return;
      }
+      if (readVoiceConfig().baseUrl.trim()) {
+        setAvailable(true);
+        return;
+      }
      const id = ++requestId;
      try {
        const result = await checkVoiceHealth();
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -1,7 +1,6 @@
 import { useCallback, useEffect, useRef, useState } from 'react';

-import { authenticatedFetch } from '../../../utils/api';
-import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
+import { transcribeVoice } from '../../../lib/voiceApi';

 // Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
 const MIME_CANDIDATES = [
@@ -97,13 +96,7 @@ export function useVoiceInput(
        setState('transcribing');
        try {
          const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
-          const fd = new FormData();
-          fd.append('audio', blob, `recording.${ext}`);
-          const res = await authenticatedFetch('/api/voice/transcribe', {
-            method: 'POST',
-            body: fd,
-            headers: voiceConfigHeaders(),
-          });
+          const res = await transcribeVoice(blob, `recording.${ext}`);
          if (!res.ok) throw new Error(`transcribe ${res.status}`);
          const data = await res.json();
          if (cancelledRef.current) return;
--- a/src/hooks/useVoiceConfig.ts
+++ b/src/hooks/useVoiceConfig.ts
@@ -13,7 +13,7 @@ const STORAGE_KEY = 'voiceConfig';
 export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync';
 const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' };

-function read(): VoiceConfig {
+export function readVoiceConfig(): VoiceConfig {
  try {
    const raw = localStorage.getItem(STORAGE_KEY);
    if (!raw) return { ...DEFAULTS };
@@ -33,9 +33,8 @@ function read(): VoiceConfig {
 // Empty fields are omitted so the server's env defaults apply.
 export function voiceConfigHeaders(): Record<string, string> {
  if (typeof window === 'undefined') return {};
-  const c = read();
+  const c = readVoiceConfig();
  const h: Record<string, string> = {};
-  if (c.baseUrl) h['x-voice-base-url'] = c.baseUrl;
  if (c.apiKey) h['x-voice-api-key'] = c.apiKey;
  if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
  if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
@@ -46,7 +45,7 @@ export function voiceConfigHeaders(): Record<string, string> {

 export function useVoiceConfig() {
  const [config, setConfig] = useState<VoiceConfig>(() =>
-    typeof window === 'undefined' ? { ...DEFAULTS } : read(),
+    typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(),
  );

  const update = (patch: Partial<VoiceConfig>) => {
--- a/src/i18n/locales/en/settings.json
+++ b/src/i18n/locales/en/settings.json
@@ -63,7 +63,7 @@
    "ttsModel": "Text-to-speech model",
    "voice": "Voice",
    "format": "Audio format",
-    "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match."
+    "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend."
  },
  "quickSettings": {
    "title": "Quick Settings",
--- a/src/lib/voiceApi.ts
+++ b/src/lib/voiceApi.ts
@@ -0,0 +1,60 @@
+import { authenticatedFetch } from '../utils/api';
+import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig';
+
+function directUrl(baseUrl: string, path: string): string {
+  return `${baseUrl.replace(/\/$/, '')}${path}`;
+}
+
+export function voiceConfigSignature(): string {
+  return JSON.stringify(readVoiceConfig());
+}
+
+export function transcribeVoice(blob: Blob, filename: string): Promise<Response> {
+  const config = readVoiceConfig();
+  const body = new FormData();
+
+  if (config.baseUrl.trim()) {
+    body.append('file', blob, filename);
+    body.append('model', config.sttModel || 'whisper-1');
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), {
+      method: 'POST',
+      headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {},
+      body,
+    });
+  }
+
+  body.append('audio', blob, filename);
+  return authenticatedFetch('/api/voice/transcribe', {
+    method: 'POST',
+    headers: voiceConfigHeaders(),
+    body,
+  });
+}
+
+export function synthesizeVoice(text: string, signal: AbortSignal): Promise<Response> {
+  const config = readVoiceConfig();
+
+  if (config.baseUrl.trim()) {
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}),
+      },
+      body: JSON.stringify({
+        model: config.ttsModel || 'tts-1',
+        voice: config.ttsVoice || 'alloy',
+        input: text,
+        ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}),
+      }),
+      signal,
+    });
+  }
+
+  return authenticatedFetch('/api/voice/tts', {
+    method: 'POST',
+    body: JSON.stringify({ text }),
+    headers: voiceConfigHeaders(),
+    signal,
+  });
+}
--- a/src/lib/voicePlayer.ts
+++ b/src/lib/voicePlayer.ts
@@ -1,5 +1,4 @@
-import { authenticatedFetch } from '../utils/api';
-import { voiceConfigHeaders } from '../hooks/useVoiceConfig';
+import { synthesizeVoice, voiceConfigSignature } from './voiceApi';

 // A single app-level audio player for read-aloud. It owns one <audio> element, lives
 // outside the React tree, and caches generated audio by content. Because playback is not
@@ -16,8 +15,8 @@ const CACHE_MAX = 24;
 const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min

 // Stable id / cache key from the text and voice settings that affect its audio (djb2).
-export function voiceId(content: string, headers = voiceConfigHeaders()): string {
-  const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]);
+export function voiceId(content: string, signature = voiceConfigSignature()): string {
+  const input = JSON.stringify([content, signature]);
  let h = 5381;
  for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0;
  return (h >>> 0).toString(36);
@@ -82,13 +81,12 @@ class VoicePlayer {
  }

  toggle(content: string) {
-    const headers = voiceConfigHeaders();
-    const id = voiceId(content, headers);
+    const id = voiceId(content);
    if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
      this.stop();
      return;
    }
-    void this.play(id, content, headers);
+    void this.play(id, content);
  }

  stop() {
@@ -131,7 +129,7 @@ class VoicePlayer {
    }, 6000);
  }

-  private async play(id: string, content: string, headers: Record<string, string>) {
+  private async play(id: string, content: string) {
    const audio = this.ensureAudio();
    audio.pause();
    this.currentId = id;
@@ -149,12 +147,7 @@ class VoicePlayer {
        const controller = new AbortController();
        this.activeController = controller;
        const timer = setTimeout(() => controller.abort(), CLIENT_TIMEOUT_MS);
-        const res = await authenticatedFetch('/api/voice/tts', {
-          method: 'POST',
-          body: JSON.stringify({ text: content }),
-          headers,
-          signal: controller.signal,
-        }).finally(() => {
+        const res = await synthesizeVoice(content, controller.signal).finally(() => {
          clearTimeout(timer);
          if (this.activeController === controller) this.activeController = null;
        });