diff --git a/server/voice-proxy.js b/server/voice-proxy.js index 149459fb..1ea4a6d8 100644 --- a/server/voice-proxy.js +++ b/server/voice-proxy.js @@ -29,7 +29,9 @@ const ENV = { function resolveConfig(req) { const h = req.headers; return { - baseUrl: (String(h['x-voice-base-url'] || '') || ENV.baseUrl).replace(/\/$/, ''), + // Security: do not allow clients to control the outbound backend host. + // Always use the server-side configured base URL. + baseUrl: ENV.baseUrl, apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey, sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts index 0adccd0d..9ee92c48 100644 --- a/src/components/chat/hooks/useVoiceAvailable.ts +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -1,31 +1,26 @@ import { useEffect, useState } from 'react'; import { authenticatedFetch } from '../../../utils/api'; -import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; +import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig'; // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / // the Settings modal) and a configured voice backend. const STORAGE_KEY = 'uiPreferences'; const SYNC_EVENT = 'ui-preferences:sync'; -const healthRequests = new Map>(); +let healthRequest: Promise | null = null; function checkVoiceHealth(): Promise { - const baseUrl = voiceConfigHeaders()['x-voice-base-url']; - const signature = baseUrl || ''; - const pending = healthRequests.get(signature); - if (pending) return pending; - const request = authenticatedFetch('/api/voice/health', { - headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {}, - }) + if (healthRequest) return healthRequest; + const request = authenticatedFetch('/api/voice/health') .then(async (response) => { if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); const data = await response.json(); return data?.configured === true; }) .finally(() => { - healthRequests.delete(signature); + healthRequest = null; }); - healthRequests.set(signature, request); + healthRequest = request; return request; } @@ -65,6 +60,10 @@ export function useVoiceAvailable(): boolean { setAvailable(false); return; } + if (readVoiceConfig().baseUrl.trim()) { + setAvailable(true); + return; + } const id = ++requestId; try { const result = await checkVoiceHealth(); diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts index 400612a0..6fcadd56 100644 --- a/src/components/chat/hooks/useVoiceInput.ts +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -1,7 +1,6 @@ import { useCallback, useEffect, useRef, useState } from 'react'; -import { authenticatedFetch } from '../../../utils/api'; -import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; +import { transcribeVoice } from '../../../lib/voiceApi'; // Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4. const MIME_CANDIDATES = [ @@ -97,13 +96,7 @@ export function useVoiceInput( setState('transcribing'); try { const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm'; - const fd = new FormData(); - fd.append('audio', blob, `recording.${ext}`); - const res = await authenticatedFetch('/api/voice/transcribe', { - method: 'POST', - body: fd, - headers: voiceConfigHeaders(), - }); + const res = await transcribeVoice(blob, `recording.${ext}`); if (!res.ok) throw new Error(`transcribe ${res.status}`); const data = await res.json(); if (cancelledRef.current) return; diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts index c9141f45..303b6467 100644 --- a/src/hooks/useVoiceConfig.ts +++ b/src/hooks/useVoiceConfig.ts @@ -13,7 +13,7 @@ const STORAGE_KEY = 'voiceConfig'; export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; -function read(): VoiceConfig { +export function readVoiceConfig(): VoiceConfig { try { const raw = localStorage.getItem(STORAGE_KEY); if (!raw) return { ...DEFAULTS }; @@ -33,9 +33,8 @@ function read(): VoiceConfig { // Empty fields are omitted so the server's env defaults apply. export function voiceConfigHeaders(): Record { if (typeof window === 'undefined') return {}; - const c = read(); + const c = readVoiceConfig(); const h: Record = {}; - if (c.baseUrl) h['x-voice-base-url'] = c.baseUrl; if (c.apiKey) h['x-voice-api-key'] = c.apiKey; if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; @@ -46,7 +45,7 @@ export function voiceConfigHeaders(): Record { export function useVoiceConfig() { const [config, setConfig] = useState(() => - typeof window === 'undefined' ? { ...DEFAULTS } : read(), + typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(), ); const update = (patch: Partial) => { diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json index 11df5929..2d9772b1 100644 --- a/src/i18n/locales/en/settings.json +++ b/src/i18n/locales/en/settings.json @@ -63,7 +63,7 @@ "ttsModel": "Text-to-speech model", "voice": "Voice", "format": "Audio format", - "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match." + "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend." }, "quickSettings": { "title": "Quick Settings", diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts new file mode 100644 index 00000000..3f9549b4 --- /dev/null +++ b/src/lib/voiceApi.ts @@ -0,0 +1,60 @@ +import { authenticatedFetch } from '../utils/api'; +import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig'; + +function directUrl(baseUrl: string, path: string): string { + return `${baseUrl.replace(/\/$/, '')}${path}`; +} + +export function voiceConfigSignature(): string { + return JSON.stringify(readVoiceConfig()); +} + +export function transcribeVoice(blob: Blob, filename: string): Promise { + const config = readVoiceConfig(); + const body = new FormData(); + + if (config.baseUrl.trim()) { + body.append('file', blob, filename); + body.append('model', config.sttModel || 'whisper-1'); + return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), { + method: 'POST', + headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}, + body, + }); + } + + body.append('audio', blob, filename); + return authenticatedFetch('/api/voice/transcribe', { + method: 'POST', + headers: voiceConfigHeaders(), + body, + }); +} + +export function synthesizeVoice(text: string, signal: AbortSignal): Promise { + const config = readVoiceConfig(); + + if (config.baseUrl.trim()) { + return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}), + }, + body: JSON.stringify({ + model: config.ttsModel || 'tts-1', + voice: config.ttsVoice || 'alloy', + input: text, + ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}), + }), + signal, + }); + } + + return authenticatedFetch('/api/voice/tts', { + method: 'POST', + body: JSON.stringify({ text }), + headers: voiceConfigHeaders(), + signal, + }); +} diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts index b09f5170..4c239c29 100644 --- a/src/lib/voicePlayer.ts +++ b/src/lib/voicePlayer.ts @@ -1,5 +1,4 @@ -import { authenticatedFetch } from '../utils/api'; -import { voiceConfigHeaders } from '../hooks/useVoiceConfig'; +import { synthesizeVoice, voiceConfigSignature } from './voiceApi'; // A single app-level audio player for read-aloud. It owns one