From 8cbfac6ab11d983ce6029169b650c0cd57c47bf0 Mon Sep 17 00:00:00 2001 From: Haileyesus <118998054+blackmammoth@users.noreply.github.com> Date: Wed, 24 Jun 2026 10:05:35 +0300 Subject: [PATCH] fix(voice): expose TTS format in user settings --- docs/voice.md | 51 ------------------- server/voice-proxy.js | 6 +-- .../settings/view/tabs/VoiceSettingsTab.tsx | 8 ++- src/hooks/useVoiceConfig.ts | 8 ++- src/i18n/locales/en/settings.json | 3 +- 5 files changed, 18 insertions(+), 58 deletions(-) delete mode 100644 docs/voice.md diff --git a/docs/voice.md b/docs/voice.md deleted file mode 100644 index 71f81b84..00000000 --- a/docs/voice.md +++ /dev/null @@ -1,51 +0,0 @@ -# Voice (optional) - -Two opt-in voice features in the chat: - -- **Push-to-talk dictation** — a mic button in the composer records, transcribes, and fills the input. -- **Read-aloud** — a speaker button on each assistant message plays it back. - -Voice is **off by default**. Turn it on with the **Voice** toggle in Quick Settings or in -**Settings → Voice**. When off, the mic and speaker controls are hidden. - -## Backend - -Voice uses any **OpenAI-compatible audio backend**, configured in **Settings → Voice**: - -| Field | Example | Notes | -|---|---|---| -| Base URL | `https://api.openai.com/v1` | OpenAI, Groq, or a local server | -| API key | `sk-…` | sent only to this app's backend, which proxies the request | -| Speech-to-text model | `whisper-1`, `gpt-4o-transcribe`, `whisper-large-v3-turbo` | | -| Text-to-speech model | `tts-1`, `gpt-4o-mini-tts`, `kokoro` | | -| Voice | `alloy`, `af_heart`, … | depends on the backend | - -The backend must expose the standard endpoints: - -``` -POST {baseUrl}/audio/transcriptions (multipart 'file' + 'model') -> { "text": "..." } -POST {baseUrl}/audio/speech ({ model, voice, input }) -> audio bytes -``` - -That covers OpenAI and Groq, plus local servers like **LocalAI**, **Speaches**, **Kokoro-FastAPI**, -and **openedai-speech**. Requests are proxied through the app's authenticated `/api/voice/*` routes, -so a local backend only needs to listen on localhost. - -### Server-side defaults (optional) - -Instead of (or as defaults behind) the Settings fields, you can set env vars on the server: - -``` -VOICE_API_BASE_URL=http://127.0.0.1:8765/v1 -VOICE_API_KEY=... -VOICE_STT_MODEL=whisper-1 -VOICE_TTS_MODEL=tts-1 -VOICE_TTS_VOICE=alloy -``` - -Per-user Settings values override these. If neither is set, the voice routes return 503. - -## Notes - -- Recording needs a secure context (HTTPS or localhost) for microphone access. -- On iOS, read-aloud is tap-initiated to satisfy Safari's autoplay policy. diff --git a/server/voice-proxy.js b/server/voice-proxy.js index e1ae5e36..a628ce63 100644 --- a/server/voice-proxy.js +++ b/server/voice-proxy.js @@ -16,14 +16,13 @@ const ENV = { sttModel: process.env.VOICE_STT_MODEL || 'whisper-1', ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1', ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy', - ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3', }; /** * Resolve the voice backend config for a request. Client headers (set from the * user's in-app voice settings) take precedence over the server env defaults. * @param {import('express').Request} req - * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string}} + * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}} */ function resolveConfig(req) { const h = req.headers; @@ -33,6 +32,7 @@ function resolveConfig(req) { sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, + ttsFormat: String(h['x-voice-tts-format'] || ''), }; } @@ -197,7 +197,7 @@ router.post('/tts', async (req, res) => { model: cfg.ttsModel, voice: cfg.ttsVoice, input: text, - response_format: ENV.ttsFormat, + ...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}), }), }); if (!r.ok) { diff --git a/src/components/settings/view/tabs/VoiceSettingsTab.tsx b/src/components/settings/view/tabs/VoiceSettingsTab.tsx index 3de61fba..7a8a2ba0 100644 --- a/src/components/settings/view/tabs/VoiceSettingsTab.tsx +++ b/src/components/settings/view/tabs/VoiceSettingsTab.tsx @@ -54,7 +54,7 @@ export default function VoiceSettingsTab() { value={config.apiKey} onChange={(e) => update({ apiKey: e.target.value })} /> -
{t('voiceSettings.note')}