From 8cbfac6ab11d983ce6029169b650c0cd57c47bf0 Mon Sep 17 00:00:00 2001
From: Haileyesus <118998054+blackmammoth@users.noreply.github.com>
Date: Wed, 24 Jun 2026 10:05:35 +0300
Subject: [PATCH] fix(voice): expose TTS format in user settings

---
 docs/voice.md                                 | 51 -------------------
 server/voice-proxy.js                         |  6 +--
 .../settings/view/tabs/VoiceSettingsTab.tsx   |  8 ++-
 src/hooks/useVoiceConfig.ts                   |  8 ++-
 src/i18n/locales/en/settings.json             |  3 +-
 5 files changed, 18 insertions(+), 58 deletions(-)
 delete mode 100644 docs/voice.md

diff --git a/docs/voice.md b/docs/voice.md
deleted file mode 100644
index 71f81b84..00000000
--- a/docs/voice.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# Voice (optional)
-
-Two opt-in voice features in the chat:
-
-- **Push-to-talk dictation** — a mic button in the composer records, transcribes, and fills the input.
-- **Read-aloud** — a speaker button on each assistant message plays it back.
-
-Voice is **off by default**. Turn it on with the **Voice** toggle in Quick Settings or in
-**Settings → Voice**. When off, the mic and speaker controls are hidden.
-
-## Backend
-
-Voice uses any **OpenAI-compatible audio backend**, configured in **Settings → Voice**:
-
-| Field | Example | Notes |
-|---|---|---|
-| Base URL | `https://api.openai.com/v1` | OpenAI, Groq, or a local server |
-| API key | `sk-…` | sent only to this app's backend, which proxies the request |
-| Speech-to-text model | `whisper-1`, `gpt-4o-transcribe`, `whisper-large-v3-turbo` | |
-| Text-to-speech model | `tts-1`, `gpt-4o-mini-tts`, `kokoro` | |
-| Voice | `alloy`, `af_heart`, … | depends on the backend |
-
-The backend must expose the standard endpoints:
-
-```
-POST {baseUrl}/audio/transcriptions   (multipart 'file' + 'model')   -> { "text": "..." }
-POST {baseUrl}/audio/speech           ({ model, voice, input })       -> audio bytes
-```
-
-That covers OpenAI and Groq, plus local servers like **LocalAI**, **Speaches**, **Kokoro-FastAPI**,
-and **openedai-speech**. Requests are proxied through the app's authenticated `/api/voice/*` routes,
-so a local backend only needs to listen on localhost.
-
-### Server-side defaults (optional)
-
-Instead of (or as defaults behind) the Settings fields, you can set env vars on the server:
-
-```
-VOICE_API_BASE_URL=http://127.0.0.1:8765/v1
-VOICE_API_KEY=...
-VOICE_STT_MODEL=whisper-1
-VOICE_TTS_MODEL=tts-1
-VOICE_TTS_VOICE=alloy
-```
-
-Per-user Settings values override these. If neither is set, the voice routes return 503.
-
-## Notes
-
-- Recording needs a secure context (HTTPS or localhost) for microphone access.
-- On iOS, read-aloud is tap-initiated to satisfy Safari's autoplay policy.
diff --git a/server/voice-proxy.js b/server/voice-proxy.js
index e1ae5e36..a628ce63 100644
--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -16,14 +16,13 @@ const ENV = {
   sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
   ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
   ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
-  ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3',
 };
 
 /**
  * Resolve the voice backend config for a request. Client headers (set from the
  * user's in-app voice settings) take precedence over the server env defaults.
  * @param {import('express').Request} req
- * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string}}
+ * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}}
  */
 function resolveConfig(req) {
   const h = req.headers;
@@ -33,6 +32,7 @@ function resolveConfig(req) {
     sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
     ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
     ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
+    ttsFormat: String(h['x-voice-tts-format'] || ''),
   };
 }
 
@@ -197,7 +197,7 @@ router.post('/tts', async (req, res) => {
         model: cfg.ttsModel,
         voice: cfg.ttsVoice,
         input: text,
-        response_format: ENV.ttsFormat,
+        ...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}),
       }),
     });
     if (!r.ok) {
diff --git a/src/components/settings/view/tabs/VoiceSettingsTab.tsx b/src/components/settings/view/tabs/VoiceSettingsTab.tsx
index 3de61fba..7a8a2ba0 100644
--- a/src/components/settings/view/tabs/VoiceSettingsTab.tsx
+++ b/src/components/settings/view/tabs/VoiceSettingsTab.tsx
@@ -54,7 +54,7 @@ export default function VoiceSettingsTab() {
             value={config.apiKey}
             onChange={(e) => update({ apiKey: e.target.value })}
           />
-          <div className="grid grid-cols-1 gap-4 sm:grid-cols-3">
+          <div className="grid grid-cols-1 gap-4 sm:grid-cols-4">
             <Field
               label={t('voiceSettings.sttModel')}
               placeholder="whisper-1"
@@ -73,6 +73,12 @@ export default function VoiceSettingsTab() {
               value={config.ttsVoice}
               onChange={(e) => update({ ttsVoice: e.target.value })}
             />
+            <Field
+              label={t('voiceSettings.format')}
+              placeholder="mp3"
+              value={config.ttsFormat}
+              onChange={(e) => update({ ttsFormat: e.target.value })}
+            />
           </div>
           <p className="text-xs text-muted-foreground">{t('voiceSettings.note')}</p>
         </div>
diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts
index fa170bca..9e3e2551 100644
--- a/src/hooks/useVoiceConfig.ts
+++ b/src/hooks/useVoiceConfig.ts
@@ -6,17 +6,20 @@ export type VoiceConfig = {
   sttModel: string;
   ttsModel: string;
   ttsVoice: string;
+  ttsFormat: string;
 };
 
 const STORAGE_KEY = 'voiceConfig';
-const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '' };
+const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' };
 
 function read(): VoiceConfig {
   try {
     const raw = localStorage.getItem(STORAGE_KEY);
     if (!raw) return { ...DEFAULTS };
     const parsed = JSON.parse(raw);
-    return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
+    const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
+    if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat;
+    return next;
   } catch {
     return { ...DEFAULTS };
   }
@@ -33,6 +36,7 @@ export function voiceConfigHeaders(): Record<string, string> {
   if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
   if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
   if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
+  if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat;
   return h;
 }
 
diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json
index da80c0dc..90ae38a6 100644
--- a/src/i18n/locales/en/settings.json
+++ b/src/i18n/locales/en/settings.json
@@ -61,7 +61,8 @@
     "sttModel": "Speech-to-text model",
     "ttsModel": "Text-to-speech model",
     "voice": "Voice",
-    "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL and model names to match."
+    "format": "Audio format",
+    "note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match."
   },
   "quickSettings": {
     "title": "Quick Settings",