fix(voice): expose TTS format in user settings

This commit is contained in:
Haileyesus
2026-06-24 10:05:35 +03:00
parent 9919851be7
commit 8cbfac6ab1
5 changed files with 18 additions and 58 deletions

View File

@@ -1,51 +0,0 @@
# Voice (optional)
Two opt-in voice features in the chat:
- **Push-to-talk dictation** — a mic button in the composer records, transcribes, and fills the input.
- **Read-aloud** — a speaker button on each assistant message plays it back.
Voice is **off by default**. Turn it on with the **Voice** toggle in Quick Settings or in
**Settings → Voice**. When off, the mic and speaker controls are hidden.
## Backend
Voice uses any **OpenAI-compatible audio backend**, configured in **Settings → Voice**:
| Field | Example | Notes |
|---|---|---|
| Base URL | `https://api.openai.com/v1` | OpenAI, Groq, or a local server |
| API key | `sk-…` | sent only to this app's backend, which proxies the request |
| Speech-to-text model | `whisper-1`, `gpt-4o-transcribe`, `whisper-large-v3-turbo` | |
| Text-to-speech model | `tts-1`, `gpt-4o-mini-tts`, `kokoro` | |
| Voice | `alloy`, `af_heart`, … | depends on the backend |
The backend must expose the standard endpoints:
```
POST {baseUrl}/audio/transcriptions (multipart 'file' + 'model') -> { "text": "..." }
POST {baseUrl}/audio/speech ({ model, voice, input }) -> audio bytes
```
That covers OpenAI and Groq, plus local servers like **LocalAI**, **Speaches**, **Kokoro-FastAPI**,
and **openedai-speech**. Requests are proxied through the app's authenticated `/api/voice/*` routes,
so a local backend only needs to listen on localhost.
### Server-side defaults (optional)
Instead of (or as defaults behind) the Settings fields, you can set env vars on the server:
```
VOICE_API_BASE_URL=http://127.0.0.1:8765/v1
VOICE_API_KEY=...
VOICE_STT_MODEL=whisper-1
VOICE_TTS_MODEL=tts-1
VOICE_TTS_VOICE=alloy
```
Per-user Settings values override these. If neither is set, the voice routes return 503.
## Notes
- Recording needs a secure context (HTTPS or localhost) for microphone access.
- On iOS, read-aloud is tap-initiated to satisfy Safari's autoplay policy.

View File

@@ -16,14 +16,13 @@ const ENV = {
sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3',
};
/**
* Resolve the voice backend config for a request. Client headers (set from the
* user's in-app voice settings) take precedence over the server env defaults.
* @param {import('express').Request} req
* @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string}}
* @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}}
*/
function resolveConfig(req) {
const h = req.headers;
@@ -33,6 +32,7 @@ function resolveConfig(req) {
sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
ttsFormat: String(h['x-voice-tts-format'] || ''),
};
}
@@ -197,7 +197,7 @@ router.post('/tts', async (req, res) => {
model: cfg.ttsModel,
voice: cfg.ttsVoice,
input: text,
response_format: ENV.ttsFormat,
...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}),
}),
});
if (!r.ok) {

View File

@@ -54,7 +54,7 @@ export default function VoiceSettingsTab() {
value={config.apiKey}
onChange={(e) => update({ apiKey: e.target.value })}
/>
<div className="grid grid-cols-1 gap-4 sm:grid-cols-3">
<div className="grid grid-cols-1 gap-4 sm:grid-cols-4">
<Field
label={t('voiceSettings.sttModel')}
placeholder="whisper-1"
@@ -73,6 +73,12 @@ export default function VoiceSettingsTab() {
value={config.ttsVoice}
onChange={(e) => update({ ttsVoice: e.target.value })}
/>
<Field
label={t('voiceSettings.format')}
placeholder="mp3"
value={config.ttsFormat}
onChange={(e) => update({ ttsFormat: e.target.value })}
/>
</div>
<p className="text-xs text-muted-foreground">{t('voiceSettings.note')}</p>
</div>

View File

@@ -6,17 +6,20 @@ export type VoiceConfig = {
sttModel: string;
ttsModel: string;
ttsVoice: string;
ttsFormat: string;
};
const STORAGE_KEY = 'voiceConfig';
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '' };
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' };
function read(): VoiceConfig {
try {
const raw = localStorage.getItem(STORAGE_KEY);
if (!raw) return { ...DEFAULTS };
const parsed = JSON.parse(raw);
return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat;
return next;
} catch {
return { ...DEFAULTS };
}
@@ -33,6 +36,7 @@ export function voiceConfigHeaders(): Record<string, string> {
if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat;
return h;
}

View File

@@ -61,7 +61,8 @@
"sttModel": "Speech-to-text model",
"ttsModel": "Text-to-speech model",
"voice": "Voice",
"note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL and model names to match."
"format": "Audio format",
"note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match."
},
"quickSettings": {
"title": "Quick Settings",