mirror of
https://github.com/siteboon/claudecodeui.git
synced 2026-06-25 04:13:51 +08:00
fix(voice): expose TTS format in user settings
This commit is contained in:
@@ -1,51 +0,0 @@
|
||||
# Voice (optional)
|
||||
|
||||
Two opt-in voice features in the chat:
|
||||
|
||||
- **Push-to-talk dictation** — a mic button in the composer records, transcribes, and fills the input.
|
||||
- **Read-aloud** — a speaker button on each assistant message plays it back.
|
||||
|
||||
Voice is **off by default**. Turn it on with the **Voice** toggle in Quick Settings or in
|
||||
**Settings → Voice**. When off, the mic and speaker controls are hidden.
|
||||
|
||||
## Backend
|
||||
|
||||
Voice uses any **OpenAI-compatible audio backend**, configured in **Settings → Voice**:
|
||||
|
||||
| Field | Example | Notes |
|
||||
|---|---|---|
|
||||
| Base URL | `https://api.openai.com/v1` | OpenAI, Groq, or a local server |
|
||||
| API key | `sk-…` | sent only to this app's backend, which proxies the request |
|
||||
| Speech-to-text model | `whisper-1`, `gpt-4o-transcribe`, `whisper-large-v3-turbo` | |
|
||||
| Text-to-speech model | `tts-1`, `gpt-4o-mini-tts`, `kokoro` | |
|
||||
| Voice | `alloy`, `af_heart`, … | depends on the backend |
|
||||
|
||||
The backend must expose the standard endpoints:
|
||||
|
||||
```
|
||||
POST {baseUrl}/audio/transcriptions (multipart 'file' + 'model') -> { "text": "..." }
|
||||
POST {baseUrl}/audio/speech ({ model, voice, input }) -> audio bytes
|
||||
```
|
||||
|
||||
That covers OpenAI and Groq, plus local servers like **LocalAI**, **Speaches**, **Kokoro-FastAPI**,
|
||||
and **openedai-speech**. Requests are proxied through the app's authenticated `/api/voice/*` routes,
|
||||
so a local backend only needs to listen on localhost.
|
||||
|
||||
### Server-side defaults (optional)
|
||||
|
||||
Instead of (or as defaults behind) the Settings fields, you can set env vars on the server:
|
||||
|
||||
```
|
||||
VOICE_API_BASE_URL=http://127.0.0.1:8765/v1
|
||||
VOICE_API_KEY=...
|
||||
VOICE_STT_MODEL=whisper-1
|
||||
VOICE_TTS_MODEL=tts-1
|
||||
VOICE_TTS_VOICE=alloy
|
||||
```
|
||||
|
||||
Per-user Settings values override these. If neither is set, the voice routes return 503.
|
||||
|
||||
## Notes
|
||||
|
||||
- Recording needs a secure context (HTTPS or localhost) for microphone access.
|
||||
- On iOS, read-aloud is tap-initiated to satisfy Safari's autoplay policy.
|
||||
@@ -16,14 +16,13 @@ const ENV = {
|
||||
sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
|
||||
ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
|
||||
ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
|
||||
ttsFormat: process.env.VOICE_TTS_FORMAT || 'mp3',
|
||||
};
|
||||
|
||||
/**
|
||||
* Resolve the voice backend config for a request. Client headers (set from the
|
||||
* user's in-app voice settings) take precedence over the server env defaults.
|
||||
* @param {import('express').Request} req
|
||||
* @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string}}
|
||||
* @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}}
|
||||
*/
|
||||
function resolveConfig(req) {
|
||||
const h = req.headers;
|
||||
@@ -33,6 +32,7 @@ function resolveConfig(req) {
|
||||
sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
|
||||
ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
|
||||
ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
|
||||
ttsFormat: String(h['x-voice-tts-format'] || ''),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ router.post('/tts', async (req, res) => {
|
||||
model: cfg.ttsModel,
|
||||
voice: cfg.ttsVoice,
|
||||
input: text,
|
||||
response_format: ENV.ttsFormat,
|
||||
...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}),
|
||||
}),
|
||||
});
|
||||
if (!r.ok) {
|
||||
|
||||
@@ -54,7 +54,7 @@ export default function VoiceSettingsTab() {
|
||||
value={config.apiKey}
|
||||
onChange={(e) => update({ apiKey: e.target.value })}
|
||||
/>
|
||||
<div className="grid grid-cols-1 gap-4 sm:grid-cols-3">
|
||||
<div className="grid grid-cols-1 gap-4 sm:grid-cols-4">
|
||||
<Field
|
||||
label={t('voiceSettings.sttModel')}
|
||||
placeholder="whisper-1"
|
||||
@@ -73,6 +73,12 @@ export default function VoiceSettingsTab() {
|
||||
value={config.ttsVoice}
|
||||
onChange={(e) => update({ ttsVoice: e.target.value })}
|
||||
/>
|
||||
<Field
|
||||
label={t('voiceSettings.format')}
|
||||
placeholder="mp3"
|
||||
value={config.ttsFormat}
|
||||
onChange={(e) => update({ ttsFormat: e.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground">{t('voiceSettings.note')}</p>
|
||||
</div>
|
||||
|
||||
@@ -6,17 +6,20 @@ export type VoiceConfig = {
|
||||
sttModel: string;
|
||||
ttsModel: string;
|
||||
ttsVoice: string;
|
||||
ttsFormat: string;
|
||||
};
|
||||
|
||||
const STORAGE_KEY = 'voiceConfig';
|
||||
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '' };
|
||||
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' };
|
||||
|
||||
function read(): VoiceConfig {
|
||||
try {
|
||||
const raw = localStorage.getItem(STORAGE_KEY);
|
||||
if (!raw) return { ...DEFAULTS };
|
||||
const parsed = JSON.parse(raw);
|
||||
return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
|
||||
const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
|
||||
if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat;
|
||||
return next;
|
||||
} catch {
|
||||
return { ...DEFAULTS };
|
||||
}
|
||||
@@ -33,6 +36,7 @@ export function voiceConfigHeaders(): Record<string, string> {
|
||||
if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
|
||||
if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
|
||||
if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
|
||||
if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat;
|
||||
return h;
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,8 @@
|
||||
"sttModel": "Speech-to-text model",
|
||||
"ttsModel": "Text-to-speech model",
|
||||
"voice": "Voice",
|
||||
"note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL and model names to match."
|
||||
"format": "Audio format",
|
||||
"note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL, model names, and audio format to match."
|
||||
},
|
||||
"quickSettings": {
|
||||
"title": "Quick Settings",
|
||||
|
||||
Reference in New Issue
Block a user