refactor(voice): provider-agnostic backend and in-app config

Switches the voice proxy to the OpenAI audio API (/v1/audio/transcriptions and
/v1/audio/speech) so it works with OpenAI, Groq, or a local server. Adds a
Settings -> Voice tab (base URL, API key, models, voice) plus a Quick Settings
toggle, and removes the bundled Python sidecar.

Review fixes: stop mic tracks on unmount, clear the global TTS stop handler and
revoke leaked blob URLs, add fetch timeouts in the proxy, surface mic errors in
the button, trim before appending transcripts, and drop the repo-wide wav ignore.
This commit is contained in:
newsbubbles
2026-06-09 10:05:06 +01:00
parent d05585e1f4
commit 711936d279
21 changed files with 367 additions and 365 deletions

View File

@@ -1,5 +1,6 @@
import { useCallback, useEffect, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
// Only one message speaks at a time across the whole app.
let stopActive: (() => void) | null = null;
@@ -36,8 +37,14 @@ export function useTts(getText: () => string) {
if (stopActive) stopActive = null;
}, [reset]);
// Cleanup on unmount.
useEffect(() => () => reset(), [reset]);
// Cleanup on unmount: drop the global stop handler if it points at us, then reset.
useEffect(
() => () => {
if (stopActive === stop) stopActive = null;
reset();
},
[reset, stop],
);
const play = useCallback(async () => {
if (stopActive) stopActive();
@@ -63,12 +70,16 @@ export function useTts(getText: () => string) {
const res = await authenticatedFetch('/api/voice/tts', {
method: 'POST',
body: JSON.stringify({ text }),
headers: voiceConfigHeaders(),
});
if (!res.ok) throw new Error(`tts ${res.status}`);
const blob = await res.blob();
const url = URL.createObjectURL(blob);
if (audioRef.current !== audio) {
URL.revokeObjectURL(url); // stopped while loading; don't leak the blob URL
return;
}
urlRef.current = url;
if (audioRef.current !== audio) return; // stopped while loading
audio.src = url;
audio.load();
await audio.play();

View File

@@ -1,38 +1,37 @@
import { useEffect, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
// Whether the optional voice feature is configured on the server (VOICE_SIDECAR_URL set).
// Probed once and cached app-wide so the mic/speak controls can hide themselves when off.
let cached: boolean | null = null;
let inflight: Promise<boolean> | null = null;
// Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
// the Settings modal). This is a lightweight read-only view of that preference so the
// mic/speak controls can hide themselves, kept in sync via the same events
// useUiPreferences emits. No server probe.
const STORAGE_KEY = 'uiPreferences';
const SYNC_EVENT = 'ui-preferences:sync';
function probe(): Promise<boolean> {
if (cached !== null) return Promise.resolve(cached);
if (!inflight) {
inflight = authenticatedFetch('/api/voice/health')
.then((r) => (r.ok ? r.json() : { enabled: false }))
.then((d) => {
cached = Boolean(d?.enabled);
return cached;
})
.catch(() => {
cached = false;
return false;
});
function readVoiceEnabled(): boolean {
try {
const raw = localStorage.getItem(STORAGE_KEY);
if (!raw) return false;
const parsed = JSON.parse(raw);
return parsed?.voiceEnabled === true || parsed?.voiceEnabled === 'true';
} catch {
return false;
}
return inflight;
}
export function useVoiceAvailable(): boolean {
const [available, setAvailable] = useState<boolean>(cached ?? false);
const [enabled, setEnabled] = useState<boolean>(() =>
typeof window === 'undefined' ? false : readVoiceEnabled(),
);
useEffect(() => {
let mounted = true;
probe().then((v) => {
if (mounted) setAvailable(v);
});
const update = () => setEnabled(readVoiceEnabled());
window.addEventListener('storage', update);
window.addEventListener(SYNC_EVENT, update as EventListener);
return () => {
mounted = false;
window.removeEventListener('storage', update);
window.removeEventListener(SYNC_EVENT, update as EventListener);
};
}, []);
return available;
return enabled;
}

View File

@@ -1,5 +1,6 @@
import { useCallback, useRef, useState } from 'react';
import { useCallback, useEffect, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
const MIME_CANDIDATES = [
@@ -39,6 +40,15 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
streamRef.current = null;
};
// Stop the mic if the component unmounts mid-recording.
useEffect(() => {
return () => {
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
recorderRef.current = null;
};
}, []);
const start = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({
@@ -68,7 +78,11 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
const fd = new FormData();
fd.append('audio', blob, `recording.${ext}`);
const res = await authenticatedFetch('/api/voice/transcribe', { method: 'POST', body: fd });
const res = await authenticatedFetch('/api/voice/transcribe', {
method: 'POST',
body: fd,
headers: voiceConfigHeaders(),
});
if (!res.ok) throw new Error(`transcribe ${res.status}`);
const data = await res.json();
const text = String(data?.text || '').trim();

View File

@@ -404,7 +404,7 @@ function ChatInterface({
renderInputWithMentions={renderInputWithMentions}
textareaRef={textareaRef}
input={input}
onVoiceTranscript={(text) => setInput(input ? `${input} ${text}` : text)}
onVoiceTranscript={(text) => setInput(input.trim() ? `${input.trim()} ${text}` : text)}
onInputChange={handleInputChange}
onTextareaClick={handleTextareaClick}
onTextareaKeyDown={handleKeyDown}

View File

@@ -1,3 +1,4 @@
import { useEffect, useRef, useState } from 'react';
import { Mic, Square, Loader2 } from 'lucide-react';
import { useTranslation } from 'react-i18next';
import { useVoiceInput } from '../../hooks/useVoiceInput';
@@ -10,10 +11,25 @@ type Props = {
};
// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
// Surfaces transcription errors itself (transiently) so they aren't silently swallowed.
export default function VoiceInputButton({ onTranscript, onError }: Props) {
const { t } = useTranslation('chat');
const available = useVoiceAvailable();
const { state, toggle } = useVoiceInput(onTranscript, onError);
const [errorMsg, setErrorMsg] = useState<string | null>(null);
const errorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
const handleError = (msg: string) => {
onError?.(msg);
setErrorMsg(msg);
if (errorTimer.current) clearTimeout(errorTimer.current);
errorTimer.current = setTimeout(() => setErrorMsg(null), 4000);
};
const { state, toggle } = useVoiceInput(onTranscript, handleError);
useEffect(() => () => {
if (errorTimer.current) clearTimeout(errorTimer.current);
}, []);
if (!available) return null;
@@ -27,14 +43,21 @@ export default function VoiceInputButton({ onTranscript, onError }: Props) {
);
return (
<PromptInputButton
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
onClick={(e: { preventDefault: () => void }) => {
e.preventDefault();
toggle();
}}
>
{icon}
</PromptInputButton>
<span className="relative inline-flex">
{errorMsg && (
<span className="absolute bottom-full left-1/2 mb-1 -translate-x-1/2 whitespace-nowrap rounded bg-red-600 px-2 py-1 text-xs text-white shadow-lg">
{errorMsg}
</span>
)}
<PromptInputButton
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
onClick={(e: { preventDefault: () => void }) => {
e.preventDefault();
toggle();
}}
>
{icon}
</PromptInputButton>
</span>
);
}

View File

@@ -4,6 +4,7 @@ import {
Eye,
Languages,
Maximize2,
Mic,
} from 'lucide-react';
import type { PreferenceToggleItem } from './types';
@@ -54,4 +55,9 @@ export const INPUT_SETTING_TOGGLES: PreferenceToggleItem[] = [
labelKey: 'quickSettings.sendByCtrlEnter',
icon: Languages,
},
{
key: 'voiceEnabled',
labelKey: 'quickSettings.voiceEnabled',
icon: Mic,
},
];

View File

@@ -6,7 +6,8 @@ export type PreferenceToggleKey =
| 'showRawParameters'
| 'showThinking'
| 'autoScrollToBottom'
| 'sendByCtrlEnter';
| 'sendByCtrlEnter'
| 'voiceEnabled';
export type QuickSettingsPreferences = Record<PreferenceToggleKey, boolean>;

View File

@@ -2,7 +2,7 @@ import type { Dispatch, SetStateAction } from 'react';
import type { LLMProvider } from '../../../types/app';
import type { ProviderAuthStatus } from '../../provider-auth/types';
export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'tasks' | 'notifications' | 'plugins' | 'about';
export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'voice' | 'tasks' | 'notifications' | 'plugins' | 'about';
export type AgentProvider = LLMProvider;
export type AgentCategory = 'account' | 'permissions' | 'mcp';
export type ProjectSortOrder = 'name' | 'date';

View File

@@ -6,6 +6,7 @@ import SettingsSidebar from '../view/SettingsSidebar';
import AgentsSettingsTab from '../view/tabs/agents-settings/AgentsSettingsTab';
import AppearanceSettingsTab from '../view/tabs/AppearanceSettingsTab';
import CredentialsSettingsTab from '../view/tabs/api-settings/CredentialsSettingsTab';
import VoiceSettingsTab from '../view/tabs/VoiceSettingsTab';
import GitSettingsTab from '../view/tabs/git-settings/GitSettingsTab';
import NotificationsSettingsTab from '../view/tabs/NotificationsSettingsTab';
import TasksSettingsTab from '../view/tabs/tasks-settings/TasksSettingsTab';
@@ -153,6 +154,8 @@ function Settings({ isOpen, onClose, projects = [], initialTab = 'agents' }: Set
{activeTab === 'api' && <CredentialsSettingsTab />}
{activeTab === 'voice' && <VoiceSettingsTab />}
{activeTab === 'plugins' && <PluginSettingsTab />}
{activeTab === 'about' && <AboutTab />}

View File

@@ -1,4 +1,4 @@
import { Bell, Bot, GitBranch, Info, Key, ListChecks, Palette, Puzzle } from 'lucide-react';
import { Bell, Bot, GitBranch, Info, Key, ListChecks, Mic, Palette, Puzzle } from 'lucide-react';
import { useTranslation } from 'react-i18next';
import { cn } from '../../../lib/utils';
import { PillBar, Pill } from '../../../shared/view/ui';
@@ -20,6 +20,7 @@ const NAV_ITEMS: NavItem[] = [
{ id: 'appearance', labelKey: 'mainTabs.appearance', icon: Palette },
{ id: 'git', labelKey: 'mainTabs.git', icon: GitBranch },
{ id: 'api', labelKey: 'mainTabs.apiTokens', icon: Key },
{ id: 'voice', labelKey: 'mainTabs.voice', icon: Mic },
{ id: 'tasks', labelKey: 'mainTabs.tasks', icon: ListChecks },
{ id: 'plugins', labelKey: 'mainTabs.plugins', icon: Puzzle },
{ id: 'notifications', labelKey: 'mainTabs.notifications', icon: Bell },

View File

@@ -0,0 +1,82 @@
import type { InputHTMLAttributes } from 'react';
import { useTranslation } from 'react-i18next';
import SettingsSection from '../SettingsSection';
import SettingsToggle from '../SettingsToggle';
import { useUiPreferences } from '../../../../hooks/useUiPreferences';
import { useVoiceConfig } from '../../../../hooks/useVoiceConfig';
const inputClass =
'w-full rounded-md border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-ring';
function Field({ label, ...props }: { label: string } & InputHTMLAttributes<HTMLInputElement>) {
return (
<label className="block space-y-1">
<span className="text-sm font-medium text-foreground">{label}</span>
<input className={inputClass} {...props} />
</label>
);
}
export default function VoiceSettingsTab() {
const { t } = useTranslation('settings');
const { preferences, setPreference } = useUiPreferences();
const { config, update } = useVoiceConfig();
return (
<div className="space-y-8">
<SettingsSection title={t('voiceSettings.title')} description={t('voiceSettings.description')}>
<div className="flex items-center justify-between rounded-lg border border-border p-3">
<div className="pr-3">
<div className="text-sm font-medium text-foreground">{t('voiceSettings.enable')}</div>
<div className="text-xs text-muted-foreground">{t('voiceSettings.enableDescription')}</div>
</div>
<SettingsToggle
checked={preferences.voiceEnabled}
onChange={(v) => setPreference('voiceEnabled', v)}
ariaLabel={t('voiceSettings.enable')}
/>
</div>
</SettingsSection>
<SettingsSection title={t('voiceSettings.backendTitle')} description={t('voiceSettings.backendDescription')}>
<div className="space-y-4">
<Field
label={t('voiceSettings.baseUrl')}
placeholder="https://api.openai.com/v1"
value={config.baseUrl}
onChange={(e) => update({ baseUrl: e.target.value })}
/>
<Field
label={t('voiceSettings.apiKey')}
type="password"
autoComplete="off"
placeholder="sk-…"
value={config.apiKey}
onChange={(e) => update({ apiKey: e.target.value })}
/>
<div className="grid grid-cols-1 gap-4 sm:grid-cols-3">
<Field
label={t('voiceSettings.sttModel')}
placeholder="whisper-1"
value={config.sttModel}
onChange={(e) => update({ sttModel: e.target.value })}
/>
<Field
label={t('voiceSettings.ttsModel')}
placeholder="tts-1"
value={config.ttsModel}
onChange={(e) => update({ ttsModel: e.target.value })}
/>
<Field
label={t('voiceSettings.voice')}
placeholder="alloy"
value={config.ttsVoice}
onChange={(e) => update({ ttsVoice: e.target.value })}
/>
</div>
<p className="text-xs text-muted-foreground">{t('voiceSettings.note')}</p>
</div>
</SettingsSection>
</div>
);
}

View File

@@ -7,6 +7,7 @@ type UiPreferences = {
autoScrollToBottom: boolean;
sendByCtrlEnter: boolean;
sidebarVisible: boolean;
voiceEnabled: boolean;
};
type UiPreferenceKey = keyof UiPreferences;
@@ -39,6 +40,7 @@ const DEFAULTS: UiPreferences = {
autoScrollToBottom: true,
sendByCtrlEnter: false,
sidebarVisible: true,
voiceEnabled: false,
};
const PREFERENCE_KEYS = Object.keys(DEFAULTS) as UiPreferenceKey[];

View File

@@ -0,0 +1,57 @@
import { useState } from 'react';
export type VoiceConfig = {
baseUrl: string;
apiKey: string;
sttModel: string;
ttsModel: string;
ttsVoice: string;
};
const STORAGE_KEY = 'voiceConfig';
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '' };
function read(): VoiceConfig {
try {
const raw = localStorage.getItem(STORAGE_KEY);
if (!raw) return { ...DEFAULTS };
const parsed = JSON.parse(raw);
return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
} catch {
return { ...DEFAULTS };
}
}
// Headers the voice proxy reads to target a per-user OpenAI-compatible backend.
// Empty fields are omitted so the server's env defaults apply.
export function voiceConfigHeaders(): Record<string, string> {
if (typeof window === 'undefined') return {};
const c = read();
const h: Record<string, string> = {};
if (c.baseUrl) h['x-voice-base-url'] = c.baseUrl;
if (c.apiKey) h['x-voice-api-key'] = c.apiKey;
if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
return h;
}
export function useVoiceConfig() {
const [config, setConfig] = useState<VoiceConfig>(() =>
typeof window === 'undefined' ? { ...DEFAULTS } : read(),
);
const update = (patch: Partial<VoiceConfig>) => {
setConfig((prev) => {
const next = { ...prev, ...patch };
try {
localStorage.setItem(STORAGE_KEY, JSON.stringify(next));
} catch {
/* ignore persistence errors */
}
return next;
});
};
return { config, update };
}

View File

@@ -49,6 +49,20 @@
"resetToDefaults": "Reset to Defaults",
"cancelChanges": "Cancel Changes"
},
"voiceSettings": {
"title": "Voice",
"description": "Speech-to-text input and read-aloud, via an OpenAI-compatible audio backend.",
"enable": "Enable voice",
"enableDescription": "Show the mic button and the read-aloud button on messages.",
"backendTitle": "Backend",
"backendDescription": "Point at OpenAI, Groq, or a local server (LocalAI, Speaches, Kokoro-FastAPI). Leave blank to use the server default.",
"baseUrl": "Base URL",
"apiKey": "API key",
"sttModel": "Speech-to-text model",
"ttsModel": "Text-to-speech model",
"voice": "Voice",
"note": "The shown defaults work with OpenAI once you add a key. For other providers, set the base URL and model names to match."
},
"quickSettings": {
"title": "Quick Settings",
"sections": {
@@ -63,6 +77,7 @@
"showThinking": "Show thinking",
"autoScrollToBottom": "Auto-scroll to bottom",
"sendByCtrlEnter": "Send by Ctrl+Enter",
"voiceEnabled": "Voice (mic + read aloud)",
"sendByCtrlEnterDescription": "When enabled, pressing Ctrl+Enter will send the message instead of just Enter. This is useful for IME users to avoid accidental sends.",
"dragHandle": {
"dragging": "Dragging handle",
@@ -93,6 +108,7 @@
"appearance": "Appearance",
"git": "Git",
"apiTokens": "API & Tokens",
"voice": "Voice",
"tasks": "Tasks",
"notifications": "Notifications",
"plugins": "Plugins",