From c947eaaee5fbc959563efb917f4ec7c88847dd6b Mon Sep 17 00:00:00 2001 From: Haile <118998054+blackmammoth@users.noreply.github.com> Date: Thu, 25 Jun 2026 15:57:10 +0300 Subject: [PATCH 1/2] feat: play sound for pending tool requests (#918) --- .../chat/hooks/useChatProviderState.ts | 11 +-- .../chat/hooks/useChatRealtimeHandlers.ts | 67 ++++++++++++++++--- src/components/chat/view/ChatInterface.tsx | 1 + src/i18n/locales/en/settings.json | 2 +- src/utils/notificationSound.ts | 4 +- 5 files changed, 65 insertions(+), 20 deletions(-) diff --git a/src/components/chat/hooks/useChatProviderState.ts b/src/components/chat/hooks/useChatProviderState.ts index a2910b0d..ea49d841 100644 --- a/src/components/chat/hooks/useChatProviderState.ts +++ b/src/components/chat/hooks/useChatProviderState.ts @@ -114,7 +114,6 @@ export function useChatProviderState({ selectedSession, selectedProject }: UseCh const [providerModelsLoading, setProviderModelsLoading] = useState(true); const [providerModelsRefreshing, setProviderModelsRefreshing] = useState(false); - const lastProviderRef = useRef(provider); const providerModelsRequestIdRef = useRef(0); const setStoredProviderModel = useCallback((targetProvider: LLMProvider, model: string) => { @@ -344,14 +343,8 @@ export function useChatProviderState({ selectedSession, selectedProject }: UseCh localStorage.setItem('selected-provider', selectedSession.__provider); }, [provider, selectedSession]); - useEffect(() => { - if (lastProviderRef.current === provider) { - return; - } - setPendingPermissionRequests([]); - lastProviderRef.current = provider; - }, [provider]); - + // Permission prompts belong to a session, not to the transient provider + // selection that is synchronized after navigation. useEffect(() => { setPendingPermissionRequests((previous) => previous.filter((request) => !request.sessionId || request.sessionId === selectedSession?.id), diff --git a/src/components/chat/hooks/useChatRealtimeHandlers.ts b/src/components/chat/hooks/useChatRealtimeHandlers.ts index 5c294efa..80826486 100644 --- a/src/components/chat/hooks/useChatRealtimeHandlers.ts +++ b/src/components/chat/hooks/useChatRealtimeHandlers.ts @@ -1,20 +1,29 @@ -import { useEffect } from 'react'; +import { useEffect, useRef } from 'react'; import type { Dispatch, MutableRefObject, SetStateAction } from 'react'; import type { ServerEvent } from '../../../contexts/WebSocketContext'; import { showCompletionTitleIndicator } from '../../../utils/pageTitleNotification'; -import { playChatCompletionSound } from '../../../utils/notificationSound'; +import { playChatCompletionSound, playNotificationSound } from '../../../utils/notificationSound'; import type { MarkSessionIdle, MarkSessionProcessing } from '../../../hooks/useSessionProtection'; import type { PendingPermissionRequest } from '../types/types'; import type { ProjectSession, LLMProvider } from '../../../types/app'; import type { SessionStore, NormalizedMessage } from '../../../stores/useSessionStore'; +const isActionablePermissionRequest = (request: { toolName?: unknown } | null | undefined): boolean => { + return request?.toolName !== 'ExitPlanMode' && request?.toolName !== 'exit_plan_mode'; +}; + +const hasActionablePermissionRequests = (requests: Array<{ toolName?: unknown }> | null | undefined): boolean => { + return Array.isArray(requests) && requests.some((request) => isActionablePermissionRequest(request)); +}; + interface UseChatRealtimeHandlersArgs { subscribe: (listener: (event: ServerEvent) => void) => () => void; provider: LLMProvider; selectedSession: ProjectSession | null; currentSessionId: string | null; setTokenBudget: (budget: Record | null) => void; + pendingPermissionRequests: PendingPermissionRequest[]; setPendingPermissionRequests: Dispatch>; streamTimerRef: MutableRefObject; accumulatedStreamRef: MutableRefObject; @@ -52,6 +61,7 @@ export function useChatRealtimeHandlers({ selectedSession, currentSessionId, setTokenBudget, + pendingPermissionRequests, setPendingPermissionRequests, streamTimerRef, accumulatedStreamRef, @@ -62,13 +72,29 @@ export function useChatRealtimeHandlers({ onWebSocketReconnect, sessionStore, }: UseChatRealtimeHandlersArgs) { + // Session switches can send `chat.subscribe` before this effect has a chance + // to rebind the websocket listener. Read the visible session id from a ref + // so a fast `chat_subscribed` ack is matched against the current view, not + // the previous render's closed-over selection. + const activeViewSessionIdRef = useRef(selectedSession?.id || currentSessionId || null); + activeViewSessionIdRef.current = selectedSession?.id || currentSessionId || null; + + // Keep the latest pending-permission snapshot available to the websocket + // listener so back-to-back permission events can dedupe and re-arm the + // notification sound before React finishes a rerender. + const pendingPermissionRequestsRef = useRef(pendingPermissionRequests); + + useEffect(() => { + pendingPermissionRequestsRef.current = pendingPermissionRequests; + }, [pendingPermissionRequests]); + useEffect(() => { const handleEvent = (msg: ServerEvent) => { if (!msg.kind) { return; } - const activeViewSessionId = selectedSession?.id || currentSessionId || null; + const activeViewSessionId = activeViewSessionIdRef.current; const sid = (typeof msg.sessionId === 'string' && msg.sessionId) || activeViewSessionId; // Record replay progress for every sequenced live event. @@ -101,7 +127,16 @@ export function useChatRealtimeHandlers({ const isViewedSession = sid === activeViewSessionId; if (isViewedSession && Array.isArray(msg.pendingPermissions)) { - setPendingPermissionRequests(msg.pendingPermissions as PendingPermissionRequest[]); + const nextPendingPermissionRequests = msg.pendingPermissions as PendingPermissionRequest[]; + const hadActionablePermissionRequests = hasActionablePermissionRequests(pendingPermissionRequestsRef.current); + const hasPendingActionablePermissionRequests = hasActionablePermissionRequests(nextPendingPermissionRequests); + + pendingPermissionRequestsRef.current = nextPendingPermissionRequests; + setPendingPermissionRequests(nextPendingPermissionRequests); + + if (hasPendingActionablePermissionRequests && !hadActionablePermissionRequests) { + void playNotificationSound(); + } } return; } @@ -203,6 +238,7 @@ export function useChatRealtimeHandlers({ // hides it immediately and atomically. onSessionIdle?.(sid); if (sid === activeViewSessionId) { + pendingPermissionRequestsRef.current = []; setPendingPermissionRequests([]); } @@ -234,10 +270,14 @@ export function useChatRealtimeHandlers({ case 'permission_request': { if (!msg.requestId) break; + if (isActionablePermissionRequest({ toolName: msg.toolName })) { + void playNotificationSound(); + } + if (sid === activeViewSessionId) { - setPendingPermissionRequests((prev) => { - if (prev.some((r: PendingPermissionRequest) => r.requestId === msg.requestId)) return prev; - return [...prev, { + const previousPendingPermissionRequests = pendingPermissionRequestsRef.current; + if (!previousPendingPermissionRequests.some((request) => request.requestId === msg.requestId)) { + const nextPendingPermissionRequests = [...previousPendingPermissionRequests, { requestId: msg.requestId as string, toolName: (msg.toolName as string) || 'UnknownTool', input: msg.input, @@ -245,7 +285,10 @@ export function useChatRealtimeHandlers({ sessionId: sid || null, receivedAt: new Date(), }]; - }); + + pendingPermissionRequestsRef.current = nextPendingPermissionRequests; + setPendingPermissionRequests(nextPendingPermissionRequests); + } } if (sid) { onSessionProcessing?.(sid); @@ -255,7 +298,12 @@ export function useChatRealtimeHandlers({ case 'permission_cancelled': { if (msg.requestId && sid === activeViewSessionId) { - setPendingPermissionRequests((prev) => prev.filter((r: PendingPermissionRequest) => r.requestId !== msg.requestId)); + const nextPendingPermissionRequests = pendingPermissionRequestsRef.current.filter( + (request: PendingPermissionRequest) => request.requestId !== msg.requestId, + ); + + pendingPermissionRequestsRef.current = nextPendingPermissionRequests; + setPendingPermissionRequests(nextPendingPermissionRequests); } break; } @@ -286,6 +334,7 @@ export function useChatRealtimeHandlers({ selectedSession, currentSessionId, setTokenBudget, + pendingPermissionRequests, setPendingPermissionRequests, streamTimerRef, accumulatedStreamRef, diff --git a/src/components/chat/view/ChatInterface.tsx b/src/components/chat/view/ChatInterface.tsx index 5efe6af4..15786a41 100644 --- a/src/components/chat/view/ChatInterface.tsx +++ b/src/components/chat/view/ChatInterface.tsx @@ -239,6 +239,7 @@ function ChatInterface({ selectedSession, currentSessionId, setTokenBudget, + pendingPermissionRequests, setPendingPermissionRequests, streamTimerRef, accumulatedStreamRef, diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json index fbcd797a..89d4e651 100644 --- a/src/i18n/locales/en/settings.json +++ b/src/i18n/locales/en/settings.json @@ -114,7 +114,7 @@ }, "sound": { "title": "Sound", - "description": "Play a short tone when a chat run finishes.", + "description": "Play a short tone when a chat run finishes or needs tool approval.", "enabled": "Enabled", "test": "Test sound" }, diff --git a/src/utils/notificationSound.ts b/src/utils/notificationSound.ts index 78af2d99..d28d6659 100644 --- a/src/utils/notificationSound.ts +++ b/src/utils/notificationSound.ts @@ -58,7 +58,7 @@ const playTone = ( oscillator.stop(startsAt + duration + 0.02); }; -export const playChatCompletionSound = async ({ force = false } = {}): Promise => { +export const playNotificationSound = async ({ force = false } = {}): Promise => { if (!force && !isNotificationSoundEnabled()) { return; } @@ -81,3 +81,5 @@ export const playChatCompletionSound = async ({ force = false } = {}): Promise => playNotificationSound(options); From 591e8e7642589b0584f9b29b46b881aaab54624e Mon Sep 17 00:00:00 2001 From: Haile <118998054+blackmammoth@users.noreply.github.com> Date: Fri, 26 Jun 2026 17:06:40 +0300 Subject: [PATCH 2/2] fix: voice tts format settings (#919) * feat(voice): add optional speech-to-text input and read-aloud TTS Adds a push-to-talk mic button in the composer and a read-aloud button on assistant messages. Both are opt-in and hidden unless a voice backend is configured via VOICE_SIDECAR_URL. The auth-gated /api/voice proxy forwards to a configurable backend exposing /transcribe and /tts (provider-agnostic); the frontend probes /api/voice/health and hides the controls when disabled. Adds i18n keys and docs/voice.md. Includes a local, no-API-key reference backend in voice-sidecar/ (faster-whisper for STT, Kokoro-82M for TTS, both CPU-capable). * refactor(voice): provider-agnostic backend and in-app config Switches the voice proxy to the OpenAI audio API (/v1/audio/transcriptions and /v1/audio/speech) so it works with OpenAI, Groq, or a local server. Adds a Settings -> Voice tab (base URL, API key, models, voice) plus a Quick Settings toggle, and removes the bundled Python sidecar. Review fixes: stop mic tracks on unmount, clear the global TTS stop handler and revoke leaked blob URLs, add fetch timeouts in the proxy, surface mic errors in the button, trim before appending transcripts, and drop the repo-wide wav ignore. * fix(voice): relax backend timeout and surface timeout errors Bumps the proxy timeout to 5 minutes (VOICE_TIMEOUT_MS) since local TTS can synthesize long messages at roughly real-time, and returns a clear timed-out message (504) instead of failing silently. The read-aloud button now shows backend errors. * fix(voice): play read-aloud through an app-level player to stop cutoffs Read-aloud now runs in a single module-level player outside the React tree instead of per-message component state. Switching chats or re-rendering a message no longer revokes the blob URL mid-play (the 'Invalid URI' cutoff). Adds content-keyed caching so re-listening doesn't regenerate, and reuses one audio element (also unlocks iOS once). * fix(voice): address review (SSRF guard, auth mapping, client timeout) Validates the user-supplied backend URL (http/https only, blocks the link-local metadata range) to prevent SSRF; remaps upstream 401/403 so a bad voice API key isn't read as the app's own auth failing; adds a client-side AbortController timeout on the read-aloud request so the button can't sit in loading if a request stalls. * docs(voice): provider-agnostic wording and jsdoc on proxy functions drop leftover sidecar/faster-whisper references now that the backend is any openai-compatible voice api, and add jsdoc to the voice-proxy functions so the docstring coverage check passes. * fix(voice): harden timeout parsing, tts input check, and player abort - fall back to the default when VOICE_TIMEOUT_MS is non-numeric or <= 0, so a bad override can't make the abort fire immediately - type-check the tts `text` before calling .trim() so a non-string body returns 400 instead of throwing - abort the in-flight TTS fetch on stop() and on a superseding play, so tapping read-aloud repeatedly doesn't leave orphaned requests generating audio * feat(voice): send transcript with the main send button while recording while dictating, the main send button stops recording, transcribes, and sends in one tap, matching the codex-style flow. the mic button still stops and drops the transcript into the input box to edit before sending. voice recording state is lifted into the composer so both buttons share it, and the send button is enabled (not grayed) while recording. also fix a pre-existing type error: the quick-settings preferences map was missing voiceEnabled. * fix(voice): make stop() idempotent so a double tap can't throw guard on the recorder's own state instead of react state, so a double tap or the mic and send buttons both firing won't call stop() on an already-inactive MediaRecorder. * fix(voice): expose TTS format in user settings * fix(voice): harden recording and backend behavior Redirects could bypass the backend URL guard, and TTS playback waited for full buffering. Recording could overlap or finish after teardown. Controls also ignored backend readiness. Explicit formats and config-aware cache keys prevent stale audio after settings change. * fix(voice): validate config and request boundaries Malformed stored settings could break voice requests instead of using safe defaults. Health results could outlive auth changes. URL checks also did not guard the fetch sink. Remove constant recorder branches so lifecycle cancellation stays clear. * fix(voice): separate client and server backends User-selected backend URLs must remain usable without letting clients control server requests. Call custom providers from the browser while keeping the server proxy bound to its configured host. This restores voice controls for frontend settings without reopening the SSRF path. * fix: hide voice options until enabled --------- Co-authored-by: newsbubbles Co-authored-by: Simos Mikelatos --- server/index.js | 3 + server/voice-proxy.js | 224 ++++++++++++++++++ .../chat/hooks/useChatComposerState.ts | 12 + src/components/chat/hooks/useTts.ts | 33 +++ .../chat/hooks/useVoiceAvailable.ts | 85 +++++++ src/components/chat/hooks/useVoiceInput.ts | 149 ++++++++++++ src/components/chat/view/ChatInterface.tsx | 2 + .../chat/view/subcomponents/ChatComposer.tsx | 51 +++- .../view/subcomponents/MessageComponent.tsx | 4 + .../subcomponents/MessageSpeakControl.tsx | 44 ++++ .../view/subcomponents/VoiceInputButton.tsx | 46 ++++ .../quick-settings-panel/constants.ts | 6 + src/components/quick-settings-panel/types.ts | 3 +- .../view/QuickSettingsContent.tsx | 5 +- .../view/QuickSettingsPanelView.tsx | 2 + src/components/settings/types/types.ts | 2 +- src/components/settings/view/Settings.tsx | 3 + .../settings/view/SettingsSidebar.tsx | 4 +- .../settings/view/tabs/VoiceSettingsTab.tsx | 91 +++++++ src/hooks/useUiPreferences.ts | 2 + src/hooks/useVoiceConfig.ts | 68 ++++++ src/i18n/locales/en/chat.json | 8 + src/i18n/locales/en/settings.json | 17 ++ src/lib/voiceApi.ts | 60 +++++ src/lib/voicePlayer.ts | 196 +++++++++++++++ 25 files changed, 1112 insertions(+), 8 deletions(-) create mode 100644 server/voice-proxy.js create mode 100644 src/components/chat/hooks/useTts.ts create mode 100644 src/components/chat/hooks/useVoiceAvailable.ts create mode 100644 src/components/chat/hooks/useVoiceInput.ts create mode 100644 src/components/chat/view/subcomponents/MessageSpeakControl.tsx create mode 100644 src/components/chat/view/subcomponents/VoiceInputButton.tsx create mode 100644 src/components/settings/view/tabs/VoiceSettingsTab.tsx create mode 100644 src/hooks/useVoiceConfig.ts create mode 100644 src/lib/voiceApi.ts create mode 100644 src/lib/voicePlayer.ts diff --git a/server/index.js b/server/index.js index d957ef58..7db1b122 100755 --- a/server/index.js +++ b/server/index.js @@ -61,6 +61,7 @@ import userRoutes from './routes/user.js'; import geminiRoutes from './routes/gemini.js'; import pluginsRoutes from './routes/plugins.js'; import providerRoutes from './modules/providers/provider.routes.js'; +import voiceRoutes from './voice-proxy.js'; import browserUseRoutes from './modules/browser-use/browser-use.routes.js'; import browserUseMcpRoutes from './modules/browser-use/browser-use-mcp.routes.js'; import { browserUseService } from './modules/browser-use/browser-use.service.js'; @@ -222,6 +223,8 @@ app.use('/api/providers', authenticateToken, providerRoutes); // Agent API Routes (uses API key authentication) app.use('/api/agent', agentRoutes); +app.use('/api/voice', authenticateToken, voiceRoutes); + // Serve public files (like api-docs.html) app.use(express.static(path.join(APP_ROOT, 'public'))); diff --git a/server/voice-proxy.js b/server/voice-proxy.js new file mode 100644 index 00000000..1ea4a6d8 --- /dev/null +++ b/server/voice-proxy.js @@ -0,0 +1,224 @@ +// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend. +// +// The backend is whatever the user points at: OpenAI, Groq, or a local server +// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the +// standard OpenAI audio endpoints: +// POST {base}/audio/transcriptions (multipart 'file' + 'model') -> { text } +// POST {base}/audio/speech ({ model, voice, input }) -> audio bytes +// +// Config is resolved per-request from headers (set by the client's voice settings), +// falling back to server env defaults. Mounted at /api/voice behind authenticateToken. +import { Readable } from 'node:stream'; + +import express from 'express'; + +const ENV = { + baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''), + apiKey: process.env.VOICE_API_KEY || '', + sttModel: process.env.VOICE_STT_MODEL || 'whisper-1', + ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1', + ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy', +}; + +/** + * Resolve the voice backend config for a request. Client headers (set from the + * user's in-app voice settings) take precedence over the server env defaults. + * @param {import('express').Request} req + * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}} + */ +function resolveConfig(req) { + const h = req.headers; + return { + // Security: do not allow clients to control the outbound backend host. + // Always use the server-side configured base URL. + baseUrl: ENV.baseUrl, + apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey, + sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, + ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, + ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, + ttsFormat: String(h['x-voice-tts-format'] || '').trim(), + }; +} + +const router = express.Router(); + +// Generous by default — local TTS can synthesize long messages at ~real-time on CPU. +// Guard against a non-numeric/zero override that would make setTimeout fire immediately. +const DEFAULT_VOICE_TIMEOUT_MS = 300000; +const _parsedTimeout = Number(process.env.VOICE_TIMEOUT_MS); +const VOICE_TIMEOUT_MS = Number.isFinite(_parsedTimeout) && _parsedTimeout > 0 + ? _parsedTimeout + : DEFAULT_VOICE_TIMEOUT_MS; + +/** + * fetch() with an AbortController timeout so a stalled backend can't hold the + * request open indefinitely. Aborts after VOICE_TIMEOUT_MS. + * @param {string} url + * @param {RequestInit} [options] + * @returns {Promise} + */ +async function fetchWithTimeout(url, options = {}) { + const parsed = new URL(url); + if (!['http:', 'https:'].includes(parsed.protocol) || !isAllowedBackendUrl(parsed.origin)) { + throw new Error('Blocked outbound voice backend URL'); + } + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS); + try { + return await fetch(parsed.toString(), { redirect: 'manual', ...options, signal: controller.signal }); + } finally { + clearTimeout(timer); + } +} + +/** + * Turn a backend fetch failure into a clear, actionable client response: + * 504 on timeout (AbortError), 502 otherwise. + * @param {import('express').Response} res + * @param {Error} e + */ +function backendError(res, e) { + if (e && e.name === 'AbortError') { + return res.status(504).json({ + error: `Voice backend timed out after ${Math.round(VOICE_TIMEOUT_MS / 1000)}s. Check your voice backend.`, + }); + } + return res.status(502).json({ error: `Voice backend unreachable: ${e.message}` }); +} + +/** + * SSRF guard for the user-configurable backend URL: allow http/https only and + * block the link-local / cloud-metadata range (169.254.x). localhost and private + * ranges are allowed on purpose so users can point at a local voice server + * (LocalAI, Speaches, Kokoro-FastAPI, etc.). + * @param {string} raw + * @returns {boolean} + */ +function isAllowedBackendUrl(raw) { + let u; + try { + u = new URL(raw); + } catch { + return false; + } + if (u.protocol !== 'http:' && u.protocol !== 'https:') return false; + if (u.hostname === '169.254.169.254' || u.hostname.startsWith('169.254.')) return false; + return true; +} + +/** + * Relay an upstream (backend) error to the client without making an upstream + * 401/403 look like the user's own app login failed. + * @param {import('express').Response} res + * @param {number} status + * @param {string} [text] + */ +function upstreamError(res, status, text) { + if (status === 401 || status === 403) { + return res.status(502).json({ error: 'Voice backend rejected the request (check the API key).' }); + } + return res.status(status).json({ error: text || 'voice backend error' }); +} + +let _upload = null; +/** + * Lazily build a memory-storage multer instance (25 MB cap) for audio uploads, + * so multer is only imported when the voice feature is actually used. + * @returns {Promise} + */ +async function getUpload() { + if (!_upload) { + const multer = (await import('multer')).default; + _upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } }); + } + return _upload; +} + +/** + * Build the Authorization header for the backend, or an empty object when no + * key is configured (e.g. a local server that needs none). + * @param {string} apiKey + * @returns {Record} + */ +function authHeader(apiKey) { + return apiKey ? { Authorization: `Bearer ${apiKey}` } : {}; +} + +/** + * GET /api/voice/health -> { configured } (true when a backend base URL is set). + */ +router.get('/health', (req, res) => { + res.json({ configured: Boolean(resolveConfig(req).baseUrl) }); +}); + +/** + * POST /api/voice/transcribe (multipart 'audio') -> { text }. + * Forwards the uploaded audio to the backend's /audio/transcriptions endpoint. + */ +router.post('/transcribe', async (req, res) => { + const cfg = resolveConfig(req); + if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' }); + if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' }); + const upload = await getUpload(); + upload.single('audio')(req, res, async (err) => { + if (err) return res.status(400).json({ error: err.message }); + if (!req.file) return res.status(400).json({ error: 'No audio uploaded' }); + try { + const fd = new FormData(); + fd.append( + 'file', + new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }), + req.file.originalname || 'recording.webm', + ); + fd.append('model', cfg.sttModel); + const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: authHeader(cfg.apiKey), + body: fd, + }); + const text = await r.text(); + if (!r.ok) return upstreamError(res, r.status, text); + let data; + try { data = JSON.parse(text); } catch { data = { text }; } + res.json({ text: data.text ?? '' }); + } catch (e) { + backendError(res, e); + } + }); +}); + +/** + * POST /api/voice/tts { text } -> audio bytes. + * Forwards the text to the backend's /audio/speech endpoint and streams the audio back. + */ +router.post('/tts', async (req, res) => { + const cfg = resolveConfig(req); + if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' }); + if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' }); + const text = req.body?.text; + if (typeof text !== 'string' || !text.trim()) return res.status(400).json({ error: 'text required' }); + try { + const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) }, + body: JSON.stringify({ + model: cfg.ttsModel, + voice: cfg.ttsVoice, + input: text, + ...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}), + }), + }); + if (!r.ok) { + const errText = await r.text().catch(() => 'tts failed'); + return upstreamError(res, r.status, errText); + } + res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg'); + res.setHeader('Cache-Control', 'no-store'); + if (!r.body) return res.end(); + Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res); + } catch (e) { + backendError(res, e); + } +}); + +export default router; diff --git a/src/components/chat/hooks/useChatComposerState.ts b/src/components/chat/hooks/useChatComposerState.ts index c1f86f2d..e3e65b77 100644 --- a/src/components/chat/hooks/useChatComposerState.ts +++ b/src/components/chat/hooks/useChatComposerState.ts @@ -775,6 +775,17 @@ export function useChatComposerState({ handleSubmitRef.current = handleSubmit; }, [handleSubmit]); + // A voice transcript either fills the input (to edit before sending) or, when the + // user tapped "stop and send", is submitted straight away. Mirror the value into + // inputValueRef synchronously so handleSubmit reads the new text, not the stale state. + const handleVoiceTranscript = useCallback((text: string, send?: boolean) => { + const base = inputValueRef.current.trim(); + const next = base ? `${base} ${text}` : text; + setInput(next); + inputValueRef.current = next; + if (send) handleSubmitRef.current?.(createFakeSubmitEvent()); + }, [setInput]); + useEffect(() => { inputValueRef.current = input; }, [input]); @@ -1013,6 +1024,7 @@ export function useChatComposerState({ isDragActive, openImagePicker: open, handleSubmit, + handleVoiceTranscript, handleInputChange, handleKeyDown, handlePaste, diff --git a/src/components/chat/hooks/useTts.ts b/src/components/chat/hooks/useTts.ts new file mode 100644 index 00000000..fc4a6c33 --- /dev/null +++ b/src/components/chat/hooks/useTts.ts @@ -0,0 +1,33 @@ +import { useCallback, useEffect, useState } from 'react'; +import { voicePlayer, voiceId, type VoiceSnapshot } from '../../../lib/voicePlayer'; + +export type TtsState = VoiceSnapshot['state']; + +/** + * Thin adapter over the app-level voicePlayer. Playback lives outside React (see + * lib/voicePlayer), so switching chats or re-rendering a message no longer cuts the + * audio off. This hook just reflects the player's state for one message and forwards taps. + */ +export function useTts(getText: () => string) { + const content = getText(); + const id = voiceId(content); + + const [snap, setSnap] = useState(() => voicePlayer.getSnapshot(id)); + + useEffect(() => { + const update = () => + setSnap((prev) => { + const next = voicePlayer.getSnapshot(id); + return prev.state === next.state && prev.error === next.error ? prev : next; + }); + update(); + return voicePlayer.subscribe(update); + }, [id]); + + const toggle = useCallback(() => { + voicePlayer.unlock(); // synchronous, within the click gesture (iOS) + voicePlayer.toggle(content); + }, [content]); + + return { state: snap.state, toggle, error: snap.error }; +} diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts new file mode 100644 index 00000000..9ee92c48 --- /dev/null +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -0,0 +1,85 @@ +import { useEffect, useState } from 'react'; + +import { authenticatedFetch } from '../../../utils/api'; +import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig'; + +// Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / +// the Settings modal) and a configured voice backend. +const STORAGE_KEY = 'uiPreferences'; +const SYNC_EVENT = 'ui-preferences:sync'; +let healthRequest: Promise | null = null; + +function checkVoiceHealth(): Promise { + if (healthRequest) return healthRequest; + const request = authenticatedFetch('/api/voice/health') + .then(async (response) => { + if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); + const data = await response.json(); + return data?.configured === true; + }) + .finally(() => { + healthRequest = null; + }); + healthRequest = request; + return request; +} + +function readVoiceEnabled(): boolean { + try { + const raw = localStorage.getItem(STORAGE_KEY); + if (!raw) return false; + const parsed = JSON.parse(raw); + return parsed?.voiceEnabled === true || parsed?.voiceEnabled === 'true'; + } catch { + return false; + } +} + +export function useVoiceAvailable(): boolean { + const [enabled, setEnabled] = useState(() => + typeof window === 'undefined' ? false : readVoiceEnabled(), + ); + const [available, setAvailable] = useState(false); + + useEffect(() => { + const update = () => setEnabled(readVoiceEnabled()); + window.addEventListener('storage', update); + window.addEventListener(SYNC_EVENT, update as EventListener); + return () => { + window.removeEventListener('storage', update); + window.removeEventListener(SYNC_EVENT, update as EventListener); + }; + }, []); + + useEffect(() => { + let active = true; + let requestId = 0; + + const check = async () => { + if (!enabled) { + setAvailable(false); + return; + } + if (readVoiceConfig().baseUrl.trim()) { + setAvailable(true); + return; + } + const id = ++requestId; + try { + const result = await checkVoiceHealth(); + if (active && id === requestId) setAvailable(result); + } catch { + if (active && id === requestId) setAvailable(false); + } + }; + + void check(); + window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check); + return () => { + active = false; + window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check); + }; + }, [enabled]); + + return enabled && available; +} diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts new file mode 100644 index 00000000..6fcadd56 --- /dev/null +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -0,0 +1,149 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; + +import { transcribeVoice } from '../../../lib/voiceApi'; + +// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4. +const MIME_CANDIDATES = [ + 'audio/webm;codecs=opus', + 'audio/webm', + 'audio/mp4', + 'audio/ogg;codecs=opus', + 'audio/ogg', +]; + +function pickMime(): string { + for (const t of MIME_CANDIDATES) { + try { + if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t; + } catch { + /* isTypeSupported can throw on some iOS versions */ + } + } + return ''; +} + +export type VoiceInputState = 'idle' | 'recording' | 'transcribing'; + +/** + * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe + * (an OpenAI-compatible speech-to-text backend via the Express proxy), and + * returns the transcript through onTranscript. + */ +export function useVoiceInput( + onTranscript: (text: string, send?: boolean) => void, + onError?: (msg: string) => void, +) { + const [state, setState] = useState('idle'); + const recorderRef = useRef(null); + const chunksRef = useRef([]); + const streamRef = useRef(null); + const cancelledRef = useRef(false); + const startingRef = useRef(false); + // Whether the in-progress stop should auto-send the transcript (vs just fill the box). + const sendRef = useRef(false); + + const stopTracks = () => { + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + }; + + // Stop the mic if the component unmounts mid-recording. + useEffect(() => { + cancelledRef.current = false; + return () => { + cancelledRef.current = true; + startingRef.current = false; + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + recorderRef.current = null; + }; + }, []); + + const start = useCallback(async () => { + if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return; + startingRef.current = true; + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true }, + }); + if (cancelledRef.current) { + stream.getTracks().forEach((t) => t.stop()); + return; + } + streamRef.current = stream; + const mimeType = pickMime(); + const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); + recorderRef.current = rec; + chunksRef.current = []; + + rec.ondataavailable = (e) => { + if (e.data.size > 0) chunksRef.current.push(e.data); + }; + + rec.onstop = async () => { + stopTracks(); + if (cancelledRef.current) return; + // Capture and clear the send intent for this stop before any async work. + const shouldSend = sendRef.current; + sendRef.current = false; + const type = rec.mimeType || 'audio/webm'; + const blob = new Blob(chunksRef.current, { type }); + if (blob.size < 800) { + setState('idle'); + onError?.('Recording too short'); + return; + } + setState('transcribing'); + try { + const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm'; + const res = await transcribeVoice(blob, `recording.${ext}`); + if (!res.ok) throw new Error(`transcribe ${res.status}`); + const data = await res.json(); + if (cancelledRef.current) return; + const text = String(data?.text || '').trim(); + if (text) onTranscript(text, shouldSend); + else onError?.('No speech detected'); + } catch (e) { + if (!cancelledRef.current) { + onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + } + } finally { + if (!cancelledRef.current) setState('idle'); + } + }; + + rec.start(); + setState('recording'); + } catch (e) { + recorderRef.current = null; + stopTracks(); + if (cancelledRef.current) return; + const err = e as { name?: string; message?: string }; + let msg = `Mic error: ${err?.message || e}`; + if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; + else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; + onError?.(msg); + setState('idle'); + } finally { + startingRef.current = false; + } + }, [onTranscript, onError]); + + // Stop recording. Pass { send: true } to auto-send the transcript once it's ready. + // Guard on the recorder's own state (not React state) so a double tap, or the mic + // and Send buttons both firing, can't call stop() on an already-inactive recorder. + const stop = useCallback((opts?: { send?: boolean }) => { + const rec = recorderRef.current; + if (rec && rec.state !== 'inactive') { + sendRef.current = opts?.send ?? false; + rec.stop(); + } + }, []); + + const toggle = useCallback(() => { + if (state === 'recording') stop(); + else if (state === 'idle') start(); + }, [state, start, stop]); + + return { state, toggle, stop }; +} diff --git a/src/components/chat/view/ChatInterface.tsx b/src/components/chat/view/ChatInterface.tsx index 15786a41..a83dfbdc 100644 --- a/src/components/chat/view/ChatInterface.tsx +++ b/src/components/chat/view/ChatInterface.tsx @@ -173,6 +173,7 @@ function ChatInterface({ isDragActive, openImagePicker, handleSubmit, + handleVoiceTranscript, handleInputChange, handleKeyDown, handlePaste, @@ -406,6 +407,7 @@ function ChatInterface({ renderInputWithMentions={renderInputWithMentions} textareaRef={textareaRef} input={input} + onVoiceTranscript={handleVoiceTranscript} onInputChange={handleInputChange} onTextareaClick={handleTextareaClick} onTextareaKeyDown={handleKeyDown} diff --git a/src/components/chat/view/subcomponents/ChatComposer.tsx b/src/components/chat/view/subcomponents/ChatComposer.tsx index c60aa893..d679df11 100644 --- a/src/components/chat/view/subcomponents/ChatComposer.tsx +++ b/src/components/chat/view/subcomponents/ChatComposer.tsx @@ -1,4 +1,5 @@ import { useTranslation } from 'react-i18next'; +import { useCallback, useEffect, useRef, useState } from 'react'; import type { ChangeEvent, ClipboardEvent, @@ -9,8 +10,10 @@ import type { RefObject, TouchEvent, } from 'react'; -import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon } from 'lucide-react'; +import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon, Loader2 } from 'lucide-react'; +import { useVoiceInput } from '../../hooks/useVoiceInput'; +import { useVoiceAvailable } from '../../hooks/useVoiceAvailable'; import type { SessionActivity } from '../../../../hooks/useSessionProtection'; import type { PendingPermissionRequest, PermissionMode } from '../../types/types'; import { @@ -27,6 +30,7 @@ import { import CommandMenu from './CommandMenu'; import ActivityIndicator from './ActivityIndicator'; import ImageAttachment from './ImageAttachment'; +import VoiceInputButton from './VoiceInputButton'; import PermissionRequestsBanner from './PermissionRequestsBanner'; import TokenUsageSummary from './TokenUsageSummary'; @@ -89,6 +93,7 @@ interface ChatComposerProps { renderInputWithMentions: (text: string) => ReactNode; textareaRef: RefObject; input: string; + onVoiceTranscript?: (text: string, send?: boolean) => void; onInputChange: (event: ChangeEvent) => void; onTextareaClick: (event: MouseEvent) => void; onTextareaKeyDown: (event: KeyboardEvent) => void; @@ -142,6 +147,7 @@ export default function ChatComposer({ renderInputWithMentions, textareaRef, input, + onVoiceTranscript, onInputChange, onTextareaClick, onTextareaKeyDown, @@ -154,6 +160,28 @@ export default function ChatComposer({ sendByCtrlEnter, }: ChatComposerProps) { const { t } = useTranslation('chat'); + + // Voice state is hosted here (not in the mic button) so the main Send button can stop + // recording and send the transcript in one tap, the way the mic button drops it in the box. + const voiceAvailable = useVoiceAvailable(); + const [voiceError, setVoiceError] = useState(null); + const voiceErrorTimer = useRef | null>(null); + const handleVoiceError = useCallback((msg: string) => { + setVoiceError(msg); + if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current); + voiceErrorTimer.current = setTimeout(() => setVoiceError(null), 4000); + }, []); + useEffect(() => () => { + if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current); + }, []); + const noopTranscript = useCallback(() => {}, []); + const { state: voiceState, toggle: voiceToggle, stop: voiceStop } = useVoiceInput( + onVoiceTranscript ?? noopTranscript, + handleVoiceError, + ); + const isRecording = voiceState === 'recording'; + const isTranscribing = voiceState === 'transcribing'; + const textareaRect = textareaRef.current?.getBoundingClientRect(); const commandMenuPosition = { top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0, @@ -309,6 +337,10 @@ export default function ChatComposer({ + {onVoiceTranscript && voiceAvailable && ( + + )} + + + ); +}; + +export default MessageSpeakControl; diff --git a/src/components/chat/view/subcomponents/VoiceInputButton.tsx b/src/components/chat/view/subcomponents/VoiceInputButton.tsx new file mode 100644 index 00000000..249afacd --- /dev/null +++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx @@ -0,0 +1,46 @@ +import { useTranslation } from 'react-i18next'; +import { Mic, Square, Loader2 } from 'lucide-react'; + +import { PromptInputButton } from '../../../../shared/view/ui'; +import type { VoiceInputState } from '../../hooks/useVoiceInput'; + +type Props = { + state: VoiceInputState; + onToggle: () => void; + errorMsg?: string | null; +}; + +// Push-to-talk mic button (presentational). Recording state and the stop-and-send action +// are owned by the composer so the main Send button can drive them too. This button just +// starts recording and, while recording, stops and drops the transcript into the input box. +export default function VoiceInputButton({ state, onToggle, errorMsg }: Props) { + const { t } = useTranslation('chat'); + + const icon = + state === 'recording' ? ( + + ) : state === 'transcribing' ? ( + + ) : ( + + ); + + return ( + + {errorMsg && ( + + {errorMsg} + + )} + void }) => { + e.preventDefault(); + onToggle(); + }} + > + {icon} + + + ); +} diff --git a/src/components/quick-settings-panel/constants.ts b/src/components/quick-settings-panel/constants.ts index 15c15458..408a64c7 100644 --- a/src/components/quick-settings-panel/constants.ts +++ b/src/components/quick-settings-panel/constants.ts @@ -4,6 +4,7 @@ import { Eye, Languages, Maximize2, + Mic, } from 'lucide-react'; import type { PreferenceToggleItem } from './types'; @@ -54,4 +55,9 @@ export const INPUT_SETTING_TOGGLES: PreferenceToggleItem[] = [ labelKey: 'quickSettings.sendByCtrlEnter', icon: Languages, }, + { + key: 'voiceEnabled', + labelKey: 'quickSettings.voiceEnabled', + icon: Mic, + }, ]; diff --git a/src/components/quick-settings-panel/types.ts b/src/components/quick-settings-panel/types.ts index 16002694..8d4f0826 100644 --- a/src/components/quick-settings-panel/types.ts +++ b/src/components/quick-settings-panel/types.ts @@ -6,7 +6,8 @@ export type PreferenceToggleKey = | 'showRawParameters' | 'showThinking' | 'autoScrollToBottom' - | 'sendByCtrlEnter'; + | 'sendByCtrlEnter' + | 'voiceEnabled'; export type QuickSettingsPreferences = Record; diff --git a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx index 8d805fe9..dc539621 100644 --- a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx +++ b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx @@ -28,6 +28,9 @@ export default function QuickSettingsContent({ onPreferenceChange, }: QuickSettingsContentProps) { const { t } = useTranslation('settings'); + const inputSettingToggles = preferences.voiceEnabled + ? INPUT_SETTING_TOGGLES + : INPUT_SETTING_TOGGLES.filter(({ key }) => key !== 'voiceEnabled'); const renderToggleRows = (items: PreferenceToggleItem[]) => ( items.map(({ key, labelKey, icon }) => ( @@ -67,7 +70,7 @@ export default function QuickSettingsContent({ - {renderToggleRows(INPUT_SETTING_TOGGLES)} + {renderToggleRows(inputSettingToggles)}

{t('quickSettings.sendByCtrlEnterDescription')}

diff --git a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx index 0de1bbc7..5f630a61 100644 --- a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx +++ b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx @@ -27,12 +27,14 @@ export default function QuickSettingsPanelView() { showThinking: preferences.showThinking, autoScrollToBottom: preferences.autoScrollToBottom, sendByCtrlEnter: preferences.sendByCtrlEnter, + voiceEnabled: preferences.voiceEnabled, }), [ preferences.autoExpandTools, preferences.autoScrollToBottom, preferences.sendByCtrlEnter, preferences.showRawParameters, preferences.showThinking, + preferences.voiceEnabled, ]); const handlePreferenceChange = useCallback( diff --git a/src/components/settings/types/types.ts b/src/components/settings/types/types.ts index 74c3d309..5efac8d3 100644 --- a/src/components/settings/types/types.ts +++ b/src/components/settings/types/types.ts @@ -3,7 +3,7 @@ import type { Dispatch, SetStateAction } from 'react'; import type { LLMProvider } from '../../../types/app'; import type { ProviderAuthStatus } from '../../provider-auth/types'; -export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about'; +export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'voice' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about'; export type AgentProvider = LLMProvider; export type AgentCategory = 'account' | 'permissions' | 'mcp' | 'skills'; export type ProjectSortOrder = 'name' | 'date'; diff --git a/src/components/settings/view/Settings.tsx b/src/components/settings/view/Settings.tsx index bfa98edf..2d434d04 100644 --- a/src/components/settings/view/Settings.tsx +++ b/src/components/settings/view/Settings.tsx @@ -7,6 +7,7 @@ import SettingsSidebar from '../view/SettingsSidebar'; import AgentsSettingsTab from '../view/tabs/agents-settings/AgentsSettingsTab'; import AppearanceSettingsTab from '../view/tabs/AppearanceSettingsTab'; import CredentialsSettingsTab from '../view/tabs/api-settings/CredentialsSettingsTab'; +import VoiceSettingsTab from '../view/tabs/VoiceSettingsTab'; import GitSettingsTab from '../view/tabs/git-settings/GitSettingsTab'; import BrowserUseSettingsTab from '../view/tabs/browser-use-settings/BrowserUseSettingsTab'; import NotificationsSettingsTab from '../view/tabs/NotificationsSettingsTab'; @@ -157,6 +158,8 @@ function Settings({ isOpen, onClose, projects = [], initialTab = 'agents' }: Set {activeTab === 'api' && } + {activeTab === 'voice' && } + {activeTab === 'plugins' && } {activeTab === 'about' && } diff --git a/src/components/settings/view/SettingsSidebar.tsx b/src/components/settings/view/SettingsSidebar.tsx index dde32a9e..3b76976e 100644 --- a/src/components/settings/view/SettingsSidebar.tsx +++ b/src/components/settings/view/SettingsSidebar.tsx @@ -1,5 +1,6 @@ -import { Bell, Bot, GitBranch, Info, Key, ListChecks, MonitorPlay, Palette, Puzzle } from 'lucide-react'; +import { Bell, Bot, GitBranch, Info, Key, ListChecks, Mic, MonitorPlay, Palette, Puzzle } from 'lucide-react'; import { useTranslation } from 'react-i18next'; + import { cn } from '../../../lib/utils'; import { PillBar, Pill } from '../../../shared/view/ui'; import type { SettingsMainTab } from '../types/types'; @@ -20,6 +21,7 @@ const NAV_ITEMS: NavItem[] = [ { id: 'appearance', labelKey: 'mainTabs.appearance', icon: Palette }, { id: 'git', labelKey: 'mainTabs.git', icon: GitBranch }, { id: 'api', labelKey: 'mainTabs.apiTokens', icon: Key }, + { id: 'voice', labelKey: 'mainTabs.voice', icon: Mic }, { id: 'tasks', labelKey: 'mainTabs.tasks', icon: ListChecks }, { id: 'browser', labelKey: 'mainTabs.browser', icon: MonitorPlay }, { id: 'plugins', labelKey: 'mainTabs.plugins', icon: Puzzle }, diff --git a/src/components/settings/view/tabs/VoiceSettingsTab.tsx b/src/components/settings/view/tabs/VoiceSettingsTab.tsx new file mode 100644 index 00000000..8dcf7585 --- /dev/null +++ b/src/components/settings/view/tabs/VoiceSettingsTab.tsx @@ -0,0 +1,91 @@ +import type { InputHTMLAttributes } from 'react'; +import { useTranslation } from 'react-i18next'; +import SettingsSection from '../SettingsSection'; +import SettingsToggle from '../SettingsToggle'; +import { useUiPreferences } from '../../../../hooks/useUiPreferences'; +import { useVoiceConfig } from '../../../../hooks/useVoiceConfig'; + +const inputClass = + 'w-full rounded-md border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-ring'; + +function Field({ label, ...props }: { label: string } & InputHTMLAttributes) { + return ( + + ); +} + +export default function VoiceSettingsTab() { + const { t } = useTranslation('settings'); + const { preferences, setPreference } = useUiPreferences(); + const { config, update } = useVoiceConfig(); + const voiceEnabled = preferences.voiceEnabled; + + return ( +
+ +
+
+
{t('voiceSettings.enable')}
+
{t('voiceSettings.enableDescription')}
+
+ setPreference('voiceEnabled', v)} + ariaLabel={t('voiceSettings.enable')} + /> +
+
+ + {voiceEnabled && ( + +
+ update({ baseUrl: e.target.value })} + /> + update({ apiKey: e.target.value })} + /> +
+ update({ sttModel: e.target.value })} + /> + update({ ttsModel: e.target.value })} + /> + update({ ttsVoice: e.target.value })} + /> + update({ ttsFormat: e.target.value })} + /> +
+

{t('voiceSettings.note')}

+
+
+ )} +
+ ); +} diff --git a/src/hooks/useUiPreferences.ts b/src/hooks/useUiPreferences.ts index eb0b8339..342f1698 100644 --- a/src/hooks/useUiPreferences.ts +++ b/src/hooks/useUiPreferences.ts @@ -7,6 +7,7 @@ type UiPreferences = { autoScrollToBottom: boolean; sendByCtrlEnter: boolean; sidebarVisible: boolean; + voiceEnabled: boolean; }; type UiPreferenceKey = keyof UiPreferences; @@ -39,6 +40,7 @@ const DEFAULTS: UiPreferences = { autoScrollToBottom: true, sendByCtrlEnter: false, sidebarVisible: true, + voiceEnabled: false, }; const PREFERENCE_KEYS = Object.keys(DEFAULTS) as UiPreferenceKey[]; diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts new file mode 100644 index 00000000..303b6467 --- /dev/null +++ b/src/hooks/useVoiceConfig.ts @@ -0,0 +1,68 @@ +import { useState } from 'react'; + +export type VoiceConfig = { + baseUrl: string; + apiKey: string; + sttModel: string; + ttsModel: string; + ttsVoice: string; + ttsFormat: string; +}; + +const STORAGE_KEY = 'voiceConfig'; +export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; +const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; + +export function readVoiceConfig(): VoiceConfig { + try { + const raw = localStorage.getItem(STORAGE_KEY); + if (!raw) return { ...DEFAULTS }; + const parsed = JSON.parse(raw); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) return { ...DEFAULTS }; + const config = { ...DEFAULTS }; + for (const key of Object.keys(DEFAULTS) as (keyof VoiceConfig)[]) { + if (typeof parsed[key] === 'string') config[key] = parsed[key]; + } + return config; + } catch { + return { ...DEFAULTS }; + } +} + +// Headers the voice proxy reads to target a per-user OpenAI-compatible backend. +// Empty fields are omitted so the server's env defaults apply. +export function voiceConfigHeaders(): Record { + if (typeof window === 'undefined') return {}; + const c = readVoiceConfig(); + const h: Record = {}; + if (c.apiKey) h['x-voice-api-key'] = c.apiKey; + if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; + if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; + if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice; + if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim(); + return h; +} + +export function useVoiceConfig() { + const [config, setConfig] = useState(() => + typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(), + ); + + const update = (patch: Partial) => { + setConfig((prev) => { + const next = { ...prev, ...patch }; + try { + const stored: Partial = { ...next }; + if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim(); + else delete stored.ttsFormat; + localStorage.setItem(STORAGE_KEY, JSON.stringify(stored)); + window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT)); + } catch { + /* ignore persistence errors */ + } + return next; + }); + }; + + return { config, update }; +} diff --git a/src/i18n/locales/en/chat.json b/src/i18n/locales/en/chat.json index 2c75fad0..656fa328 100644 --- a/src/i18n/locales/en/chat.json +++ b/src/i18n/locales/en/chat.json @@ -122,6 +122,14 @@ } } }, + "voice": { + "input": "Voice input", + "stopRecording": "Stop recording", + "transcribing": "Transcribing…", + "speak": "Read aloud", + "stopSpeaking": "Stop", + "loading": "Loading…" + }, "input": { "placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...", "placeholderDefault": "Type your message...", diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json index 89d4e651..a04067e2 100644 --- a/src/i18n/locales/en/settings.json +++ b/src/i18n/locales/en/settings.json @@ -50,6 +50,21 @@ "resetToDefaults": "Reset to Defaults", "cancelChanges": "Cancel Changes" }, + "voiceSettings": { + "title": "Voice", + "description": "Speech-to-text input and read-aloud, via an OpenAI-compatible audio backend.", + "enable": "Enable voice", + "enableDescription": "Show the mic button and the read-aloud button on messages.", + "backendTitle": "Backend", + "backendDescription": "Point at OpenAI, Groq, or a local server (LocalAI, Speaches, Kokoro-FastAPI). Leave blank to use the server default.", + "baseUrl": "Base URL", + "apiKey": "API key", + "sttModel": "Speech-to-text model", + "ttsModel": "Text-to-speech model", + "voice": "Voice", + "format": "Audio format", + "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend." + }, "quickSettings": { "title": "Quick Settings", "sections": { @@ -64,6 +79,7 @@ "showThinking": "Show thinking", "autoScrollToBottom": "Auto-scroll to bottom", "sendByCtrlEnter": "Send by Ctrl+Enter", + "voiceEnabled": "Voice (mic + read aloud)", "sendByCtrlEnterDescription": "When enabled, pressing Ctrl+Enter will send the message instead of just Enter. This is useful for IME users to avoid accidental sends.", "dragHandle": { "dragging": "Dragging handle", @@ -94,6 +110,7 @@ "appearance": "Appearance", "git": "Git", "apiTokens": "API & Tokens", + "voice": "Voice", "tasks": "Tasks", "browser": "Browser", "notifications": "Notifications", diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts new file mode 100644 index 00000000..3f9549b4 --- /dev/null +++ b/src/lib/voiceApi.ts @@ -0,0 +1,60 @@ +import { authenticatedFetch } from '../utils/api'; +import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig'; + +function directUrl(baseUrl: string, path: string): string { + return `${baseUrl.replace(/\/$/, '')}${path}`; +} + +export function voiceConfigSignature(): string { + return JSON.stringify(readVoiceConfig()); +} + +export function transcribeVoice(blob: Blob, filename: string): Promise { + const config = readVoiceConfig(); + const body = new FormData(); + + if (config.baseUrl.trim()) { + body.append('file', blob, filename); + body.append('model', config.sttModel || 'whisper-1'); + return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), { + method: 'POST', + headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}, + body, + }); + } + + body.append('audio', blob, filename); + return authenticatedFetch('/api/voice/transcribe', { + method: 'POST', + headers: voiceConfigHeaders(), + body, + }); +} + +export function synthesizeVoice(text: string, signal: AbortSignal): Promise { + const config = readVoiceConfig(); + + if (config.baseUrl.trim()) { + return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}), + }, + body: JSON.stringify({ + model: config.ttsModel || 'tts-1', + voice: config.ttsVoice || 'alloy', + input: text, + ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}), + }), + signal, + }); + } + + return authenticatedFetch('/api/voice/tts', { + method: 'POST', + body: JSON.stringify({ text }), + headers: voiceConfigHeaders(), + signal, + }); +} diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts new file mode 100644 index 00000000..4c239c29 --- /dev/null +++ b/src/lib/voicePlayer.ts @@ -0,0 +1,196 @@ +import { synthesizeVoice, voiceConfigSignature } from './voiceApi'; + +// A single app-level audio player for read-aloud. It owns one