diff --git a/server/index.js b/server/index.js index d957ef58..7db1b122 100755 --- a/server/index.js +++ b/server/index.js @@ -61,6 +61,7 @@ import userRoutes from './routes/user.js'; import geminiRoutes from './routes/gemini.js'; import pluginsRoutes from './routes/plugins.js'; import providerRoutes from './modules/providers/provider.routes.js'; +import voiceRoutes from './voice-proxy.js'; import browserUseRoutes from './modules/browser-use/browser-use.routes.js'; import browserUseMcpRoutes from './modules/browser-use/browser-use-mcp.routes.js'; import { browserUseService } from './modules/browser-use/browser-use.service.js'; @@ -222,6 +223,8 @@ app.use('/api/providers', authenticateToken, providerRoutes); // Agent API Routes (uses API key authentication) app.use('/api/agent', agentRoutes); +app.use('/api/voice', authenticateToken, voiceRoutes); + // Serve public files (like api-docs.html) app.use(express.static(path.join(APP_ROOT, 'public'))); diff --git a/server/voice-proxy.js b/server/voice-proxy.js new file mode 100644 index 00000000..1ea4a6d8 --- /dev/null +++ b/server/voice-proxy.js @@ -0,0 +1,224 @@ +// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend. +// +// The backend is whatever the user points at: OpenAI, Groq, or a local server +// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the +// standard OpenAI audio endpoints: +// POST {base}/audio/transcriptions (multipart 'file' + 'model') -> { text } +// POST {base}/audio/speech ({ model, voice, input }) -> audio bytes +// +// Config is resolved per-request from headers (set by the client's voice settings), +// falling back to server env defaults. Mounted at /api/voice behind authenticateToken. +import { Readable } from 'node:stream'; + +import express from 'express'; + +const ENV = { + baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''), + apiKey: process.env.VOICE_API_KEY || '', + sttModel: process.env.VOICE_STT_MODEL || 'whisper-1', + ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1', + ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy', +}; + +/** + * Resolve the voice backend config for a request. Client headers (set from the + * user's in-app voice settings) take precedence over the server env defaults. + * @param {import('express').Request} req + * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}} + */ +function resolveConfig(req) { + const h = req.headers; + return { + // Security: do not allow clients to control the outbound backend host. + // Always use the server-side configured base URL. + baseUrl: ENV.baseUrl, + apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey, + sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, + ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, + ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, + ttsFormat: String(h['x-voice-tts-format'] || '').trim(), + }; +} + +const router = express.Router(); + +// Generous by default — local TTS can synthesize long messages at ~real-time on CPU. +// Guard against a non-numeric/zero override that would make setTimeout fire immediately. +const DEFAULT_VOICE_TIMEOUT_MS = 300000; +const _parsedTimeout = Number(process.env.VOICE_TIMEOUT_MS); +const VOICE_TIMEOUT_MS = Number.isFinite(_parsedTimeout) && _parsedTimeout > 0 + ? _parsedTimeout + : DEFAULT_VOICE_TIMEOUT_MS; + +/** + * fetch() with an AbortController timeout so a stalled backend can't hold the + * request open indefinitely. Aborts after VOICE_TIMEOUT_MS. + * @param {string} url + * @param {RequestInit} [options] + * @returns {Promise} + */ +async function fetchWithTimeout(url, options = {}) { + const parsed = new URL(url); + if (!['http:', 'https:'].includes(parsed.protocol) || !isAllowedBackendUrl(parsed.origin)) { + throw new Error('Blocked outbound voice backend URL'); + } + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS); + try { + return await fetch(parsed.toString(), { redirect: 'manual', ...options, signal: controller.signal }); + } finally { + clearTimeout(timer); + } +} + +/** + * Turn a backend fetch failure into a clear, actionable client response: + * 504 on timeout (AbortError), 502 otherwise. + * @param {import('express').Response} res + * @param {Error} e + */ +function backendError(res, e) { + if (e && e.name === 'AbortError') { + return res.status(504).json({ + error: `Voice backend timed out after ${Math.round(VOICE_TIMEOUT_MS / 1000)}s. Check your voice backend.`, + }); + } + return res.status(502).json({ error: `Voice backend unreachable: ${e.message}` }); +} + +/** + * SSRF guard for the user-configurable backend URL: allow http/https only and + * block the link-local / cloud-metadata range (169.254.x). localhost and private + * ranges are allowed on purpose so users can point at a local voice server + * (LocalAI, Speaches, Kokoro-FastAPI, etc.). + * @param {string} raw + * @returns {boolean} + */ +function isAllowedBackendUrl(raw) { + let u; + try { + u = new URL(raw); + } catch { + return false; + } + if (u.protocol !== 'http:' && u.protocol !== 'https:') return false; + if (u.hostname === '169.254.169.254' || u.hostname.startsWith('169.254.')) return false; + return true; +} + +/** + * Relay an upstream (backend) error to the client without making an upstream + * 401/403 look like the user's own app login failed. + * @param {import('express').Response} res + * @param {number} status + * @param {string} [text] + */ +function upstreamError(res, status, text) { + if (status === 401 || status === 403) { + return res.status(502).json({ error: 'Voice backend rejected the request (check the API key).' }); + } + return res.status(status).json({ error: text || 'voice backend error' }); +} + +let _upload = null; +/** + * Lazily build a memory-storage multer instance (25 MB cap) for audio uploads, + * so multer is only imported when the voice feature is actually used. + * @returns {Promise} + */ +async function getUpload() { + if (!_upload) { + const multer = (await import('multer')).default; + _upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } }); + } + return _upload; +} + +/** + * Build the Authorization header for the backend, or an empty object when no + * key is configured (e.g. a local server that needs none). + * @param {string} apiKey + * @returns {Record} + */ +function authHeader(apiKey) { + return apiKey ? { Authorization: `Bearer ${apiKey}` } : {}; +} + +/** + * GET /api/voice/health -> { configured } (true when a backend base URL is set). + */ +router.get('/health', (req, res) => { + res.json({ configured: Boolean(resolveConfig(req).baseUrl) }); +}); + +/** + * POST /api/voice/transcribe (multipart 'audio') -> { text }. + * Forwards the uploaded audio to the backend's /audio/transcriptions endpoint. + */ +router.post('/transcribe', async (req, res) => { + const cfg = resolveConfig(req); + if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' }); + if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' }); + const upload = await getUpload(); + upload.single('audio')(req, res, async (err) => { + if (err) return res.status(400).json({ error: err.message }); + if (!req.file) return res.status(400).json({ error: 'No audio uploaded' }); + try { + const fd = new FormData(); + fd.append( + 'file', + new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }), + req.file.originalname || 'recording.webm', + ); + fd.append('model', cfg.sttModel); + const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, { + method: 'POST', + headers: authHeader(cfg.apiKey), + body: fd, + }); + const text = await r.text(); + if (!r.ok) return upstreamError(res, r.status, text); + let data; + try { data = JSON.parse(text); } catch { data = { text }; } + res.json({ text: data.text ?? '' }); + } catch (e) { + backendError(res, e); + } + }); +}); + +/** + * POST /api/voice/tts { text } -> audio bytes. + * Forwards the text to the backend's /audio/speech endpoint and streams the audio back. + */ +router.post('/tts', async (req, res) => { + const cfg = resolveConfig(req); + if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' }); + if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' }); + const text = req.body?.text; + if (typeof text !== 'string' || !text.trim()) return res.status(400).json({ error: 'text required' }); + try { + const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) }, + body: JSON.stringify({ + model: cfg.ttsModel, + voice: cfg.ttsVoice, + input: text, + ...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}), + }), + }); + if (!r.ok) { + const errText = await r.text().catch(() => 'tts failed'); + return upstreamError(res, r.status, errText); + } + res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg'); + res.setHeader('Cache-Control', 'no-store'); + if (!r.body) return res.end(); + Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res); + } catch (e) { + backendError(res, e); + } +}); + +export default router; diff --git a/src/components/chat/hooks/useChatComposerState.ts b/src/components/chat/hooks/useChatComposerState.ts index c1f86f2d..e3e65b77 100644 --- a/src/components/chat/hooks/useChatComposerState.ts +++ b/src/components/chat/hooks/useChatComposerState.ts @@ -775,6 +775,17 @@ export function useChatComposerState({ handleSubmitRef.current = handleSubmit; }, [handleSubmit]); + // A voice transcript either fills the input (to edit before sending) or, when the + // user tapped "stop and send", is submitted straight away. Mirror the value into + // inputValueRef synchronously so handleSubmit reads the new text, not the stale state. + const handleVoiceTranscript = useCallback((text: string, send?: boolean) => { + const base = inputValueRef.current.trim(); + const next = base ? `${base} ${text}` : text; + setInput(next); + inputValueRef.current = next; + if (send) handleSubmitRef.current?.(createFakeSubmitEvent()); + }, [setInput]); + useEffect(() => { inputValueRef.current = input; }, [input]); @@ -1013,6 +1024,7 @@ export function useChatComposerState({ isDragActive, openImagePicker: open, handleSubmit, + handleVoiceTranscript, handleInputChange, handleKeyDown, handlePaste, diff --git a/src/components/chat/hooks/useTts.ts b/src/components/chat/hooks/useTts.ts new file mode 100644 index 00000000..fc4a6c33 --- /dev/null +++ b/src/components/chat/hooks/useTts.ts @@ -0,0 +1,33 @@ +import { useCallback, useEffect, useState } from 'react'; +import { voicePlayer, voiceId, type VoiceSnapshot } from '../../../lib/voicePlayer'; + +export type TtsState = VoiceSnapshot['state']; + +/** + * Thin adapter over the app-level voicePlayer. Playback lives outside React (see + * lib/voicePlayer), so switching chats or re-rendering a message no longer cuts the + * audio off. This hook just reflects the player's state for one message and forwards taps. + */ +export function useTts(getText: () => string) { + const content = getText(); + const id = voiceId(content); + + const [snap, setSnap] = useState(() => voicePlayer.getSnapshot(id)); + + useEffect(() => { + const update = () => + setSnap((prev) => { + const next = voicePlayer.getSnapshot(id); + return prev.state === next.state && prev.error === next.error ? prev : next; + }); + update(); + return voicePlayer.subscribe(update); + }, [id]); + + const toggle = useCallback(() => { + voicePlayer.unlock(); // synchronous, within the click gesture (iOS) + voicePlayer.toggle(content); + }, [content]); + + return { state: snap.state, toggle, error: snap.error }; +} diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts new file mode 100644 index 00000000..9ee92c48 --- /dev/null +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -0,0 +1,85 @@ +import { useEffect, useState } from 'react'; + +import { authenticatedFetch } from '../../../utils/api'; +import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig'; + +// Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / +// the Settings modal) and a configured voice backend. +const STORAGE_KEY = 'uiPreferences'; +const SYNC_EVENT = 'ui-preferences:sync'; +let healthRequest: Promise | null = null; + +function checkVoiceHealth(): Promise { + if (healthRequest) return healthRequest; + const request = authenticatedFetch('/api/voice/health') + .then(async (response) => { + if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); + const data = await response.json(); + return data?.configured === true; + }) + .finally(() => { + healthRequest = null; + }); + healthRequest = request; + return request; +} + +function readVoiceEnabled(): boolean { + try { + const raw = localStorage.getItem(STORAGE_KEY); + if (!raw) return false; + const parsed = JSON.parse(raw); + return parsed?.voiceEnabled === true || parsed?.voiceEnabled === 'true'; + } catch { + return false; + } +} + +export function useVoiceAvailable(): boolean { + const [enabled, setEnabled] = useState(() => + typeof window === 'undefined' ? false : readVoiceEnabled(), + ); + const [available, setAvailable] = useState(false); + + useEffect(() => { + const update = () => setEnabled(readVoiceEnabled()); + window.addEventListener('storage', update); + window.addEventListener(SYNC_EVENT, update as EventListener); + return () => { + window.removeEventListener('storage', update); + window.removeEventListener(SYNC_EVENT, update as EventListener); + }; + }, []); + + useEffect(() => { + let active = true; + let requestId = 0; + + const check = async () => { + if (!enabled) { + setAvailable(false); + return; + } + if (readVoiceConfig().baseUrl.trim()) { + setAvailable(true); + return; + } + const id = ++requestId; + try { + const result = await checkVoiceHealth(); + if (active && id === requestId) setAvailable(result); + } catch { + if (active && id === requestId) setAvailable(false); + } + }; + + void check(); + window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check); + return () => { + active = false; + window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check); + }; + }, [enabled]); + + return enabled && available; +} diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts new file mode 100644 index 00000000..6fcadd56 --- /dev/null +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -0,0 +1,149 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; + +import { transcribeVoice } from '../../../lib/voiceApi'; + +// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4. +const MIME_CANDIDATES = [ + 'audio/webm;codecs=opus', + 'audio/webm', + 'audio/mp4', + 'audio/ogg;codecs=opus', + 'audio/ogg', +]; + +function pickMime(): string { + for (const t of MIME_CANDIDATES) { + try { + if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t; + } catch { + /* isTypeSupported can throw on some iOS versions */ + } + } + return ''; +} + +export type VoiceInputState = 'idle' | 'recording' | 'transcribing'; + +/** + * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe + * (an OpenAI-compatible speech-to-text backend via the Express proxy), and + * returns the transcript through onTranscript. + */ +export function useVoiceInput( + onTranscript: (text: string, send?: boolean) => void, + onError?: (msg: string) => void, +) { + const [state, setState] = useState('idle'); + const recorderRef = useRef(null); + const chunksRef = useRef([]); + const streamRef = useRef(null); + const cancelledRef = useRef(false); + const startingRef = useRef(false); + // Whether the in-progress stop should auto-send the transcript (vs just fill the box). + const sendRef = useRef(false); + + const stopTracks = () => { + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + }; + + // Stop the mic if the component unmounts mid-recording. + useEffect(() => { + cancelledRef.current = false; + return () => { + cancelledRef.current = true; + startingRef.current = false; + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + recorderRef.current = null; + }; + }, []); + + const start = useCallback(async () => { + if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return; + startingRef.current = true; + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true }, + }); + if (cancelledRef.current) { + stream.getTracks().forEach((t) => t.stop()); + return; + } + streamRef.current = stream; + const mimeType = pickMime(); + const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); + recorderRef.current = rec; + chunksRef.current = []; + + rec.ondataavailable = (e) => { + if (e.data.size > 0) chunksRef.current.push(e.data); + }; + + rec.onstop = async () => { + stopTracks(); + if (cancelledRef.current) return; + // Capture and clear the send intent for this stop before any async work. + const shouldSend = sendRef.current; + sendRef.current = false; + const type = rec.mimeType || 'audio/webm'; + const blob = new Blob(chunksRef.current, { type }); + if (blob.size < 800) { + setState('idle'); + onError?.('Recording too short'); + return; + } + setState('transcribing'); + try { + const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm'; + const res = await transcribeVoice(blob, `recording.${ext}`); + if (!res.ok) throw new Error(`transcribe ${res.status}`); + const data = await res.json(); + if (cancelledRef.current) return; + const text = String(data?.text || '').trim(); + if (text) onTranscript(text, shouldSend); + else onError?.('No speech detected'); + } catch (e) { + if (!cancelledRef.current) { + onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + } + } finally { + if (!cancelledRef.current) setState('idle'); + } + }; + + rec.start(); + setState('recording'); + } catch (e) { + recorderRef.current = null; + stopTracks(); + if (cancelledRef.current) return; + const err = e as { name?: string; message?: string }; + let msg = `Mic error: ${err?.message || e}`; + if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; + else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; + onError?.(msg); + setState('idle'); + } finally { + startingRef.current = false; + } + }, [onTranscript, onError]); + + // Stop recording. Pass { send: true } to auto-send the transcript once it's ready. + // Guard on the recorder's own state (not React state) so a double tap, or the mic + // and Send buttons both firing, can't call stop() on an already-inactive recorder. + const stop = useCallback((opts?: { send?: boolean }) => { + const rec = recorderRef.current; + if (rec && rec.state !== 'inactive') { + sendRef.current = opts?.send ?? false; + rec.stop(); + } + }, []); + + const toggle = useCallback(() => { + if (state === 'recording') stop(); + else if (state === 'idle') start(); + }, [state, start, stop]); + + return { state, toggle, stop }; +} diff --git a/src/components/chat/view/ChatInterface.tsx b/src/components/chat/view/ChatInterface.tsx index 15786a41..a83dfbdc 100644 --- a/src/components/chat/view/ChatInterface.tsx +++ b/src/components/chat/view/ChatInterface.tsx @@ -173,6 +173,7 @@ function ChatInterface({ isDragActive, openImagePicker, handleSubmit, + handleVoiceTranscript, handleInputChange, handleKeyDown, handlePaste, @@ -406,6 +407,7 @@ function ChatInterface({ renderInputWithMentions={renderInputWithMentions} textareaRef={textareaRef} input={input} + onVoiceTranscript={handleVoiceTranscript} onInputChange={handleInputChange} onTextareaClick={handleTextareaClick} onTextareaKeyDown={handleKeyDown} diff --git a/src/components/chat/view/subcomponents/ChatComposer.tsx b/src/components/chat/view/subcomponents/ChatComposer.tsx index c60aa893..d679df11 100644 --- a/src/components/chat/view/subcomponents/ChatComposer.tsx +++ b/src/components/chat/view/subcomponents/ChatComposer.tsx @@ -1,4 +1,5 @@ import { useTranslation } from 'react-i18next'; +import { useCallback, useEffect, useRef, useState } from 'react'; import type { ChangeEvent, ClipboardEvent, @@ -9,8 +10,10 @@ import type { RefObject, TouchEvent, } from 'react'; -import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon } from 'lucide-react'; +import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon, Loader2 } from 'lucide-react'; +import { useVoiceInput } from '../../hooks/useVoiceInput'; +import { useVoiceAvailable } from '../../hooks/useVoiceAvailable'; import type { SessionActivity } from '../../../../hooks/useSessionProtection'; import type { PendingPermissionRequest, PermissionMode } from '../../types/types'; import { @@ -27,6 +30,7 @@ import { import CommandMenu from './CommandMenu'; import ActivityIndicator from './ActivityIndicator'; import ImageAttachment from './ImageAttachment'; +import VoiceInputButton from './VoiceInputButton'; import PermissionRequestsBanner from './PermissionRequestsBanner'; import TokenUsageSummary from './TokenUsageSummary'; @@ -89,6 +93,7 @@ interface ChatComposerProps { renderInputWithMentions: (text: string) => ReactNode; textareaRef: RefObject; input: string; + onVoiceTranscript?: (text: string, send?: boolean) => void; onInputChange: (event: ChangeEvent) => void; onTextareaClick: (event: MouseEvent) => void; onTextareaKeyDown: (event: KeyboardEvent) => void; @@ -142,6 +147,7 @@ export default function ChatComposer({ renderInputWithMentions, textareaRef, input, + onVoiceTranscript, onInputChange, onTextareaClick, onTextareaKeyDown, @@ -154,6 +160,28 @@ export default function ChatComposer({ sendByCtrlEnter, }: ChatComposerProps) { const { t } = useTranslation('chat'); + + // Voice state is hosted here (not in the mic button) so the main Send button can stop + // recording and send the transcript in one tap, the way the mic button drops it in the box. + const voiceAvailable = useVoiceAvailable(); + const [voiceError, setVoiceError] = useState(null); + const voiceErrorTimer = useRef | null>(null); + const handleVoiceError = useCallback((msg: string) => { + setVoiceError(msg); + if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current); + voiceErrorTimer.current = setTimeout(() => setVoiceError(null), 4000); + }, []); + useEffect(() => () => { + if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current); + }, []); + const noopTranscript = useCallback(() => {}, []); + const { state: voiceState, toggle: voiceToggle, stop: voiceStop } = useVoiceInput( + onVoiceTranscript ?? noopTranscript, + handleVoiceError, + ); + const isRecording = voiceState === 'recording'; + const isTranscribing = voiceState === 'transcribing'; + const textareaRect = textareaRef.current?.getBoundingClientRect(); const commandMenuPosition = { top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0, @@ -309,6 +337,10 @@ export default function ChatComposer({ + {onVoiceTranscript && voiceAvailable && ( + + )} + + + ); +}; + +export default MessageSpeakControl; diff --git a/src/components/chat/view/subcomponents/VoiceInputButton.tsx b/src/components/chat/view/subcomponents/VoiceInputButton.tsx new file mode 100644 index 00000000..249afacd --- /dev/null +++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx @@ -0,0 +1,46 @@ +import { useTranslation } from 'react-i18next'; +import { Mic, Square, Loader2 } from 'lucide-react'; + +import { PromptInputButton } from '../../../../shared/view/ui'; +import type { VoiceInputState } from '../../hooks/useVoiceInput'; + +type Props = { + state: VoiceInputState; + onToggle: () => void; + errorMsg?: string | null; +}; + +// Push-to-talk mic button (presentational). Recording state and the stop-and-send action +// are owned by the composer so the main Send button can drive them too. This button just +// starts recording and, while recording, stops and drops the transcript into the input box. +export default function VoiceInputButton({ state, onToggle, errorMsg }: Props) { + const { t } = useTranslation('chat'); + + const icon = + state === 'recording' ? ( + + ) : state === 'transcribing' ? ( + + ) : ( + + ); + + return ( + + {errorMsg && ( + + {errorMsg} + + )} + void }) => { + e.preventDefault(); + onToggle(); + }} + > + {icon} + + + ); +} diff --git a/src/components/quick-settings-panel/constants.ts b/src/components/quick-settings-panel/constants.ts index 15c15458..408a64c7 100644 --- a/src/components/quick-settings-panel/constants.ts +++ b/src/components/quick-settings-panel/constants.ts @@ -4,6 +4,7 @@ import { Eye, Languages, Maximize2, + Mic, } from 'lucide-react'; import type { PreferenceToggleItem } from './types'; @@ -54,4 +55,9 @@ export const INPUT_SETTING_TOGGLES: PreferenceToggleItem[] = [ labelKey: 'quickSettings.sendByCtrlEnter', icon: Languages, }, + { + key: 'voiceEnabled', + labelKey: 'quickSettings.voiceEnabled', + icon: Mic, + }, ]; diff --git a/src/components/quick-settings-panel/types.ts b/src/components/quick-settings-panel/types.ts index 16002694..8d4f0826 100644 --- a/src/components/quick-settings-panel/types.ts +++ b/src/components/quick-settings-panel/types.ts @@ -6,7 +6,8 @@ export type PreferenceToggleKey = | 'showRawParameters' | 'showThinking' | 'autoScrollToBottom' - | 'sendByCtrlEnter'; + | 'sendByCtrlEnter' + | 'voiceEnabled'; export type QuickSettingsPreferences = Record; diff --git a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx index 8d805fe9..dc539621 100644 --- a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx +++ b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx @@ -28,6 +28,9 @@ export default function QuickSettingsContent({ onPreferenceChange, }: QuickSettingsContentProps) { const { t } = useTranslation('settings'); + const inputSettingToggles = preferences.voiceEnabled + ? INPUT_SETTING_TOGGLES + : INPUT_SETTING_TOGGLES.filter(({ key }) => key !== 'voiceEnabled'); const renderToggleRows = (items: PreferenceToggleItem[]) => ( items.map(({ key, labelKey, icon }) => ( @@ -67,7 +70,7 @@ export default function QuickSettingsContent({ - {renderToggleRows(INPUT_SETTING_TOGGLES)} + {renderToggleRows(inputSettingToggles)}

{t('quickSettings.sendByCtrlEnterDescription')}

diff --git a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx index 0de1bbc7..5f630a61 100644 --- a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx +++ b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx @@ -27,12 +27,14 @@ export default function QuickSettingsPanelView() { showThinking: preferences.showThinking, autoScrollToBottom: preferences.autoScrollToBottom, sendByCtrlEnter: preferences.sendByCtrlEnter, + voiceEnabled: preferences.voiceEnabled, }), [ preferences.autoExpandTools, preferences.autoScrollToBottom, preferences.sendByCtrlEnter, preferences.showRawParameters, preferences.showThinking, + preferences.voiceEnabled, ]); const handlePreferenceChange = useCallback( diff --git a/src/components/settings/types/types.ts b/src/components/settings/types/types.ts index 74c3d309..5efac8d3 100644 --- a/src/components/settings/types/types.ts +++ b/src/components/settings/types/types.ts @@ -3,7 +3,7 @@ import type { Dispatch, SetStateAction } from 'react'; import type { LLMProvider } from '../../../types/app'; import type { ProviderAuthStatus } from '../../provider-auth/types'; -export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about'; +export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'voice' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about'; export type AgentProvider = LLMProvider; export type AgentCategory = 'account' | 'permissions' | 'mcp' | 'skills'; export type ProjectSortOrder = 'name' | 'date'; diff --git a/src/components/settings/view/Settings.tsx b/src/components/settings/view/Settings.tsx index bfa98edf..2d434d04 100644 --- a/src/components/settings/view/Settings.tsx +++ b/src/components/settings/view/Settings.tsx @@ -7,6 +7,7 @@ import SettingsSidebar from '../view/SettingsSidebar'; import AgentsSettingsTab from '../view/tabs/agents-settings/AgentsSettingsTab'; import AppearanceSettingsTab from '../view/tabs/AppearanceSettingsTab'; import CredentialsSettingsTab from '../view/tabs/api-settings/CredentialsSettingsTab'; +import VoiceSettingsTab from '../view/tabs/VoiceSettingsTab'; import GitSettingsTab from '../view/tabs/git-settings/GitSettingsTab'; import BrowserUseSettingsTab from '../view/tabs/browser-use-settings/BrowserUseSettingsTab'; import NotificationsSettingsTab from '../view/tabs/NotificationsSettingsTab'; @@ -157,6 +158,8 @@ function Settings({ isOpen, onClose, projects = [], initialTab = 'agents' }: Set {activeTab === 'api' && } + {activeTab === 'voice' && } + {activeTab === 'plugins' && } {activeTab === 'about' && } diff --git a/src/components/settings/view/SettingsSidebar.tsx b/src/components/settings/view/SettingsSidebar.tsx index dde32a9e..3b76976e 100644 --- a/src/components/settings/view/SettingsSidebar.tsx +++ b/src/components/settings/view/SettingsSidebar.tsx @@ -1,5 +1,6 @@ -import { Bell, Bot, GitBranch, Info, Key, ListChecks, MonitorPlay, Palette, Puzzle } from 'lucide-react'; +import { Bell, Bot, GitBranch, Info, Key, ListChecks, Mic, MonitorPlay, Palette, Puzzle } from 'lucide-react'; import { useTranslation } from 'react-i18next'; + import { cn } from '../../../lib/utils'; import { PillBar, Pill } from '../../../shared/view/ui'; import type { SettingsMainTab } from '../types/types'; @@ -20,6 +21,7 @@ const NAV_ITEMS: NavItem[] = [ { id: 'appearance', labelKey: 'mainTabs.appearance', icon: Palette }, { id: 'git', labelKey: 'mainTabs.git', icon: GitBranch }, { id: 'api', labelKey: 'mainTabs.apiTokens', icon: Key }, + { id: 'voice', labelKey: 'mainTabs.voice', icon: Mic }, { id: 'tasks', labelKey: 'mainTabs.tasks', icon: ListChecks }, { id: 'browser', labelKey: 'mainTabs.browser', icon: MonitorPlay }, { id: 'plugins', labelKey: 'mainTabs.plugins', icon: Puzzle }, diff --git a/src/components/settings/view/tabs/VoiceSettingsTab.tsx b/src/components/settings/view/tabs/VoiceSettingsTab.tsx new file mode 100644 index 00000000..8dcf7585 --- /dev/null +++ b/src/components/settings/view/tabs/VoiceSettingsTab.tsx @@ -0,0 +1,91 @@ +import type { InputHTMLAttributes } from 'react'; +import { useTranslation } from 'react-i18next'; +import SettingsSection from '../SettingsSection'; +import SettingsToggle from '../SettingsToggle'; +import { useUiPreferences } from '../../../../hooks/useUiPreferences'; +import { useVoiceConfig } from '../../../../hooks/useVoiceConfig'; + +const inputClass = + 'w-full rounded-md border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-ring'; + +function Field({ label, ...props }: { label: string } & InputHTMLAttributes) { + return ( + + ); +} + +export default function VoiceSettingsTab() { + const { t } = useTranslation('settings'); + const { preferences, setPreference } = useUiPreferences(); + const { config, update } = useVoiceConfig(); + const voiceEnabled = preferences.voiceEnabled; + + return ( +
+ +
+
+
{t('voiceSettings.enable')}
+
{t('voiceSettings.enableDescription')}
+
+ setPreference('voiceEnabled', v)} + ariaLabel={t('voiceSettings.enable')} + /> +
+
+ + {voiceEnabled && ( + +
+ update({ baseUrl: e.target.value })} + /> + update({ apiKey: e.target.value })} + /> +
+ update({ sttModel: e.target.value })} + /> + update({ ttsModel: e.target.value })} + /> + update({ ttsVoice: e.target.value })} + /> + update({ ttsFormat: e.target.value })} + /> +
+

{t('voiceSettings.note')}

+
+
+ )} +
+ ); +} diff --git a/src/hooks/useUiPreferences.ts b/src/hooks/useUiPreferences.ts index eb0b8339..342f1698 100644 --- a/src/hooks/useUiPreferences.ts +++ b/src/hooks/useUiPreferences.ts @@ -7,6 +7,7 @@ type UiPreferences = { autoScrollToBottom: boolean; sendByCtrlEnter: boolean; sidebarVisible: boolean; + voiceEnabled: boolean; }; type UiPreferenceKey = keyof UiPreferences; @@ -39,6 +40,7 @@ const DEFAULTS: UiPreferences = { autoScrollToBottom: true, sendByCtrlEnter: false, sidebarVisible: true, + voiceEnabled: false, }; const PREFERENCE_KEYS = Object.keys(DEFAULTS) as UiPreferenceKey[]; diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts new file mode 100644 index 00000000..303b6467 --- /dev/null +++ b/src/hooks/useVoiceConfig.ts @@ -0,0 +1,68 @@ +import { useState } from 'react'; + +export type VoiceConfig = { + baseUrl: string; + apiKey: string; + sttModel: string; + ttsModel: string; + ttsVoice: string; + ttsFormat: string; +}; + +const STORAGE_KEY = 'voiceConfig'; +export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; +const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; + +export function readVoiceConfig(): VoiceConfig { + try { + const raw = localStorage.getItem(STORAGE_KEY); + if (!raw) return { ...DEFAULTS }; + const parsed = JSON.parse(raw); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) return { ...DEFAULTS }; + const config = { ...DEFAULTS }; + for (const key of Object.keys(DEFAULTS) as (keyof VoiceConfig)[]) { + if (typeof parsed[key] === 'string') config[key] = parsed[key]; + } + return config; + } catch { + return { ...DEFAULTS }; + } +} + +// Headers the voice proxy reads to target a per-user OpenAI-compatible backend. +// Empty fields are omitted so the server's env defaults apply. +export function voiceConfigHeaders(): Record { + if (typeof window === 'undefined') return {}; + const c = readVoiceConfig(); + const h: Record = {}; + if (c.apiKey) h['x-voice-api-key'] = c.apiKey; + if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; + if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; + if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice; + if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim(); + return h; +} + +export function useVoiceConfig() { + const [config, setConfig] = useState(() => + typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(), + ); + + const update = (patch: Partial) => { + setConfig((prev) => { + const next = { ...prev, ...patch }; + try { + const stored: Partial = { ...next }; + if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim(); + else delete stored.ttsFormat; + localStorage.setItem(STORAGE_KEY, JSON.stringify(stored)); + window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT)); + } catch { + /* ignore persistence errors */ + } + return next; + }); + }; + + return { config, update }; +} diff --git a/src/i18n/locales/en/chat.json b/src/i18n/locales/en/chat.json index 2c75fad0..656fa328 100644 --- a/src/i18n/locales/en/chat.json +++ b/src/i18n/locales/en/chat.json @@ -122,6 +122,14 @@ } } }, + "voice": { + "input": "Voice input", + "stopRecording": "Stop recording", + "transcribing": "Transcribing…", + "speak": "Read aloud", + "stopSpeaking": "Stop", + "loading": "Loading…" + }, "input": { "placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...", "placeholderDefault": "Type your message...", diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json index 89d4e651..a04067e2 100644 --- a/src/i18n/locales/en/settings.json +++ b/src/i18n/locales/en/settings.json @@ -50,6 +50,21 @@ "resetToDefaults": "Reset to Defaults", "cancelChanges": "Cancel Changes" }, + "voiceSettings": { + "title": "Voice", + "description": "Speech-to-text input and read-aloud, via an OpenAI-compatible audio backend.", + "enable": "Enable voice", + "enableDescription": "Show the mic button and the read-aloud button on messages.", + "backendTitle": "Backend", + "backendDescription": "Point at OpenAI, Groq, or a local server (LocalAI, Speaches, Kokoro-FastAPI). Leave blank to use the server default.", + "baseUrl": "Base URL", + "apiKey": "API key", + "sttModel": "Speech-to-text model", + "ttsModel": "Text-to-speech model", + "voice": "Voice", + "format": "Audio format", + "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend." + }, "quickSettings": { "title": "Quick Settings", "sections": { @@ -64,6 +79,7 @@ "showThinking": "Show thinking", "autoScrollToBottom": "Auto-scroll to bottom", "sendByCtrlEnter": "Send by Ctrl+Enter", + "voiceEnabled": "Voice (mic + read aloud)", "sendByCtrlEnterDescription": "When enabled, pressing Ctrl+Enter will send the message instead of just Enter. This is useful for IME users to avoid accidental sends.", "dragHandle": { "dragging": "Dragging handle", @@ -94,6 +110,7 @@ "appearance": "Appearance", "git": "Git", "apiTokens": "API & Tokens", + "voice": "Voice", "tasks": "Tasks", "browser": "Browser", "notifications": "Notifications", diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts new file mode 100644 index 00000000..3f9549b4 --- /dev/null +++ b/src/lib/voiceApi.ts @@ -0,0 +1,60 @@ +import { authenticatedFetch } from '../utils/api'; +import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig'; + +function directUrl(baseUrl: string, path: string): string { + return `${baseUrl.replace(/\/$/, '')}${path}`; +} + +export function voiceConfigSignature(): string { + return JSON.stringify(readVoiceConfig()); +} + +export function transcribeVoice(blob: Blob, filename: string): Promise { + const config = readVoiceConfig(); + const body = new FormData(); + + if (config.baseUrl.trim()) { + body.append('file', blob, filename); + body.append('model', config.sttModel || 'whisper-1'); + return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), { + method: 'POST', + headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}, + body, + }); + } + + body.append('audio', blob, filename); + return authenticatedFetch('/api/voice/transcribe', { + method: 'POST', + headers: voiceConfigHeaders(), + body, + }); +} + +export function synthesizeVoice(text: string, signal: AbortSignal): Promise { + const config = readVoiceConfig(); + + if (config.baseUrl.trim()) { + return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}), + }, + body: JSON.stringify({ + model: config.ttsModel || 'tts-1', + voice: config.ttsVoice || 'alloy', + input: text, + ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}), + }), + signal, + }); + } + + return authenticatedFetch('/api/voice/tts', { + method: 'POST', + body: JSON.stringify({ text }), + headers: voiceConfigHeaders(), + signal, + }); +} diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts new file mode 100644 index 00000000..4c239c29 --- /dev/null +++ b/src/lib/voicePlayer.ts @@ -0,0 +1,196 @@ +import { synthesizeVoice, voiceConfigSignature } from './voiceApi'; + +// A single app-level audio player for read-aloud. It owns one