diff --git a/server/index.js b/server/index.js
index d957ef58..7db1b122 100755
--- a/server/index.js
+++ b/server/index.js
@@ -61,6 +61,7 @@ import userRoutes from './routes/user.js';
 import geminiRoutes from './routes/gemini.js';
 import pluginsRoutes from './routes/plugins.js';
 import providerRoutes from './modules/providers/provider.routes.js';
+import voiceRoutes from './voice-proxy.js';
 import browserUseRoutes from './modules/browser-use/browser-use.routes.js';
 import browserUseMcpRoutes from './modules/browser-use/browser-use-mcp.routes.js';
 import { browserUseService } from './modules/browser-use/browser-use.service.js';
@@ -222,6 +223,8 @@ app.use('/api/providers', authenticateToken, providerRoutes);
 // Agent API Routes (uses API key authentication)
 app.use('/api/agent', agentRoutes);
 
+app.use('/api/voice', authenticateToken, voiceRoutes);
+
 // Serve public files (like api-docs.html)
 app.use(express.static(path.join(APP_ROOT, 'public')));
 
diff --git a/server/voice-proxy.js b/server/voice-proxy.js
new file mode 100644
index 00000000..1ea4a6d8
--- /dev/null
+++ b/server/voice-proxy.js
@@ -0,0 +1,224 @@
+// Optional voice proxy — forwards STT/TTS to an OpenAI-compatible audio backend.
+//
+// The backend is whatever the user points at: OpenAI, Groq, or a local server
+// (LocalAI / Speaches / Kokoro-FastAPI / openedai-speech / etc.). It must expose the
+// standard OpenAI audio endpoints:
+//     POST {base}/audio/transcriptions   (multipart 'file' + 'model')      -> { text }
+//     POST {base}/audio/speech           ({ model, voice, input })         -> audio bytes
+//
+// Config is resolved per-request from headers (set by the client's voice settings),
+// falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
+import { Readable } from 'node:stream';
+
+import express from 'express';
+
+const ENV = {
+  baseUrl: (process.env.VOICE_API_BASE_URL || '').replace(/\/$/, ''),
+  apiKey: process.env.VOICE_API_KEY || '',
+  sttModel: process.env.VOICE_STT_MODEL || 'whisper-1',
+  ttsModel: process.env.VOICE_TTS_MODEL || 'tts-1',
+  ttsVoice: process.env.VOICE_TTS_VOICE || 'alloy',
+};
+
+/**
+ * Resolve the voice backend config for a request. Client headers (set from the
+ * user's in-app voice settings) take precedence over the server env defaults.
+ * @param {import('express').Request} req
+ * @returns {{baseUrl: string, apiKey: string, sttModel: string, ttsModel: string, ttsVoice: string, ttsFormat: string}}
+ */
+function resolveConfig(req) {
+  const h = req.headers;
+  return {
+    // Security: do not allow clients to control the outbound backend host.
+    // Always use the server-side configured base URL.
+    baseUrl: ENV.baseUrl,
+    apiKey: String(h['x-voice-api-key'] || '') || ENV.apiKey,
+    sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
+    ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
+    ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
+    ttsFormat: String(h['x-voice-tts-format'] || '').trim(),
+  };
+}
+
+const router = express.Router();
+
+// Generous by default — local TTS can synthesize long messages at ~real-time on CPU.
+// Guard against a non-numeric/zero override that would make setTimeout fire immediately.
+const DEFAULT_VOICE_TIMEOUT_MS = 300000;
+const _parsedTimeout = Number(process.env.VOICE_TIMEOUT_MS);
+const VOICE_TIMEOUT_MS = Number.isFinite(_parsedTimeout) && _parsedTimeout > 0
+  ? _parsedTimeout
+  : DEFAULT_VOICE_TIMEOUT_MS;
+
+/**
+ * fetch() with an AbortController timeout so a stalled backend can't hold the
+ * request open indefinitely. Aborts after VOICE_TIMEOUT_MS.
+ * @param {string} url
+ * @param {RequestInit} [options]
+ * @returns {Promise<Response>}
+ */
+async function fetchWithTimeout(url, options = {}) {
+  const parsed = new URL(url);
+  if (!['http:', 'https:'].includes(parsed.protocol) || !isAllowedBackendUrl(parsed.origin)) {
+    throw new Error('Blocked outbound voice backend URL');
+  }
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
+  try {
+    return await fetch(parsed.toString(), { redirect: 'manual', ...options, signal: controller.signal });
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+/**
+ * Turn a backend fetch failure into a clear, actionable client response:
+ * 504 on timeout (AbortError), 502 otherwise.
+ * @param {import('express').Response} res
+ * @param {Error} e
+ */
+function backendError(res, e) {
+  if (e && e.name === 'AbortError') {
+    return res.status(504).json({
+      error: `Voice backend timed out after ${Math.round(VOICE_TIMEOUT_MS / 1000)}s. Check your voice backend.`,
+    });
+  }
+  return res.status(502).json({ error: `Voice backend unreachable: ${e.message}` });
+}
+
+/**
+ * SSRF guard for the user-configurable backend URL: allow http/https only and
+ * block the link-local / cloud-metadata range (169.254.x). localhost and private
+ * ranges are allowed on purpose so users can point at a local voice server
+ * (LocalAI, Speaches, Kokoro-FastAPI, etc.).
+ * @param {string} raw
+ * @returns {boolean}
+ */
+function isAllowedBackendUrl(raw) {
+  let u;
+  try {
+    u = new URL(raw);
+  } catch {
+    return false;
+  }
+  if (u.protocol !== 'http:' && u.protocol !== 'https:') return false;
+  if (u.hostname === '169.254.169.254' || u.hostname.startsWith('169.254.')) return false;
+  return true;
+}
+
+/**
+ * Relay an upstream (backend) error to the client without making an upstream
+ * 401/403 look like the user's own app login failed.
+ * @param {import('express').Response} res
+ * @param {number} status
+ * @param {string} [text]
+ */
+function upstreamError(res, status, text) {
+  if (status === 401 || status === 403) {
+    return res.status(502).json({ error: 'Voice backend rejected the request (check the API key).' });
+  }
+  return res.status(status).json({ error: text || 'voice backend error' });
+}
+
+let _upload = null;
+/**
+ * Lazily build a memory-storage multer instance (25 MB cap) for audio uploads,
+ * so multer is only imported when the voice feature is actually used.
+ * @returns {Promise<import('multer').Multer>}
+ */
+async function getUpload() {
+  if (!_upload) {
+    const multer = (await import('multer')).default;
+    _upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } });
+  }
+  return _upload;
+}
+
+/**
+ * Build the Authorization header for the backend, or an empty object when no
+ * key is configured (e.g. a local server that needs none).
+ * @param {string} apiKey
+ * @returns {Record<string, string>}
+ */
+function authHeader(apiKey) {
+  return apiKey ? { Authorization: `Bearer ${apiKey}` } : {};
+}
+
+/**
+ * GET /api/voice/health -> { configured } (true when a backend base URL is set).
+ */
+router.get('/health', (req, res) => {
+  res.json({ configured: Boolean(resolveConfig(req).baseUrl) });
+});
+
+/**
+ * POST /api/voice/transcribe (multipart 'audio') -> { text }.
+ * Forwards the uploaded audio to the backend's /audio/transcriptions endpoint.
+ */
+router.post('/transcribe', async (req, res) => {
+  const cfg = resolveConfig(req);
+  if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
+  if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
+  const upload = await getUpload();
+  upload.single('audio')(req, res, async (err) => {
+    if (err) return res.status(400).json({ error: err.message });
+    if (!req.file) return res.status(400).json({ error: 'No audio uploaded' });
+    try {
+      const fd = new FormData();
+      fd.append(
+        'file',
+        new Blob([req.file.buffer], { type: req.file.mimetype || 'audio/webm' }),
+        req.file.originalname || 'recording.webm',
+      );
+      fd.append('model', cfg.sttModel);
+      const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/transcriptions`, {
+        method: 'POST',
+        headers: authHeader(cfg.apiKey),
+        body: fd,
+      });
+      const text = await r.text();
+      if (!r.ok) return upstreamError(res, r.status, text);
+      let data;
+      try { data = JSON.parse(text); } catch { data = { text }; }
+      res.json({ text: data.text ?? '' });
+    } catch (e) {
+      backendError(res, e);
+    }
+  });
+});
+
+/**
+ * POST /api/voice/tts { text } -> audio bytes.
+ * Forwards the text to the backend's /audio/speech endpoint and streams the audio back.
+ */
+router.post('/tts', async (req, res) => {
+  const cfg = resolveConfig(req);
+  if (!cfg.baseUrl) return res.status(503).json({ error: 'No voice backend configured' });
+  if (!isAllowedBackendUrl(cfg.baseUrl)) return res.status(400).json({ error: 'Invalid voice backend URL.' });
+  const text = req.body?.text;
+  if (typeof text !== 'string' || !text.trim()) return res.status(400).json({ error: 'text required' });
+  try {
+    const r = await fetchWithTimeout(`${cfg.baseUrl}/audio/speech`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json', ...authHeader(cfg.apiKey) },
+      body: JSON.stringify({
+        model: cfg.ttsModel,
+        voice: cfg.ttsVoice,
+        input: text,
+        ...(cfg.ttsFormat ? { response_format: cfg.ttsFormat } : {}),
+      }),
+    });
+    if (!r.ok) {
+      const errText = await r.text().catch(() => 'tts failed');
+      return upstreamError(res, r.status, errText);
+    }
+    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
+    res.setHeader('Cache-Control', 'no-store');
+    if (!r.body) return res.end();
+    Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res);
+  } catch (e) {
+    backendError(res, e);
+  }
+});
+
+export default router;
diff --git a/src/components/chat/hooks/useChatComposerState.ts b/src/components/chat/hooks/useChatComposerState.ts
index c1f86f2d..e3e65b77 100644
--- a/src/components/chat/hooks/useChatComposerState.ts
+++ b/src/components/chat/hooks/useChatComposerState.ts
@@ -775,6 +775,17 @@ export function useChatComposerState({
     handleSubmitRef.current = handleSubmit;
   }, [handleSubmit]);
 
+  // A voice transcript either fills the input (to edit before sending) or, when the
+  // user tapped "stop and send", is submitted straight away. Mirror the value into
+  // inputValueRef synchronously so handleSubmit reads the new text, not the stale state.
+  const handleVoiceTranscript = useCallback((text: string, send?: boolean) => {
+    const base = inputValueRef.current.trim();
+    const next = base ? `${base} ${text}` : text;
+    setInput(next);
+    inputValueRef.current = next;
+    if (send) handleSubmitRef.current?.(createFakeSubmitEvent());
+  }, [setInput]);
+
   useEffect(() => {
     inputValueRef.current = input;
   }, [input]);
@@ -1013,6 +1024,7 @@ export function useChatComposerState({
     isDragActive,
     openImagePicker: open,
     handleSubmit,
+    handleVoiceTranscript,
     handleInputChange,
     handleKeyDown,
     handlePaste,
diff --git a/src/components/chat/hooks/useTts.ts b/src/components/chat/hooks/useTts.ts
new file mode 100644
index 00000000..fc4a6c33
--- /dev/null
+++ b/src/components/chat/hooks/useTts.ts
@@ -0,0 +1,33 @@
+import { useCallback, useEffect, useState } from 'react';
+import { voicePlayer, voiceId, type VoiceSnapshot } from '../../../lib/voicePlayer';
+
+export type TtsState = VoiceSnapshot['state'];
+
+/**
+ * Thin adapter over the app-level voicePlayer. Playback lives outside React (see
+ * lib/voicePlayer), so switching chats or re-rendering a message no longer cuts the
+ * audio off. This hook just reflects the player's state for one message and forwards taps.
+ */
+export function useTts(getText: () => string) {
+  const content = getText();
+  const id = voiceId(content);
+
+  const [snap, setSnap] = useState<VoiceSnapshot>(() => voicePlayer.getSnapshot(id));
+
+  useEffect(() => {
+    const update = () =>
+      setSnap((prev) => {
+        const next = voicePlayer.getSnapshot(id);
+        return prev.state === next.state && prev.error === next.error ? prev : next;
+      });
+    update();
+    return voicePlayer.subscribe(update);
+  }, [id]);
+
+  const toggle = useCallback(() => {
+    voicePlayer.unlock(); // synchronous, within the click gesture (iOS)
+    voicePlayer.toggle(content);
+  }, [content]);
+
+  return { state: snap.state, toggle, error: snap.error };
+}
diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts
new file mode 100644
index 00000000..9ee92c48
--- /dev/null
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -0,0 +1,85 @@
+import { useEffect, useState } from 'react';
+
+import { authenticatedFetch } from '../../../utils/api';
+import { readVoiceConfig, VOICE_CONFIG_SYNC_EVENT } from '../../../hooks/useVoiceConfig';
+
+// Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
+// the Settings modal) and a configured voice backend.
+const STORAGE_KEY = 'uiPreferences';
+const SYNC_EVENT = 'ui-preferences:sync';
+let healthRequest: Promise<boolean> | null = null;
+
+function checkVoiceHealth(): Promise<boolean> {
+  if (healthRequest) return healthRequest;
+  const request = authenticatedFetch('/api/voice/health')
+    .then(async (response) => {
+      if (!response.ok) throw new Error(`Voice health check failed (${response.status})`);
+      const data = await response.json();
+      return data?.configured === true;
+    })
+    .finally(() => {
+      healthRequest = null;
+    });
+  healthRequest = request;
+  return request;
+}
+
+function readVoiceEnabled(): boolean {
+  try {
+    const raw = localStorage.getItem(STORAGE_KEY);
+    if (!raw) return false;
+    const parsed = JSON.parse(raw);
+    return parsed?.voiceEnabled === true || parsed?.voiceEnabled === 'true';
+  } catch {
+    return false;
+  }
+}
+
+export function useVoiceAvailable(): boolean {
+  const [enabled, setEnabled] = useState<boolean>(() =>
+    typeof window === 'undefined' ? false : readVoiceEnabled(),
+  );
+  const [available, setAvailable] = useState(false);
+
+  useEffect(() => {
+    const update = () => setEnabled(readVoiceEnabled());
+    window.addEventListener('storage', update);
+    window.addEventListener(SYNC_EVENT, update as EventListener);
+    return () => {
+      window.removeEventListener('storage', update);
+      window.removeEventListener(SYNC_EVENT, update as EventListener);
+    };
+  }, []);
+
+  useEffect(() => {
+    let active = true;
+    let requestId = 0;
+
+    const check = async () => {
+      if (!enabled) {
+        setAvailable(false);
+        return;
+      }
+      if (readVoiceConfig().baseUrl.trim()) {
+        setAvailable(true);
+        return;
+      }
+      const id = ++requestId;
+      try {
+        const result = await checkVoiceHealth();
+        if (active && id === requestId) setAvailable(result);
+      } catch {
+        if (active && id === requestId) setAvailable(false);
+      }
+    };
+
+    void check();
+    window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check);
+    return () => {
+      active = false;
+      window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check);
+    };
+  }, [enabled]);
+
+  return enabled && available;
+}
diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts
new file mode 100644
index 00000000..6fcadd56
--- /dev/null
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -0,0 +1,149 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+
+import { transcribeVoice } from '../../../lib/voiceApi';
+
+// Mobile-safe recording: iOS Safari 18.4+ supports webm/opus; older iOS needs mp4.
+const MIME_CANDIDATES = [
+  'audio/webm;codecs=opus',
+  'audio/webm',
+  'audio/mp4',
+  'audio/ogg;codecs=opus',
+  'audio/ogg',
+];
+
+function pickMime(): string {
+  for (const t of MIME_CANDIDATES) {
+    try {
+      if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(t)) return t;
+    } catch {
+      /* isTypeSupported can throw on some iOS versions */
+    }
+  }
+  return '';
+}
+
+export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
+
+/**
+ * Push-to-talk dictation. Records the mic, uploads to /api/voice/transcribe
+ * (an OpenAI-compatible speech-to-text backend via the Express proxy), and
+ * returns the transcript through onTranscript.
+ */
+export function useVoiceInput(
+  onTranscript: (text: string, send?: boolean) => void,
+  onError?: (msg: string) => void,
+) {
+  const [state, setState] = useState<VoiceInputState>('idle');
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const streamRef = useRef<MediaStream | null>(null);
+  const cancelledRef = useRef(false);
+  const startingRef = useRef(false);
+  // Whether the in-progress stop should auto-send the transcript (vs just fill the box).
+  const sendRef = useRef(false);
+
+  const stopTracks = () => {
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+  };
+
+  // Stop the mic if the component unmounts mid-recording.
+  useEffect(() => {
+    cancelledRef.current = false;
+    return () => {
+      cancelledRef.current = true;
+      startingRef.current = false;
+      streamRef.current?.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+      recorderRef.current = null;
+    };
+  }, []);
+
+  const start = useCallback(async () => {
+    if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return;
+    startingRef.current = true;
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: { echoCancellation: true, noiseSuppression: true },
+      });
+      if (cancelledRef.current) {
+        stream.getTracks().forEach((t) => t.stop());
+        return;
+      }
+      streamRef.current = stream;
+      const mimeType = pickMime();
+      const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
+      recorderRef.current = rec;
+      chunksRef.current = [];
+
+      rec.ondataavailable = (e) => {
+        if (e.data.size > 0) chunksRef.current.push(e.data);
+      };
+
+      rec.onstop = async () => {
+        stopTracks();
+        if (cancelledRef.current) return;
+        // Capture and clear the send intent for this stop before any async work.
+        const shouldSend = sendRef.current;
+        sendRef.current = false;
+        const type = rec.mimeType || 'audio/webm';
+        const blob = new Blob(chunksRef.current, { type });
+        if (blob.size < 800) {
+          setState('idle');
+          onError?.('Recording too short');
+          return;
+        }
+        setState('transcribing');
+        try {
+          const ext = type.includes('mp4') ? 'm4a' : type.includes('ogg') ? 'ogg' : 'webm';
+          const res = await transcribeVoice(blob, `recording.${ext}`);
+          if (!res.ok) throw new Error(`transcribe ${res.status}`);
+          const data = await res.json();
+          if (cancelledRef.current) return;
+          const text = String(data?.text || '').trim();
+          if (text) onTranscript(text, shouldSend);
+          else onError?.('No speech detected');
+        } catch (e) {
+          if (!cancelledRef.current) {
+            onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
+          }
+        } finally {
+          if (!cancelledRef.current) setState('idle');
+        }
+      };
+
+      rec.start();
+      setState('recording');
+    } catch (e) {
+      recorderRef.current = null;
+      stopTracks();
+      if (cancelledRef.current) return;
+      const err = e as { name?: string; message?: string };
+      let msg = `Mic error: ${err?.message || e}`;
+      if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
+      else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
+      onError?.(msg);
+      setState('idle');
+    } finally {
+      startingRef.current = false;
+    }
+  }, [onTranscript, onError]);
+
+  // Stop recording. Pass { send: true } to auto-send the transcript once it's ready.
+  // Guard on the recorder's own state (not React state) so a double tap, or the mic
+  // and Send buttons both firing, can't call stop() on an already-inactive recorder.
+  const stop = useCallback((opts?: { send?: boolean }) => {
+    const rec = recorderRef.current;
+    if (rec && rec.state !== 'inactive') {
+      sendRef.current = opts?.send ?? false;
+      rec.stop();
+    }
+  }, []);
+
+  const toggle = useCallback(() => {
+    if (state === 'recording') stop();
+    else if (state === 'idle') start();
+  }, [state, start, stop]);
+
+  return { state, toggle, stop };
+}
diff --git a/src/components/chat/view/ChatInterface.tsx b/src/components/chat/view/ChatInterface.tsx
index 15786a41..a83dfbdc 100644
--- a/src/components/chat/view/ChatInterface.tsx
+++ b/src/components/chat/view/ChatInterface.tsx
@@ -173,6 +173,7 @@ function ChatInterface({
     isDragActive,
     openImagePicker,
     handleSubmit,
+    handleVoiceTranscript,
     handleInputChange,
     handleKeyDown,
     handlePaste,
@@ -406,6 +407,7 @@ function ChatInterface({
           renderInputWithMentions={renderInputWithMentions}
           textareaRef={textareaRef}
           input={input}
+          onVoiceTranscript={handleVoiceTranscript}
           onInputChange={handleInputChange}
           onTextareaClick={handleTextareaClick}
           onTextareaKeyDown={handleKeyDown}
diff --git a/src/components/chat/view/subcomponents/ChatComposer.tsx b/src/components/chat/view/subcomponents/ChatComposer.tsx
index c60aa893..d679df11 100644
--- a/src/components/chat/view/subcomponents/ChatComposer.tsx
+++ b/src/components/chat/view/subcomponents/ChatComposer.tsx
@@ -1,4 +1,5 @@
 import { useTranslation } from 'react-i18next';
+import { useCallback, useEffect, useRef, useState } from 'react';
 import type {
   ChangeEvent,
   ClipboardEvent,
@@ -9,8 +10,10 @@ import type {
   RefObject,
   TouchEvent,
 } from 'react';
-import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon } from 'lucide-react';
+import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon, Loader2 } from 'lucide-react';
 
+import { useVoiceInput } from '../../hooks/useVoiceInput';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
 import type { SessionActivity } from '../../../../hooks/useSessionProtection';
 import type { PendingPermissionRequest, PermissionMode } from '../../types/types';
 import {
@@ -27,6 +30,7 @@ import {
 import CommandMenu from './CommandMenu';
 import ActivityIndicator from './ActivityIndicator';
 import ImageAttachment from './ImageAttachment';
+import VoiceInputButton from './VoiceInputButton';
 import PermissionRequestsBanner from './PermissionRequestsBanner';
 import TokenUsageSummary from './TokenUsageSummary';
 
@@ -89,6 +93,7 @@ interface ChatComposerProps {
   renderInputWithMentions: (text: string) => ReactNode;
   textareaRef: RefObject<HTMLTextAreaElement>;
   input: string;
+  onVoiceTranscript?: (text: string, send?: boolean) => void;
   onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
   onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
   onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -142,6 +147,7 @@ export default function ChatComposer({
   renderInputWithMentions,
   textareaRef,
   input,
+  onVoiceTranscript,
   onInputChange,
   onTextareaClick,
   onTextareaKeyDown,
@@ -154,6 +160,28 @@ export default function ChatComposer({
   sendByCtrlEnter,
 }: ChatComposerProps) {
   const { t } = useTranslation('chat');
+
+  // Voice state is hosted here (not in the mic button) so the main Send button can stop
+  // recording and send the transcript in one tap, the way the mic button drops it in the box.
+  const voiceAvailable = useVoiceAvailable();
+  const [voiceError, setVoiceError] = useState<string | null>(null);
+  const voiceErrorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const handleVoiceError = useCallback((msg: string) => {
+    setVoiceError(msg);
+    if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
+    voiceErrorTimer.current = setTimeout(() => setVoiceError(null), 4000);
+  }, []);
+  useEffect(() => () => {
+    if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
+  }, []);
+  const noopTranscript = useCallback(() => {}, []);
+  const { state: voiceState, toggle: voiceToggle, stop: voiceStop } = useVoiceInput(
+    onVoiceTranscript ?? noopTranscript,
+    handleVoiceError,
+  );
+  const isRecording = voiceState === 'recording';
+  const isTranscribing = voiceState === 'transcribing';
+
   const textareaRect = textareaRef.current?.getBoundingClientRect();
   const commandMenuPosition = {
     top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0,
@@ -309,6 +337,10 @@ export default function ChatComposer({
               <ImageIcon />
             </PromptInputButton>
 
+            {onVoiceTranscript && voiceAvailable && (
+              <VoiceInputButton state={voiceState} onToggle={voiceToggle} errorMsg={voiceError} />
+            )}
+
             <button
               type="button"
               onClick={onModeSwitch}
@@ -387,10 +419,21 @@ export default function ChatComposer({
               {sendByCtrlEnter ? t('input.hintText.ctrlEnter') : t('input.hintText.enter')}
             </div>
             <PromptInputSubmit
-              onClick={isLoading ? onAbortSession : undefined}
-              disabled={!isLoading && !input.trim()}
+              onClick={
+                isLoading
+                  ? onAbortSession
+                  : isRecording
+                    ? (e: MouseEvent<HTMLButtonElement>) => {
+                        e.preventDefault();
+                        voiceStop({ send: true });
+                      }
+                    : undefined
+              }
+              disabled={isLoading ? false : isRecording ? false : isTranscribing ? true : !input.trim()}
               className="h-10 w-10 sm:h-10 sm:w-10"
-            />
+            >
+              {isTranscribing ? <Loader2 className="h-4 w-4 animate-spin" /> : undefined}
+            </PromptInputSubmit>
           </div>
         </PromptInputFooter>
       </PromptInput>
diff --git a/src/components/chat/view/subcomponents/MessageComponent.tsx b/src/components/chat/view/subcomponents/MessageComponent.tsx
index ba69bf90..17b27918 100644
--- a/src/components/chat/view/subcomponents/MessageComponent.tsx
+++ b/src/components/chat/view/subcomponents/MessageComponent.tsx
@@ -15,6 +15,7 @@ import { Reasoning, ReasoningTrigger, ReasoningContent } from '../../../../share
 
 import { Markdown } from './Markdown';
 import MessageCopyControl from './MessageCopyControl';
+import MessageSpeakControl from './MessageSpeakControl';
 
 type DiffLine = {
   type: string;
@@ -415,6 +416,9 @@ const MessageComponent = memo(({ message, prevMessage, createDiff, onFileOpen, a
                 {shouldShowAssistantCopyControl && (
                   <MessageCopyControl content={assistantCopyContent} messageType="assistant" />
                 )}
+                {shouldShowAssistantCopyControl && (
+                  <MessageSpeakControl content={assistantCopyContent} />
+                )}
                 {!isGrouped && <span>{formattedTime}</span>}
               </div>
             )}
diff --git a/src/components/chat/view/subcomponents/MessageSpeakControl.tsx b/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
new file mode 100644
index 00000000..01a90dfa
--- /dev/null
+++ b/src/components/chat/view/subcomponents/MessageSpeakControl.tsx
@@ -0,0 +1,44 @@
+import { Volume2, Loader2, Square } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { useTts } from '../../hooks/useTts';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+
+// Tap-to-speak button beside the copy control on assistant messages.
+// Renders nothing unless the optional voice feature is enabled.
+const MessageSpeakControl = ({ content }: { content: string }) => {
+  const { t } = useTranslation('chat');
+  const available = useVoiceAvailable();
+  const { state, toggle, error } = useTts(() => content);
+
+  if (!available) return null;
+
+  const title =
+    state === 'playing' ? t('voice.stopSpeaking') : state === 'loading' ? t('voice.loading') : t('voice.speak');
+
+  return (
+    <span className="relative inline-flex">
+      {error && (
+        <span className="absolute bottom-full left-1/2 z-10 mb-1 max-w-[240px] -translate-x-1/2 whitespace-normal rounded bg-red-600 px-2 py-1 text-center text-xs text-white shadow-lg">
+          {error}
+        </span>
+      )}
+      <button
+        type="button"
+        onClick={toggle}
+        title={title}
+        aria-label={title}
+        className="inline-flex items-center gap-1 rounded px-1 py-0.5 text-gray-400 transition-colors hover:text-gray-600 dark:text-gray-500 dark:hover:text-gray-300"
+      >
+        {state === 'playing' ? (
+          <Square className="h-3.5 w-3.5" />
+        ) : state === 'loading' ? (
+          <Loader2 className="h-3.5 w-3.5 animate-spin" />
+        ) : (
+          <Volume2 className="h-3.5 w-3.5" />
+        )}
+      </button>
+    </span>
+  );
+};
+
+export default MessageSpeakControl;
diff --git a/src/components/chat/view/subcomponents/VoiceInputButton.tsx b/src/components/chat/view/subcomponents/VoiceInputButton.tsx
new file mode 100644
index 00000000..249afacd
--- /dev/null
+++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx
@@ -0,0 +1,46 @@
+import { useTranslation } from 'react-i18next';
+import { Mic, Square, Loader2 } from 'lucide-react';
+
+import { PromptInputButton } from '../../../../shared/view/ui';
+import type { VoiceInputState } from '../../hooks/useVoiceInput';
+
+type Props = {
+  state: VoiceInputState;
+  onToggle: () => void;
+  errorMsg?: string | null;
+};
+
+// Push-to-talk mic button (presentational). Recording state and the stop-and-send action
+// are owned by the composer so the main Send button can drive them too. This button just
+// starts recording and, while recording, stops and drops the transcript into the input box.
+export default function VoiceInputButton({ state, onToggle, errorMsg }: Props) {
+  const { t } = useTranslation('chat');
+
+  const icon =
+    state === 'recording' ? (
+      <Square className="text-red-500" />
+    ) : state === 'transcribing' ? (
+      <Loader2 className="animate-spin" />
+    ) : (
+      <Mic />
+    );
+
+  return (
+    <span className="relative inline-flex">
+      {errorMsg && (
+        <span className="absolute bottom-full left-1/2 mb-1 -translate-x-1/2 whitespace-nowrap rounded bg-red-600 px-2 py-1 text-xs text-white shadow-lg">
+          {errorMsg}
+        </span>
+      )}
+      <PromptInputButton
+        tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
+        onClick={(e: { preventDefault: () => void }) => {
+          e.preventDefault();
+          onToggle();
+        }}
+      >
+        {icon}
+      </PromptInputButton>
+    </span>
+  );
+}
diff --git a/src/components/quick-settings-panel/constants.ts b/src/components/quick-settings-panel/constants.ts
index 15c15458..408a64c7 100644
--- a/src/components/quick-settings-panel/constants.ts
+++ b/src/components/quick-settings-panel/constants.ts
@@ -4,6 +4,7 @@ import {
   Eye,
   Languages,
   Maximize2,
+  Mic,
 } from 'lucide-react';
 import type { PreferenceToggleItem } from './types';
 
@@ -54,4 +55,9 @@ export const INPUT_SETTING_TOGGLES: PreferenceToggleItem[] = [
     labelKey: 'quickSettings.sendByCtrlEnter',
     icon: Languages,
   },
+  {
+    key: 'voiceEnabled',
+    labelKey: 'quickSettings.voiceEnabled',
+    icon: Mic,
+  },
 ];
diff --git a/src/components/quick-settings-panel/types.ts b/src/components/quick-settings-panel/types.ts
index 16002694..8d4f0826 100644
--- a/src/components/quick-settings-panel/types.ts
+++ b/src/components/quick-settings-panel/types.ts
@@ -6,7 +6,8 @@ export type PreferenceToggleKey =
   | 'showRawParameters'
   | 'showThinking'
   | 'autoScrollToBottom'
-  | 'sendByCtrlEnter';
+  | 'sendByCtrlEnter'
+  | 'voiceEnabled';
 
 export type QuickSettingsPreferences = Record<PreferenceToggleKey, boolean>;
 
diff --git a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx
index 8d805fe9..dc539621 100644
--- a/src/components/quick-settings-panel/view/QuickSettingsContent.tsx
+++ b/src/components/quick-settings-panel/view/QuickSettingsContent.tsx
@@ -28,6 +28,9 @@ export default function QuickSettingsContent({
   onPreferenceChange,
 }: QuickSettingsContentProps) {
   const { t } = useTranslation('settings');
+  const inputSettingToggles = preferences.voiceEnabled
+    ? INPUT_SETTING_TOGGLES
+    : INPUT_SETTING_TOGGLES.filter(({ key }) => key !== 'voiceEnabled');
 
   const renderToggleRows = (items: PreferenceToggleItem[]) => (
     items.map(({ key, labelKey, icon }) => (
@@ -67,7 +70,7 @@ export default function QuickSettingsContent({
       </QuickSettingsSection>
 
       <QuickSettingsSection title={t('quickSettings.sections.inputSettings')}>
-        {renderToggleRows(INPUT_SETTING_TOGGLES)}
+        {renderToggleRows(inputSettingToggles)}
         <p className="ml-3 text-xs text-gray-500 dark:text-gray-400">
           {t('quickSettings.sendByCtrlEnterDescription')}
         </p>
diff --git a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx
index 0de1bbc7..5f630a61 100644
--- a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx
+++ b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx
@@ -27,12 +27,14 @@ export default function QuickSettingsPanelView() {
     showThinking: preferences.showThinking,
     autoScrollToBottom: preferences.autoScrollToBottom,
     sendByCtrlEnter: preferences.sendByCtrlEnter,
+    voiceEnabled: preferences.voiceEnabled,
   }), [
     preferences.autoExpandTools,
     preferences.autoScrollToBottom,
     preferences.sendByCtrlEnter,
     preferences.showRawParameters,
     preferences.showThinking,
+    preferences.voiceEnabled,
   ]);
 
   const handlePreferenceChange = useCallback(
diff --git a/src/components/settings/types/types.ts b/src/components/settings/types/types.ts
index 74c3d309..5efac8d3 100644
--- a/src/components/settings/types/types.ts
+++ b/src/components/settings/types/types.ts
@@ -3,7 +3,7 @@ import type { Dispatch, SetStateAction } from 'react';
 import type { LLMProvider } from '../../../types/app';
 import type { ProviderAuthStatus } from '../../provider-auth/types';
 
-export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about';
+export type SettingsMainTab = 'agents' | 'appearance' | 'git' | 'api' | 'voice' | 'tasks' | 'browser' | 'notifications' | 'plugins' | 'about';
 export type AgentProvider = LLMProvider;
 export type AgentCategory = 'account' | 'permissions' | 'mcp' | 'skills';
 export type ProjectSortOrder = 'name' | 'date';
diff --git a/src/components/settings/view/Settings.tsx b/src/components/settings/view/Settings.tsx
index bfa98edf..2d434d04 100644
--- a/src/components/settings/view/Settings.tsx
+++ b/src/components/settings/view/Settings.tsx
@@ -7,6 +7,7 @@ import SettingsSidebar from '../view/SettingsSidebar';
 import AgentsSettingsTab from '../view/tabs/agents-settings/AgentsSettingsTab';
 import AppearanceSettingsTab from '../view/tabs/AppearanceSettingsTab';
 import CredentialsSettingsTab from '../view/tabs/api-settings/CredentialsSettingsTab';
+import VoiceSettingsTab from '../view/tabs/VoiceSettingsTab';
 import GitSettingsTab from '../view/tabs/git-settings/GitSettingsTab';
 import BrowserUseSettingsTab from '../view/tabs/browser-use-settings/BrowserUseSettingsTab';
 import NotificationsSettingsTab from '../view/tabs/NotificationsSettingsTab';
@@ -157,6 +158,8 @@ function Settings({ isOpen, onClose, projects = [], initialTab = 'agents' }: Set
 
               {activeTab === 'api' && <CredentialsSettingsTab />}
 
+              {activeTab === 'voice' && <VoiceSettingsTab />}
+
               {activeTab === 'plugins' && <PluginSettingsTab />}
 
               {activeTab === 'about' && <AboutTab />}
diff --git a/src/components/settings/view/SettingsSidebar.tsx b/src/components/settings/view/SettingsSidebar.tsx
index dde32a9e..3b76976e 100644
--- a/src/components/settings/view/SettingsSidebar.tsx
+++ b/src/components/settings/view/SettingsSidebar.tsx
@@ -1,5 +1,6 @@
-import { Bell, Bot, GitBranch, Info, Key, ListChecks, MonitorPlay, Palette, Puzzle } from 'lucide-react';
+import { Bell, Bot, GitBranch, Info, Key, ListChecks, Mic, MonitorPlay, Palette, Puzzle } from 'lucide-react';
 import { useTranslation } from 'react-i18next';
+
 import { cn } from '../../../lib/utils';
 import { PillBar, Pill } from '../../../shared/view/ui';
 import type { SettingsMainTab } from '../types/types';
@@ -20,6 +21,7 @@ const NAV_ITEMS: NavItem[] = [
   { id: 'appearance', labelKey: 'mainTabs.appearance', icon: Palette },
   { id: 'git', labelKey: 'mainTabs.git', icon: GitBranch },
   { id: 'api', labelKey: 'mainTabs.apiTokens', icon: Key },
+  { id: 'voice', labelKey: 'mainTabs.voice', icon: Mic },
   { id: 'tasks', labelKey: 'mainTabs.tasks', icon: ListChecks },
   { id: 'browser', labelKey: 'mainTabs.browser', icon: MonitorPlay },
   { id: 'plugins', labelKey: 'mainTabs.plugins', icon: Puzzle },
diff --git a/src/components/settings/view/tabs/VoiceSettingsTab.tsx b/src/components/settings/view/tabs/VoiceSettingsTab.tsx
new file mode 100644
index 00000000..8dcf7585
--- /dev/null
+++ b/src/components/settings/view/tabs/VoiceSettingsTab.tsx
@@ -0,0 +1,91 @@
+import type { InputHTMLAttributes } from 'react';
+import { useTranslation } from 'react-i18next';
+import SettingsSection from '../SettingsSection';
+import SettingsToggle from '../SettingsToggle';
+import { useUiPreferences } from '../../../../hooks/useUiPreferences';
+import { useVoiceConfig } from '../../../../hooks/useVoiceConfig';
+
+const inputClass =
+  'w-full rounded-md border border-border bg-background px-3 py-2 text-sm text-foreground placeholder:text-muted-foreground focus:outline-none focus:ring-2 focus:ring-ring';
+
+function Field({ label, ...props }: { label: string } & InputHTMLAttributes<HTMLInputElement>) {
+  return (
+    <label className="block space-y-1">
+      <span className="text-sm font-medium text-foreground">{label}</span>
+      <input className={inputClass} {...props} />
+    </label>
+  );
+}
+
+export default function VoiceSettingsTab() {
+  const { t } = useTranslation('settings');
+  const { preferences, setPreference } = useUiPreferences();
+  const { config, update } = useVoiceConfig();
+  const voiceEnabled = preferences.voiceEnabled;
+
+  return (
+    <div className="space-y-8">
+      <SettingsSection title={t('voiceSettings.title')} description={t('voiceSettings.description')}>
+        <div className="flex items-center justify-between rounded-lg border border-border p-3">
+          <div className="pr-3">
+            <div className="text-sm font-medium text-foreground">{t('voiceSettings.enable')}</div>
+            <div className="text-xs text-muted-foreground">{t('voiceSettings.enableDescription')}</div>
+          </div>
+          <SettingsToggle
+            checked={voiceEnabled}
+            onChange={(v) => setPreference('voiceEnabled', v)}
+            ariaLabel={t('voiceSettings.enable')}
+          />
+        </div>
+      </SettingsSection>
+
+      {voiceEnabled && (
+        <SettingsSection title={t('voiceSettings.backendTitle')} description={t('voiceSettings.backendDescription')}>
+          <div className="space-y-4">
+            <Field
+              label={t('voiceSettings.baseUrl')}
+              placeholder="https://api.openai.com/v1"
+              value={config.baseUrl}
+              onChange={(e) => update({ baseUrl: e.target.value })}
+            />
+            <Field
+              label={t('voiceSettings.apiKey')}
+              type="password"
+              autoComplete="off"
+              placeholder="sk-…"
+              value={config.apiKey}
+              onChange={(e) => update({ apiKey: e.target.value })}
+            />
+            <div className="grid grid-cols-1 gap-4 sm:grid-cols-4">
+              <Field
+                label={t('voiceSettings.sttModel')}
+                placeholder="whisper-1"
+                value={config.sttModel}
+                onChange={(e) => update({ sttModel: e.target.value })}
+              />
+              <Field
+                label={t('voiceSettings.ttsModel')}
+                placeholder="tts-1"
+                value={config.ttsModel}
+                onChange={(e) => update({ ttsModel: e.target.value })}
+              />
+              <Field
+                label={t('voiceSettings.voice')}
+                placeholder="alloy"
+                value={config.ttsVoice}
+                onChange={(e) => update({ ttsVoice: e.target.value })}
+              />
+              <Field
+                label={t('voiceSettings.format')}
+                placeholder="mp3"
+                value={config.ttsFormat}
+                onChange={(e) => update({ ttsFormat: e.target.value })}
+              />
+            </div>
+            <p className="text-xs text-muted-foreground">{t('voiceSettings.note')}</p>
+          </div>
+        </SettingsSection>
+      )}
+    </div>
+  );
+}
diff --git a/src/hooks/useUiPreferences.ts b/src/hooks/useUiPreferences.ts
index eb0b8339..342f1698 100644
--- a/src/hooks/useUiPreferences.ts
+++ b/src/hooks/useUiPreferences.ts
@@ -7,6 +7,7 @@ type UiPreferences = {
   autoScrollToBottom: boolean;
   sendByCtrlEnter: boolean;
   sidebarVisible: boolean;
+  voiceEnabled: boolean;
 };
 
 type UiPreferenceKey = keyof UiPreferences;
@@ -39,6 +40,7 @@ const DEFAULTS: UiPreferences = {
   autoScrollToBottom: true,
   sendByCtrlEnter: false,
   sidebarVisible: true,
+  voiceEnabled: false,
 };
 
 const PREFERENCE_KEYS = Object.keys(DEFAULTS) as UiPreferenceKey[];
diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts
new file mode 100644
index 00000000..303b6467
--- /dev/null
+++ b/src/hooks/useVoiceConfig.ts
@@ -0,0 +1,68 @@
+import { useState } from 'react';
+
+export type VoiceConfig = {
+  baseUrl: string;
+  apiKey: string;
+  sttModel: string;
+  ttsModel: string;
+  ttsVoice: string;
+  ttsFormat: string;
+};
+
+const STORAGE_KEY = 'voiceConfig';
+export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync';
+const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' };
+
+export function readVoiceConfig(): VoiceConfig {
+  try {
+    const raw = localStorage.getItem(STORAGE_KEY);
+    if (!raw) return { ...DEFAULTS };
+    const parsed = JSON.parse(raw);
+    if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) return { ...DEFAULTS };
+    const config = { ...DEFAULTS };
+    for (const key of Object.keys(DEFAULTS) as (keyof VoiceConfig)[]) {
+      if (typeof parsed[key] === 'string') config[key] = parsed[key];
+    }
+    return config;
+  } catch {
+    return { ...DEFAULTS };
+  }
+}
+
+// Headers the voice proxy reads to target a per-user OpenAI-compatible backend.
+// Empty fields are omitted so the server's env defaults apply.
+export function voiceConfigHeaders(): Record<string, string> {
+  if (typeof window === 'undefined') return {};
+  const c = readVoiceConfig();
+  const h: Record<string, string> = {};
+  if (c.apiKey) h['x-voice-api-key'] = c.apiKey;
+  if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
+  if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
+  if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
+  if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim();
+  return h;
+}
+
+export function useVoiceConfig() {
+  const [config, setConfig] = useState<VoiceConfig>(() =>
+    typeof window === 'undefined' ? { ...DEFAULTS } : readVoiceConfig(),
+  );
+
+  const update = (patch: Partial<VoiceConfig>) => {
+    setConfig((prev) => {
+      const next = { ...prev, ...patch };
+      try {
+        const stored: Partial<VoiceConfig> = { ...next };
+        if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim();
+        else delete stored.ttsFormat;
+        localStorage.setItem(STORAGE_KEY, JSON.stringify(stored));
+        window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT));
+      } catch {
+        /* ignore persistence errors */
+      }
+      return next;
+    });
+  };
+
+  return { config, update };
+}
diff --git a/src/i18n/locales/en/chat.json b/src/i18n/locales/en/chat.json
index 2c75fad0..656fa328 100644
--- a/src/i18n/locales/en/chat.json
+++ b/src/i18n/locales/en/chat.json
@@ -122,6 +122,14 @@
       }
     }
   },
+  "voice": {
+    "input": "Voice input",
+    "stopRecording": "Stop recording",
+    "transcribing": "Transcribing…",
+    "speak": "Read aloud",
+    "stopSpeaking": "Stop",
+    "loading": "Loading…"
+  },
   "input": {
     "placeholder": "Type / for commands, @ for files, or ask {{provider}} anything...",
     "placeholderDefault": "Type your message...",
diff --git a/src/i18n/locales/en/settings.json b/src/i18n/locales/en/settings.json
index 89d4e651..a04067e2 100644
--- a/src/i18n/locales/en/settings.json
+++ b/src/i18n/locales/en/settings.json
@@ -50,6 +50,21 @@
     "resetToDefaults": "Reset to Defaults",
     "cancelChanges": "Cancel Changes"
   },
+  "voiceSettings": {
+    "title": "Voice",
+    "description": "Speech-to-text input and read-aloud, via an OpenAI-compatible audio backend.",
+    "enable": "Enable voice",
+    "enableDescription": "Show the mic button and the read-aloud button on messages.",
+    "backendTitle": "Backend",
+    "backendDescription": "Point at OpenAI, Groq, or a local server (LocalAI, Speaches, Kokoro-FastAPI). Leave blank to use the server default.",
+    "baseUrl": "Base URL",
+    "apiKey": "API key",
+    "sttModel": "Speech-to-text model",
+    "ttsModel": "Text-to-speech model",
+    "voice": "Voice",
+    "format": "Audio format",
+    "note": "A custom base URL is called directly by your browser and must allow browser CORS requests. Leave it blank to use the server-configured backend."
+  },
   "quickSettings": {
     "title": "Quick Settings",
     "sections": {
@@ -64,6 +79,7 @@
     "showThinking": "Show thinking",
     "autoScrollToBottom": "Auto-scroll to bottom",
     "sendByCtrlEnter": "Send by Ctrl+Enter",
+    "voiceEnabled": "Voice (mic + read aloud)",
     "sendByCtrlEnterDescription": "When enabled, pressing Ctrl+Enter will send the message instead of just Enter. This is useful for IME users to avoid accidental sends.",
     "dragHandle": {
       "dragging": "Dragging handle",
@@ -94,6 +110,7 @@
     "appearance": "Appearance",
     "git": "Git",
     "apiTokens": "API & Tokens",
+    "voice": "Voice",
     "tasks": "Tasks",
     "browser": "Browser",
     "notifications": "Notifications",
diff --git a/src/lib/voiceApi.ts b/src/lib/voiceApi.ts
new file mode 100644
index 00000000..3f9549b4
--- /dev/null
+++ b/src/lib/voiceApi.ts
@@ -0,0 +1,60 @@
+import { authenticatedFetch } from '../utils/api';
+import { readVoiceConfig, voiceConfigHeaders } from '../hooks/useVoiceConfig';
+
+function directUrl(baseUrl: string, path: string): string {
+  return `${baseUrl.replace(/\/$/, '')}${path}`;
+}
+
+export function voiceConfigSignature(): string {
+  return JSON.stringify(readVoiceConfig());
+}
+
+export function transcribeVoice(blob: Blob, filename: string): Promise<Response> {
+  const config = readVoiceConfig();
+  const body = new FormData();
+
+  if (config.baseUrl.trim()) {
+    body.append('file', blob, filename);
+    body.append('model', config.sttModel || 'whisper-1');
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/transcriptions'), {
+      method: 'POST',
+      headers: config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {},
+      body,
+    });
+  }
+
+  body.append('audio', blob, filename);
+  return authenticatedFetch('/api/voice/transcribe', {
+    method: 'POST',
+    headers: voiceConfigHeaders(),
+    body,
+  });
+}
+
+export function synthesizeVoice(text: string, signal: AbortSignal): Promise<Response> {
+  const config = readVoiceConfig();
+
+  if (config.baseUrl.trim()) {
+    return fetch(directUrl(config.baseUrl.trim(), '/audio/speech'), {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(config.apiKey ? { Authorization: `Bearer ${config.apiKey}` } : {}),
+      },
+      body: JSON.stringify({
+        model: config.ttsModel || 'tts-1',
+        voice: config.ttsVoice || 'alloy',
+        input: text,
+        ...(config.ttsFormat.trim() ? { response_format: config.ttsFormat.trim() } : {}),
+      }),
+      signal,
+    });
+  }
+
+  return authenticatedFetch('/api/voice/tts', {
+    method: 'POST',
+    body: JSON.stringify({ text }),
+    headers: voiceConfigHeaders(),
+    signal,
+  });
+}
diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts
new file mode 100644
index 00000000..4c239c29
--- /dev/null
+++ b/src/lib/voicePlayer.ts
@@ -0,0 +1,196 @@
+import { synthesizeVoice, voiceConfigSignature } from './voiceApi';
+
+// A single app-level audio player for read-aloud. It owns one <audio> element, lives
+// outside the React tree, and caches generated audio by content. Because playback is not
+// tied to a component, switching chats or re-rendering a message can't revoke the blob URL
+// out from under it (the cause of mid-play cutoffs). v1 plays one message at a time
+// (a new play replaces the current one); the design leaves room for a queue later.
+
+export type VoicePlayState = 'idle' | 'loading' | 'playing';
+
+export type VoiceSnapshot = { state: VoicePlayState; error: string | null };
+
+const IDLE: VoiceSnapshot = { state: 'idle', error: null };
+const CACHE_MAX = 24;
+const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min
+
+// Stable id / cache key from the text and voice settings that affect its audio (djb2).
+export function voiceId(content: string, signature = voiceConfigSignature()): string {
+  const input = JSON.stringify([content, signature]);
+  let h = 5381;
+  for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0;
+  return (h >>> 0).toString(36);
+}
+
+class VoicePlayer {
+  private audio: HTMLAudioElement | null = null;
+  private unlocked = false;
+  private cache = new Map<string, string>(); // id -> blob URL (insertion order = LRU)
+  private currentId: string | null = null;
+  private state: VoicePlayState = 'idle';
+  private errorId: string | null = null;
+  private errorMsg: string | null = null;
+  private token = 0; // bumps to ignore stale in-flight results
+  private activeController: AbortController | null = null; // aborts the in-flight TTS fetch
+  private errorTimer: ReturnType<typeof setTimeout> | null = null;
+  private listeners = new Set<() => void>();
+
+  subscribe(listener: () => void): () => void {
+    this.listeners.add(listener);
+    return () => {
+      this.listeners.delete(listener);
+    };
+  }
+
+  private emit() {
+    this.listeners.forEach((l) => l());
+  }
+
+  getSnapshot(id: string): VoiceSnapshot {
+    const state = this.currentId === id ? this.state : 'idle';
+    const error = this.errorId === id ? this.errorMsg : null;
+    if (state === 'idle' && error === null) return IDLE;
+    return { state, error };
+  }
+
+  private ensureAudio(): HTMLAudioElement {
+    if (!this.audio) {
+      const audio = new Audio();
+      audio.addEventListener('ended', () => this.onEnded());
+      audio.addEventListener('error', () => {
+        // Only meaningful while we believe we're playing.
+        if (this.state === 'playing') this.onEnded();
+      });
+      this.audio = audio;
+    }
+    return this.audio;
+  }
+
+  // Call synchronously from the click handler so iOS grants the (reused) element playback.
+  unlock() {
+    if (this.unlocked) return;
+    const audio = this.ensureAudio();
+    try {
+      const p = audio.play();
+      if (p && typeof p.catch === 'function') p.catch(() => {});
+      audio.pause();
+    } catch {
+      /* priming attempt; ignore */
+    }
+    this.unlocked = true;
+  }
+
+  toggle(content: string) {
+    const id = voiceId(content);
+    if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
+      this.stop();
+      return;
+    }
+    void this.play(id, content);
+  }
+
+  stop() {
+    this.token++; // ignore any stale in-flight result
+    this.abortActive(); // and actually cancel the network request
+    if (this.audio) this.audio.pause();
+    this.state = 'idle';
+    this.currentId = null;
+    this.emit();
+  }
+
+  private abortActive() {
+    if (this.activeController) {
+      this.activeController.abort();
+      this.activeController = null;
+    }
+  }
+
+  private onEnded() {
+    this.state = 'idle';
+    this.currentId = null;
+    this.emit();
+    // (queue auto-advance would hook in here)
+  }
+
+  private setError(id: string, msg: string) {
+    this.state = 'idle';
+    this.currentId = id;
+    this.errorId = id;
+    this.errorMsg = msg;
+    this.emit();
+    if (this.errorTimer) clearTimeout(this.errorTimer);
+    this.errorTimer = setTimeout(() => {
+      if (this.errorId === id) {
+        this.errorId = null;
+        this.errorMsg = null;
+        if (this.currentId === id) this.currentId = null;
+        this.emit();
+      }
+    }, 6000);
+  }
+
+  private async play(id: string, content: string) {
+    const audio = this.ensureAudio();
+    audio.pause();
+    this.currentId = id;
+    this.errorId = null;
+    this.errorMsg = null;
+    this.state = 'loading';
+    this.emit();
+
+    const myToken = ++this.token;
+    this.abortActive(); // cancel any request this play supersedes
+
+    try {
+      let url = this.cache.get(id);
+      if (!url) {
+        const controller = new AbortController();
+        this.activeController = controller;
+        const timer = setTimeout(() => controller.abort(), CLIENT_TIMEOUT_MS);
+        const res = await synthesizeVoice(content, controller.signal).finally(() => {
+          clearTimeout(timer);
+          if (this.activeController === controller) this.activeController = null;
+        });
+        if (myToken !== this.token) return; // superseded by another play/stop
+        if (!res.ok) {
+          let msg = `Read-aloud failed (${res.status})`;
+          try {
+            const j = await res.json();
+            if (j?.error) msg = String(j.error);
+          } catch {
+            /* non-JSON error body */
+          }
+          throw new Error(msg);
+        }
+        const blob = await res.blob();
+        if (myToken !== this.token) return;
+        url = URL.createObjectURL(blob);
+        this.cacheSet(id, url);
+      }
+      if (myToken !== this.token) return;
+      audio.src = url;
+      audio.load();
+      await audio.play();
+      if (myToken !== this.token) return;
+      this.state = 'playing';
+      this.emit();
+    } catch (e) {
+      if (myToken !== this.token) return;
+      const aborted = e instanceof Error && e.name === 'AbortError';
+      this.setError(id, aborted ? 'Read-aloud timed out.' : e instanceof Error ? e.message : 'Read-aloud failed');
+    }
+  }
+
+  private cacheSet(id: string, url: string) {
+    this.cache.set(id, url);
+    while (this.cache.size > CACHE_MAX) {
+      const oldest = this.cache.keys().next().value as string | undefined;
+      if (oldest === undefined) break;
+      const oldUrl = this.cache.get(oldest);
+      this.cache.delete(oldest);
+      if (oldUrl && oldUrl !== this.audio?.src) URL.revokeObjectURL(oldUrl);
+    }
+  }
+}
+
+export const voicePlayer = new VoicePlayer();