From af16d8ebdcdaabbfa25f3c82f76a6e16a1b30180 Mon Sep 17 00:00:00 2001 From: Haileyesus <118998054+blackmammoth@users.noreply.github.com> Date: Thu, 25 Jun 2026 16:35:30 +0300 Subject: [PATCH] fix(voice): harden recording and backend behavior Redirects could bypass the backend URL guard, and TTS playback waited for full buffering. Recording could overlap or finish after teardown. Controls also ignored backend readiness. Explicit formats and config-aware cache keys prevent stale audio after settings change. --- server/voice-proxy.js | 9 ++- .../chat/hooks/useVoiceAvailable.ts | 63 +++++++++++++++++-- src/components/chat/hooks/useVoiceInput.ts | 27 +++++++- src/hooks/useVoiceConfig.ts | 15 +++-- src/lib/voicePlayer.ts | 16 ++--- 5 files changed, 108 insertions(+), 22 deletions(-) diff --git a/server/voice-proxy.js b/server/voice-proxy.js index a628ce63..304535a2 100644 --- a/server/voice-proxy.js +++ b/server/voice-proxy.js @@ -8,6 +8,8 @@ // // Config is resolved per-request from headers (set by the client's voice settings), // falling back to server env defaults. Mounted at /api/voice behind authenticateToken. +import { Readable } from 'node:stream'; + import express from 'express'; const ENV = { @@ -32,7 +34,7 @@ function resolveConfig(req) { sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, - ttsFormat: String(h['x-voice-tts-format'] || ''), + ttsFormat: String(h['x-voice-tts-format'] || '').trim(), }; } @@ -57,7 +59,7 @@ async function fetchWithTimeout(url, options = {}) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS); try { - return await fetch(url, { ...options, signal: controller.signal }); + return await fetch(url, { redirect: 'manual', ...options, signal: controller.signal }); } finally { clearTimeout(timer); } @@ -206,7 +208,8 @@ router.post('/tts', async (req, res) => { } res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg'); res.setHeader('Cache-Control', 'no-store'); - res.send(Buffer.from(await r.arrayBuffer())); + if (!r.body) return res.end(); + Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res); } catch (e) { backendError(res, e); } diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts index 388411f0..df5fe533 100644 --- a/src/components/chat/hooks/useVoiceAvailable.ts +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -1,11 +1,39 @@ import { useEffect, useState } from 'react'; +import { authenticatedFetch } from '../../../utils/api'; +import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; + // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / -// the Settings modal). This is a lightweight read-only view of that preference so the -// mic/speak controls can hide themselves, kept in sync via the same events -// useUiPreferences emits. No server probe. +// the Settings modal) and a configured voice backend. const STORAGE_KEY = 'uiPreferences'; const SYNC_EVENT = 'ui-preferences:sync'; +const healthCache = new Map(); +const healthRequests = new Map>(); + +function checkVoiceHealth(): Promise { + const baseUrl = voiceConfigHeaders()['x-voice-base-url']; + const signature = baseUrl || ''; + if (healthCache.has(signature)) return Promise.resolve(healthCache.get(signature) ?? false); + const pending = healthRequests.get(signature); + if (pending) return pending; + const request = authenticatedFetch('/api/voice/health', { + headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {}, + }) + .then(async (response) => { + if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); + const data = await response.json(); + return data?.configured === true; + }) + .then((available) => { + healthCache.set(signature, available); + return available; + }) + .finally(() => { + healthRequests.delete(signature); + }); + healthRequests.set(signature, request); + return request; +} function readVoiceEnabled(): boolean { try { @@ -22,6 +50,7 @@ export function useVoiceAvailable(): boolean { const [enabled, setEnabled] = useState(() => typeof window === 'undefined' ? false : readVoiceEnabled(), ); + const [available, setAvailable] = useState(false); useEffect(() => { const update = () => setEnabled(readVoiceEnabled()); @@ -33,5 +62,31 @@ export function useVoiceAvailable(): boolean { }; }, []); - return enabled; + useEffect(() => { + let active = true; + let requestId = 0; + + const check = async () => { + if (!enabled) { + setAvailable(false); + return; + } + const id = ++requestId; + try { + const result = await checkVoiceHealth(); + if (active && id === requestId) setAvailable(result); + } catch { + if (active && id === requestId) setAvailable(false); + } + }; + + void check(); + window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check); + return () => { + active = false; + window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check); + }; + }, [enabled]); + + return enabled && available; } diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts index f344fa81..706fd6c4 100644 --- a/src/components/chat/hooks/useVoiceInput.ts +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -1,4 +1,5 @@ import { useCallback, useEffect, useRef, useState } from 'react'; + import { authenticatedFetch } from '../../../utils/api'; import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; @@ -37,6 +38,8 @@ export function useVoiceInput( const recorderRef = useRef(null); const chunksRef = useRef([]); const streamRef = useRef(null); + const cancelledRef = useRef(false); + const startingRef = useRef(false); // Whether the in-progress stop should auto-send the transcript (vs just fill the box). const sendRef = useRef(false); @@ -47,7 +50,10 @@ export function useVoiceInput( // Stop the mic if the component unmounts mid-recording. useEffect(() => { + cancelledRef.current = false; return () => { + cancelledRef.current = true; + startingRef.current = false; streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current = null; recorderRef.current = null; @@ -55,10 +61,17 @@ export function useVoiceInput( }, []); const start = useCallback(async () => { + if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return; + startingRef.current = true; + let recordingCancelled = false; try { const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true }, }); + if (cancelledRef.current) { + stream.getTracks().forEach((t) => t.stop()); + return; + } streamRef.current = stream; const mimeType = pickMime(); const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); @@ -71,6 +84,7 @@ export function useVoiceInput( rec.onstop = async () => { stopTracks(); + if (recordingCancelled || cancelledRef.current) return; // Capture and clear the send intent for this stop before any async work. const shouldSend = sendRef.current; sendRef.current = false; @@ -93,25 +107,34 @@ export function useVoiceInput( }); if (!res.ok) throw new Error(`transcribe ${res.status}`); const data = await res.json(); + if (recordingCancelled || cancelledRef.current) return; const text = String(data?.text || '').trim(); if (text) onTranscript(text, shouldSend); else onError?.('No speech detected'); } catch (e) { - onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + if (!recordingCancelled && !cancelledRef.current) { + onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + } } finally { - setState('idle'); + if (!recordingCancelled && !cancelledRef.current) setState('idle'); } }; rec.start(); setState('recording'); } catch (e) { + recordingCancelled = true; + recorderRef.current = null; + stopTracks(); + if (cancelledRef.current) return; const err = e as { name?: string; message?: string }; let msg = `Mic error: ${err?.message || e}`; if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; onError?.(msg); setState('idle'); + } finally { + startingRef.current = false; } }, [onTranscript, onError]); diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts index 9e3e2551..77e22546 100644 --- a/src/hooks/useVoiceConfig.ts +++ b/src/hooks/useVoiceConfig.ts @@ -10,16 +10,15 @@ export type VoiceConfig = { }; const STORAGE_KEY = 'voiceConfig'; -const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' }; +export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; +const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; function read(): VoiceConfig { try { const raw = localStorage.getItem(STORAGE_KEY); if (!raw) return { ...DEFAULTS }; const parsed = JSON.parse(raw); - const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) }; - if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat; - return next; + return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) }; } catch { return { ...DEFAULTS }; } @@ -36,7 +35,7 @@ export function voiceConfigHeaders(): Record { if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice; - if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat; + if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim(); return h; } @@ -49,7 +48,11 @@ export function useVoiceConfig() { setConfig((prev) => { const next = { ...prev, ...patch }; try { - localStorage.setItem(STORAGE_KEY, JSON.stringify(next)); + const stored: Partial = { ...next }; + if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim(); + else delete stored.ttsFormat; + localStorage.setItem(STORAGE_KEY, JSON.stringify(stored)); + window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT)); } catch { /* ignore persistence errors */ } diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts index a51700d3..b09f5170 100644 --- a/src/lib/voicePlayer.ts +++ b/src/lib/voicePlayer.ts @@ -15,10 +15,11 @@ const IDLE: VoiceSnapshot = { state: 'idle', error: null }; const CACHE_MAX = 24; const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min -// Stable id / cache key from a message's text (djb2). -export function voiceId(content: string): string { +// Stable id / cache key from the text and voice settings that affect its audio (djb2). +export function voiceId(content: string, headers = voiceConfigHeaders()): string { + const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]); let h = 5381; - for (let i = 0; i < content.length; i++) h = (((h << 5) + h) + content.charCodeAt(i)) | 0; + for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0; return (h >>> 0).toString(36); } @@ -81,12 +82,13 @@ class VoicePlayer { } toggle(content: string) { - const id = voiceId(content); + const headers = voiceConfigHeaders(); + const id = voiceId(content, headers); if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) { this.stop(); return; } - void this.play(id, content); + void this.play(id, content, headers); } stop() { @@ -129,7 +131,7 @@ class VoicePlayer { }, 6000); } - private async play(id: string, content: string) { + private async play(id: string, content: string, headers: Record) { const audio = this.ensureAudio(); audio.pause(); this.currentId = id; @@ -150,7 +152,7 @@ class VoicePlayer { const res = await authenticatedFetch('/api/voice/tts', { method: 'POST', body: JSON.stringify({ text: content }), - headers: voiceConfigHeaders(), + headers, signal: controller.signal, }).finally(() => { clearTimeout(timer);