fix(voice): harden recording and backend behavior

Redirects could bypass the backend URL guard, and TTS playback waited for full buffering.

Recording could overlap or finish after teardown. Controls also ignored backend readiness.

Explicit formats and config-aware cache keys prevent stale audio after settings change.
This commit is contained in:
Haileyesus
2026-06-25 16:35:30 +03:00
parent b0a49120cc
commit af16d8ebdc
5 changed files with 108 additions and 22 deletions

View File

@@ -8,6 +8,8 @@
// //
// Config is resolved per-request from headers (set by the client's voice settings), // Config is resolved per-request from headers (set by the client's voice settings),
// falling back to server env defaults. Mounted at /api/voice behind authenticateToken. // falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
import { Readable } from 'node:stream';
import express from 'express'; import express from 'express';
const ENV = { const ENV = {
@@ -32,7 +34,7 @@ function resolveConfig(req) {
sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
ttsFormat: String(h['x-voice-tts-format'] || ''), ttsFormat: String(h['x-voice-tts-format'] || '').trim(),
}; };
} }
@@ -57,7 +59,7 @@ async function fetchWithTimeout(url, options = {}) {
const controller = new AbortController(); const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS); const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
try { try {
return await fetch(url, { ...options, signal: controller.signal }); return await fetch(url, { redirect: 'manual', ...options, signal: controller.signal });
} finally { } finally {
clearTimeout(timer); clearTimeout(timer);
} }
@@ -206,7 +208,8 @@ router.post('/tts', async (req, res) => {
} }
res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg'); res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
res.setHeader('Cache-Control', 'no-store'); res.setHeader('Cache-Control', 'no-store');
res.send(Buffer.from(await r.arrayBuffer())); if (!r.body) return res.end();
Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res);
} catch (e) { } catch (e) {
backendError(res, e); backendError(res, e);
} }

View File

@@ -1,11 +1,39 @@
import { useEffect, useState } from 'react'; import { useEffect, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api';
import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
// Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
// the Settings modal). This is a lightweight read-only view of that preference so the // the Settings modal) and a configured voice backend.
// mic/speak controls can hide themselves, kept in sync via the same events
// useUiPreferences emits. No server probe.
const STORAGE_KEY = 'uiPreferences'; const STORAGE_KEY = 'uiPreferences';
const SYNC_EVENT = 'ui-preferences:sync'; const SYNC_EVENT = 'ui-preferences:sync';
const healthCache = new Map<string, boolean>();
const healthRequests = new Map<string, Promise<boolean>>();
function checkVoiceHealth(): Promise<boolean> {
const baseUrl = voiceConfigHeaders()['x-voice-base-url'];
const signature = baseUrl || '';
if (healthCache.has(signature)) return Promise.resolve(healthCache.get(signature) ?? false);
const pending = healthRequests.get(signature);
if (pending) return pending;
const request = authenticatedFetch('/api/voice/health', {
headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {},
})
.then(async (response) => {
if (!response.ok) throw new Error(`Voice health check failed (${response.status})`);
const data = await response.json();
return data?.configured === true;
})
.then((available) => {
healthCache.set(signature, available);
return available;
})
.finally(() => {
healthRequests.delete(signature);
});
healthRequests.set(signature, request);
return request;
}
function readVoiceEnabled(): boolean { function readVoiceEnabled(): boolean {
try { try {
@@ -22,6 +50,7 @@ export function useVoiceAvailable(): boolean {
const [enabled, setEnabled] = useState<boolean>(() => const [enabled, setEnabled] = useState<boolean>(() =>
typeof window === 'undefined' ? false : readVoiceEnabled(), typeof window === 'undefined' ? false : readVoiceEnabled(),
); );
const [available, setAvailable] = useState(false);
useEffect(() => { useEffect(() => {
const update = () => setEnabled(readVoiceEnabled()); const update = () => setEnabled(readVoiceEnabled());
@@ -33,5 +62,31 @@ export function useVoiceAvailable(): boolean {
}; };
}, []); }, []);
return enabled; useEffect(() => {
let active = true;
let requestId = 0;
const check = async () => {
if (!enabled) {
setAvailable(false);
return;
}
const id = ++requestId;
try {
const result = await checkVoiceHealth();
if (active && id === requestId) setAvailable(result);
} catch {
if (active && id === requestId) setAvailable(false);
}
};
void check();
window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check);
return () => {
active = false;
window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check);
};
}, [enabled]);
return enabled && available;
} }

View File

@@ -1,4 +1,5 @@
import { useCallback, useEffect, useRef, useState } from 'react'; import { useCallback, useEffect, useRef, useState } from 'react';
import { authenticatedFetch } from '../../../utils/api'; import { authenticatedFetch } from '../../../utils/api';
import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
@@ -37,6 +38,8 @@ export function useVoiceInput(
const recorderRef = useRef<MediaRecorder | null>(null); const recorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]); const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const cancelledRef = useRef(false);
const startingRef = useRef(false);
// Whether the in-progress stop should auto-send the transcript (vs just fill the box). // Whether the in-progress stop should auto-send the transcript (vs just fill the box).
const sendRef = useRef(false); const sendRef = useRef(false);
@@ -47,7 +50,10 @@ export function useVoiceInput(
// Stop the mic if the component unmounts mid-recording. // Stop the mic if the component unmounts mid-recording.
useEffect(() => { useEffect(() => {
cancelledRef.current = false;
return () => { return () => {
cancelledRef.current = true;
startingRef.current = false;
streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null; streamRef.current = null;
recorderRef.current = null; recorderRef.current = null;
@@ -55,10 +61,17 @@ export function useVoiceInput(
}, []); }, []);
const start = useCallback(async () => { const start = useCallback(async () => {
if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return;
startingRef.current = true;
let recordingCancelled = false;
try { try {
const stream = await navigator.mediaDevices.getUserMedia({ const stream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true }, audio: { echoCancellation: true, noiseSuppression: true },
}); });
if (cancelledRef.current) {
stream.getTracks().forEach((t) => t.stop());
return;
}
streamRef.current = stream; streamRef.current = stream;
const mimeType = pickMime(); const mimeType = pickMime();
const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
@@ -71,6 +84,7 @@ export function useVoiceInput(
rec.onstop = async () => { rec.onstop = async () => {
stopTracks(); stopTracks();
if (recordingCancelled || cancelledRef.current) return;
// Capture and clear the send intent for this stop before any async work. // Capture and clear the send intent for this stop before any async work.
const shouldSend = sendRef.current; const shouldSend = sendRef.current;
sendRef.current = false; sendRef.current = false;
@@ -93,25 +107,34 @@ export function useVoiceInput(
}); });
if (!res.ok) throw new Error(`transcribe ${res.status}`); if (!res.ok) throw new Error(`transcribe ${res.status}`);
const data = await res.json(); const data = await res.json();
if (recordingCancelled || cancelledRef.current) return;
const text = String(data?.text || '').trim(); const text = String(data?.text || '').trim();
if (text) onTranscript(text, shouldSend); if (text) onTranscript(text, shouldSend);
else onError?.('No speech detected'); else onError?.('No speech detected');
} catch (e) { } catch (e) {
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); if (!recordingCancelled && !cancelledRef.current) {
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
}
} finally { } finally {
setState('idle'); if (!recordingCancelled && !cancelledRef.current) setState('idle');
} }
}; };
rec.start(); rec.start();
setState('recording'); setState('recording');
} catch (e) { } catch (e) {
recordingCancelled = true;
recorderRef.current = null;
stopTracks();
if (cancelledRef.current) return;
const err = e as { name?: string; message?: string }; const err = e as { name?: string; message?: string };
let msg = `Mic error: ${err?.message || e}`; let msg = `Mic error: ${err?.message || e}`;
if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
onError?.(msg); onError?.(msg);
setState('idle'); setState('idle');
} finally {
startingRef.current = false;
} }
}, [onTranscript, onError]); }, [onTranscript, onError]);

View File

@@ -10,16 +10,15 @@ export type VoiceConfig = {
}; };
const STORAGE_KEY = 'voiceConfig'; const STORAGE_KEY = 'voiceConfig';
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' }; export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync';
const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' };
function read(): VoiceConfig { function read(): VoiceConfig {
try { try {
const raw = localStorage.getItem(STORAGE_KEY); const raw = localStorage.getItem(STORAGE_KEY);
if (!raw) return { ...DEFAULTS }; if (!raw) return { ...DEFAULTS };
const parsed = JSON.parse(raw); const parsed = JSON.parse(raw);
const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) }; return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat;
return next;
} catch { } catch {
return { ...DEFAULTS }; return { ...DEFAULTS };
} }
@@ -36,7 +35,7 @@ export function voiceConfigHeaders(): Record<string, string> {
if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice; if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat; if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim();
return h; return h;
} }
@@ -49,7 +48,11 @@ export function useVoiceConfig() {
setConfig((prev) => { setConfig((prev) => {
const next = { ...prev, ...patch }; const next = { ...prev, ...patch };
try { try {
localStorage.setItem(STORAGE_KEY, JSON.stringify(next)); const stored: Partial<VoiceConfig> = { ...next };
if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim();
else delete stored.ttsFormat;
localStorage.setItem(STORAGE_KEY, JSON.stringify(stored));
window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT));
} catch { } catch {
/* ignore persistence errors */ /* ignore persistence errors */
} }

View File

@@ -15,10 +15,11 @@ const IDLE: VoiceSnapshot = { state: 'idle', error: null };
const CACHE_MAX = 24; const CACHE_MAX = 24;
const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min
// Stable id / cache key from a message's text (djb2). // Stable id / cache key from the text and voice settings that affect its audio (djb2).
export function voiceId(content: string): string { export function voiceId(content: string, headers = voiceConfigHeaders()): string {
const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]);
let h = 5381; let h = 5381;
for (let i = 0; i < content.length; i++) h = (((h << 5) + h) + content.charCodeAt(i)) | 0; for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0;
return (h >>> 0).toString(36); return (h >>> 0).toString(36);
} }
@@ -81,12 +82,13 @@ class VoicePlayer {
} }
toggle(content: string) { toggle(content: string) {
const id = voiceId(content); const headers = voiceConfigHeaders();
const id = voiceId(content, headers);
if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) { if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
this.stop(); this.stop();
return; return;
} }
void this.play(id, content); void this.play(id, content, headers);
} }
stop() { stop() {
@@ -129,7 +131,7 @@ class VoicePlayer {
}, 6000); }, 6000);
} }
private async play(id: string, content: string) { private async play(id: string, content: string, headers: Record<string, string>) {
const audio = this.ensureAudio(); const audio = this.ensureAudio();
audio.pause(); audio.pause();
this.currentId = id; this.currentId = id;
@@ -150,7 +152,7 @@ class VoicePlayer {
const res = await authenticatedFetch('/api/voice/tts', { const res = await authenticatedFetch('/api/voice/tts', {
method: 'POST', method: 'POST',
body: JSON.stringify({ text: content }), body: JSON.stringify({ text: content }),
headers: voiceConfigHeaders(), headers,
signal: controller.signal, signal: controller.signal,
}).finally(() => { }).finally(() => {
clearTimeout(timer); clearTimeout(timer);