fix(voice): harden recording and backend behavior

Redirects could bypass the backend URL guard, and TTS playback waited for full buffering. Recording could overlap or finish after teardown. Controls also ignored backend readiness. Explicit formats and config-aware cache keys prevent stale audio after settings change.
2026-06-26 05:15:48 +08:00 · 2026-06-25 16:35:30 +03:00
parent b0a49120cc
commit af16d8ebdc
5 changed files with 108 additions and 22 deletions
--- a/server/voice-proxy.js
+++ b/server/voice-proxy.js
@@ -8,6 +8,8 @@
 //
 // Config is resolved per-request from headers (set by the client's voice settings),
 // falling back to server env defaults. Mounted at /api/voice behind authenticateToken.
 import { Readable } from 'node:stream';
 import express from 'express';
 const ENV = {
@@ -32,7 +34,7 @@ function resolveConfig(req) {
    sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel,
    ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel,
    ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice,
-    ttsFormat: String(h['x-voice-tts-format'] || ''),
+    ttsFormat: String(h['x-voice-tts-format'] || '').trim(),
  };
 }
@@ -57,7 +59,7 @@ async function fetchWithTimeout(url, options = {}) {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS);
  try {
-    return await fetch(url, { ...options, signal: controller.signal });
+    return await fetch(url, { redirect: 'manual', ...options, signal: controller.signal });
  } finally {
    clearTimeout(timer);
  }
@@ -206,7 +208,8 @@ router.post('/tts', async (req, res) => {
    }
    res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg');
    res.setHeader('Cache-Control', 'no-store');
-    res.send(Buffer.from(await r.arrayBuffer()));
+    if (!r.body) return res.end();
    Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res);
  } catch (e) {
    backendError(res, e);
  }
--- a/src/components/chat/hooks/useVoiceAvailable.ts
+++ b/src/components/chat/hooks/useVoiceAvailable.ts
@@ -1,11 +1,39 @@
 import { useEffect, useState } from 'react';
 import { authenticatedFetch } from '../../../utils/api';
 import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
 // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings /
-// the Settings modal). This is a lightweight read-only view of that preference so the
+// the Settings modal) and a configured voice backend.
 // mic/speak controls can hide themselves, kept in sync via the same events
 // useUiPreferences emits. No server probe.
 const STORAGE_KEY = 'uiPreferences';
 const SYNC_EVENT = 'ui-preferences:sync';
 const healthCache = new Map<string, boolean>();
 const healthRequests = new Map<string, Promise<boolean>>();
 function checkVoiceHealth(): Promise<boolean> {
  const baseUrl = voiceConfigHeaders()['x-voice-base-url'];
  const signature = baseUrl || '';
  if (healthCache.has(signature)) return Promise.resolve(healthCache.get(signature) ?? false);
  const pending = healthRequests.get(signature);
  if (pending) return pending;
  const request = authenticatedFetch('/api/voice/health', {
    headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {},
  })
    .then(async (response) => {
      if (!response.ok) throw new Error(`Voice health check failed (${response.status})`);
      const data = await response.json();
      return data?.configured === true;
    })
    .then((available) => {
      healthCache.set(signature, available);
      return available;
    })
    .finally(() => {
      healthRequests.delete(signature);
    });
  healthRequests.set(signature, request);
  return request;
 }
 function readVoiceEnabled(): boolean {
  try {
@@ -22,6 +50,7 @@ export function useVoiceAvailable(): boolean {
  const [enabled, setEnabled] = useState<boolean>(() =>
    typeof window === 'undefined' ? false : readVoiceEnabled(),
  );
  const [available, setAvailable] = useState(false);
  useEffect(() => {
    const update = () => setEnabled(readVoiceEnabled());
@@ -33,5 +62,31 @@ export function useVoiceAvailable(): boolean {
    };
  }, []);
-  return enabled;
+  useEffect(() => {
    let active = true;
    let requestId = 0;
    const check = async () => {
      if (!enabled) {
        setAvailable(false);
        return;
      }
      const id = ++requestId;
      try {
        const result = await checkVoiceHealth();
        if (active && id === requestId) setAvailable(result);
      } catch {
        if (active && id === requestId) setAvailable(false);
      }
    };
    void check();
    window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check);
    return () => {
      active = false;
      window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check);
    };
  }, [enabled]);
  return enabled && available;
 }
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -1,4 +1,5 @@
 import { useCallback, useEffect, useRef, useState } from 'react';
 import { authenticatedFetch } from '../../../utils/api';
 import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
@@ -37,6 +38,8 @@ export function useVoiceInput(
  const recorderRef = useRef<MediaRecorder | null>(null);
  const chunksRef = useRef<Blob[]>([]);
  const streamRef = useRef<MediaStream | null>(null);
  const cancelledRef = useRef(false);
  const startingRef = useRef(false);
  // Whether the in-progress stop should auto-send the transcript (vs just fill the box).
  const sendRef = useRef(false);
@@ -47,7 +50,10 @@ export function useVoiceInput(
  // Stop the mic if the component unmounts mid-recording.
  useEffect(() => {
    cancelledRef.current = false;
    return () => {
      cancelledRef.current = true;
      startingRef.current = false;
      streamRef.current?.getTracks().forEach((t) => t.stop());
      streamRef.current = null;
      recorderRef.current = null;
@@ -55,10 +61,17 @@ export function useVoiceInput(
  }, []);
  const start = useCallback(async () => {
    if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return;
    startingRef.current = true;
    let recordingCancelled = false;
    try {
      const stream = await navigator.mediaDevices.getUserMedia({
        audio: { echoCancellation: true, noiseSuppression: true },
      });
      if (cancelledRef.current) {
        stream.getTracks().forEach((t) => t.stop());
        return;
      }
      streamRef.current = stream;
      const mimeType = pickMime();
      const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream);
@@ -71,6 +84,7 @@ export function useVoiceInput(
      rec.onstop = async () => {
        stopTracks();
        if (recordingCancelled || cancelledRef.current) return;
        // Capture and clear the send intent for this stop before any async work.
        const shouldSend = sendRef.current;
        sendRef.current = false;
@@ -93,25 +107,34 @@ export function useVoiceInput(
          });
          if (!res.ok) throw new Error(`transcribe ${res.status}`);
          const data = await res.json();
          if (recordingCancelled || cancelledRef.current) return;
          const text = String(data?.text || '').trim();
          if (text) onTranscript(text, shouldSend);
          else onError?.('No speech detected');
        } catch (e) {
-          onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
+          if (!recordingCancelled && !cancelledRef.current) {
            onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
          }
        } finally {
-          setState('idle');
+          if (!recordingCancelled && !cancelledRef.current) setState('idle');
        }
      };
      rec.start();
      setState('recording');
    } catch (e) {
      recordingCancelled = true;
      recorderRef.current = null;
      stopTracks();
      if (cancelledRef.current) return;
      const err = e as { name?: string; message?: string };
      let msg = `Mic error: ${err?.message || e}`;
      if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.';
      else if (err?.name === 'NotFoundError') msg = 'No microphone found.';
      onError?.(msg);
      setState('idle');
    } finally {
      startingRef.current = false;
    }
  }, [onTranscript, onError]);
--- a/src/hooks/useVoiceConfig.ts
+++ b/src/hooks/useVoiceConfig.ts
@@ -10,16 +10,15 @@ export type VoiceConfig = {
 };
 const STORAGE_KEY = 'voiceConfig';
-const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' };
+export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync';
 const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' };
 function read(): VoiceConfig {
  try {
    const raw = localStorage.getItem(STORAGE_KEY);
    if (!raw) return { ...DEFAULTS };
    const parsed = JSON.parse(raw);
-    const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
+    return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) };
    if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat;
    return next;
  } catch {
    return { ...DEFAULTS };
  }
@@ -36,7 +35,7 @@ export function voiceConfigHeaders(): Record<string, string> {
  if (c.sttModel) h['x-voice-stt-model'] = c.sttModel;
  if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel;
  if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice;
-  if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat;
+  if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim();
  return h;
 }
@@ -49,7 +48,11 @@ export function useVoiceConfig() {
    setConfig((prev) => {
      const next = { ...prev, ...patch };
      try {
-        localStorage.setItem(STORAGE_KEY, JSON.stringify(next));
+        const stored: Partial<VoiceConfig> = { ...next };
        if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim();
        else delete stored.ttsFormat;
        localStorage.setItem(STORAGE_KEY, JSON.stringify(stored));
        window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT));
      } catch {
        /* ignore persistence errors */
      }
--- a/src/lib/voicePlayer.ts
+++ b/src/lib/voicePlayer.ts
@@ -15,10 +15,11 @@ const IDLE: VoiceSnapshot = { state: 'idle', error: null };
 const CACHE_MAX = 24;
 const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min
-// Stable id / cache key from a message's text (djb2).
+// Stable id / cache key from the text and voice settings that affect its audio (djb2).
-export function voiceId(content: string): string {
+export function voiceId(content: string, headers = voiceConfigHeaders()): string {
  const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]);
  let h = 5381;
-  for (let i = 0; i < content.length; i++) h = (((h << 5) + h) + content.charCodeAt(i)) | 0;
+  for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0;
  return (h >>> 0).toString(36);
 }
@@ -81,12 +82,13 @@ class VoicePlayer {
  }
  toggle(content: string) {
-    const id = voiceId(content);
+    const headers = voiceConfigHeaders();
    const id = voiceId(content, headers);
    if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
      this.stop();
      return;
    }
-    void this.play(id, content);
+    void this.play(id, content, headers);
  }
  stop() {
@@ -129,7 +131,7 @@ class VoicePlayer {
    }, 6000);
  }
-  private async play(id: string, content: string) {
+  private async play(id: string, content: string, headers: Record<string, string>) {
    const audio = this.ensureAudio();
    audio.pause();
    this.currentId = id;
@@ -150,7 +152,7 @@ class VoicePlayer {
        const res = await authenticatedFetch('/api/voice/tts', {
          method: 'POST',
          body: JSON.stringify({ text: content }),
-          headers: voiceConfigHeaders(),
+          headers,
          signal: controller.signal,
        }).finally(() => {
          clearTimeout(timer);