fix(voice): play read-aloud through an app-level player to stop cutoffs

Read-aloud now runs in a single module-level player outside the React tree instead of per-message component state. Switching chats or re-rendering a message no longer revokes the blob URL mid-play (the 'Invalid URI' cutoff). Adds content-keyed caching so re-listening doesn't regenerate, and reuses one audio element (also unlocks iOS once).
2026-06-25 04:13:51 +08:00 · 2026-06-09 15:19:36 +01:00
parent 32a6405537
commit cb3ad16139
2 changed files with 203 additions and 107 deletions
--- a/src/components/chat/hooks/useTts.ts
+++ b/src/components/chat/hooks/useTts.ts
@@ -1,119 +1,33 @@
-import { useCallback, useEffect, useRef, useState } from 'react';
-import { authenticatedFetch } from '../../../utils/api';
-import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig';
+import { useCallback, useEffect, useState } from 'react';
+import { voicePlayer, voiceId, type VoiceSnapshot } from '../../../lib/voicePlayer';

-// Only one message speaks at a time across the whole app.
-let stopActive: (() => void) | null = null;
-
-export type TtsState = 'idle' | 'loading' | 'playing';
+export type TtsState = VoiceSnapshot['state'];

 /**
- * Tap-to-speak for a single message. Sends raw markdown to /api/voice/tts and plays
- * the returned audio. Manual-gesture only (v1) to satisfy iOS autoplay. Exposes the
- * last error (e.g. a backend timeout) so the control can surface it.
+ * Thin adapter over the app-level voicePlayer. Playback lives outside React (see
+ * lib/voicePlayer), so switching chats or re-rendering a message no longer cuts the
+ * audio off. This hook just reflects the player's state for one message and forwards taps.
 */
 export function useTts(getText: () => string) {
-  const [state, setState] = useState<TtsState>('idle');
-  const [error, setError] = useState<string | null>(null);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
-  const urlRef = useRef<string | null>(null);
-  const errorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const content = getText();
+  const id = voiceId(content);

-  const reset = useCallback(() => {
-    if (audioRef.current) {
-      audioRef.current.onended = null;
-      audioRef.current.onerror = null;
-      audioRef.current.pause();
-      audioRef.current.src = '';
-      audioRef.current = null;
-    }
-    if (urlRef.current) {
-      URL.revokeObjectURL(urlRef.current);
-      urlRef.current = null;
-    }
-  }, []);
+  const [snap, setSnap] = useState<VoiceSnapshot>(() => voicePlayer.getSnapshot(id));

-  const stop = useCallback(() => {
-    reset();
-    setState('idle');
-    if (stopActive) stopActive = null;
-  }, [reset]);
-
-  const showError = useCallback((msg: string) => {
-    setError(msg);
-    if (errorTimer.current) clearTimeout(errorTimer.current);
-    errorTimer.current = setTimeout(() => setError(null), 6000);
-  }, []);
-
-  // Cleanup on unmount: drop the global stop handler if it points at us, then reset.
-  useEffect(
-    () => () => {
-      if (stopActive === stop) stopActive = null;
-      if (errorTimer.current) clearTimeout(errorTimer.current);
-      reset();
-    },
-    [reset, stop],
-  );
-
-  const play = useCallback(async () => {
-    if (stopActive) stopActive();
-    const text = getText();
-    if (!text || !text.trim()) return;
-    setError(null);
-
-    // Create + "unlock" the audio element synchronously inside the click gesture,
-    // so iOS Safari lets us play it after the async fetch resolves.
-    const audio = new Audio();
-    audioRef.current = audio;
-    audio.onended = () => stop();
-    audio.onerror = () => stop();
-    try {
-      audio.play().catch(() => {});
-      audio.pause();
-    } catch {
-      /* unlock attempt; ignore */
-    }
-    stopActive = stop;
-    setState('loading');
-
-    try {
-      const res = await authenticatedFetch('/api/voice/tts', {
-        method: 'POST',
-        body: JSON.stringify({ text }),
-        headers: voiceConfigHeaders(),
+  useEffect(() => {
+    const update = () =>
+      setSnap((prev) => {
+        const next = voicePlayer.getSnapshot(id);
+        return prev.state === next.state && prev.error === next.error ? prev : next;
      });
-      if (!res.ok) {
-        let msg = `Read-aloud failed (${res.status})`;
-        try {
-          const j = await res.json();
-          if (j?.error) msg = String(j.error);
-        } catch {
-          /* non-JSON error body */
-        }
-        throw new Error(msg);
-      }
-      const blob = await res.blob();
-      const url = URL.createObjectURL(blob);
-      if (audioRef.current !== audio) {
-        URL.revokeObjectURL(url); // stopped while loading; don't leak the blob URL
-        return;
-      }
-      urlRef.current = url;
-      audio.src = url;
-      audio.load();
-      await audio.play();
-      setState('playing');
-    } catch (e) {
-      reset();
-      setState('idle');
-      showError(e instanceof Error ? e.message : 'Read-aloud failed');
-    }
-  }, [getText, reset, stop, showError]);
+    update();
+    return voicePlayer.subscribe(update);
+  }, [id]);

  const toggle = useCallback(() => {
-    if (state === 'playing' || state === 'loading') stop();
-    else play();
-  }, [state, play, stop]);
+    voicePlayer.unlock(); // synchronous, within the click gesture (iOS)
+    voicePlayer.toggle(content);
+  }, [content]);

-  return { state, toggle, error };
+  return { state: snap.state, toggle, error: snap.error };
 }
--- a/src/lib/voicePlayer.ts
+++ b/src/lib/voicePlayer.ts
@@ -0,0 +1,182 @@
+import { authenticatedFetch } from '../utils/api';
+import { voiceConfigHeaders } from '../hooks/useVoiceConfig';
+
+// A single app-level audio player for read-aloud. It owns one <audio> element, lives
+// outside the React tree, and caches generated audio by content. Because playback is not
+// tied to a component, switching chats or re-rendering a message can't revoke the blob URL
+// out from under it (the cause of mid-play cutoffs). v1 plays one message at a time
+// (a new play replaces the current one); the design leaves room for a queue later.
+
+export type VoicePlayState = 'idle' | 'loading' | 'playing';
+
+export type VoiceSnapshot = { state: VoicePlayState; error: string | null };
+
+const IDLE: VoiceSnapshot = { state: 'idle', error: null };
+const CACHE_MAX = 24;
+
+// Stable id / cache key from a message's text (djb2).
+export function voiceId(content: string): string {
+  let h = 5381;
+  for (let i = 0; i < content.length; i++) h = (((h << 5) + h) + content.charCodeAt(i)) | 0;
+  return (h >>> 0).toString(36);
+}
+
+class VoicePlayer {
+  private audio: HTMLAudioElement | null = null;
+  private unlocked = false;
+  private cache = new Map<string, string>(); // id -> blob URL (insertion order = LRU)
+  private currentId: string | null = null;
+  private state: VoicePlayState = 'idle';
+  private errorId: string | null = null;
+  private errorMsg: string | null = null;
+  private token = 0; // bumps to cancel in-flight fetches
+  private errorTimer: ReturnType<typeof setTimeout> | null = null;
+  private listeners = new Set<() => void>();
+
+  subscribe(listener: () => void): () => void {
+    this.listeners.add(listener);
+    return () => {
+      this.listeners.delete(listener);
+    };
+  }
+
+  private emit() {
+    this.listeners.forEach((l) => l());
+  }
+
+  getSnapshot(id: string): VoiceSnapshot {
+    const state = this.currentId === id ? this.state : 'idle';
+    const error = this.errorId === id ? this.errorMsg : null;
+    if (state === 'idle' && error === null) return IDLE;
+    return { state, error };
+  }
+
+  private ensureAudio(): HTMLAudioElement {
+    if (!this.audio) {
+      const audio = new Audio();
+      audio.addEventListener('ended', () => this.onEnded());
+      audio.addEventListener('error', () => {
+        // Only meaningful while we believe we're playing.
+        if (this.state === 'playing') this.onEnded();
+      });
+      this.audio = audio;
+    }
+    return this.audio;
+  }
+
+  // Call synchronously from the click handler so iOS grants the (reused) element playback.
+  unlock() {
+    if (this.unlocked) return;
+    const audio = this.ensureAudio();
+    try {
+      const p = audio.play();
+      if (p && typeof p.catch === 'function') p.catch(() => {});
+      audio.pause();
+    } catch {
+      /* priming attempt; ignore */
+    }
+    this.unlocked = true;
+  }
+
+  toggle(content: string) {
+    const id = voiceId(content);
+    if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) {
+      this.stop();
+      return;
+    }
+    void this.play(id, content);
+  }
+
+  stop() {
+    this.token++; // cancel any in-flight fetch
+    if (this.audio) this.audio.pause();
+    this.state = 'idle';
+    this.currentId = null;
+    this.emit();
+  }
+
+  private onEnded() {
+    this.state = 'idle';
+    this.currentId = null;
+    this.emit();
+    // (queue auto-advance would hook in here)
+  }
+
+  private setError(id: string, msg: string) {
+    this.state = 'idle';
+    this.currentId = id;
+    this.errorId = id;
+    this.errorMsg = msg;
+    this.emit();
+    if (this.errorTimer) clearTimeout(this.errorTimer);
+    this.errorTimer = setTimeout(() => {
+      if (this.errorId === id) {
+        this.errorId = null;
+        this.errorMsg = null;
+        if (this.currentId === id) this.currentId = null;
+        this.emit();
+      }
+    }, 6000);
+  }
+
+  private async play(id: string, content: string) {
+    const audio = this.ensureAudio();
+    audio.pause();
+    this.currentId = id;
+    this.errorId = null;
+    this.errorMsg = null;
+    this.state = 'loading';
+    this.emit();
+
+    const myToken = ++this.token;
+
+    try {
+      let url = this.cache.get(id);
+      if (!url) {
+        const res = await authenticatedFetch('/api/voice/tts', {
+          method: 'POST',
+          body: JSON.stringify({ text: content }),
+          headers: voiceConfigHeaders(),
+        });
+        if (myToken !== this.token) return; // superseded by another play/stop
+        if (!res.ok) {
+          let msg = `Read-aloud failed (${res.status})`;
+          try {
+            const j = await res.json();
+            if (j?.error) msg = String(j.error);
+          } catch {
+            /* non-JSON error body */
+          }
+          throw new Error(msg);
+        }
+        const blob = await res.blob();
+        if (myToken !== this.token) return;
+        url = URL.createObjectURL(blob);
+        this.cacheSet(id, url);
+      }
+      if (myToken !== this.token) return;
+      audio.src = url;
+      audio.load();
+      await audio.play();
+      if (myToken !== this.token) return;
+      this.state = 'playing';
+      this.emit();
+    } catch (e) {
+      if (myToken !== this.token) return;
+      this.setError(id, e instanceof Error ? e.message : 'Read-aloud failed');
+    }
+  }
+
+  private cacheSet(id: string, url: string) {
+    this.cache.set(id, url);
+    while (this.cache.size > CACHE_MAX) {
+      const oldest = this.cache.keys().next().value as string | undefined;
+      if (oldest === undefined) break;
+      const oldUrl = this.cache.get(oldest);
+      this.cache.delete(oldest);
+      if (oldUrl && oldUrl !== this.audio?.src) URL.revokeObjectURL(oldUrl);
+    }
+  }
+}
+
+export const voicePlayer = new VoicePlayer();