diff --git a/server/voice-proxy.js b/server/voice-proxy.js index a628ce63..304535a2 100644 --- a/server/voice-proxy.js +++ b/server/voice-proxy.js @@ -8,6 +8,8 @@ // // Config is resolved per-request from headers (set by the client's voice settings), // falling back to server env defaults. Mounted at /api/voice behind authenticateToken. +import { Readable } from 'node:stream'; + import express from 'express'; const ENV = { @@ -32,7 +34,7 @@ function resolveConfig(req) { sttModel: String(h['x-voice-stt-model'] || '') || ENV.sttModel, ttsModel: String(h['x-voice-tts-model'] || '') || ENV.ttsModel, ttsVoice: String(h['x-voice-tts-voice'] || '') || ENV.ttsVoice, - ttsFormat: String(h['x-voice-tts-format'] || ''), + ttsFormat: String(h['x-voice-tts-format'] || '').trim(), }; } @@ -57,7 +59,7 @@ async function fetchWithTimeout(url, options = {}) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), VOICE_TIMEOUT_MS); try { - return await fetch(url, { ...options, signal: controller.signal }); + return await fetch(url, { redirect: 'manual', ...options, signal: controller.signal }); } finally { clearTimeout(timer); } @@ -206,7 +208,8 @@ router.post('/tts', async (req, res) => { } res.setHeader('Content-Type', r.headers.get('content-type') || 'audio/mpeg'); res.setHeader('Cache-Control', 'no-store'); - res.send(Buffer.from(await r.arrayBuffer())); + if (!r.body) return res.end(); + Readable.fromWeb(r.body).on('error', (error) => res.destroy(error)).pipe(res); } catch (e) { backendError(res, e); } diff --git a/src/components/chat/hooks/useVoiceAvailable.ts b/src/components/chat/hooks/useVoiceAvailable.ts index 388411f0..df5fe533 100644 --- a/src/components/chat/hooks/useVoiceAvailable.ts +++ b/src/components/chat/hooks/useVoiceAvailable.ts @@ -1,11 +1,39 @@ import { useEffect, useState } from 'react'; +import { authenticatedFetch } from '../../../utils/api'; +import { VOICE_CONFIG_SYNC_EVENT, voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; + // Voice UI is gated on the `voiceEnabled` UI preference (toggled in Quick Settings / -// the Settings modal). This is a lightweight read-only view of that preference so the -// mic/speak controls can hide themselves, kept in sync via the same events -// useUiPreferences emits. No server probe. +// the Settings modal) and a configured voice backend. const STORAGE_KEY = 'uiPreferences'; const SYNC_EVENT = 'ui-preferences:sync'; +const healthCache = new Map(); +const healthRequests = new Map>(); + +function checkVoiceHealth(): Promise { + const baseUrl = voiceConfigHeaders()['x-voice-base-url']; + const signature = baseUrl || ''; + if (healthCache.has(signature)) return Promise.resolve(healthCache.get(signature) ?? false); + const pending = healthRequests.get(signature); + if (pending) return pending; + const request = authenticatedFetch('/api/voice/health', { + headers: baseUrl ? { 'x-voice-base-url': baseUrl } : {}, + }) + .then(async (response) => { + if (!response.ok) throw new Error(`Voice health check failed (${response.status})`); + const data = await response.json(); + return data?.configured === true; + }) + .then((available) => { + healthCache.set(signature, available); + return available; + }) + .finally(() => { + healthRequests.delete(signature); + }); + healthRequests.set(signature, request); + return request; +} function readVoiceEnabled(): boolean { try { @@ -22,6 +50,7 @@ export function useVoiceAvailable(): boolean { const [enabled, setEnabled] = useState(() => typeof window === 'undefined' ? false : readVoiceEnabled(), ); + const [available, setAvailable] = useState(false); useEffect(() => { const update = () => setEnabled(readVoiceEnabled()); @@ -33,5 +62,31 @@ export function useVoiceAvailable(): boolean { }; }, []); - return enabled; + useEffect(() => { + let active = true; + let requestId = 0; + + const check = async () => { + if (!enabled) { + setAvailable(false); + return; + } + const id = ++requestId; + try { + const result = await checkVoiceHealth(); + if (active && id === requestId) setAvailable(result); + } catch { + if (active && id === requestId) setAvailable(false); + } + }; + + void check(); + window.addEventListener(VOICE_CONFIG_SYNC_EVENT, check); + return () => { + active = false; + window.removeEventListener(VOICE_CONFIG_SYNC_EVENT, check); + }; + }, [enabled]); + + return enabled && available; } diff --git a/src/components/chat/hooks/useVoiceInput.ts b/src/components/chat/hooks/useVoiceInput.ts index f344fa81..706fd6c4 100644 --- a/src/components/chat/hooks/useVoiceInput.ts +++ b/src/components/chat/hooks/useVoiceInput.ts @@ -1,4 +1,5 @@ import { useCallback, useEffect, useRef, useState } from 'react'; + import { authenticatedFetch } from '../../../utils/api'; import { voiceConfigHeaders } from '../../../hooks/useVoiceConfig'; @@ -37,6 +38,8 @@ export function useVoiceInput( const recorderRef = useRef(null); const chunksRef = useRef([]); const streamRef = useRef(null); + const cancelledRef = useRef(false); + const startingRef = useRef(false); // Whether the in-progress stop should auto-send the transcript (vs just fill the box). const sendRef = useRef(false); @@ -47,7 +50,10 @@ export function useVoiceInput( // Stop the mic if the component unmounts mid-recording. useEffect(() => { + cancelledRef.current = false; return () => { + cancelledRef.current = true; + startingRef.current = false; streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current = null; recorderRef.current = null; @@ -55,10 +61,17 @@ export function useVoiceInput( }, []); const start = useCallback(async () => { + if (startingRef.current || (recorderRef.current && recorderRef.current.state !== 'inactive')) return; + startingRef.current = true; + let recordingCancelled = false; try { const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true }, }); + if (cancelledRef.current) { + stream.getTracks().forEach((t) => t.stop()); + return; + } streamRef.current = stream; const mimeType = pickMime(); const rec = mimeType ? new MediaRecorder(stream, { mimeType }) : new MediaRecorder(stream); @@ -71,6 +84,7 @@ export function useVoiceInput( rec.onstop = async () => { stopTracks(); + if (recordingCancelled || cancelledRef.current) return; // Capture and clear the send intent for this stop before any async work. const shouldSend = sendRef.current; sendRef.current = false; @@ -93,25 +107,34 @@ export function useVoiceInput( }); if (!res.ok) throw new Error(`transcribe ${res.status}`); const data = await res.json(); + if (recordingCancelled || cancelledRef.current) return; const text = String(data?.text || '').trim(); if (text) onTranscript(text, shouldSend); else onError?.('No speech detected'); } catch (e) { - onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + if (!recordingCancelled && !cancelledRef.current) { + onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); + } } finally { - setState('idle'); + if (!recordingCancelled && !cancelledRef.current) setState('idle'); } }; rec.start(); setState('recording'); } catch (e) { + recordingCancelled = true; + recorderRef.current = null; + stopTracks(); + if (cancelledRef.current) return; const err = e as { name?: string; message?: string }; let msg = `Mic error: ${err?.message || e}`; if (err?.name === 'NotAllowedError') msg = 'Microphone access denied.'; else if (err?.name === 'NotFoundError') msg = 'No microphone found.'; onError?.(msg); setState('idle'); + } finally { + startingRef.current = false; } }, [onTranscript, onError]); diff --git a/src/hooks/useVoiceConfig.ts b/src/hooks/useVoiceConfig.ts index 9e3e2551..77e22546 100644 --- a/src/hooks/useVoiceConfig.ts +++ b/src/hooks/useVoiceConfig.ts @@ -10,16 +10,15 @@ export type VoiceConfig = { }; const STORAGE_KEY = 'voiceConfig'; -const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: 'mp3' }; +export const VOICE_CONFIG_SYNC_EVENT = 'voice-config:sync'; +const DEFAULTS: VoiceConfig = { baseUrl: '', apiKey: '', sttModel: '', ttsModel: '', ttsVoice: '', ttsFormat: '' }; function read(): VoiceConfig { try { const raw = localStorage.getItem(STORAGE_KEY); if (!raw) return { ...DEFAULTS }; const parsed = JSON.parse(raw); - const next = { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) }; - if (!next.ttsFormat) next.ttsFormat = DEFAULTS.ttsFormat; - return next; + return { ...DEFAULTS, ...(parsed && typeof parsed === 'object' ? parsed : {}) }; } catch { return { ...DEFAULTS }; } @@ -36,7 +35,7 @@ export function voiceConfigHeaders(): Record { if (c.sttModel) h['x-voice-stt-model'] = c.sttModel; if (c.ttsModel) h['x-voice-tts-model'] = c.ttsModel; if (c.ttsVoice) h['x-voice-tts-voice'] = c.ttsVoice; - if (c.ttsFormat) h['x-voice-tts-format'] = c.ttsFormat; + if (c.ttsFormat.trim()) h['x-voice-tts-format'] = c.ttsFormat.trim(); return h; } @@ -49,7 +48,11 @@ export function useVoiceConfig() { setConfig((prev) => { const next = { ...prev, ...patch }; try { - localStorage.setItem(STORAGE_KEY, JSON.stringify(next)); + const stored: Partial = { ...next }; + if (next.ttsFormat.trim()) stored.ttsFormat = next.ttsFormat.trim(); + else delete stored.ttsFormat; + localStorage.setItem(STORAGE_KEY, JSON.stringify(stored)); + window.dispatchEvent(new Event(VOICE_CONFIG_SYNC_EVENT)); } catch { /* ignore persistence errors */ } diff --git a/src/lib/voicePlayer.ts b/src/lib/voicePlayer.ts index a51700d3..b09f5170 100644 --- a/src/lib/voicePlayer.ts +++ b/src/lib/voicePlayer.ts @@ -15,10 +15,11 @@ const IDLE: VoiceSnapshot = { state: 'idle', error: null }; const CACHE_MAX = 24; const CLIENT_TIMEOUT_MS = 330000; // backstop; the server proxy already times out at 5 min -// Stable id / cache key from a message's text (djb2). -export function voiceId(content: string): string { +// Stable id / cache key from the text and voice settings that affect its audio (djb2). +export function voiceId(content: string, headers = voiceConfigHeaders()): string { + const input = JSON.stringify([content, Object.entries(headers).sort(([a], [b]) => a.localeCompare(b))]); let h = 5381; - for (let i = 0; i < content.length; i++) h = (((h << 5) + h) + content.charCodeAt(i)) | 0; + for (let i = 0; i < input.length; i++) h = (((h << 5) + h) + input.charCodeAt(i)) | 0; return (h >>> 0).toString(36); } @@ -81,12 +82,13 @@ class VoicePlayer { } toggle(content: string) { - const id = voiceId(content); + const headers = voiceConfigHeaders(); + const id = voiceId(content, headers); if (this.currentId === id && (this.state === 'playing' || this.state === 'loading')) { this.stop(); return; } - void this.play(id, content); + void this.play(id, content, headers); } stop() { @@ -129,7 +131,7 @@ class VoicePlayer { }, 6000); } - private async play(id: string, content: string) { + private async play(id: string, content: string, headers: Record) { const audio = this.ensureAudio(); audio.pause(); this.currentId = id; @@ -150,7 +152,7 @@ class VoicePlayer { const res = await authenticatedFetch('/api/voice/tts', { method: 'POST', body: JSON.stringify({ text: content }), - headers: voiceConfigHeaders(), + headers, signal: controller.signal, }).finally(() => { clearTimeout(timer);