feat(voice): send transcript with the main send button while recording

while dictating, the main send button stops recording, transcribes, and sends
in one tap, matching the codex-style flow. the mic button still stops and drops
the transcript into the input box to edit before sending. voice recording state
is lifted into the composer so both buttons share it, and the send button is
enabled (not grayed) while recording. also fix a pre-existing type error: the
quick-settings preferences map was missing voiceEnabled.
This commit is contained in:
newsbubbles
2026-06-13 13:09:14 +01:00
parent 7f8ae7023d
commit 952ddb9eb7
6 changed files with 88 additions and 40 deletions

View File

@@ -798,6 +798,17 @@ export function useChatComposerState({
handleSubmitRef.current = handleSubmit; handleSubmitRef.current = handleSubmit;
}, [handleSubmit]); }, [handleSubmit]);
// A voice transcript either fills the input (to edit before sending) or, when the
// user tapped "stop and send", is submitted straight away. Mirror the value into
// inputValueRef synchronously so handleSubmit reads the new text, not the stale state.
const handleVoiceTranscript = useCallback((text: string, send?: boolean) => {
const base = inputValueRef.current.trim();
const next = base ? `${base} ${text}` : text;
setInput(next);
inputValueRef.current = next;
if (send) handleSubmitRef.current?.(createFakeSubmitEvent());
}, [setInput]);
useEffect(() => { useEffect(() => {
inputValueRef.current = input; inputValueRef.current = input;
}, [input]); }, [input]);
@@ -1050,6 +1061,7 @@ export function useChatComposerState({
isDragActive, isDragActive,
openImagePicker: open, openImagePicker: open,
handleSubmit, handleSubmit,
handleVoiceTranscript,
handleInputChange, handleInputChange,
handleKeyDown, handleKeyDown,
handlePaste, handlePaste,

View File

@@ -29,11 +29,16 @@ export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
* (an OpenAI-compatible speech-to-text backend via the Express proxy), and * (an OpenAI-compatible speech-to-text backend via the Express proxy), and
* returns the transcript through onTranscript. * returns the transcript through onTranscript.
*/ */
export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) { export function useVoiceInput(
onTranscript: (text: string, send?: boolean) => void,
onError?: (msg: string) => void,
) {
const [state, setState] = useState<VoiceInputState>('idle'); const [state, setState] = useState<VoiceInputState>('idle');
const recorderRef = useRef<MediaRecorder | null>(null); const recorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]); const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
// Whether the in-progress stop should auto-send the transcript (vs just fill the box).
const sendRef = useRef(false);
const stopTracks = () => { const stopTracks = () => {
streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current?.getTracks().forEach((t) => t.stop());
@@ -66,6 +71,9 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
rec.onstop = async () => { rec.onstop = async () => {
stopTracks(); stopTracks();
// Capture and clear the send intent for this stop before any async work.
const shouldSend = sendRef.current;
sendRef.current = false;
const type = rec.mimeType || 'audio/webm'; const type = rec.mimeType || 'audio/webm';
const blob = new Blob(chunksRef.current, { type }); const blob = new Blob(chunksRef.current, { type });
if (blob.size < 800) { if (blob.size < 800) {
@@ -86,7 +94,7 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
if (!res.ok) throw new Error(`transcribe ${res.status}`); if (!res.ok) throw new Error(`transcribe ${res.status}`);
const data = await res.json(); const data = await res.json();
const text = String(data?.text || '').trim(); const text = String(data?.text || '').trim();
if (text) onTranscript(text); if (text) onTranscript(text, shouldSend);
else onError?.('No speech detected'); else onError?.('No speech detected');
} catch (e) { } catch (e) {
onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`); onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
@@ -107,8 +115,12 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
} }
}, [onTranscript, onError]); }, [onTranscript, onError]);
const stop = useCallback(() => { // Stop recording. Pass { send: true } to auto-send the transcript once it's ready.
if (recorderRef.current && state === 'recording') recorderRef.current.stop(); const stop = useCallback((opts?: { send?: boolean }) => {
if (recorderRef.current && state === 'recording') {
sendRef.current = opts?.send ?? false;
recorderRef.current.stop();
}
}, [state]); }, [state]);
const toggle = useCallback(() => { const toggle = useCallback(() => {
@@ -116,5 +128,5 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
else if (state === 'idle') start(); else if (state === 'idle') start();
}, [state, start, stop]); }, [state, start, stop]);
return { state, toggle }; return { state, toggle, stop };
} }

View File

@@ -164,6 +164,7 @@ function ChatInterface({
isDragActive, isDragActive,
openImagePicker, openImagePicker,
handleSubmit, handleSubmit,
handleVoiceTranscript,
handleInputChange, handleInputChange,
handleKeyDown, handleKeyDown,
handlePaste, handlePaste,
@@ -404,7 +405,7 @@ function ChatInterface({
renderInputWithMentions={renderInputWithMentions} renderInputWithMentions={renderInputWithMentions}
textareaRef={textareaRef} textareaRef={textareaRef}
input={input} input={input}
onVoiceTranscript={(text) => setInput(input.trim() ? `${input.trim()} ${text}` : text)} onVoiceTranscript={handleVoiceTranscript}
onInputChange={handleInputChange} onInputChange={handleInputChange}
onTextareaClick={handleTextareaClick} onTextareaClick={handleTextareaClick}
onTextareaKeyDown={handleKeyDown} onTextareaKeyDown={handleKeyDown}

View File

@@ -1,4 +1,5 @@
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { useCallback, useEffect, useRef, useState } from 'react';
import type { import type {
ChangeEvent, ChangeEvent,
ClipboardEvent, ClipboardEvent,
@@ -9,8 +10,10 @@ import type {
RefObject, RefObject,
TouchEvent, TouchEvent,
} from 'react'; } from 'react';
import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon } from 'lucide-react'; import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon, Loader2 } from 'lucide-react';
import { useVoiceInput } from '../../hooks/useVoiceInput';
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
import type { PendingPermissionRequest, PermissionMode, Provider } from '../../types/types'; import type { PendingPermissionRequest, PermissionMode, Provider } from '../../types/types';
import { import {
PromptInput, PromptInput,
@@ -90,7 +93,7 @@ interface ChatComposerProps {
renderInputWithMentions: (text: string) => ReactNode; renderInputWithMentions: (text: string) => ReactNode;
textareaRef: RefObject<HTMLTextAreaElement>; textareaRef: RefObject<HTMLTextAreaElement>;
input: string; input: string;
onVoiceTranscript?: (text: string) => void; onVoiceTranscript?: (text: string, send?: boolean) => void;
onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void; onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void; onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void; onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -158,6 +161,28 @@ export default function ChatComposer({
sendByCtrlEnter, sendByCtrlEnter,
}: ChatComposerProps) { }: ChatComposerProps) {
const { t } = useTranslation('chat'); const { t } = useTranslation('chat');
// Voice state is hosted here (not in the mic button) so the main Send button can stop
// recording and send the transcript in one tap, the way the mic button drops it in the box.
const voiceAvailable = useVoiceAvailable();
const [voiceError, setVoiceError] = useState<string | null>(null);
const voiceErrorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
const handleVoiceError = useCallback((msg: string) => {
setVoiceError(msg);
if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
voiceErrorTimer.current = setTimeout(() => setVoiceError(null), 4000);
}, []);
useEffect(() => () => {
if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
}, []);
const noopTranscript = useCallback(() => {}, []);
const { state: voiceState, toggle: voiceToggle, stop: voiceStop } = useVoiceInput(
onVoiceTranscript ?? noopTranscript,
handleVoiceError,
);
const isRecording = voiceState === 'recording';
const isTranscribing = voiceState === 'transcribing';
const textareaRect = textareaRef.current?.getBoundingClientRect(); const textareaRect = textareaRef.current?.getBoundingClientRect();
const commandMenuPosition = { const commandMenuPosition = {
top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0, top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0,
@@ -318,7 +343,9 @@ export default function ChatComposer({
<ImageIcon /> <ImageIcon />
</PromptInputButton> </PromptInputButton>
{onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />} {onVoiceTranscript && voiceAvailable && (
<VoiceInputButton state={voiceState} onToggle={voiceToggle} errorMsg={voiceError} />
)}
<button <button
type="button" type="button"
@@ -398,10 +425,21 @@ export default function ChatComposer({
{sendByCtrlEnter ? t('input.hintText.ctrlEnter') : t('input.hintText.enter')} {sendByCtrlEnter ? t('input.hintText.ctrlEnter') : t('input.hintText.enter')}
</div> </div>
<PromptInputSubmit <PromptInputSubmit
onClick={isLoading ? onAbortSession : undefined} onClick={
disabled={!isLoading && !input.trim()} isLoading
? onAbortSession
: isRecording
? (e: MouseEvent<HTMLButtonElement>) => {
e.preventDefault();
voiceStop({ send: true });
}
: undefined
}
disabled={isLoading ? false : isRecording ? false : isTranscribing ? true : !input.trim()}
className="h-10 w-10 sm:h-10 sm:w-10" className="h-10 w-10 sm:h-10 sm:w-10"
/> >
{isTranscribing ? <Loader2 className="h-4 w-4 animate-spin" /> : undefined}
</PromptInputSubmit>
</div> </div>
</PromptInputFooter> </PromptInputFooter>
</PromptInput> </PromptInput>

View File

@@ -1,37 +1,20 @@
import { useEffect, useRef, useState } from 'react';
import { Mic, Square, Loader2 } from 'lucide-react';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { useVoiceInput } from '../../hooks/useVoiceInput'; import { Mic, Square, Loader2 } from 'lucide-react';
import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
import { PromptInputButton } from '../../../../shared/view/ui'; import { PromptInputButton } from '../../../../shared/view/ui';
import type { VoiceInputState } from '../../hooks/useVoiceInput';
type Props = { type Props = {
onTranscript: (text: string) => void; state: VoiceInputState;
onError?: (msg: string) => void; onToggle: () => void;
errorMsg?: string | null;
}; };
// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled. // Push-to-talk mic button (presentational). Recording state and the stop-and-send action
// Surfaces transcription errors itself (transiently) so they aren't silently swallowed. // are owned by the composer so the main Send button can drive them too. This button just
export default function VoiceInputButton({ onTranscript, onError }: Props) { // starts recording and, while recording, stops and drops the transcript into the input box.
export default function VoiceInputButton({ state, onToggle, errorMsg }: Props) {
const { t } = useTranslation('chat'); const { t } = useTranslation('chat');
const available = useVoiceAvailable();
const [errorMsg, setErrorMsg] = useState<string | null>(null);
const errorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
const handleError = (msg: string) => {
onError?.(msg);
setErrorMsg(msg);
if (errorTimer.current) clearTimeout(errorTimer.current);
errorTimer.current = setTimeout(() => setErrorMsg(null), 4000);
};
const { state, toggle } = useVoiceInput(onTranscript, handleError);
useEffect(() => () => {
if (errorTimer.current) clearTimeout(errorTimer.current);
}, []);
if (!available) return null;
const icon = const icon =
state === 'recording' ? ( state === 'recording' ? (
@@ -53,7 +36,7 @@ export default function VoiceInputButton({ onTranscript, onError }: Props) {
tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }} tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
onClick={(e: { preventDefault: () => void }) => { onClick={(e: { preventDefault: () => void }) => {
e.preventDefault(); e.preventDefault();
toggle(); onToggle();
}} }}
> >
{icon} {icon}

View File

@@ -27,12 +27,14 @@ export default function QuickSettingsPanelView() {
showThinking: preferences.showThinking, showThinking: preferences.showThinking,
autoScrollToBottom: preferences.autoScrollToBottom, autoScrollToBottom: preferences.autoScrollToBottom,
sendByCtrlEnter: preferences.sendByCtrlEnter, sendByCtrlEnter: preferences.sendByCtrlEnter,
voiceEnabled: preferences.voiceEnabled,
}), [ }), [
preferences.autoExpandTools, preferences.autoExpandTools,
preferences.autoScrollToBottom, preferences.autoScrollToBottom,
preferences.sendByCtrlEnter, preferences.sendByCtrlEnter,
preferences.showRawParameters, preferences.showRawParameters,
preferences.showThinking, preferences.showThinking,
preferences.voiceEnabled,
]); ]);
const handlePreferenceChange = useCallback( const handlePreferenceChange = useCallback(