feat(voice): send transcript with the main send button while recording

while dictating, the main send button stops recording, transcribes, and sends in one tap, matching the codex-style flow. the mic button still stops and drops the transcript into the input box to edit before sending. voice recording state is lifted into the composer so both buttons share it, and the send button is enabled (not grayed) while recording. also fix a pre-existing type error: the quick-settings preferences map was missing voiceEnabled.
2026-06-25 12:16:00 +08:00 · 2026-06-13 13:09:14 +01:00
parent 7f8ae7023d
commit 952ddb9eb7
6 changed files with 88 additions and 40 deletions
--- a/src/components/chat/hooks/useChatComposerState.ts
+++ b/src/components/chat/hooks/useChatComposerState.ts
@@ -798,6 +798,17 @@ export function useChatComposerState({
    handleSubmitRef.current = handleSubmit;
  }, [handleSubmit]);

+  // A voice transcript either fills the input (to edit before sending) or, when the
+  // user tapped "stop and send", is submitted straight away. Mirror the value into
+  // inputValueRef synchronously so handleSubmit reads the new text, not the stale state.
+  const handleVoiceTranscript = useCallback((text: string, send?: boolean) => {
+    const base = inputValueRef.current.trim();
+    const next = base ? `${base} ${text}` : text;
+    setInput(next);
+    inputValueRef.current = next;
+    if (send) handleSubmitRef.current?.(createFakeSubmitEvent());
+  }, [setInput]);
+
  useEffect(() => {
    inputValueRef.current = input;
  }, [input]);
@@ -1050,6 +1061,7 @@ export function useChatComposerState({
    isDragActive,
    openImagePicker: open,
    handleSubmit,
+    handleVoiceTranscript,
    handleInputChange,
    handleKeyDown,
    handlePaste,
--- a/src/components/chat/hooks/useVoiceInput.ts
+++ b/src/components/chat/hooks/useVoiceInput.ts
@@ -29,11 +29,16 @@ export type VoiceInputState = 'idle' | 'recording' | 'transcribing';
 * (an OpenAI-compatible speech-to-text backend via the Express proxy), and
 * returns the transcript through onTranscript.
 */
-export function useVoiceInput(onTranscript: (text: string) => void, onError?: (msg: string) => void) {
+export function useVoiceInput(
+  onTranscript: (text: string, send?: boolean) => void,
+  onError?: (msg: string) => void,
+) {
  const [state, setState] = useState<VoiceInputState>('idle');
  const recorderRef = useRef<MediaRecorder | null>(null);
  const chunksRef = useRef<Blob[]>([]);
  const streamRef = useRef<MediaStream | null>(null);
+  // Whether the in-progress stop should auto-send the transcript (vs just fill the box).
+  const sendRef = useRef(false);

  const stopTracks = () => {
    streamRef.current?.getTracks().forEach((t) => t.stop());
@@ -66,6 +71,9 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m

      rec.onstop = async () => {
        stopTracks();
+        // Capture and clear the send intent for this stop before any async work.
+        const shouldSend = sendRef.current;
+        sendRef.current = false;
        const type = rec.mimeType || 'audio/webm';
        const blob = new Blob(chunksRef.current, { type });
        if (blob.size < 800) {
@@ -86,7 +94,7 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
          if (!res.ok) throw new Error(`transcribe ${res.status}`);
          const data = await res.json();
          const text = String(data?.text || '').trim();
-          if (text) onTranscript(text);
+          if (text) onTranscript(text, shouldSend);
          else onError?.('No speech detected');
        } catch (e) {
          onError?.(`Transcription failed: ${e instanceof Error ? e.message : String(e)}`);
@@ -107,8 +115,12 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
    }
  }, [onTranscript, onError]);

-  const stop = useCallback(() => {
-    if (recorderRef.current && state === 'recording') recorderRef.current.stop();
+  // Stop recording. Pass { send: true } to auto-send the transcript once it's ready.
+  const stop = useCallback((opts?: { send?: boolean }) => {
+    if (recorderRef.current && state === 'recording') {
+      sendRef.current = opts?.send ?? false;
+      recorderRef.current.stop();
+    }
  }, [state]);

  const toggle = useCallback(() => {
@@ -116,5 +128,5 @@ export function useVoiceInput(onTranscript: (text: string) => void, onError?: (m
    else if (state === 'idle') start();
  }, [state, start, stop]);

-  return { state, toggle };
+  return { state, toggle, stop };
 }
--- a/src/components/chat/view/ChatInterface.tsx
+++ b/src/components/chat/view/ChatInterface.tsx
@@ -164,6 +164,7 @@ function ChatInterface({
    isDragActive,
    openImagePicker,
    handleSubmit,
+    handleVoiceTranscript,
    handleInputChange,
    handleKeyDown,
    handlePaste,
@@ -404,7 +405,7 @@ function ChatInterface({
          renderInputWithMentions={renderInputWithMentions}
          textareaRef={textareaRef}
          input={input}
-          onVoiceTranscript={(text) => setInput(input.trim() ? `${input.trim()} ${text}` : text)}
+          onVoiceTranscript={handleVoiceTranscript}
          onInputChange={handleInputChange}
          onTextareaClick={handleTextareaClick}
          onTextareaKeyDown={handleKeyDown}
--- a/src/components/chat/view/subcomponents/ChatComposer.tsx
+++ b/src/components/chat/view/subcomponents/ChatComposer.tsx
@@ -1,4 +1,5 @@
 import { useTranslation } from 'react-i18next';
+import { useCallback, useEffect, useRef, useState } from 'react';
 import type {
  ChangeEvent,
  ClipboardEvent,
@@ -9,8 +10,10 @@ import type {
  RefObject,
  TouchEvent,
 } from 'react';
-import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon } from 'lucide-react';
+import { ImageIcon, MessageSquareIcon, XIcon, ArrowDownIcon, Loader2 } from 'lucide-react';

+import { useVoiceInput } from '../../hooks/useVoiceInput';
+import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
 import type { PendingPermissionRequest, PermissionMode, Provider } from '../../types/types';
 import {
  PromptInput,
@@ -90,7 +93,7 @@ interface ChatComposerProps {
  renderInputWithMentions: (text: string) => ReactNode;
  textareaRef: RefObject<HTMLTextAreaElement>;
  input: string;
-  onVoiceTranscript?: (text: string) => void;
+  onVoiceTranscript?: (text: string, send?: boolean) => void;
  onInputChange: (event: ChangeEvent<HTMLTextAreaElement>) => void;
  onTextareaClick: (event: MouseEvent<HTMLTextAreaElement>) => void;
  onTextareaKeyDown: (event: KeyboardEvent<HTMLTextAreaElement>) => void;
@@ -158,6 +161,28 @@ export default function ChatComposer({
  sendByCtrlEnter,
 }: ChatComposerProps) {
  const { t } = useTranslation('chat');
+
+  // Voice state is hosted here (not in the mic button) so the main Send button can stop
+  // recording and send the transcript in one tap, the way the mic button drops it in the box.
+  const voiceAvailable = useVoiceAvailable();
+  const [voiceError, setVoiceError] = useState<string | null>(null);
+  const voiceErrorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
+  const handleVoiceError = useCallback((msg: string) => {
+    setVoiceError(msg);
+    if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
+    voiceErrorTimer.current = setTimeout(() => setVoiceError(null), 4000);
+  }, []);
+  useEffect(() => () => {
+    if (voiceErrorTimer.current) clearTimeout(voiceErrorTimer.current);
+  }, []);
+  const noopTranscript = useCallback(() => {}, []);
+  const { state: voiceState, toggle: voiceToggle, stop: voiceStop } = useVoiceInput(
+    onVoiceTranscript ?? noopTranscript,
+    handleVoiceError,
+  );
+  const isRecording = voiceState === 'recording';
+  const isTranscribing = voiceState === 'transcribing';
+
  const textareaRect = textareaRef.current?.getBoundingClientRect();
  const commandMenuPosition = {
    top: textareaRect ? Math.max(16, textareaRect.top - 316) : 0,
@@ -318,7 +343,9 @@ export default function ChatComposer({
              <ImageIcon />
            </PromptInputButton>

-            {onVoiceTranscript && <VoiceInputButton onTranscript={onVoiceTranscript} />}
+            {onVoiceTranscript && voiceAvailable && (
+              <VoiceInputButton state={voiceState} onToggle={voiceToggle} errorMsg={voiceError} />
+            )}

            <button
              type="button"
@@ -398,10 +425,21 @@ export default function ChatComposer({
              {sendByCtrlEnter ? t('input.hintText.ctrlEnter') : t('input.hintText.enter')}
            </div>
            <PromptInputSubmit
-              onClick={isLoading ? onAbortSession : undefined}
-              disabled={!isLoading && !input.trim()}
+              onClick={
+                isLoading
+                  ? onAbortSession
+                  : isRecording
+                    ? (e: MouseEvent<HTMLButtonElement>) => {
+                        e.preventDefault();
+                        voiceStop({ send: true });
+                      }
+                    : undefined
+              }
+              disabled={isLoading ? false : isRecording ? false : isTranscribing ? true : !input.trim()}
              className="h-10 w-10 sm:h-10 sm:w-10"
-            />
+            >
+              {isTranscribing ? <Loader2 className="h-4 w-4 animate-spin" /> : undefined}
+            </PromptInputSubmit>
          </div>
        </PromptInputFooter>
      </PromptInput>
--- a/src/components/chat/view/subcomponents/VoiceInputButton.tsx
+++ b/src/components/chat/view/subcomponents/VoiceInputButton.tsx
@@ -1,37 +1,20 @@
-import { useEffect, useRef, useState } from 'react';
-import { Mic, Square, Loader2 } from 'lucide-react';
 import { useTranslation } from 'react-i18next';
-import { useVoiceInput } from '../../hooks/useVoiceInput';
-import { useVoiceAvailable } from '../../hooks/useVoiceAvailable';
+import { Mic, Square, Loader2 } from 'lucide-react';
+
 import { PromptInputButton } from '../../../../shared/view/ui';
+import type { VoiceInputState } from '../../hooks/useVoiceInput';

 type Props = {
-  onTranscript: (text: string) => void;
-  onError?: (msg: string) => void;
+  state: VoiceInputState;
+  onToggle: () => void;
+  errorMsg?: string | null;
 };

-// Push-to-talk mic button. Renders nothing unless the optional voice feature is enabled.
-// Surfaces transcription errors itself (transiently) so they aren't silently swallowed.
-export default function VoiceInputButton({ onTranscript, onError }: Props) {
+// Push-to-talk mic button (presentational). Recording state and the stop-and-send action
+// are owned by the composer so the main Send button can drive them too. This button just
+// starts recording and, while recording, stops and drops the transcript into the input box.
+export default function VoiceInputButton({ state, onToggle, errorMsg }: Props) {
  const { t } = useTranslation('chat');
-  const available = useVoiceAvailable();
-  const [errorMsg, setErrorMsg] = useState<string | null>(null);
-  const errorTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
-
-  const handleError = (msg: string) => {
-    onError?.(msg);
-    setErrorMsg(msg);
-    if (errorTimer.current) clearTimeout(errorTimer.current);
-    errorTimer.current = setTimeout(() => setErrorMsg(null), 4000);
-  };
-
-  const { state, toggle } = useVoiceInput(onTranscript, handleError);
-
-  useEffect(() => () => {
-    if (errorTimer.current) clearTimeout(errorTimer.current);
-  }, []);
-
-  if (!available) return null;

  const icon =
    state === 'recording' ? (
@@ -53,7 +36,7 @@ export default function VoiceInputButton({ onTranscript, onError }: Props) {
        tooltip={{ content: state === 'recording' ? t('voice.stopRecording') : t('voice.input') }}
        onClick={(e: { preventDefault: () => void }) => {
          e.preventDefault();
-          toggle();
+          onToggle();
        }}
      >
        {icon}
--- a/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx
+++ b/src/components/quick-settings-panel/view/QuickSettingsPanelView.tsx
@@ -27,12 +27,14 @@ export default function QuickSettingsPanelView() {
    showThinking: preferences.showThinking,
    autoScrollToBottom: preferences.autoScrollToBottom,
    sendByCtrlEnter: preferences.sendByCtrlEnter,
+    voiceEnabled: preferences.voiceEnabled,
  }), [
    preferences.autoExpandTools,
    preferences.autoScrollToBottom,
    preferences.sendByCtrlEnter,
    preferences.showRawParameters,
    preferences.showThinking,
+    preferences.voiceEnabled,
  ]);

  const handlePreferenceChange = useCallback(