fix(stt): revert to reliable REST gpt-4o-transcribe + MediaRecorder full-blob (Realtime WS not accessible on key)

- Backend: added transcribe_audio (gpt-4o-transcribe), switched audio handler to full blob -> REST -> LLM -> streaming TTS - Frontend: MediaRecorder (webm/opus) full recording sent on stop (one blob per utterance) - Removed dead WhisperStream callbacks and pending_transcript/lock - This unblocks voice per AUDIT item 1 (Option B fallback). Deltas will come in later item. - Also preps for deprecation fix (MediaRecorder is the good path).
2026-06-04 15:23:57 -04:00
parent 188da1d52a
commit 0e74a16b40
2 changed files with 33 additions and 200 deletions
@@ -201,7 +201,7 @@ export function useConversation() {

    try {
      setMicError(null);
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } });
      streamRef.current = stream;

      const ws = wsRef.current;
@@ -211,14 +211,27 @@ export function useConversation() {
        return;
      }

-      // PCM16 capture for Realtime WebSocket STT
-      captureRef.current = startPCMCapture(stream, (pcm16) => {
-        if (ws.readyState === WebSocket.OPEN) {
-          const base64 = arrayBufferToBase64(pcm16.buffer);
-          ws.send(JSON.stringify({ type: 'audio', data: base64 }));
+      // Use MediaRecorder for full utterance blob (Opus/webm) — sent on stop for REST STT
+      const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
+      const chunks: Blob[] = [];
+      mediaRecorder.ondataavailable = (e) => {
+        if (e.data.size > 0) chunks.push(e.data);
+      };
+      mediaRecorder.onstop = () => {
+        if (chunks.length > 0 && ws.readyState === WebSocket.OPEN) {
+          const blob = new Blob(chunks, { type: 'audio/webm' });
+          blob.arrayBuffer().then((buf) => {
+            const base64 = arrayBufferToBase64(buf);
+            ws.send(JSON.stringify({ type: 'audio', data: base64 }));
+          });
        }
-      });
-
+        chunks.length = 0;
+        stream.getTracks().forEach((t) => t.stop());
+        streamRef.current = null;
+        setIsRecording(false);
+      };
+      recorderRef.current = mediaRecorder;
+      mediaRecorder.start();
      setIsRecording(true);
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
@@ -228,11 +241,16 @@ export function useConversation() {
  }, [addMessage]);

  const stopRecording = useCallback(() => {
-    captureRef.current?.stop();
-    captureRef.current = null;
-    streamRef.current?.getTracks().forEach((t) => t.stop());
-    streamRef.current = null;
-    setIsRecording(false);
+    if (recorderRef.current && recorderRef.current.state === 'recording') {
+      recorderRef.current.stop();
+      // onstop will handle sending the blob and cleanup
+    } else {
+      // fallback cleanup
+      streamRef.current?.getTracks().forEach((t) => t.stop());
+      streamRef.current = null;
+      setIsRecording(false);
+    }
+    captureRef.current = null; // legacy
  }, []);

  // ── Text ──
@@ -249,8 +267,8 @@ export function useConversation() {
    connect();
    return () => {
      wsRef.current?.close();
+      if (recorderRef.current && recorderRef.current.state === 'recording') recorderRef.current.stop();
      captureRef.current?.stop();
-      recorderRef.current?.stop();
      streamRef.current?.getTracks().forEach((t) => t.stop());
    };
  }, [connect]);