feat: Realtime WebSocket STT via gpt-realtime-whisper

Replaces REST-based transcription (gpt-4o-transcribe) with WebSocket streaming via gpt-realtime-whisper. Frontend captures PCM16 audio and streams it through the backend to a Realtime transcription session. - Server-side VAD detects utterance boundaries automatically - Word-level transcript deltas stream to the client in real-time - On utterance end, gpt-5.4-nano generates a response - TTS streams back via with_streaming_response - Total pipeline: PCM16 → Realtime WS → LLM → streaming TTS
2026-06-04 14:26:19 -04:00
parent 25b12ee14f
commit 7502f201c7
3 changed files with 326 additions and 153 deletions
@@ -42,6 +42,7 @@ export function useConversation() {

  const wsRef = useRef<WebSocket | null>(null);
  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const captureRef = useRef<{ stop: () => void } | null>(null);
  const recorderRef = useRef<MediaRecorder | null>(null);
  const streamRef = useRef<MediaStream | null>(null);
  const audioBufferRef = useRef<Uint8Array[]>([]);
@@ -193,7 +194,6 @@ export function useConversation() {
  // ── Audio (Realtime PCM16) ──

  const startRecording = useCallback(async () => {
-    // Check HTTPS
    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
      addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
      return;
@@ -211,37 +211,14 @@ export function useConversation() {
        return;
      }

-      // Record Opus/webm — much more efficient than PCM16
-      const chunks: BlobPart[] = [];
-      const recorder = new MediaRecorder(stream, {
-        mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
-          ? 'audio/webm;codecs=opus'
-          : 'audio/webm',
+      // PCM16 capture for Realtime WebSocket STT
+      captureRef.current = startPCMCapture(stream, (pcm16) => {
+        if (ws.readyState === WebSocket.OPEN) {
+          const base64 = arrayBufferToBase64(pcm16.buffer);
+          ws.send(JSON.stringify({ type: 'audio', data: base64 }));
+        }
      });

-      recorder.ondataavailable = (e) => {
-        if (e.data.size > 0) chunks.push(e.data);
-      };
-
-      recorder.onstop = () => {
-        // Send recorded audio as one blob, then transcribe
-        const blob = new Blob(chunks, { type: 'audio/webm' });
-        const reader = new FileReader();
-        reader.onload = () => {
-          const base64 = (reader.result as string).split(',')[1];
-          if (ws.readyState === WebSocket.OPEN) {
-            ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
-            ws.send(JSON.stringify({ type: 'transcribe' }));
-          }
-        };
-        reader.readAsDataURL(blob);
-
-        stream.getTracks().forEach((t) => t.stop());
-        setIsRecording(false);
-      };
-
-      recorder.start();
-      recorderRef.current = recorder;
      setIsRecording(true);
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
@@ -251,7 +228,11 @@ export function useConversation() {
  }, [addMessage]);

  const stopRecording = useCallback(() => {
-    recorderRef.current?.stop();
+    captureRef.current?.stop();
+    captureRef.current = null;
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+    setIsRecording(false);
  }, []);

  // ── Text ──
@@ -268,6 +249,7 @@ export function useConversation() {
    connect();
    return () => {
      wsRef.current?.close();
+      captureRef.current?.stop();
      recorderRef.current?.stop();
      streamRef.current?.getTracks().forEach((t) => t.stop());
    };
@@ -289,3 +271,48 @@ export function useConversation() {
    stopRecording,
  };
 }
+
+// ── Helpers ──
+
+function arrayBufferToBase64(buffer: ArrayBufferLike): string {
+  const bytes = new Uint8Array(buffer);
+  let binary = '';
+  for (let i = 0; i < bytes.length; i++) {
+    binary += String.fromCharCode(bytes[i]);
+  }
+  return btoa(binary);
+}
+
+/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
+function startPCMCapture(
+  stream: MediaStream,
+  onChunk: (pcm16: Uint8Array) => void,
+): { stop: () => void } {
+  const ctx = new AudioContext({ sampleRate: 24000 });
+  const source = ctx.createMediaStreamSource(stream);
+  const processor = ctx.createScriptProcessor(4096, 1, 1);
+  let running = true;
+
+  processor.onaudioprocess = (e) => {
+    if (!running) return;
+    const input = e.inputBuffer.getChannelData(0);
+    const pcm16 = new Int16Array(input.length);
+    for (let i = 0; i < input.length; i++) {
+      const s = Math.max(-1, Math.min(1, input[i]));
+      pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+    }
+    onChunk(new Uint8Array(pcm16.buffer));
+  };
+
+  source.connect(processor);
+  processor.connect(ctx.destination);
+
+  return {
+    stop: () => {
+      running = false;
+      source.disconnect();
+      processor.disconnect();
+      ctx.close();
+    },
+  };
+}