feat: OpenAI Realtime API pipeline

Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM → OpenAI TTS) with a single OpenAI Realtime API WebSocket using gpt-4o-mini-realtime-preview. - ~300-800ms latency vs 1-3s - Server VAD for automatic turn detection - Streaming audio chunks during playback - Interruptions: user can speak over Kira mid-response - Honcho memory still injected into session instructions - Frontend captures PCM16 mono 24kHz via AudioContext - Backend relays client ↔ OpenAI Realtime API - Supports both voice (PCM16) and text input
2026-06-04 13:32:39 -04:00
parent e64698b0ab
commit e2332af8d0
4 changed files with 551 additions and 251 deletions
@@ -25,6 +25,41 @@ function saveUserId(id: string) {
  localStorage.setItem(USER_ID_KEY, id);
 }

+/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
+function startPCMCapture(
+  stream: MediaStream,
+  onChunk: (pcm16: Uint8Array) => void,
+): { stop: () => void } {
+  const ctx = new AudioContext({ sampleRate: 24000 });
+  const source = ctx.createMediaStreamSource(stream);
+  const processor = ctx.createScriptProcessor(4096, 1, 1);
+  let running = true;
+
+  processor.onaudioprocess = (e) => {
+    if (!running) return;
+    const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
+    // Convert float32 → PCM16 int16
+    const pcm16 = new Int16Array(input.length);
+    for (let i = 0; i < input.length; i++) {
+      const s = Math.max(-1, Math.min(1, input[i]));
+      pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+    }
+    onChunk(new Uint8Array(pcm16.buffer));
+  };
+
+  source.connect(processor);
+  processor.connect(ctx.destination);
+
+  return {
+    stop: () => {
+      running = false;
+      source.disconnect();
+      processor.disconnect();
+      ctx.close();
+    },
+  };
+}
+
 export function useConversation() {
  const [messages, setMessages] = useState<Message[]>([]);
  const [isConnected, setIsConnected] = useState(false);
@@ -38,11 +73,13 @@ export function useConversation() {
    accessory: '',
  });
  const [loadingPrefs, setLoadingPrefs] = useState(true);
+  const [micError, setMicError] = useState<string | null>(null);

  const wsRef = useRef<WebSocket | null>(null);
  const audioRef = useRef<HTMLAudioElement | null>(null);
-  const recorderRef = useRef<MediaRecorder | null>(null);
+  const captureRef = useRef<{ stop: () => void } | null>(null);
  const streamRef = useRef<MediaStream | null>(null);
+  const audioBufferRef = useRef<Uint8Array[]>([]);

  // Connect WebSocket
  const connect = useCallback(() => {
@@ -54,7 +91,6 @@ export function useConversation() {

    ws.onopen = () => {
      setIsConnected(true);
-      // Auto-identify if returning user
      const savedId = loadUserId();
      if (savedId) {
        ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
@@ -102,35 +138,56 @@ export function useConversation() {
        break;
      }

-      case 'preference_saved':
-        // Already optimistically updated locally
-        break;
-
      case 'transcript':
-        addMessage('user', msg.text);
+        addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
        break;

      case 'speaking_start':
        setIsKiraSpeaking(true);
-        addMessage('kira', msg.text || '...');
        break;

-      case 'audio':
+      case 'audio': {
+        // Incoming PCM16 audio from Kira
        if (msg.data && audioRef.current) {
+          // Accumulate audio chunks and create a blob
          const binary = atob(msg.data);
          const bytes = new Uint8Array(binary.length);
          for (let i = 0; i < binary.length; i++) {
            bytes[i] = binary.charCodeAt(i);
          }
-          const blob = new Blob([bytes], { type: 'audio/ogg' });
+          audioBufferRef.current.push(bytes);
+
+          // Convert accumulated PCM16 to WAV blob for playback
+          const allChunks = audioBufferRef.current;
+          const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
+          const combined = new Uint8Array(totalLen);
+          let offset = 0;
+          for (const chunk of allChunks) {
+            combined.set(chunk, offset);
+            offset += chunk.length;
+          }
+
+          const wav = pcm16ToWav(combined);
+          const blob = new Blob([wav], { type: 'audio/wav' });
          const url = URL.createObjectURL(blob);
          audioRef.current.src = url;
          audioRef.current.play().catch(() => {});
        }
        break;
+      }

      case 'speaking_end':
        setIsKiraSpeaking(false);
+        audioBufferRef.current = [];
+        break;
+
+      case 'interruption':
+        setIsKiraSpeaking(false);
+        audioBufferRef.current = [];
+        if (audioRef.current) {
+          audioRef.current.pause();
+          audioRef.current.currentTime = 0;
+        }
        break;

      case 'error':
@@ -154,99 +211,80 @@ export function useConversation() {
    setPreferences((p) => ({ ...p, name }));

    if (wsRef.current?.readyState === WebSocket.OPEN) {
-      wsRef.current.send(JSON.stringify({
-        type: 'identify',
-        user_id: userId,
-        name,
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
    }
  }, []);

  // ── Preferences ──

  const setPreference = useCallback((key: string, value: string) => {
-    // Optimistic update
    setPreferences((p) => ({ ...p, [key]: value }));
-
-    // Sync to backend
    if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
-      wsRef.current.send(JSON.stringify({
-        type: 'set_preference',
-        key,
-        value,
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
    }
  }, [identified]);

+  // ── Audio (Realtime PCM16) ──
+
+  const startRecording = useCallback(async () => {
+    // Check HTTPS
+    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+      addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
+      return;
+    }
+
+    try {
+      setMicError(null);
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      streamRef.current = stream;
+
+      const ws = wsRef.current;
+      if (!ws || ws.readyState !== WebSocket.OPEN) {
+        addMessage('kira', 'Not connected to server yet...');
+        stream.getTracks().forEach((t) => t.stop());
+        return;
+      }
+
+      // Start PCM16 capture — each chunk sent as WS message
+      const capture = startPCMCapture(stream, (pcm16) => {
+        if (ws.readyState === WebSocket.OPEN) {
+          const base64 = arrayBufferToBase64(pcm16.buffer);
+          ws.send(JSON.stringify({ type: 'audio', data: base64 }));
+        }
+      });
+
+      captureRef.current = capture;
+      setIsRecording(true);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      setMicError(msg);
+      console.error('[Kira Mic]', msg);
+    }
+  }, [addMessage]);
+
+  const stopRecording = useCallback(() => {
+    captureRef.current?.stop();
+    captureRef.current = null;
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+    setIsRecording(false);
+  }, []);
+
  // ── Text ──

  const sendText = useCallback((text: string) => {
    if (!text.trim()) return;
    if (wsRef.current?.readyState === WebSocket.OPEN) {
-      wsRef.current.send(JSON.stringify({
-        type: 'conversation_text',
-        text: text.trim(),
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
    }
  }, []);

-  // ── Audio ──
-
-  const startRecording = useCallback(async () => {
-    // Check if mediaDevices is available (requires HTTPS/localhost)
-    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
-      addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!');
-      return;
-    }
-
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      streamRef.current = stream;
-
-      const recorder = new MediaRecorder(stream, {
-        mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
-          ? 'audio/webm;codecs=opus'
-          : 'audio/webm',
-      });
-
-      const chunks: BlobPart[] = [];
-      recorder.ondataavailable = (e) => {
-        if (e.data.size > 0) chunks.push(e.data);
-      };
-
-      recorder.onstop = () => {
-        const blob = new Blob(chunks, { type: 'audio/webm' });
-        const reader = new FileReader();
-        reader.onload = () => {
-          const base64 = (reader.result as string).split(',')[1];
-          if (wsRef.current?.readyState === WebSocket.OPEN) {
-            wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
-            wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
-          }
-        };
-        reader.readAsDataURL(blob);
-
-        stream.getTracks().forEach((t) => t.stop());
-        setIsRecording(false);
-      };
-
-      recorder.start();
-      recorderRef.current = recorder;
-      setIsRecording(true);
-    } catch (err) {
-      console.error('[Kira Mic] failed:', err);
-    }
-  }, []);
-
-  const stopRecording = useCallback(() => {
-    recorderRef.current?.stop();
-  }, []);
-
  // Connect on mount
  useEffect(() => {
    connect();
    return () => {
      wsRef.current?.close();
+      captureRef.current?.stop();
      streamRef.current?.getTracks().forEach((t) => t.stop());
    };
  }, [connect]);
@@ -259,6 +297,7 @@ export function useConversation() {
    identified,
    preferences,
    loadingPrefs,
+    micError,
    identify,
    setPreference,
    sendText,
@@ -266,3 +305,61 @@ export function useConversation() {
    stopRecording,
  };
 }
+
+// ── Helpers ──
+
+function arrayBufferToBase64(buffer: ArrayBufferLike): string {
+  const bytes = new Uint8Array(buffer);
+  let binary = '';
+  for (let i = 0; i < bytes.length; i++) {
+    binary += String.fromCharCode(bytes[i]);
+  }
+  return btoa(binary);
+}
+
+/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
+function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
+  const numChannels = 1;
+  const sampleRate = 24000;
+  const bitsPerSample = 16;
+  const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
+  const blockAlign = numChannels * (bitsPerSample / 8);
+  const dataSize = pcm16.length;
+  const headerSize = 44;
+  const totalSize = headerSize + dataSize;
+
+  const buf = new ArrayBuffer(totalSize);
+  const view = new DataView(buf);
+
+  // RIFF header
+  writeString(view, 0, 'RIFF');
+  view.setUint32(4, totalSize - 8, true);
+  writeString(view, 8, 'WAVE');
+
+  // fmt subchunk
+  writeString(view, 12, 'fmt ');
+  view.setUint32(16, 16, true); // subchunk size
+  view.setUint16(20, 1, true);  // PCM
+  view.setUint16(22, numChannels, true);
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, byteRate, true);
+  view.setUint16(32, blockAlign, true);
+  view.setUint16(34, bitsPerSample, true);
+
+  // data subchunk
+  writeString(view, 36, 'data');
+  view.setUint32(40, dataSize, true);
+
+  // PCM data
+  for (let i = 0; i < pcm16.length; i++) {
+    view.setUint8(44 + i, pcm16[i]);
+  }
+
+  return buf;
+}
+
+function writeString(view: DataView, offset: number, str: string) {
+  for (let i = 0; i < str.length; i++) {
+    view.setUint8(offset + i, str.charCodeAt(i));
+  }
+}