feat: OpenAI Realtime API pipeline

Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM → OpenAI TTS) with a single OpenAI Realtime API WebSocket using gpt-4o-mini-realtime-preview. - ~300-800ms latency vs 1-3s - Server VAD for automatic turn detection - Streaming audio chunks during playback - Interruptions: user can speak over Kira mid-response - Honcho memory still injected into session instructions - Frontend captures PCM16 mono 24kHz via AudioContext - Backend relays client ↔ OpenAI Realtime API - Supports both voice (PCM16) and text input
2026-06-04 13:32:39 -04:00
parent e64698b0ab
commit e2332af8d0
4 changed files with 551 additions and 251 deletions
@@ -1,26 +1,21 @@
 """Kira — AI body double backend

-Real-time speech-to-speech pipeline:
-  mic audio → Whisper API → text → DeepSeek LLM → response text → OpenAI TTS → audio
-
-Honcho memory integration:
-  Cross-session user context injected into LLM prompts,
-  conversation exchanges stored for continuous learning.
-  User preferences (name, scene, outfit, accessory) persisted in peer metadata.
+OpenAI Realtime API pipeline:
+  mic audio → [built-in STT → GPT-4o-mini → built-in TTS] → speaker audio
+                    Single WebSocket, ~300-800ms latency
 """

 import json
 import base64
 import uuid
 import logging
+import asyncio

 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware

 from config import settings
-from services.stt import transcribe_audio
-from services.llm import get_kira_response
-from services.tts import synthesize_speech
+from services.realtime import RealtimeRelay
 from services.memory import kira_memory

 logging.basicConfig(level=logging.INFO)
@@ -36,24 +31,13 @@ app.add_middleware(
    allow_headers=["*"],
 )

-# ─── Base system prompt (static part) ───
-BASE_SYSTEM_PROMPT = (
-    "You are Kira, a warm, kind, and encouraging AI body double. "
-    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
-    "stay focused and on task. Keep responses short, supportive, and uplifting. "
-    "Check in on them. Remind them to take breaks. Celebrate small wins. "
-    "Use occasional emoji but don't overdo it. Never be judgmental. "
-    "You remember things about them between conversations."
-)
-

@app.on_event("startup")
 async def startup():
-    """Initialize Honcho memory on app startup."""
    if kira_memory.init():
        logger.info("Honcho memory initialized")
    else:
-        logger.info("Honcho memory not configured — running without memory")
+        logger.info("Honcho memory not configured")


@app.get("/api/health")
@@ -62,61 +46,6 @@ async def health():
    return {"status": "ok", "name": "kira", "memory": mem_status}


-def build_system_prompt(user_id: str) -> dict:
-    """Build system prompt with Honcho memory context injected."""
-    base = BASE_SYSTEM_PROMPT
-
-    if kira_memory.enabled:
-        try:
-            kira_memory.ensure_peers(user_id)
-            memory_suffix = kira_memory.build_system_prompt_suffix()
-            if memory_suffix:
-                base += memory_suffix
-        except Exception as e:
-            logger.warning(f"Failed to build memory context: {e}")
-
-    return {"role": "system", "content": base}
-
-
-def handle_identify(msg: dict, session_id: str) -> dict | None:
-    """Handle user identification. Returns user preferences or None."""
-    user_id = msg.get("user_id", "").strip()
-    if not user_id:
-        return {"type": "error", "message": "user_id is required"}
-
-    user_name = msg.get("name", "").strip()
-    if user_name:
-        kira_memory.set_user_preference(user_id, "name", user_name)
-
-    prefs = kira_memory.get_user_preferences(user_id)
-    logger.info(f"[{session_id}] Identified as {user_id} (name={user_name or prefs.get('name', '')})")
-
-    return {
-        "type": "identified",
-        "user_id": user_id,
-        "preferences": prefs,
-    }
-
-
-def handle_set_preference(msg: dict, session_id: str, user_id: str) -> dict | None:
-    """Handle preference update. Returns success status."""
-    if not user_id or user_id == "default-user":
-        return {"type": "error", "message": "Must identify first"}
-
-    key = msg.get("key", "").strip()
-    value = msg.get("value", "").strip()
-
-    if not key:
-        return {"type": "error", "message": "key is required"}
-
-    ok = kira_memory.set_user_preference(user_id, key, value)
-    return {
-        "type": "preference_saved",
-        "key": key,
-        "success": ok,
-    }
-
-
@app.websocket("/api/ws")
 async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
@@ -125,8 +54,102 @@ async def conversation_ws(websocket: WebSocket):
    identified = False
    logger.info(f"[{session_id}] WebSocket connected")

-    audio_buffer = bytearray()
-    conversation_history: list[dict] = []
+    # Track conversation for Honcho
+    pending_transcripts: list[tuple[str, str]] = []
+
+    # Will be set when Realtime relay is ready
+    relay_ready = asyncio.Event()
+    relay: RealtimeRelay | None = None
+    relay_task: asyncio.Task | None = None
+    audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
+    text_queue: asyncio.Queue[str] = asyncio.Queue()
+
+    async def on_ready():
+        relay_ready.set()
+        logger.info(f"[{session_id}] Realtime relay ready")
+
+    async def on_audio_delta(audio_bytes: bytes):
+        """Forward audio chunks from OpenAI to the client."""
+        try:
+            audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+            await websocket.send_json({
+                "type": "audio",
+                "data": audio_b64,
+            })
+        except Exception:
+            pass
+
+    async def on_transcript(text: str):
+        """Store transcripts for Honcho."""
+        pending_transcripts.append(("transcript", text))
+        role, content = text.split(": ", 1)
+        logger.info(f"[{session_id}] {role}: {content}")
+        await websocket.send_json({
+            "type": "transcript",
+            "role": role,
+            "text": content,
+        })
+
+    async def on_speech_started():
+        """Kira started speaking."""
+        await websocket.send_json({"type": "speaking_start"})
+
+    async def on_speech_stopped():
+        """Kira finished speaking."""
+        await websocket.send_json({"type": "speaking_end"})
+
+    async def on_interruption():
+        """User interrupted — Kira stops speaking."""
+        await websocket.send_json({"type": "interruption"})
+
+    async def on_error(msg: str):
+        await websocket.send_json({"type": "error", "message": msg})
+
+    # ── Create and start the Realtime relay ──
+    relay = RealtimeRelay(
+        on_audio_delta=on_audio_delta,
+        on_transcript=on_transcript,
+        on_speech_started=on_speech_started,
+        on_speech_stopped=on_speech_stopped,
+        on_interruption=on_interruption,
+        on_error=on_error,
+        on_ready=on_ready,
+    )
+
+    relay_task = asyncio.create_task(relay.connect())
+
+    # Wait for relay to be ready
+    try:
+        await asyncio.wait_for(relay_ready.wait(), timeout=15)
+    except asyncio.TimeoutError:
+        logger.error(f"[{session_id}] Realtime relay failed to connect")
+        await websocket.send_json({"type": "error", "message": "Failed to connect to AI"})
+        relay_task.cancel()
+        return
+
+    # ── Forward audio/text from client to relay ──
+    async def forward_audio():
+        while relay and relay._connected:
+            try:
+                pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1)
+                await relay.send_audio(pcm16)
+            except asyncio.TimeoutError:
+                continue
+            except Exception:
+                break
+
+    async def forward_text():
+        while relay and relay._connected:
+            try:
+                text = await asyncio.wait_for(text_queue.get(), timeout=1)
+                await relay.send_text(text)
+            except asyncio.TimeoutError:
+                continue
+            except Exception:
+                break
+
+    fwd_audio_task = asyncio.create_task(forward_audio())
+    fwd_text_task = asyncio.create_task(forward_text())

    try:
        while True:
@@ -134,110 +157,98 @@ async def conversation_ws(websocket: WebSocket):
            msg = json.loads(raw)
            msg_type = msg.get("type", "")

-            # ── Identity & Preferences ──
-
+            # ── Identity ──
            if msg_type == "identify":
-                response = handle_identify(msg, session_id)
-                if response:
-                    await websocket.send_json(response)
-                    if response["type"] == "identified":
-                        user_id = response["user_id"]
-                        identified = True
-                        # Set up Honcho for this user
-                        if kira_memory.enabled:
-                            try:
-                                kira_memory.ensure_peers(user_id)
-                                kira_memory.ensure_session(session_id)
-                                logger.info(f"[{session_id}] Honcho session ready for {user_id}")
-                            except Exception as e:
-                                logger.warning(f"[{session_id}] Honcho setup failed: {e}")
+                user_id = msg.get("user_id", "").strip()
+                user_name = msg.get("name", "").strip()
+                if user_name and user_id:
+                    kira_memory.set_user_preference(user_id, "name", user_name)
+
+                prefs = kira_memory.get_user_preferences(user_id)
+                identified = True
+
+                if kira_memory.enabled:
+                    kira_memory.ensure_peers(user_id)
+                    kira_memory.ensure_session(session_id)
+
+                # Inject Honcho context into the Realtime session instructions
+                memory_suffix = ""
+                if kira_memory.enabled:
+                    try:
+                        ctx = kira_memory.build_system_prompt_suffix()
+                        if ctx:
+                            memory_suffix = ctx
+                    except Exception:
+                        pass
+
+                if relay and relay._connected and memory_suffix:
+                    await relay._send({
+                        "type": "session.update",
+                        "session": {
+                            "instructions": (
+                                "You are Kira, a warm, kind, and encouraging AI body double. "
+                                "Speak in a friendly, girly-pop tone. Help someone with ADHD "
+                                "stay focused. Keep responses short and supportive. "
+                                "Check in, remind breaks, celebrate wins. Never judgmental."
+                                + memory_suffix
+                            ),
+                        },
+                    })
+
+                await websocket.send_json({
+                    "type": "identified",
+                    "user_id": user_id,
+                    "preferences": prefs,
+                })
                continue

+            # ── Preferences ──
            if msg_type == "set_preference":
-                response = handle_set_preference(msg, session_id, user_id)
-                if response:
-                    await websocket.send_json(response)
+                key = msg.get("key", "").strip()
+                value = msg.get("value", "").strip()
+                if key and user_id and user_id != "default-user":
+                    kira_memory.set_user_preference(user_id, key, value)
                continue

-            # ── Conversation ──
+            # ── Audio from frontend (PCM16) ──
+            if msg_type == "audio":
+                audio_b64 = msg.get("data", "")
+                if audio_b64:
+                    pcm16 = base64.b64decode(audio_b64)
+                    await audio_queue.put(pcm16)
+                continue

-            system_prompt = build_system_prompt(user_id)
+            # ── Text input ──
+            if msg_type == "conversation_text":
+                text = msg.get("text", "").strip()
+                if text:
+                    await text_queue.put(text)
+                    # Also store in Honcho immediately
+                    if kira_memory.enabled and identified:
+                        kira_memory.store_user_message(text)
+                continue

-            if msg_type == "audio_chunk":
-                chunk = base64.b64decode(msg["data"])
-                audio_buffer.extend(chunk)
-
-            elif msg_type == "transcribe":
-                if not audio_buffer:
-                    await websocket.send_json({"type": "error", "message": "No audio data"})
-                    continue
-
-                logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
-
-                transcript = await transcribe_audio(bytes(audio_buffer))
-                audio_buffer.clear()
-
-                if not transcript:
-                    await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
-                    continue
-
-                await websocket.send_json({"type": "transcript", "text": transcript})
-
-                logger.info(f"[{session_id}] User: {transcript}")
-                conversation_history.append({"role": "user", "content": transcript})
-
-                messages = [system_prompt] + conversation_history[-10:]
-                kira_text = await get_kira_response(messages)
-
-                conversation_history.append({"role": "assistant", "content": kira_text})
-                logger.info(f"[{session_id}] Kira: {kira_text}")
-
-                if kira_memory.enabled and identified:
-                    try:
-                        kira_memory.store_messages(transcript, kira_text)
-                    except Exception as e:
-                        logger.warning(f"[{session_id}] Failed to store messages: {e}")
-
-                await websocket.send_json({"type": "speaking_start", "text": kira_text})
-                audio_bytes = await synthesize_speech(kira_text)
-                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-                await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
-                await websocket.send_json({"type": "speaking_end"})
-
-            elif msg_type == "ping":
+            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})

-            elif msg_type == "conversation_text":
-                user_text = msg.get("text", "").strip()
-                if not user_text:
-                    continue
-
-                logger.info(f"[{session_id}] User (text): {user_text}")
-                conversation_history.append({"role": "user", "content": user_text})
-
-                messages = [system_prompt] + conversation_history[-10:]
-                kira_text = await get_kira_response(messages)
-
-                conversation_history.append({"role": "assistant", "content": kira_text})
-                logger.info(f"[{session_id}] Kira: {kira_text}")
-
-                if kira_memory.enabled and identified:
-                    try:
-                        kira_memory.store_messages(user_text, kira_text)
-                    except Exception as e:
-                        logger.warning(f"[{session_id}] Failed to store messages: {e}")
-
-                await websocket.send_json({"type": "speaking_start", "text": kira_text})
-                audio_bytes = await synthesize_speech(kira_text)
-                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-                await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
-                await websocket.send_json({"type": "speaking_end"})
-
    except WebSocketDisconnect:
        logger.info(f"[{session_id}] Disconnected")
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")
-        try:
-            await websocket.send_json({"type": "error", "message": str(e)})
-        except Exception:
-            pass
+    finally:
+        # Store pending transcripts in Honcho
+        if kira_memory.enabled and identified:
+            for _, transcript_text in pending_transcripts:
+                if transcript_text.startswith("user: "):
+                    content = transcript_text[6:]
+                    kira_memory.store_user_message(content)
+                elif transcript_text.startswith("assistant: "):
+                    content = transcript_text[11:]
+                    kira_memory.store_kira_message(content)
+
+        fwd_audio_task.cancel()
+        fwd_text_task.cancel()
+        if relay:
+            await relay.disconnect()
+        if relay_task:
+            relay_task.cancel()
@@ -7,3 +7,4 @@ pydantic>=2.10.0
 pydantic-settings>=2.7.0
 httpx>=0.28.0
 honcho-ai>=2.1.0
+openai[realtime]>=2.41.0
@@ -0,0 +1,191 @@
+"""OpenAI Realtime API relay service.
+
+Manages a WebSocket connection to OpenAI's Realtime API and relays
+audio/text events between the client and OpenAI.
+"""
+
+import json
+import logging
+from typing import Callable, Awaitable
+from config import settings
+
+logger = logging.getLogger("kira.realtime")
+
+# ─── System instructions for Kira's personality ───
+
+KIRA_INSTRUCTIONS = (
+    "You are Kira, a warm, kind, and encouraging AI body double. "
+    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
+    "stay focused and on task. Keep responses short, supportive, and uplifting. "
+    "Check in on them. Remind them to take breaks. Celebrate small wins. "
+    "Use occasional emoji but don't overdo it. Never be judgmental. "
+    "You remember things about them between conversations."
+)
+
+
+class RealtimeRelay:
+    """Relays audio/text between a client WS and OpenAI Realtime API."""
+
+    def __init__(
+        self,
+        on_audio_delta: Callable[[bytes], Awaitable[None]],
+        on_transcript: Callable[[str], Awaitable[None]],
+        on_speech_started: Callable[[], Awaitable[None]],
+        on_speech_stopped: Callable[[], Awaitable[None]],
+        on_interruption: Callable[[], Awaitable[None]],
+        on_error: Callable[[str], Awaitable[None]],
+        on_ready: Callable[[], Awaitable[None]],
+    ):
+        self._on_audio_delta = on_audio_delta
+        self._on_transcript = on_transcript
+        self._on_speech_started = on_speech_started
+        self._on_speech_stopped = on_speech_stopped
+        self._on_interruption = on_interruption
+        self._on_error = on_error
+        self._on_ready = on_ready
+        self._conn = None
+        self._connected = False
+
+    async def connect(self):
+        """Open a WebSocket to OpenAI Realtime API."""
+        if self._connected:
+            return
+
+        try:
+            from openai import AsyncOpenAI
+
+            client = AsyncOpenAI(api_key=settings.openai_api_key)
+
+            logger.info("Connecting to OpenAI Realtime API...")
+            async with client.beta.realtime.connect(
+                model="gpt-4o-mini-realtime-preview-2025-07-18",
+            ) as conn:
+                self._conn = conn
+                self._connected = True
+                logger.info("Connected to OpenAI Realtime API")
+
+                # Configure session
+                await self._send({
+                    "type": "session.update",
+                    "session": {
+                        "instructions": KIRA_INSTRUCTIONS,
+                        "voice": "alloy",
+                        "input_audio_transcription": {"enabled": True},
+                        "turn_detection": {
+                            "type": "server_vad",
+                            "threshold": 0.5,
+                            "prefix_padding_ms": 300,
+                            "silence_duration_ms": 600,
+                        },
+                    },
+                })
+
+                await self._on_ready()
+
+                # Listen for events
+                while self._connected:
+                    try:
+                        event = await conn.recv()
+                        await self._handle_event(event)
+                    except Exception as e:
+                        if self._connected:
+                            logger.warning(f"Realtime recv error: {e}")
+                        break
+
+        except ImportError:
+            logger.error("openai[realtime] not installed — run: pip install 'openai[realtime]'")
+            await self._on_error("Missing openai[realtime] dependency")
+        except Exception as e:
+            logger.error(f"Realtime connection error: {e}")
+            await self._on_error(str(e))
+        finally:
+            self._connected = False
+            self._conn = None
+
+    async def _handle_event(self, event):
+        """Process an event from the OpenAI Realtime API."""
+        event_type = getattr(event, "type", None) or event.get("type", "")
+
+        if event_type == "response.audio.delta":
+            audio_b64 = getattr(event, "delta", None) or event.get("delta", "")
+            if audio_b64:
+                import base64
+                audio_bytes = base64.b64decode(audio_b64)
+                await self._on_audio_delta(audio_bytes)
+
+        elif event_type == "response.audio_buffer.speech_started":
+            await self._on_speech_started()
+
+        elif event_type == "response.audio_buffer.speech_stopped":
+            await self._on_speech_stopped()
+
+        elif event_type == "input_audio_buffer.speech_started":
+            # User started speaking — interrupt Kira
+            await self._on_interruption()
+
+        elif event_type == "conversation.item.created":
+            item = getattr(event, "item", None) or event.get("item", {})
+            role = getattr(item, "role", None) or item.get("role", "")
+            content = getattr(item, "content", None) or item.get("content", [])
+            for part in (content or []):
+                part_type = getattr(part, "type", None) or part.get("type", "")
+                part_text = getattr(part, "text", None) or part.get("text", "")
+                if part_type == "text" and part_text and role == "assistant":
+                    await self._on_transcript(f"assistant: {part_text}")
+                part_transcript = getattr(part, "transcript", None) or part.get("transcript", "")
+                if part_type == "transcript" and part_transcript and role == "user":
+                    await self._on_transcript(f"user: {part_transcript}")
+
+        elif event_type == "error":
+            err = getattr(event, "error", None) or event.get("error", {})
+            msg = getattr(err, "message", None) or err.get("message", str(event))
+            logger.warning(f"Realtime API error: {msg}")
+            await self._on_error(msg)
+
+    async def send_audio(self, pcm16_bytes: bytes):
+        """Send PCM16 audio chunk to OpenAI."""
+        if not self._connected or not self._conn:
+            return
+        try:
+            import base64
+            audio_b64 = base64.b64encode(pcm16_bytes).decode("utf-8")
+            await self._send({
+                "type": "input_audio_buffer.append",
+                "audio": audio_b64,
+            })
+        except Exception as e:
+            logger.warning(f"Failed to send audio: {e}")
+
+    async def send_text(self, text: str):
+        """Send a text message to OpenAI and trigger a response."""
+        if not self._connected or not self._conn:
+            return
+        try:
+            await self._send({
+                "type": "conversation.item.create",
+                "item": {
+                    "type": "message",
+                    "role": "user",
+                    "content": [{"type": "input_text", "text": text}],
+                },
+            })
+            await self._send({"type": "response.create"})
+        except Exception as e:
+            logger.warning(f"Failed to send text: {e}")
+
+    async def _send(self, data: dict):
+        """Send a JSON event to the Realtime API."""
+        try:
+            await self._conn.send(data)
+        except Exception as e:
+            logger.warning(f"Realtime send error: {e}")
+
+    async def disconnect(self):
+        """Close the Realtime connection."""
+        self._connected = False
+        if self._conn:
+            try:
+                await self._conn.close()
+            except Exception:
+                pass
+        self._conn = None
@@ -25,6 +25,41 @@ function saveUserId(id: string) {
  localStorage.setItem(USER_ID_KEY, id);
 }

+/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
+function startPCMCapture(
+  stream: MediaStream,
+  onChunk: (pcm16: Uint8Array) => void,
+): { stop: () => void } {
+  const ctx = new AudioContext({ sampleRate: 24000 });
+  const source = ctx.createMediaStreamSource(stream);
+  const processor = ctx.createScriptProcessor(4096, 1, 1);
+  let running = true;
+
+  processor.onaudioprocess = (e) => {
+    if (!running) return;
+    const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
+    // Convert float32 → PCM16 int16
+    const pcm16 = new Int16Array(input.length);
+    for (let i = 0; i < input.length; i++) {
+      const s = Math.max(-1, Math.min(1, input[i]));
+      pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+    }
+    onChunk(new Uint8Array(pcm16.buffer));
+  };
+
+  source.connect(processor);
+  processor.connect(ctx.destination);
+
+  return {
+    stop: () => {
+      running = false;
+      source.disconnect();
+      processor.disconnect();
+      ctx.close();
+    },
+  };
+}
+
 export function useConversation() {
  const [messages, setMessages] = useState<Message[]>([]);
  const [isConnected, setIsConnected] = useState(false);
@@ -38,11 +73,13 @@ export function useConversation() {
    accessory: '',
  });
  const [loadingPrefs, setLoadingPrefs] = useState(true);
+  const [micError, setMicError] = useState<string | null>(null);

  const wsRef = useRef<WebSocket | null>(null);
  const audioRef = useRef<HTMLAudioElement | null>(null);
-  const recorderRef = useRef<MediaRecorder | null>(null);
+  const captureRef = useRef<{ stop: () => void } | null>(null);
  const streamRef = useRef<MediaStream | null>(null);
+  const audioBufferRef = useRef<Uint8Array[]>([]);

  // Connect WebSocket
  const connect = useCallback(() => {
@@ -54,7 +91,6 @@ export function useConversation() {

    ws.onopen = () => {
      setIsConnected(true);
-      // Auto-identify if returning user
      const savedId = loadUserId();
      if (savedId) {
        ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
@@ -102,35 +138,56 @@ export function useConversation() {
        break;
      }

-      case 'preference_saved':
-        // Already optimistically updated locally
-        break;
-
      case 'transcript':
-        addMessage('user', msg.text);
+        addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
        break;

      case 'speaking_start':
        setIsKiraSpeaking(true);
-        addMessage('kira', msg.text || '...');
        break;

-      case 'audio':
+      case 'audio': {
+        // Incoming PCM16 audio from Kira
        if (msg.data && audioRef.current) {
+          // Accumulate audio chunks and create a blob
          const binary = atob(msg.data);
          const bytes = new Uint8Array(binary.length);
          for (let i = 0; i < binary.length; i++) {
            bytes[i] = binary.charCodeAt(i);
          }
-          const blob = new Blob([bytes], { type: 'audio/ogg' });
+          audioBufferRef.current.push(bytes);
+
+          // Convert accumulated PCM16 to WAV blob for playback
+          const allChunks = audioBufferRef.current;
+          const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
+          const combined = new Uint8Array(totalLen);
+          let offset = 0;
+          for (const chunk of allChunks) {
+            combined.set(chunk, offset);
+            offset += chunk.length;
+          }
+
+          const wav = pcm16ToWav(combined);
+          const blob = new Blob([wav], { type: 'audio/wav' });
          const url = URL.createObjectURL(blob);
          audioRef.current.src = url;
          audioRef.current.play().catch(() => {});
        }
        break;
+      }

      case 'speaking_end':
        setIsKiraSpeaking(false);
+        audioBufferRef.current = [];
+        break;
+
+      case 'interruption':
+        setIsKiraSpeaking(false);
+        audioBufferRef.current = [];
+        if (audioRef.current) {
+          audioRef.current.pause();
+          audioRef.current.currentTime = 0;
+        }
        break;

      case 'error':
@@ -154,99 +211,80 @@ export function useConversation() {
    setPreferences((p) => ({ ...p, name }));

    if (wsRef.current?.readyState === WebSocket.OPEN) {
-      wsRef.current.send(JSON.stringify({
-        type: 'identify',
-        user_id: userId,
-        name,
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
    }
  }, []);

  // ── Preferences ──

  const setPreference = useCallback((key: string, value: string) => {
-    // Optimistic update
    setPreferences((p) => ({ ...p, [key]: value }));
-
-    // Sync to backend
    if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
-      wsRef.current.send(JSON.stringify({
-        type: 'set_preference',
-        key,
-        value,
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
    }
  }, [identified]);

+  // ── Audio (Realtime PCM16) ──
+
+  const startRecording = useCallback(async () => {
+    // Check HTTPS
+    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+      addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
+      return;
+    }
+
+    try {
+      setMicError(null);
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      streamRef.current = stream;
+
+      const ws = wsRef.current;
+      if (!ws || ws.readyState !== WebSocket.OPEN) {
+        addMessage('kira', 'Not connected to server yet...');
+        stream.getTracks().forEach((t) => t.stop());
+        return;
+      }
+
+      // Start PCM16 capture — each chunk sent as WS message
+      const capture = startPCMCapture(stream, (pcm16) => {
+        if (ws.readyState === WebSocket.OPEN) {
+          const base64 = arrayBufferToBase64(pcm16.buffer);
+          ws.send(JSON.stringify({ type: 'audio', data: base64 }));
+        }
+      });
+
+      captureRef.current = capture;
+      setIsRecording(true);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      setMicError(msg);
+      console.error('[Kira Mic]', msg);
+    }
+  }, [addMessage]);
+
+  const stopRecording = useCallback(() => {
+    captureRef.current?.stop();
+    captureRef.current = null;
+    streamRef.current?.getTracks().forEach((t) => t.stop());
+    streamRef.current = null;
+    setIsRecording(false);
+  }, []);
+
  // ── Text ──

  const sendText = useCallback((text: string) => {
    if (!text.trim()) return;
    if (wsRef.current?.readyState === WebSocket.OPEN) {
-      wsRef.current.send(JSON.stringify({
-        type: 'conversation_text',
-        text: text.trim(),
-      }));
+      wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
    }
  }, []);

-  // ── Audio ──
-
-  const startRecording = useCallback(async () => {
-    // Check if mediaDevices is available (requires HTTPS/localhost)
-    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
-      addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!');
-      return;
-    }
-
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      streamRef.current = stream;
-
-      const recorder = new MediaRecorder(stream, {
-        mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
-          ? 'audio/webm;codecs=opus'
-          : 'audio/webm',
-      });
-
-      const chunks: BlobPart[] = [];
-      recorder.ondataavailable = (e) => {
-        if (e.data.size > 0) chunks.push(e.data);
-      };
-
-      recorder.onstop = () => {
-        const blob = new Blob(chunks, { type: 'audio/webm' });
-        const reader = new FileReader();
-        reader.onload = () => {
-          const base64 = (reader.result as string).split(',')[1];
-          if (wsRef.current?.readyState === WebSocket.OPEN) {
-            wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
-            wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
-          }
-        };
-        reader.readAsDataURL(blob);
-
-        stream.getTracks().forEach((t) => t.stop());
-        setIsRecording(false);
-      };
-
-      recorder.start();
-      recorderRef.current = recorder;
-      setIsRecording(true);
-    } catch (err) {
-      console.error('[Kira Mic] failed:', err);
-    }
-  }, []);
-
-  const stopRecording = useCallback(() => {
-    recorderRef.current?.stop();
-  }, []);
-
  // Connect on mount
  useEffect(() => {
    connect();
    return () => {
      wsRef.current?.close();
+      captureRef.current?.stop();
      streamRef.current?.getTracks().forEach((t) => t.stop());
    };
  }, [connect]);
@@ -259,6 +297,7 @@ export function useConversation() {
    identified,
    preferences,
    loadingPrefs,
+    micError,
    identify,
    setPreference,
    sendText,
@@ -266,3 +305,61 @@ export function useConversation() {
    stopRecording,
  };
 }
+
+// ── Helpers ──
+
+function arrayBufferToBase64(buffer: ArrayBufferLike): string {
+  const bytes = new Uint8Array(buffer);
+  let binary = '';
+  for (let i = 0; i < bytes.length; i++) {
+    binary += String.fromCharCode(bytes[i]);
+  }
+  return btoa(binary);
+}
+
+/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
+function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
+  const numChannels = 1;
+  const sampleRate = 24000;
+  const bitsPerSample = 16;
+  const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
+  const blockAlign = numChannels * (bitsPerSample / 8);
+  const dataSize = pcm16.length;
+  const headerSize = 44;
+  const totalSize = headerSize + dataSize;
+
+  const buf = new ArrayBuffer(totalSize);
+  const view = new DataView(buf);
+
+  // RIFF header
+  writeString(view, 0, 'RIFF');
+  view.setUint32(4, totalSize - 8, true);
+  writeString(view, 8, 'WAVE');
+
+  // fmt subchunk
+  writeString(view, 12, 'fmt ');
+  view.setUint32(16, 16, true); // subchunk size
+  view.setUint16(20, 1, true);  // PCM
+  view.setUint16(22, numChannels, true);
+  view.setUint32(24, sampleRate, true);
+  view.setUint32(28, byteRate, true);
+  view.setUint16(32, blockAlign, true);
+  view.setUint16(34, bitsPerSample, true);
+
+  // data subchunk
+  writeString(view, 36, 'data');
+  view.setUint32(40, dataSize, true);
+
+  // PCM data
+  for (let i = 0; i < pcm16.length; i++) {
+    view.setUint8(44 + i, pcm16[i]);
+  }
+
+  return buf;
+}
+
+function writeString(view: DataView, offset: number, str: string) {
+  for (let i = 0; i < str.length; i++) {
+    view.setUint8(offset + i, str.charCodeAt(i));
+  }
+}