feat: hybrid pipeline — gpt-realtime-whisper + gpt-5.4-nano + TTS

Hybrid approach gives streaming STT at ~/usr/bin/bash.017/min + cheap brain at ~/usr/bin/bash.001/min + TTS at ~/usr/bin/bash.015/min = ~/usr/bin/bash.033/min total. - gpt-realtime-whisper handles streaming transcription with VAD - gpt-5.4-nano handles response generation (chat completions) - OpenAI TTS (nova) for voice output - Server VAD detects utterance boundaries - Honcho memory context injected into system prompt - Removed old full Realtime relay service
2026-06-04 13:48:06 -04:00
parent 1c15d42e06
commit 274d04ea10
4 changed files with 281 additions and 284 deletions
@@ -1,8 +1,6 @@
 """Kira — AI body double backend

-OpenAI Realtime API pipeline:
-  mic audio → [built-in STT → GPT-4o-mini → built-in TTS] → speaker audio
-                    Single WebSocket, ~300-800ms latency
+Hybrid pipeline: gpt-realtime-whisper (streaming STT) → gpt-5.4-nano (LLM) → OpenAI TTS
 """

 import json
@@ -15,7 +13,7 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware

 from config import settings
-from services.realtime import RealtimeRelay
+from services.hybrid import HybridPipeline
 from services.memory import kira_memory

 logging.basicConfig(level=logging.INFO)
@@ -54,102 +52,92 @@ async def conversation_ws(websocket: WebSocket):
    identified = False
    logger.info(f"[{session_id}] WebSocket connected")

-    # Track conversation for Honcho
-    pending_transcripts: list[tuple[str, str]] = []
-
-    # Will be set when Realtime relay is ready
-    relay_ready = asyncio.Event()
-    relay: RealtimeRelay | None = None
-    relay_task: asyncio.Task | None = None
+    pending_transcripts: list[str] = []
+    pipeline: HybridPipeline | None = None
+    pipeline_task: asyncio.Task | None = None
+    pipeline_ready = asyncio.Event()
    audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
    text_queue: asyncio.Queue[str] = asyncio.Queue()

+    memory_suffix = ""
+
    async def on_ready():
-        relay_ready.set()
-        logger.info(f"[{session_id}] Realtime relay ready")
+        pipeline_ready.set()
+        logger.info(f"[{session_id}] Pipeline ready")
+
+    async def on_transcript_delta(delta: str):
+        """Streaming partial transcript."""
+        await websocket.send_json({"type": "transcript_delta", "text": delta})
+
+    async def on_transcript_done(full: str):
+        """Full utterance received."""
+        await websocket.send_json({"type": "transcript", "role": "user", "text": full})

    async def on_audio_delta(audio_bytes: bytes):
-        """Forward audio chunks from OpenAI to the client."""
+        """Forward TTS audio to client."""
        try:
            audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-            await websocket.send_json({
-                "type": "audio",
-                "data": audio_b64,
-            })
+            await websocket.send_json({"type": "audio", "data": audio_b64})
        except Exception:
            pass

-    async def on_transcript(text: str):
-        """Store transcripts for Honcho."""
-        pending_transcripts.append(("transcript", text))
-        role, content = text.split(": ", 1)
-        logger.info(f"[{session_id}] {role}: {content}")
-        await websocket.send_json({
-            "type": "transcript",
-            "role": role,
-            "text": content,
-        })
-
-    async def on_speech_started():
-        """Kira started speaking."""
+    async def on_speech_start():
        await websocket.send_json({"type": "speaking_start"})

-    async def on_speech_stopped():
-        """Kira finished speaking."""
+    async def on_speech_end():
        await websocket.send_json({"type": "speaking_end"})

-    async def on_interruption():
-        """User interrupted — Kira stops speaking."""
-        await websocket.send_json({"type": "interruption"})
-
    async def on_error(msg: str):
        await websocket.send_json({"type": "error", "message": msg})

-    # ── Create and start the Realtime relay ──
-    relay = RealtimeRelay(
+    # Create pipeline
+    pipeline = HybridPipeline(
+        on_transcript_delta=on_transcript_delta,
+        on_transcript_done=on_transcript_done,
        on_audio_delta=on_audio_delta,
-        on_transcript=on_transcript,
-        on_speech_started=on_speech_started,
-        on_speech_stopped=on_speech_stopped,
-        on_interruption=on_interruption,
-        on_error=on_error,
+        on_speech_start=on_speech_start,
+        on_speech_end=on_speech_end,
        on_ready=on_ready,
+        on_error=on_error,
+        memory_suffix=memory_suffix,
    )

-    relay_task = asyncio.create_task(relay.connect())
+    pipeline_task = asyncio.create_task(pipeline.connect())

-    # Wait for relay to be ready
    try:
-        await asyncio.wait_for(relay_ready.wait(), timeout=15)
+        await asyncio.wait_for(pipeline_ready.wait(), timeout=15)
    except asyncio.TimeoutError:
-        logger.error(f"[{session_id}] Realtime relay failed to connect")
+        logger.error(f"[{session_id}] Pipeline failed to connect")
        await websocket.send_json({"type": "error", "message": "Failed to connect to AI"})
-        relay_task.cancel()
+        pipeline_task.cancel()
        return

-    # ── Forward audio/text from client to relay ──
+    # Forward audio/text from client to pipeline
    async def forward_audio():
-        while relay and relay._connected:
+        while pipeline and pipeline._connected:
            try:
                pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1)
-                await relay.send_audio(pcm16)
+                await pipeline.send_audio(pcm16)
            except asyncio.TimeoutError:
                continue
            except Exception:
                break

    async def forward_text():
-        while relay and relay._connected:
+        while pipeline and pipeline._connected:
            try:
                text = await asyncio.wait_for(text_queue.get(), timeout=1)
-                await relay.send_text(text)
+                await pipeline.send_text(text)
+                # Store in Honcho
+                if kira_memory.enabled and identified:
+                    kira_memory.store_user_message(text)
            except asyncio.TimeoutError:
                continue
            except Exception:
                break

-    fwd_audio_task = asyncio.create_task(forward_audio())
-    fwd_text_task = asyncio.create_task(forward_text())
+    fwd_audio = asyncio.create_task(forward_audio())
+    fwd_text = asyncio.create_task(forward_text())

    try:
        while True:
@@ -171,30 +159,16 @@ async def conversation_ws(websocket: WebSocket):
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)

-                # Inject Honcho context into the Realtime session instructions
-                memory_suffix = ""
+                # Build memory context and update pipeline
                if kira_memory.enabled:
                    try:
                        ctx = kira_memory.build_system_prompt_suffix()
                        if ctx:
+                            pipeline._memory_suffix = ctx
                            memory_suffix = ctx
                    except Exception:
                        pass

-                if relay and relay._connected and memory_suffix:
-                    await relay._send({
-                        "type": "session.update",
-                        "session": {
-                            "instructions": (
-                                "You are Kira, a warm, kind, and encouraging AI body double. "
-                                "Speak in a friendly, girly-pop tone. Help someone with ADHD "
-                                "stay focused. Keep responses short and supportive. "
-                                "Check in, remind breaks, celebrate wins. Never judgmental."
-                                + memory_suffix
-                            ),
-                        },
-                    })
-
                await websocket.send_json({
                    "type": "identified",
                    "user_id": user_id,
@@ -210,7 +184,7 @@ async def conversation_ws(websocket: WebSocket):
                    kira_memory.set_user_preference(user_id, key, value)
                continue

-            # ── Audio from frontend (PCM16) ──
+            # ── Audio (PCM16) ──
            if msg_type == "audio":
                audio_b64 = msg.get("data", "")
                if audio_b64:
@@ -223,9 +197,6 @@ async def conversation_ws(websocket: WebSocket):
                text = msg.get("text", "").strip()
                if text:
                    await text_queue.put(text)
-                    # Also store in Honcho immediately
-                    if kira_memory.enabled and identified:
-                        kira_memory.store_user_message(text)
                continue

            if msg_type == "ping":
@@ -236,19 +207,9 @@ async def conversation_ws(websocket: WebSocket):
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")
    finally:
-        # Store pending transcripts in Honcho
-        if kira_memory.enabled and identified:
-            for _, transcript_text in pending_transcripts:
-                if transcript_text.startswith("user: "):
-                    content = transcript_text[6:]
-                    kira_memory.store_user_message(content)
-                elif transcript_text.startswith("assistant: "):
-                    content = transcript_text[11:]
-                    kira_memory.store_kira_message(content)
-
-        fwd_audio_task.cancel()
-        fwd_text_task.cancel()
-        if relay:
-            await relay.disconnect()
-        if relay_task:
-            relay_task.cancel()
+        fwd_audio.cancel()
+        fwd_text.cancel()
+        if pipeline:
+            await pipeline.disconnect()
+        if pipeline_task:
+            pipeline_task.cancel()