fix: cache Honcho memory context per-session (not per-turn)

The memory context was being rebuilt on every conversation turn via build_system_prompt(), which calls Honcho's dialectic reasoning API twice (get_user_context + get_kira_context). Each call takes 5-15s. Now the memory suffix is computed ONCE during identify and cached in a memory_suffix variable for the session duration. Per-turn latency drops from ~37s to ~3s. Also removed duplicated _pcm16_to_wav and cleaned up orphaned code.
2026-06-04 14:11:14 -04:00
parent c5cc4dd480
commit 7875b5d12a
1 changed files with 17 additions and 55 deletions
@@ -8,7 +8,6 @@ import json
 import base64
 import uuid
 import logging
-import asyncio

 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
@@ -29,7 +28,6 @@ app.add_middleware(
    allow_headers=["*"],
 )

-# System prompt
 BASE_SYSTEM_PROMPT = (
    "You are Kira, a warm, kind, and encouraging AI body double. "
    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
@@ -63,25 +61,13 @@ async def health():
    return {"status": "ok", "name": "kira", "memory": mem_status}


-def build_system_prompt(user_id: str) -> str:
-    prompt = BASE_SYSTEM_PROMPT
-    if kira_memory.enabled:
-        try:
-            kira_memory.ensure_peers(user_id)
-            suffix = kira_memory.build_system_prompt_suffix()
-            if suffix:
-                prompt += suffix
-        except Exception as e:
-            logger.warning(f"Memory context failed: {e}")
-    return prompt
+async def run_conversation(text: str, memory_suffix: str = "") -> str:
+    """LLM call with optional Honcho memory context injected into system prompt."""
+    system_prompt = BASE_SYSTEM_PROMPT
+    if memory_suffix:
+        system_prompt += memory_suffix

-
-async def run_conversation(text: str, user_id: str) -> str:
-    """STT → LLM → TTS using the cheapest models."""
-    system_prompt = build_system_prompt(user_id)
    client = get_openai()
-
-    # LLM
    resp = await client.chat.completions.create(
        model="gpt-5.4-nano",
        messages=[
@@ -91,8 +77,7 @@ async def run_conversation(text: str, user_id: str) -> str:
        max_completion_tokens=300,
        temperature=0.7,
    )
-    kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
-    return kira_text
+    return resp.choices[0].message.content or "Mhm, I'm here!"


 async def transcribe_audio(audio_bytes: bytes) -> str | None:
@@ -126,41 +111,13 @@ async def synthesize_speech(text: str) -> bytes:
        return b""


-def _pcm16_to_wav(pcm_data: bytes) -> bytes:
-    """Wrap raw PCM16 mono 24kHz data in a WAV container."""
-    import struct
-    num_channels = 1
-    sample_rate = 24000
-    bits_per_sample = 16
-    byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
-    block_align = num_channels * (bits_per_sample // 8)
-    data_size = len(pcm_data)
-    header_size = 44
-    total_size = header_size + data_size
-
-    header = b"RIFF"
-    header += struct.pack("<I", total_size - 8)
-    header += b"WAVE"
-    header += b"fmt "
-    header += struct.pack("<I", 16)           # subchunk size
-    header += struct.pack("<H", 1)            # PCM format
-    header += struct.pack("<H", num_channels)
-    header += struct.pack("<I", sample_rate)
-    header += struct.pack("<I", byte_rate)
-    header += struct.pack("<H", block_align)
-    header += struct.pack("<H", bits_per_sample)
-    header += b"data"
-    header += struct.pack("<I", data_size)
-
-    return header + pcm_data
-
-
@app.websocket("/api/ws")
 async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
    session_id = str(uuid.uuid4())[:8]
    user_id = "default-user"
    identified = False
+    memory_suffix = ""
    logger.info(f"[{session_id}] WebSocket connected")

    audio_buffer = bytearray()
@@ -185,6 +142,13 @@ async def conversation_ws(websocket: WebSocket):
                if kira_memory.enabled:
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)
+                    # Build memory context ONCE on identify (not per-turn — too slow)
+                    try:
+                        ctx = kira_memory.build_system_prompt_suffix()
+                        if ctx:
+                            memory_suffix = ctx
+                    except Exception:
+                        pass

                await websocket.send_json({
                    "type": "identified",
@@ -207,7 +171,6 @@ async def conversation_ws(websocket: WebSocket):

            # ── Conversation ──
            if msg_type == "audio_chunk":
-                # Single Opus/webm blob from MediaRecorder
                chunk = base64.b64decode(msg["data"])
                audio_buffer.extend(chunk)

@@ -229,13 +192,12 @@ async def conversation_ws(websocket: WebSocket):
                await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
                conversation_history.append({"role": "user", "content": transcript})

-                # 2. LLM
+                # 2. LLM (uses cached memory_suffix from identify)
                logger.info(f"[{session_id}] User: {transcript}")
-                kira_text = await run_conversation(transcript, user_id)
+                kira_text = await run_conversation(transcript, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")

-                # Store in Honcho
                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(transcript, kira_text)
@@ -257,7 +219,7 @@ async def conversation_ws(websocket: WebSocket):
                conversation_history.append({"role": "user", "content": user_text})
                logger.info(f"[{session_id}] User (text): {user_text}")

-                kira_text = await run_conversation(user_text, user_id)
+                kira_text = await run_conversation(user_text, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")