fix: cache Honcho memory context per-session (not per-turn)

The memory context was being rebuilt on every conversation turn via build_system_prompt(), which calls Honcho's dialectic reasoning API twice (get_user_context + get_kira_context). Each call takes 5-15s. Now the memory suffix is computed ONCE during identify and cached in a memory_suffix variable for the session duration. Per-turn latency drops from ~37s to ~3s. Also removed duplicated _pcm16_to_wav and cleaned up orphaned code.
2026-06-04 14:11:14 -04:00
parent c5cc4dd480
commit 7875b5d12a
1 changed files with 17 additions and 55 deletions
@@ -8,7 +8,6 @@ import json
 import base64
 import uuid
 import logging
 import asyncio
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
@@ -29,7 +28,6 @@ app.add_middleware(
    allow_headers=["*"],
 )
 # System prompt
 BASE_SYSTEM_PROMPT = (
    "You are Kira, a warm, kind, and encouraging AI body double. "
    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
@@ -63,25 +61,13 @@ async def health():
    return {"status": "ok", "name": "kira", "memory": mem_status}
-def build_system_prompt(user_id: str) -> str:
+async def run_conversation(text: str, memory_suffix: str = "") -> str:
-    prompt = BASE_SYSTEM_PROMPT
+    """LLM call with optional Honcho memory context injected into system prompt."""
-    if kira_memory.enabled:
+    system_prompt = BASE_SYSTEM_PROMPT
-        try:
+    if memory_suffix:
-            kira_memory.ensure_peers(user_id)
+        system_prompt += memory_suffix
            suffix = kira_memory.build_system_prompt_suffix()
            if suffix:
                prompt += suffix
        except Exception as e:
            logger.warning(f"Memory context failed: {e}")
    return prompt
 async def run_conversation(text: str, user_id: str) -> str:
    """STT → LLM → TTS using the cheapest models."""
    system_prompt = build_system_prompt(user_id)
    client = get_openai()
    # LLM
    resp = await client.chat.completions.create(
        model="gpt-5.4-nano",
        messages=[
@@ -91,8 +77,7 @@ async def run_conversation(text: str, user_id: str) -> str:
        max_completion_tokens=300,
        temperature=0.7,
    )
-    kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
+    return resp.choices[0].message.content or "Mhm, I'm here!"
    return kira_text
 async def transcribe_audio(audio_bytes: bytes) -> str | None:
@@ -126,41 +111,13 @@ async def synthesize_speech(text: str) -> bytes:
        return b""
 def _pcm16_to_wav(pcm_data: bytes) -> bytes:
    """Wrap raw PCM16 mono 24kHz data in a WAV container."""
    import struct
    num_channels = 1
    sample_rate = 24000
    bits_per_sample = 16
    byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
    block_align = num_channels * (bits_per_sample // 8)
    data_size = len(pcm_data)
    header_size = 44
    total_size = header_size + data_size
    header = b"RIFF"
    header += struct.pack("<I", total_size - 8)
    header += b"WAVE"
    header += b"fmt "
    header += struct.pack("<I", 16)           # subchunk size
    header += struct.pack("<H", 1)            # PCM format
    header += struct.pack("<H", num_channels)
    header += struct.pack("<I", sample_rate)
    header += struct.pack("<I", byte_rate)
    header += struct.pack("<H", block_align)
    header += struct.pack("<H", bits_per_sample)
    header += b"data"
    header += struct.pack("<I", data_size)
    return header + pcm_data
@app.websocket("/api/ws")
 async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
    session_id = str(uuid.uuid4())[:8]
    user_id = "default-user"
    identified = False
    memory_suffix = ""
    logger.info(f"[{session_id}] WebSocket connected")
    audio_buffer = bytearray()
@@ -185,6 +142,13 @@ async def conversation_ws(websocket: WebSocket):
                if kira_memory.enabled:
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)
                    # Build memory context ONCE on identify (not per-turn — too slow)
                    try:
                        ctx = kira_memory.build_system_prompt_suffix()
                        if ctx:
                            memory_suffix = ctx
                    except Exception:
                        pass
                await websocket.send_json({
                    "type": "identified",
@@ -207,7 +171,6 @@ async def conversation_ws(websocket: WebSocket):
            # ── Conversation ──
            if msg_type == "audio_chunk":
                # Single Opus/webm blob from MediaRecorder
                chunk = base64.b64decode(msg["data"])
                audio_buffer.extend(chunk)
@@ -229,13 +192,12 @@ async def conversation_ws(websocket: WebSocket):
                await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
                conversation_history.append({"role": "user", "content": transcript})
-                # 2. LLM
+                # 2. LLM (uses cached memory_suffix from identify)
                logger.info(f"[{session_id}] User: {transcript}")
-                kira_text = await run_conversation(transcript, user_id)
+                kira_text = await run_conversation(transcript, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")
                # Store in Honcho
                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(transcript, kira_text)
@@ -257,7 +219,7 @@ async def conversation_ws(websocket: WebSocket):
                conversation_history.append({"role": "user", "content": user_text})
                logger.info(f"[{session_id}] User (text): {user_text}")
-                kira_text = await run_conversation(user_text, user_id)
+                kira_text = await run_conversation(user_text, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")