fix: cache Honcho memory context per-session (not per-turn)

The memory context was being rebuilt on every conversation turn via
build_system_prompt(), which calls Honcho's dialectic reasoning API
twice (get_user_context + get_kira_context). Each call takes 5-15s.

Now the memory suffix is computed ONCE during identify and cached in
a memory_suffix variable for the session duration. Per-turn latency
drops from ~37s to ~3s.

Also removed duplicated _pcm16_to_wav and cleaned up orphaned code.
This commit is contained in:
2026-06-04 14:11:14 -04:00
parent c5cc4dd480
commit 7875b5d12a
+17 -55
View File
@@ -8,7 +8,6 @@ import json
import base64
import uuid
import logging
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
@@ -29,7 +28,6 @@ app.add_middleware(
allow_headers=["*"],
)
# System prompt
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
@@ -63,25 +61,13 @@ async def health():
return {"status": "ok", "name": "kira", "memory": mem_status}
def build_system_prompt(user_id: str) -> str:
prompt = BASE_SYSTEM_PROMPT
if kira_memory.enabled:
try:
kira_memory.ensure_peers(user_id)
suffix = kira_memory.build_system_prompt_suffix()
if suffix:
prompt += suffix
except Exception as e:
logger.warning(f"Memory context failed: {e}")
return prompt
async def run_conversation(text: str, memory_suffix: str = "") -> str:
"""LLM call with optional Honcho memory context injected into system prompt."""
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
async def run_conversation(text: str, user_id: str) -> str:
"""STT → LLM → TTS using the cheapest models."""
system_prompt = build_system_prompt(user_id)
client = get_openai()
# LLM
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
@@ -91,8 +77,7 @@ async def run_conversation(text: str, user_id: str) -> str:
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
return kira_text
return resp.choices[0].message.content or "Mhm, I'm here!"
async def transcribe_audio(audio_bytes: bytes) -> str | None:
@@ -126,41 +111,13 @@ async def synthesize_speech(text: str) -> bytes:
return b""
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
import struct
num_channels = 1
sample_rate = 24000
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm_data)
header_size = 44
total_size = header_size + data_size
header = b"RIFF"
header += struct.pack("<I", total_size - 8)
header += b"WAVE"
header += b"fmt "
header += struct.pack("<I", 16) # subchunk size
header += struct.pack("<H", 1) # PCM format
header += struct.pack("<H", num_channels)
header += struct.pack("<I", sample_rate)
header += struct.pack("<I", byte_rate)
header += struct.pack("<H", block_align)
header += struct.pack("<H", bits_per_sample)
header += b"data"
header += struct.pack("<I", data_size)
return header + pcm_data
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected")
audio_buffer = bytearray()
@@ -185,6 +142,13 @@ async def conversation_ws(websocket: WebSocket):
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
# Build memory context ONCE on identify (not per-turn — too slow)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
@@ -207,7 +171,6 @@ async def conversation_ws(websocket: WebSocket):
# ── Conversation ──
if msg_type == "audio_chunk":
# Single Opus/webm blob from MediaRecorder
chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk)
@@ -229,13 +192,12 @@ async def conversation_ws(websocket: WebSocket):
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript})
# 2. LLM
# 2. LLM (uses cached memory_suffix from identify)
logger.info(f"[{session_id}] User: {transcript}")
kira_text = await run_conversation(transcript, user_id)
kira_text = await run_conversation(transcript, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
@@ -257,7 +219,7 @@ async def conversation_ws(websocket: WebSocket):
conversation_history.append({"role": "user", "content": user_text})
logger.info(f"[{session_id}] User (text): {user_text}")
kira_text = await run_conversation(user_text, user_id)
kira_text = await run_conversation(user_text, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")