fix: cache Honcho memory context per-session (not per-turn)

The memory context was being rebuilt on every conversation turn via
build_system_prompt(), which calls Honcho's dialectic reasoning API
twice (get_user_context + get_kira_context). Each call takes 5-15s.

Now the memory suffix is computed ONCE during identify and cached in
a memory_suffix variable for the session duration. Per-turn latency
drops from ~37s to ~3s.

Also removed duplicated _pcm16_to_wav and cleaned up orphaned code.
This commit is contained in:
2026-06-04 14:11:14 -04:00
parent c5cc4dd480
commit 7875b5d12a
+17 -55
View File
@@ -8,7 +8,6 @@ import json
import base64 import base64
import uuid import uuid
import logging import logging
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@@ -29,7 +28,6 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# System prompt
BASE_SYSTEM_PROMPT = ( BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. " "You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD " "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
@@ -63,25 +61,13 @@ async def health():
return {"status": "ok", "name": "kira", "memory": mem_status} return {"status": "ok", "name": "kira", "memory": mem_status}
def build_system_prompt(user_id: str) -> str: async def run_conversation(text: str, memory_suffix: str = "") -> str:
prompt = BASE_SYSTEM_PROMPT """LLM call with optional Honcho memory context injected into system prompt."""
if kira_memory.enabled: system_prompt = BASE_SYSTEM_PROMPT
try: if memory_suffix:
kira_memory.ensure_peers(user_id) system_prompt += memory_suffix
suffix = kira_memory.build_system_prompt_suffix()
if suffix:
prompt += suffix
except Exception as e:
logger.warning(f"Memory context failed: {e}")
return prompt
async def run_conversation(text: str, user_id: str) -> str:
"""STT → LLM → TTS using the cheapest models."""
system_prompt = build_system_prompt(user_id)
client = get_openai() client = get_openai()
# LLM
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model="gpt-5.4-nano", model="gpt-5.4-nano",
messages=[ messages=[
@@ -91,8 +77,7 @@ async def run_conversation(text: str, user_id: str) -> str:
max_completion_tokens=300, max_completion_tokens=300,
temperature=0.7, temperature=0.7,
) )
kira_text = resp.choices[0].message.content or "Mhm, I'm here!" return resp.choices[0].message.content or "Mhm, I'm here!"
return kira_text
async def transcribe_audio(audio_bytes: bytes) -> str | None: async def transcribe_audio(audio_bytes: bytes) -> str | None:
@@ -126,41 +111,13 @@ async def synthesize_speech(text: str) -> bytes:
return b"" return b""
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
import struct
num_channels = 1
sample_rate = 24000
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm_data)
header_size = 44
total_size = header_size + data_size
header = b"RIFF"
header += struct.pack("<I", total_size - 8)
header += b"WAVE"
header += b"fmt "
header += struct.pack("<I", 16) # subchunk size
header += struct.pack("<H", 1) # PCM format
header += struct.pack("<H", num_channels)
header += struct.pack("<I", sample_rate)
header += struct.pack("<I", byte_rate)
header += struct.pack("<H", block_align)
header += struct.pack("<H", bits_per_sample)
header += b"data"
header += struct.pack("<I", data_size)
return header + pcm_data
@app.websocket("/api/ws") @app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket): async def conversation_ws(websocket: WebSocket):
await websocket.accept() await websocket.accept()
session_id = str(uuid.uuid4())[:8] session_id = str(uuid.uuid4())[:8]
user_id = "default-user" user_id = "default-user"
identified = False identified = False
memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected") logger.info(f"[{session_id}] WebSocket connected")
audio_buffer = bytearray() audio_buffer = bytearray()
@@ -185,6 +142,13 @@ async def conversation_ws(websocket: WebSocket):
if kira_memory.enabled: if kira_memory.enabled:
kira_memory.ensure_peers(user_id) kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id) kira_memory.ensure_session(session_id)
# Build memory context ONCE on identify (not per-turn — too slow)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({ await websocket.send_json({
"type": "identified", "type": "identified",
@@ -207,7 +171,6 @@ async def conversation_ws(websocket: WebSocket):
# ── Conversation ── # ── Conversation ──
if msg_type == "audio_chunk": if msg_type == "audio_chunk":
# Single Opus/webm blob from MediaRecorder
chunk = base64.b64decode(msg["data"]) chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk) audio_buffer.extend(chunk)
@@ -229,13 +192,12 @@ async def conversation_ws(websocket: WebSocket):
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript}) await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript}) conversation_history.append({"role": "user", "content": transcript})
# 2. LLM # 2. LLM (uses cached memory_suffix from identify)
logger.info(f"[{session_id}] User: {transcript}") logger.info(f"[{session_id}] User: {transcript}")
kira_text = await run_conversation(transcript, user_id) kira_text = await run_conversation(transcript, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text}) conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}") logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified: if kira_memory.enabled and identified:
try: try:
kira_memory.store_messages(transcript, kira_text) kira_memory.store_messages(transcript, kira_text)
@@ -257,7 +219,7 @@ async def conversation_ws(websocket: WebSocket):
conversation_history.append({"role": "user", "content": user_text}) conversation_history.append({"role": "user", "content": user_text})
logger.info(f"[{session_id}] User (text): {user_text}") logger.info(f"[{session_id}] User (text): {user_text}")
kira_text = await run_conversation(user_text, user_id) kira_text = await run_conversation(user_text, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text}) conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}") logger.info(f"[{session_id}] Kira: {kira_text}")