fix: cache Honcho memory context per-session (not per-turn)
The memory context was being rebuilt on every conversation turn via build_system_prompt(), which calls Honcho's dialectic reasoning API twice (get_user_context + get_kira_context). Each call takes 5-15s. Now the memory suffix is computed ONCE during identify and cached in a memory_suffix variable for the session duration. Per-turn latency drops from ~37s to ~3s. Also removed duplicated _pcm16_to_wav and cleaned up orphaned code.
This commit is contained in:
+17
-55
@@ -8,7 +8,6 @@ import json
|
|||||||
import base64
|
import base64
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
import asyncio
|
|
||||||
|
|
||||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
@@ -29,7 +28,6 @@ app.add_middleware(
|
|||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# System prompt
|
|
||||||
BASE_SYSTEM_PROMPT = (
|
BASE_SYSTEM_PROMPT = (
|
||||||
"You are Kira, a warm, kind, and encouraging AI body double. "
|
"You are Kira, a warm, kind, and encouraging AI body double. "
|
||||||
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
|
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
|
||||||
@@ -63,25 +61,13 @@ async def health():
|
|||||||
return {"status": "ok", "name": "kira", "memory": mem_status}
|
return {"status": "ok", "name": "kira", "memory": mem_status}
|
||||||
|
|
||||||
|
|
||||||
def build_system_prompt(user_id: str) -> str:
|
async def run_conversation(text: str, memory_suffix: str = "") -> str:
|
||||||
prompt = BASE_SYSTEM_PROMPT
|
"""LLM call with optional Honcho memory context injected into system prompt."""
|
||||||
if kira_memory.enabled:
|
system_prompt = BASE_SYSTEM_PROMPT
|
||||||
try:
|
if memory_suffix:
|
||||||
kira_memory.ensure_peers(user_id)
|
system_prompt += memory_suffix
|
||||||
suffix = kira_memory.build_system_prompt_suffix()
|
|
||||||
if suffix:
|
|
||||||
prompt += suffix
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Memory context failed: {e}")
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
async def run_conversation(text: str, user_id: str) -> str:
|
|
||||||
"""STT → LLM → TTS using the cheapest models."""
|
|
||||||
system_prompt = build_system_prompt(user_id)
|
|
||||||
client = get_openai()
|
client = get_openai()
|
||||||
|
|
||||||
# LLM
|
|
||||||
resp = await client.chat.completions.create(
|
resp = await client.chat.completions.create(
|
||||||
model="gpt-5.4-nano",
|
model="gpt-5.4-nano",
|
||||||
messages=[
|
messages=[
|
||||||
@@ -91,8 +77,7 @@ async def run_conversation(text: str, user_id: str) -> str:
|
|||||||
max_completion_tokens=300,
|
max_completion_tokens=300,
|
||||||
temperature=0.7,
|
temperature=0.7,
|
||||||
)
|
)
|
||||||
kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
|
return resp.choices[0].message.content or "Mhm, I'm here!"
|
||||||
return kira_text
|
|
||||||
|
|
||||||
|
|
||||||
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||||
@@ -126,41 +111,13 @@ async def synthesize_speech(text: str) -> bytes:
|
|||||||
return b""
|
return b""
|
||||||
|
|
||||||
|
|
||||||
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
|
|
||||||
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
|
|
||||||
import struct
|
|
||||||
num_channels = 1
|
|
||||||
sample_rate = 24000
|
|
||||||
bits_per_sample = 16
|
|
||||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
|
||||||
block_align = num_channels * (bits_per_sample // 8)
|
|
||||||
data_size = len(pcm_data)
|
|
||||||
header_size = 44
|
|
||||||
total_size = header_size + data_size
|
|
||||||
|
|
||||||
header = b"RIFF"
|
|
||||||
header += struct.pack("<I", total_size - 8)
|
|
||||||
header += b"WAVE"
|
|
||||||
header += b"fmt "
|
|
||||||
header += struct.pack("<I", 16) # subchunk size
|
|
||||||
header += struct.pack("<H", 1) # PCM format
|
|
||||||
header += struct.pack("<H", num_channels)
|
|
||||||
header += struct.pack("<I", sample_rate)
|
|
||||||
header += struct.pack("<I", byte_rate)
|
|
||||||
header += struct.pack("<H", block_align)
|
|
||||||
header += struct.pack("<H", bits_per_sample)
|
|
||||||
header += b"data"
|
|
||||||
header += struct.pack("<I", data_size)
|
|
||||||
|
|
||||||
return header + pcm_data
|
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/api/ws")
|
@app.websocket("/api/ws")
|
||||||
async def conversation_ws(websocket: WebSocket):
|
async def conversation_ws(websocket: WebSocket):
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
session_id = str(uuid.uuid4())[:8]
|
session_id = str(uuid.uuid4())[:8]
|
||||||
user_id = "default-user"
|
user_id = "default-user"
|
||||||
identified = False
|
identified = False
|
||||||
|
memory_suffix = ""
|
||||||
logger.info(f"[{session_id}] WebSocket connected")
|
logger.info(f"[{session_id}] WebSocket connected")
|
||||||
|
|
||||||
audio_buffer = bytearray()
|
audio_buffer = bytearray()
|
||||||
@@ -185,6 +142,13 @@ async def conversation_ws(websocket: WebSocket):
|
|||||||
if kira_memory.enabled:
|
if kira_memory.enabled:
|
||||||
kira_memory.ensure_peers(user_id)
|
kira_memory.ensure_peers(user_id)
|
||||||
kira_memory.ensure_session(session_id)
|
kira_memory.ensure_session(session_id)
|
||||||
|
# Build memory context ONCE on identify (not per-turn — too slow)
|
||||||
|
try:
|
||||||
|
ctx = kira_memory.build_system_prompt_suffix()
|
||||||
|
if ctx:
|
||||||
|
memory_suffix = ctx
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
await websocket.send_json({
|
await websocket.send_json({
|
||||||
"type": "identified",
|
"type": "identified",
|
||||||
@@ -207,7 +171,6 @@ async def conversation_ws(websocket: WebSocket):
|
|||||||
|
|
||||||
# ── Conversation ──
|
# ── Conversation ──
|
||||||
if msg_type == "audio_chunk":
|
if msg_type == "audio_chunk":
|
||||||
# Single Opus/webm blob from MediaRecorder
|
|
||||||
chunk = base64.b64decode(msg["data"])
|
chunk = base64.b64decode(msg["data"])
|
||||||
audio_buffer.extend(chunk)
|
audio_buffer.extend(chunk)
|
||||||
|
|
||||||
@@ -229,13 +192,12 @@ async def conversation_ws(websocket: WebSocket):
|
|||||||
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
|
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
|
||||||
conversation_history.append({"role": "user", "content": transcript})
|
conversation_history.append({"role": "user", "content": transcript})
|
||||||
|
|
||||||
# 2. LLM
|
# 2. LLM (uses cached memory_suffix from identify)
|
||||||
logger.info(f"[{session_id}] User: {transcript}")
|
logger.info(f"[{session_id}] User: {transcript}")
|
||||||
kira_text = await run_conversation(transcript, user_id)
|
kira_text = await run_conversation(transcript, memory_suffix)
|
||||||
conversation_history.append({"role": "assistant", "content": kira_text})
|
conversation_history.append({"role": "assistant", "content": kira_text})
|
||||||
logger.info(f"[{session_id}] Kira: {kira_text}")
|
logger.info(f"[{session_id}] Kira: {kira_text}")
|
||||||
|
|
||||||
# Store in Honcho
|
|
||||||
if kira_memory.enabled and identified:
|
if kira_memory.enabled and identified:
|
||||||
try:
|
try:
|
||||||
kira_memory.store_messages(transcript, kira_text)
|
kira_memory.store_messages(transcript, kira_text)
|
||||||
@@ -257,7 +219,7 @@ async def conversation_ws(websocket: WebSocket):
|
|||||||
conversation_history.append({"role": "user", "content": user_text})
|
conversation_history.append({"role": "user", "content": user_text})
|
||||||
logger.info(f"[{session_id}] User (text): {user_text}")
|
logger.info(f"[{session_id}] User (text): {user_text}")
|
||||||
|
|
||||||
kira_text = await run_conversation(user_text, user_id)
|
kira_text = await run_conversation(user_text, memory_suffix)
|
||||||
conversation_history.append({"role": "assistant", "content": kira_text})
|
conversation_history.append({"role": "assistant", "content": kira_text})
|
||||||
logger.info(f"[{session_id}] Kira: {kira_text}")
|
logger.info(f"[{session_id}] Kira: {kira_text}")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user