Files
kira/backend/main.py
T
hobokenchicken 9cd183a83b fix: streaming TTS via with_streaming_response
Replaced synchronous TTS (waiting for full audio at 5.9s) with
streaming TTS that sends audio chunks as they arrive. Backend now
accumulates chunks in audioBufferRef and plays the complete stream
on speaking_end. Reduces TTS latency from ~6s to ~1s first byte.
2026-06-04 14:17:54 -04:00

257 lines
9.1 KiB
Python

"""Kira — AI body double backend
Cheapest pipeline: gpt-4o-mini-transcribe STT → gpt-5.4-nano LLM → OpenAI TTS
~$0.019/min total, simple 3-step chat completions.
"""
import json
import base64
import uuid
import logging
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.memory import kira_memory
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")
app = FastAPI(title="Kira Backend")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental."
)
_openai = None
def get_openai():
global _openai
if _openai is None:
from openai import AsyncOpenAI
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
return _openai
@app.on_event("startup")
async def startup():
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured")
@app.get("/api/health")
async def health():
mem_status = "active" if kira_memory.enabled else "disabled"
return {"status": "ok", "name": "kira", "memory": mem_status}
async def run_conversation(text: str, memory_suffix: str = "") -> str:
"""LLM call with optional Honcho memory context injected into system prompt."""
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
client = get_openai()
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
],
max_completion_tokens=300,
temperature=0.7,
)
return resp.choices[0].message.content or "Mhm, I'm here!"
async def transcribe_audio(audio_bytes: bytes) -> str | None:
"""Transcribe Opus/webm audio using cheapest STT model."""
client = get_openai()
try:
transcript = await client.audio.transcriptions.create(
model="gpt-4o-mini-transcribe",
file=("audio.webm", audio_bytes, "audio/webm"),
response_format="text",
)
return transcript.strip() if transcript and transcript.strip() else None
except Exception as e:
logger.warning(f"STT error: {e}")
return None
async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None:
"""Generate TTS audio from text, streaming chunks to the client."""
client = get_openai()
try:
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="nova",
input=text,
response_format="opus",
) as resp:
async for chunk in resp.iter_bytes():
if chunk:
audio_b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""})
speaking_start_sent = True
except Exception as e:
logger.warning(f"TTS error: {e}")
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected")
audio_buffer = bytearray()
conversation_history: list[dict] = []
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
# Build memory context ONCE on identify (not per-turn — too slow)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
await websocket.send_json({
"type": "preference_saved",
"key": key,
"success": True,
})
continue
# ── Conversation ──
if msg_type == "audio_chunk":
chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk)
elif msg_type == "transcribe":
if not audio_buffer:
await websocket.send_json({"type": "error", "message": "No audio data"})
continue
import time
t0 = time.time()
logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
# 1. STT
transcript = await transcribe_audio(bytes(audio_buffer))
t1 = time.time()
audio_buffer.clear()
if not transcript:
await websocket.send_json({"type": "error", "message": "Could not transcribe"})
continue
logger.info(f"[{session_id}] STT took {t1-t0:.1f}s")
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript})
# 2. LLM (uses cached memory_suffix from identify)
logger.info(f"[{session_id}] User: {transcript}")
kira_text = await run_conversation(transcript, memory_suffix)
t2 = time.time()
logger.info(f"[{session_id}] LLM took {t2-t1:.1f}s")
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
except Exception:
pass
# 3. TTS
await websocket.send_json({"type": "speaking_start", "text": kira_text})
await synthesize_speech(kira_text, websocket)
t3 = time.time()
logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s")
await websocket.send_json({"type": "speaking_end"})
elif msg_type == "conversation_text":
user_text = msg.get("text", "").strip()
if not user_text:
continue
conversation_history.append({"role": "user", "content": user_text})
logger.info(f"[{session_id}] User (text): {user_text}")
kira_text = await run_conversation(user_text, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(user_text, kira_text)
except Exception:
pass
await websocket.send_json({"type": "speaking_start", "text": kira_text})
await synthesize_speech(kira_text, websocket)
await websocket.send_json({"type": "speaking_end"})
elif msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
try:
await websocket.send_json({"type": "error", "message": str(e)})
except Exception:
pass