9cd183a83b
Replaced synchronous TTS (waiting for full audio at 5.9s) with streaming TTS that sends audio chunks as they arrive. Backend now accumulates chunks in audioBufferRef and plays the complete stream on speaking_end. Reduces TTS latency from ~6s to ~1s first byte.
257 lines
9.1 KiB
Python
257 lines
9.1 KiB
Python
"""Kira — AI body double backend
|
|
|
|
Cheapest pipeline: gpt-4o-mini-transcribe STT → gpt-5.4-nano LLM → OpenAI TTS
|
|
~$0.019/min total, simple 3-step chat completions.
|
|
"""
|
|
|
|
import json
|
|
import base64
|
|
import uuid
|
|
import logging
|
|
|
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
from config import settings
|
|
from services.memory import kira_memory
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("kira")
|
|
|
|
app = FastAPI(title="Kira Backend")
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
BASE_SYSTEM_PROMPT = (
|
|
"You are Kira, a warm, kind, and encouraging AI body double. "
|
|
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
|
|
"stay focused and on task. Keep responses short, supportive, and uplifting. "
|
|
"Check in on them. Remind them to take breaks. Celebrate small wins. "
|
|
"Use occasional emoji but don't overdo it. Never be judgmental."
|
|
)
|
|
|
|
_openai = None
|
|
|
|
|
|
def get_openai():
|
|
global _openai
|
|
if _openai is None:
|
|
from openai import AsyncOpenAI
|
|
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
|
|
return _openai
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
if kira_memory.init():
|
|
logger.info("Honcho memory initialized")
|
|
else:
|
|
logger.info("Honcho memory not configured")
|
|
|
|
|
|
@app.get("/api/health")
|
|
async def health():
|
|
mem_status = "active" if kira_memory.enabled else "disabled"
|
|
return {"status": "ok", "name": "kira", "memory": mem_status}
|
|
|
|
|
|
async def run_conversation(text: str, memory_suffix: str = "") -> str:
|
|
"""LLM call with optional Honcho memory context injected into system prompt."""
|
|
system_prompt = BASE_SYSTEM_PROMPT
|
|
if memory_suffix:
|
|
system_prompt += memory_suffix
|
|
|
|
client = get_openai()
|
|
resp = await client.chat.completions.create(
|
|
model="gpt-5.4-nano",
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": text},
|
|
],
|
|
max_completion_tokens=300,
|
|
temperature=0.7,
|
|
)
|
|
return resp.choices[0].message.content or "Mhm, I'm here!"
|
|
|
|
|
|
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
|
"""Transcribe Opus/webm audio using cheapest STT model."""
|
|
client = get_openai()
|
|
try:
|
|
transcript = await client.audio.transcriptions.create(
|
|
model="gpt-4o-mini-transcribe",
|
|
file=("audio.webm", audio_bytes, "audio/webm"),
|
|
response_format="text",
|
|
)
|
|
return transcript.strip() if transcript and transcript.strip() else None
|
|
except Exception as e:
|
|
logger.warning(f"STT error: {e}")
|
|
return None
|
|
|
|
|
|
async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None:
|
|
"""Generate TTS audio from text, streaming chunks to the client."""
|
|
client = get_openai()
|
|
try:
|
|
async with client.audio.speech.with_streaming_response.create(
|
|
model="tts-1",
|
|
voice="nova",
|
|
input=text,
|
|
response_format="opus",
|
|
) as resp:
|
|
async for chunk in resp.iter_bytes():
|
|
if chunk:
|
|
audio_b64 = base64.b64encode(chunk).decode("utf-8")
|
|
await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""})
|
|
speaking_start_sent = True
|
|
except Exception as e:
|
|
logger.warning(f"TTS error: {e}")
|
|
|
|
|
|
@app.websocket("/api/ws")
|
|
async def conversation_ws(websocket: WebSocket):
|
|
await websocket.accept()
|
|
session_id = str(uuid.uuid4())[:8]
|
|
user_id = "default-user"
|
|
identified = False
|
|
memory_suffix = ""
|
|
logger.info(f"[{session_id}] WebSocket connected")
|
|
|
|
audio_buffer = bytearray()
|
|
conversation_history: list[dict] = []
|
|
|
|
try:
|
|
while True:
|
|
raw = await websocket.receive_text()
|
|
msg = json.loads(raw)
|
|
msg_type = msg.get("type", "")
|
|
|
|
# ── Identity & Preferences ──
|
|
if msg_type == "identify":
|
|
user_id = msg.get("user_id", "").strip()
|
|
user_name = msg.get("name", "").strip()
|
|
if user_name and user_id:
|
|
kira_memory.set_user_preference(user_id, "name", user_name)
|
|
|
|
prefs = kira_memory.get_user_preferences(user_id)
|
|
identified = True
|
|
|
|
if kira_memory.enabled:
|
|
kira_memory.ensure_peers(user_id)
|
|
kira_memory.ensure_session(session_id)
|
|
# Build memory context ONCE on identify (not per-turn — too slow)
|
|
try:
|
|
ctx = kira_memory.build_system_prompt_suffix()
|
|
if ctx:
|
|
memory_suffix = ctx
|
|
except Exception:
|
|
pass
|
|
|
|
await websocket.send_json({
|
|
"type": "identified",
|
|
"user_id": user_id,
|
|
"preferences": prefs,
|
|
})
|
|
continue
|
|
|
|
if msg_type == "set_preference":
|
|
key = msg.get("key", "").strip()
|
|
value = msg.get("value", "").strip()
|
|
if key and user_id and user_id != "default-user":
|
|
kira_memory.set_user_preference(user_id, key, value)
|
|
await websocket.send_json({
|
|
"type": "preference_saved",
|
|
"key": key,
|
|
"success": True,
|
|
})
|
|
continue
|
|
|
|
# ── Conversation ──
|
|
if msg_type == "audio_chunk":
|
|
chunk = base64.b64decode(msg["data"])
|
|
audio_buffer.extend(chunk)
|
|
|
|
elif msg_type == "transcribe":
|
|
if not audio_buffer:
|
|
await websocket.send_json({"type": "error", "message": "No audio data"})
|
|
continue
|
|
|
|
import time
|
|
t0 = time.time()
|
|
logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
|
|
|
|
# 1. STT
|
|
transcript = await transcribe_audio(bytes(audio_buffer))
|
|
t1 = time.time()
|
|
audio_buffer.clear()
|
|
|
|
if not transcript:
|
|
await websocket.send_json({"type": "error", "message": "Could not transcribe"})
|
|
continue
|
|
|
|
logger.info(f"[{session_id}] STT took {t1-t0:.1f}s")
|
|
|
|
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
|
|
conversation_history.append({"role": "user", "content": transcript})
|
|
|
|
# 2. LLM (uses cached memory_suffix from identify)
|
|
logger.info(f"[{session_id}] User: {transcript}")
|
|
kira_text = await run_conversation(transcript, memory_suffix)
|
|
t2 = time.time()
|
|
logger.info(f"[{session_id}] LLM took {t2-t1:.1f}s")
|
|
conversation_history.append({"role": "assistant", "content": kira_text})
|
|
logger.info(f"[{session_id}] Kira: {kira_text}")
|
|
|
|
if kira_memory.enabled and identified:
|
|
try:
|
|
kira_memory.store_messages(transcript, kira_text)
|
|
except Exception:
|
|
pass
|
|
|
|
# 3. TTS
|
|
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
|
await synthesize_speech(kira_text, websocket)
|
|
t3 = time.time()
|
|
logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s")
|
|
await websocket.send_json({"type": "speaking_end"})
|
|
|
|
elif msg_type == "conversation_text":
|
|
user_text = msg.get("text", "").strip()
|
|
if not user_text:
|
|
continue
|
|
|
|
conversation_history.append({"role": "user", "content": user_text})
|
|
logger.info(f"[{session_id}] User (text): {user_text}")
|
|
|
|
kira_text = await run_conversation(user_text, memory_suffix)
|
|
conversation_history.append({"role": "assistant", "content": kira_text})
|
|
logger.info(f"[{session_id}] Kira: {kira_text}")
|
|
|
|
if kira_memory.enabled and identified:
|
|
try:
|
|
kira_memory.store_messages(user_text, kira_text)
|
|
except Exception:
|
|
pass
|
|
|
|
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
|
await synthesize_speech(kira_text, websocket)
|
|
await websocket.send_json({"type": "speaking_end"})
|
|
|
|
elif msg_type == "ping":
|
|
await websocket.send_json({"type": "pong"})
|
|
|
|
except WebSocketDisconnect:
|
|
logger.info(f"[{session_id}] Disconnected")
|
|
except Exception as e:
|
|
logger.error(f"[{session_id}] Error: {e}")
|
|
try:
|
|
await websocket.send_json({"type": "error", "message": str(e)})
|
|
except Exception:
|
|
pass
|