Files
kira/backend/main.py
T
hobokenchicken 191b7ad9b5 fix(stt): correct Realtime WS model to gpt-realtime-whisper + enhance event handling for deltas/completed
- URL now uses ?model=gpt-realtime-whisper (was invalid gpt-4o-mini-realtime-preview)
- Cleaned session.update (removed modalities that may not apply)
- Expanded _handle to catch input_audio_transcription.delta and .completed events
- on_error now forwards transcription errors to frontend client
- Per AUDIT + PLAN item 1
2026-06-04 15:14:26 -04:00

263 lines
8.9 KiB
Python

"""Kira — AI body double backend
Realtime WebSocket STT (gpt-realtime-whisper) → gpt-5.4-nano → streaming TTS
"""
import json
import base64
import uuid
import logging
import time
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.memory import kira_memory
from services.whisper_stream import WhisperStream
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")
app = FastAPI(title="Kira Backend")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental."
)
_openai = None
def get_openai():
global _openai
if _openai is None:
from openai import AsyncOpenAI
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
return _openai
@app.on_event("startup")
async def startup():
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured")
@app.get("/api/health")
async def health():
mem_status = "active" if kira_memory.enabled else "disabled"
return {"status": "ok", "name": "kira", "memory": mem_status}
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected")
conversation_history: list[dict] = []
pending_transcript: str | None = None
transcript_lock = asyncio.Lock()
# ── Whisper stream callbacks ──
async def on_ready():
logger.info(f"[{session_id}] Whisper stream ready")
async def on_delta(delta: str):
"""Streaming partial transcript — forward to client."""
try:
await websocket.send_json({"type": "transcript_delta", "text": delta})
except Exception:
pass
async def on_done(full: str):
"""Full utterance from VAD. Kick off LLM + TTS."""
nonlocal pending_transcript
logger.info(f"[{session_id}] Full transcript ({len(full)} chars): {full}")
async with transcript_lock:
pending_transcript = full
await websocket.send_json({"type": "transcript", "role": "user", "text": full})
conversation_history.append({"role": "user", "content": full})
# LLM
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
client = get_openai()
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": full},
],
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(full, kira_text)
except Exception:
pass
# Streaming TTS
await websocket.send_json({"type": "speaking_start", "text": kira_text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=kira_text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
async def on_error(msg: str):
logger.warning(f"Whisper error: {msg}")
try:
await websocket.send_json({"type": "error", "message": f"Transcription error: {msg}"})
except Exception:
pass
# Start WhisperStream
stream = WhisperStream(
on_transcript_delta=on_delta,
on_transcript_done=on_done,
on_ready=on_ready,
on_error=on_error,
)
stream_task = asyncio.create_task(stream.connect())
await asyncio.sleep(2) # brief wait for connection
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
continue
# ── PCM16 audio → WhisperStream ──
if msg_type == "audio":
pcm16 = base64.b64decode(msg["data"])
await stream.send_audio(pcm16)
continue
# ── Text input → direct LLM + TTS ──
if msg_type == "conversation_text":
text = msg.get("text", "").strip()
if not text:
continue
logger.info(f"[{session_id}] User (text): {text}")
conversation_history.append({"role": "user", "content": text})
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
client = get_openai()
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
],
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm!"
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(text, kira_text)
except Exception:
pass
await websocket.send_json({"type": "speaking_start", "text": kira_text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=kira_text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
continue
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
finally:
await stream.disconnect()
stream_task.cancel()