"""Kira — AI body double backend Realtime WebSocket STT (gpt-realtime-whisper) → gpt-5.4-nano → streaming TTS """ import json import base64 import uuid import logging import time import asyncio from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from config import settings from services.memory import kira_memory from services.whisper_stream import WhisperStream logging.basicConfig(level=logging.INFO) logger = logging.getLogger("kira") app = FastAPI(title="Kira Backend") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) BASE_SYSTEM_PROMPT = ( "You are Kira, a warm, kind, and encouraging AI body double. " "You speak in a friendly, girly-pop tone. You are helping someone with ADHD " "stay focused and on task. Keep responses short, supportive, and uplifting. " "Check in on them. Remind them to take breaks. Celebrate small wins. " "Use occasional emoji but don't overdo it. Never be judgmental." ) _openai = None def get_openai(): global _openai if _openai is None: from openai import AsyncOpenAI _openai = AsyncOpenAI(api_key=settings.openai_api_key) return _openai @app.on_event("startup") async def startup(): if kira_memory.init(): logger.info("Honcho memory initialized") else: logger.info("Honcho memory not configured") @app.get("/api/health") async def health(): mem_status = "active" if kira_memory.enabled else "disabled" return {"status": "ok", "name": "kira", "memory": mem_status} @app.websocket("/api/ws") async def conversation_ws(websocket: WebSocket): await websocket.accept() session_id = str(uuid.uuid4())[:8] user_id = "default-user" identified = False memory_suffix = "" logger.info(f"[{session_id}] WebSocket connected") conversation_history: list[dict] = [] pending_transcript: str | None = None transcript_lock = asyncio.Lock() # ── Whisper stream callbacks ── async def on_ready(): logger.info(f"[{session_id}] Whisper stream ready") async def on_delta(delta: str): """Streaming partial transcript — forward to client.""" try: await websocket.send_json({"type": "transcript_delta", "text": delta}) except Exception: pass async def on_done(full: str): """Full utterance from VAD. Kick off LLM + TTS.""" nonlocal pending_transcript logger.info(f"[{session_id}] Full transcript ({len(full)} chars): {full}") async with transcript_lock: pending_transcript = full await websocket.send_json({"type": "transcript", "role": "user", "text": full}) conversation_history.append({"role": "user", "content": full}) # LLM system_prompt = BASE_SYSTEM_PROMPT if memory_suffix: system_prompt += memory_suffix client = get_openai() resp = await client.chat.completions.create( model="gpt-5.4-nano", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": full}, ], max_completion_tokens=300, temperature=0.7, ) kira_text = resp.choices[0].message.content or "Mhm, I'm here!" conversation_history.append({"role": "assistant", "content": kira_text}) logger.info(f"[{session_id}] Kira: {kira_text}") # Store in Honcho if kira_memory.enabled and identified: try: kira_memory.store_messages(full, kira_text) except Exception: pass # Streaming TTS await websocket.send_json({"type": "speaking_start", "text": kira_text}) async with client.audio.speech.with_streaming_response.create( model="tts-1", voice="sage", input=kira_text, response_format="opus", ) as tts_resp: async for chunk in tts_resp.iter_bytes(): if chunk: b64 = base64.b64encode(chunk).decode("utf-8") await websocket.send_json({"type": "audio", "data": b64}) await websocket.send_json({"type": "speaking_end"}) async def on_error(msg: str): logger.warning(f"Whisper error: {msg}") try: await websocket.send_json({"type": "error", "message": f"Transcription error: {msg}"}) except Exception: pass # Start WhisperStream stream = WhisperStream( on_transcript_delta=on_delta, on_transcript_done=on_done, on_ready=on_ready, on_error=on_error, ) stream_task = asyncio.create_task(stream.connect()) await asyncio.sleep(2) # brief wait for connection try: while True: raw = await websocket.receive_text() msg = json.loads(raw) msg_type = msg.get("type", "") # ── Identity & Preferences ── if msg_type == "identify": user_id = msg.get("user_id", "").strip() user_name = msg.get("name", "").strip() if user_name and user_id: kira_memory.set_user_preference(user_id, "name", user_name) prefs = kira_memory.get_user_preferences(user_id) identified = True if kira_memory.enabled: kira_memory.ensure_peers(user_id) kira_memory.ensure_session(session_id) try: ctx = kira_memory.build_system_prompt_suffix() if ctx: memory_suffix = ctx except Exception: pass await websocket.send_json({ "type": "identified", "user_id": user_id, "preferences": prefs, }) continue if msg_type == "set_preference": key = msg.get("key", "").strip() value = msg.get("value", "").strip() if key and user_id and user_id != "default-user": kira_memory.set_user_preference(user_id, key, value) await websocket.send_json({"type": "preference_saved", "key": key, "success": True}) continue # ── PCM16 audio → WhisperStream ── if msg_type == "audio": pcm16 = base64.b64decode(msg["data"]) await stream.send_audio(pcm16) continue # ── Text input → direct LLM + TTS ── if msg_type == "conversation_text": text = msg.get("text", "").strip() if not text: continue logger.info(f"[{session_id}] User (text): {text}") conversation_history.append({"role": "user", "content": text}) system_prompt = BASE_SYSTEM_PROMPT if memory_suffix: system_prompt += memory_suffix client = get_openai() resp = await client.chat.completions.create( model="gpt-5.4-nano", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": text}, ], max_completion_tokens=300, temperature=0.7, ) kira_text = resp.choices[0].message.content or "Mhm!" conversation_history.append({"role": "assistant", "content": kira_text}) logger.info(f"[{session_id}] Kira: {kira_text}") if kira_memory.enabled and identified: try: kira_memory.store_messages(text, kira_text) except Exception: pass await websocket.send_json({"type": "speaking_start", "text": kira_text}) async with client.audio.speech.with_streaming_response.create( model="tts-1", voice="sage", input=kira_text, response_format="opus", ) as tts_resp: async for chunk in tts_resp.iter_bytes(): if chunk: b64 = base64.b64encode(chunk).decode("utf-8") await websocket.send_json({"type": "audio", "data": b64}) await websocket.send_json({"type": "speaking_end"}) continue if msg_type == "ping": await websocket.send_json({"type": "pong"}) except WebSocketDisconnect: logger.info(f"[{session_id}] Disconnected") except Exception as e: logger.error(f"[{session_id}] Error: {e}") finally: await stream.disconnect() stream_task.cancel()