"""Kira — AI body double backend Cheapest pipeline: gpt-4o-mini-transcribe STT → gpt-5.4-nano LLM → OpenAI TTS ~$0.019/min total, simple 3-step chat completions. """ import json import base64 import uuid import logging import asyncio from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from config import settings from services.memory import kira_memory logging.basicConfig(level=logging.INFO) logger = logging.getLogger("kira") app = FastAPI(title="Kira Backend") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # System prompt BASE_SYSTEM_PROMPT = ( "You are Kira, a warm, kind, and encouraging AI body double. " "You speak in a friendly, girly-pop tone. You are helping someone with ADHD " "stay focused and on task. Keep responses short, supportive, and uplifting. " "Check in on them. Remind them to take breaks. Celebrate small wins. " "Use occasional emoji but don't overdo it. Never be judgmental." ) _openai = None def get_openai(): global _openai if _openai is None: from openai import AsyncOpenAI _openai = AsyncOpenAI(api_key=settings.openai_api_key) return _openai @app.on_event("startup") async def startup(): if kira_memory.init(): logger.info("Honcho memory initialized") else: logger.info("Honcho memory not configured") @app.get("/api/health") async def health(): mem_status = "active" if kira_memory.enabled else "disabled" return {"status": "ok", "name": "kira", "memory": mem_status} def build_system_prompt(user_id: str) -> str: prompt = BASE_SYSTEM_PROMPT if kira_memory.enabled: try: kira_memory.ensure_peers(user_id) suffix = kira_memory.build_system_prompt_suffix() if suffix: prompt += suffix except Exception as e: logger.warning(f"Memory context failed: {e}") return prompt async def run_conversation(text: str, user_id: str) -> str: """STT → LLM → TTS using the cheapest models.""" system_prompt = build_system_prompt(user_id) client = get_openai() # LLM resp = await client.chat.completions.create( model="gpt-5.4-nano", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": text}, ], max_tokens=300, temperature=0.7, ) kira_text = resp.choices[0].message.content or "Mhm, I'm here!" return kira_text async def transcribe_audio(audio_bytes: bytes) -> str | None: """Transcribe audio bytes using cheapest STT model.""" client = get_openai() try: transcript = await client.audio.transcriptions.create( model="gpt-4o-mini-transcribe", file=("audio.webm", audio_bytes, "audio/webm"), response_format="text", ) return transcript.strip() if transcript and transcript.strip() else None except Exception as e: logger.warning(f"STT error: {e}") return None async def synthesize_speech(text: str) -> bytes: """Generate TTS audio from text.""" client = get_openai() try: resp = await client.audio.speech.create( model="tts-1", voice="nova", input=text, response_format="opus", ) return resp.content except Exception as e: logger.warning(f"TTS error: {e}") return b"" @app.websocket("/api/ws") async def conversation_ws(websocket: WebSocket): await websocket.accept() session_id = str(uuid.uuid4())[:8] user_id = "default-user" identified = False logger.info(f"[{session_id}] WebSocket connected") audio_buffer = bytearray() conversation_history: list[dict] = [] try: while True: raw = await websocket.receive_text() msg = json.loads(raw) msg_type = msg.get("type", "") # ── Identity & Preferences ── if msg_type == "identify": user_id = msg.get("user_id", "").strip() user_name = msg.get("name", "").strip() if user_name and user_id: kira_memory.set_user_preference(user_id, "name", user_name) prefs = kira_memory.get_user_preferences(user_id) identified = True if kira_memory.enabled: kira_memory.ensure_peers(user_id) kira_memory.ensure_session(session_id) await websocket.send_json({ "type": "identified", "user_id": user_id, "preferences": prefs, }) continue if msg_type == "set_preference": key = msg.get("key", "").strip() value = msg.get("value", "").strip() if key and user_id and user_id != "default-user": kira_memory.set_user_preference(user_id, key, value) await websocket.send_json({ "type": "preference_saved", "key": key, "success": True, }) continue # ── Conversation ── if msg_type == "audio": # Accumulate PCM16 audio chunks chunk = base64.b64decode(msg["data"]) audio_buffer.extend(chunk) elif msg_type == "transcribe": if not audio_buffer: await websocket.send_json({"type": "error", "message": "No audio data"}) continue logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...") # 1. STT transcript = await transcribe_audio(bytes(audio_buffer)) audio_buffer.clear() if not transcript: await websocket.send_json({"type": "error", "message": "Could not transcribe"}) continue await websocket.send_json({"type": "transcript", "role": "user", "text": transcript}) conversation_history.append({"role": "user", "content": transcript}) # 2. LLM logger.info(f"[{session_id}] User: {transcript}") kira_text = await run_conversation(transcript, user_id) conversation_history.append({"role": "assistant", "content": kira_text}) logger.info(f"[{session_id}] Kira: {kira_text}") # Store in Honcho if kira_memory.enabled and identified: try: kira_memory.store_messages(transcript, kira_text) except Exception: pass # 3. TTS await websocket.send_json({"type": "speaking_start", "text": kira_text}) audio_bytes = await synthesize_speech(kira_text) audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) await websocket.send_json({"type": "speaking_end"}) elif msg_type == "conversation_text": user_text = msg.get("text", "").strip() if not user_text: continue conversation_history.append({"role": "user", "content": user_text}) logger.info(f"[{session_id}] User (text): {user_text}") kira_text = await run_conversation(user_text, user_id) conversation_history.append({"role": "assistant", "content": kira_text}) logger.info(f"[{session_id}] Kira: {kira_text}") if kira_memory.enabled and identified: try: kira_memory.store_messages(user_text, kira_text) except Exception: pass await websocket.send_json({"type": "speaking_start", "text": kira_text}) audio_bytes = await synthesize_speech(kira_text) audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) await websocket.send_json({"type": "speaking_end"}) elif msg_type == "ping": await websocket.send_json({"type": "pong"}) except WebSocketDisconnect: logger.info(f"[{session_id}] Disconnected") except Exception as e: logger.error(f"[{session_id}] Error: {e}") try: await websocket.send_json({"type": "error", "message": str(e)}) except Exception: pass