kira/backend/main.py

"""Kira — AI body double backend

Realtime WebSocket STT (gpt-realtime-whisper) → gpt-5.4-nano → streaming TTS
"""

import json
import base64
import uuid
import logging
import time
import asyncio

from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware

from config import settings
from services.memory import kira_memory
from services.whisper_stream import WhisperStream

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")

app = FastAPI(title="Kira Backend")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

BASE_SYSTEM_PROMPT = (
    "You are Kira, a warm, kind, and encouraging AI body double. "
    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
    "stay focused and on task. Keep responses short, supportive, and uplifting. "
    "Check in on them. Remind them to take breaks. Celebrate small wins. "
    "Use occasional emoji but don't overdo it. Never be judgmental."
)

_openai = None


def get_openai():
    global _openai
    if _openai is None:
        from openai import AsyncOpenAI
        _openai = AsyncOpenAI(api_key=settings.openai_api_key)
    return _openai


@app.on_event("startup")
async def startup():
    if kira_memory.init():
        logger.info("Honcho memory initialized")
    else:
        logger.info("Honcho memory not configured")


@app.get("/api/health")
async def health():
    mem_status = "active" if kira_memory.enabled else "disabled"
    return {"status": "ok", "name": "kira", "memory": mem_status}


@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
    session_id = str(uuid.uuid4())[:8]
    user_id = "default-user"
    identified = False
    memory_suffix = ""
    logger.info(f"[{session_id}] WebSocket connected")

    conversation_history: list[dict] = []
    pending_transcript: str | None = None
    transcript_lock = asyncio.Lock()

    # ── Whisper stream callbacks ──

    async def on_ready():
        logger.info(f"[{session_id}] Whisper stream ready")

    async def on_delta(delta: str):
        """Streaming partial transcript — forward to client."""
        try:
            await websocket.send_json({"type": "transcript_delta", "text": delta})
        except Exception:
            pass

    async def on_done(full: str):
        """Full utterance from VAD. Kick off LLM + TTS."""
        nonlocal pending_transcript
        logger.info(f"[{session_id}] Full transcript ({len(full)} chars): {full}")

        async with transcript_lock:
            pending_transcript = full

        await websocket.send_json({"type": "transcript", "role": "user", "text": full})
        conversation_history.append({"role": "user", "content": full})

        # LLM
        system_prompt = BASE_SYSTEM_PROMPT
        if memory_suffix:
            system_prompt += memory_suffix

        client = get_openai()
        resp = await client.chat.completions.create(
            model="gpt-5.4-nano",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": full},
            ],
            max_completion_tokens=300,
            temperature=0.7,
        )
        kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
        conversation_history.append({"role": "assistant", "content": kira_text})
        logger.info(f"[{session_id}] Kira: {kira_text}")

        # Store in Honcho
        if kira_memory.enabled and identified:
            try:
                kira_memory.store_messages(full, kira_text)
            except Exception:
                pass

        # Streaming TTS
        await websocket.send_json({"type": "speaking_start", "text": kira_text})
        async with client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="sage",
            input=kira_text,
            response_format="opus",
        ) as tts_resp:
            async for chunk in tts_resp.iter_bytes():
                if chunk:
                    b64 = base64.b64encode(chunk).decode("utf-8")
                    await websocket.send_json({"type": "audio", "data": b64})
        await websocket.send_json({"type": "speaking_end"})

    async def on_error(msg: str):
        logger.warning(f"Whisper error: {msg}")

    # Start WhisperStream
    stream = WhisperStream(
        on_transcript_delta=on_delta,
        on_transcript_done=on_done,
        on_ready=on_ready,
        on_error=on_error,
    )
    stream_task = asyncio.create_task(stream.connect())
    await asyncio.sleep(2)  # brief wait for connection

    try:
        while True:
            raw = await websocket.receive_text()
            msg = json.loads(raw)
            msg_type = msg.get("type", "")

            # ── Identity & Preferences ──
            if msg_type == "identify":
                user_id = msg.get("user_id", "").strip()
                user_name = msg.get("name", "").strip()
                if user_name and user_id:
                    kira_memory.set_user_preference(user_id, "name", user_name)

                prefs = kira_memory.get_user_preferences(user_id)
                identified = True

                if kira_memory.enabled:
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)
                    try:
                        ctx = kira_memory.build_system_prompt_suffix()
                        if ctx:
                            memory_suffix = ctx
                    except Exception:
                        pass

                await websocket.send_json({
                    "type": "identified",
                    "user_id": user_id,
                    "preferences": prefs,
                })
                continue

            if msg_type == "set_preference":
                key = msg.get("key", "").strip()
                value = msg.get("value", "").strip()
                if key and user_id and user_id != "default-user":
                    kira_memory.set_user_preference(user_id, key, value)
                await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
                continue

            # ── PCM16 audio → WhisperStream ──
            if msg_type == "audio":
                pcm16 = base64.b64decode(msg["data"])
                await stream.send_audio(pcm16)
                continue

            # ── Text input → direct LLM + TTS ──
            if msg_type == "conversation_text":
                text = msg.get("text", "").strip()
                if not text:
                    continue

                logger.info(f"[{session_id}] User (text): {text}")
                conversation_history.append({"role": "user", "content": text})

                system_prompt = BASE_SYSTEM_PROMPT
                if memory_suffix:
                    system_prompt += memory_suffix

                client = get_openai()
                resp = await client.chat.completions.create(
                    model="gpt-5.4-nano",
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": text},
                    ],
                    max_completion_tokens=300,
                    temperature=0.7,
                )
                kira_text = resp.choices[0].message.content or "Mhm!"
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")

                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(text, kira_text)
                    except Exception:
                        pass

                await websocket.send_json({"type": "speaking_start", "text": kira_text})
                async with client.audio.speech.with_streaming_response.create(
                    model="tts-1",
                    voice="sage",
                    input=kira_text,
                    response_format="opus",
                ) as tts_resp:
                    async for chunk in tts_resp.iter_bytes():
                        if chunk:
                            b64 = base64.b64encode(chunk).decode("utf-8")
                            await websocket.send_json({"type": "audio", "data": b64})
                await websocket.send_json({"type": "speaking_end"})
                continue

            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})

    except WebSocketDisconnect:
        logger.info(f"[{session_id}] Disconnected")
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")
    finally:
        await stream.disconnect()
        stream_task.cancel()