kira/backend/main.py

"""Kira — AI body double backend

REST STT (gpt-4o-transcribe) → gpt-5.4-nano + Honcho → streaming TTS (sage)
"""

import json
import base64
import uuid
import logging
import asyncio

from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware

from config import settings
from services.memory import kira_memory

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")

app = FastAPI(title="Kira Backend")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

BASE_SYSTEM_PROMPT = (
    "You are Kira, a warm, kind, and encouraging AI body double. "
    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
    "stay focused and on task. Keep responses short, supportive, and uplifting. "
    "Check in on them. Remind them to take breaks. Celebrate small wins. "
    "Use occasional emoji but don't overdo it. Never be judgmental."
)

_openai = None


def get_openai():
    global _openai
    if _openai is None:
        from openai import AsyncOpenAI
        _openai = AsyncOpenAI(api_key=settings.openai_api_key)
    return _openai


@app.on_event("startup")
async def startup():
    if kira_memory.init():
        logger.info("Honcho memory initialized")
    else:
        logger.info("Honcho memory not configured")


@app.get("/api/health")
async def health():
    mem_status = "active" if kira_memory.enabled else "disabled"
    return {"status": "ok", "name": "kira", "memory": mem_status}


async def transcribe_audio(client, audio_b64: str) -> str | None:
    """REST transcription via gpt-4o-transcribe (full utterance blob)."""
    try:
        audio_bytes = base64.b64decode(audio_b64)
        import io
        audio_file = io.BytesIO(audio_bytes)
        audio_file.name = "audio.webm"
        transcript = await client.audio.transcriptions.create(
            model="gpt-4o-transcribe",
            file=audio_file,
        )
        return transcript.text.strip() if transcript.text else None
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        return None


async def get_kira_response(client, user_text: str, memory_suffix: str) -> str:
    """Get Kira's response from gpt-5.4-nano."""
    system_prompt = BASE_SYSTEM_PROMPT
    if memory_suffix:
        system_prompt += memory_suffix
    resp = await client.chat.completions.create(
        model="gpt-5.4-nano",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_text},
        ],
        max_completion_tokens=300,
        temperature=0.7,
    )
    return resp.choices[0].message.content or "Mhm, I'm here! ✨"


async def stream_tts(client, text: str, websocket: WebSocket):
    """Stream TTS audio as Opus chunks over WebSocket."""
    await websocket.send_json({"type": "speaking_start", "text": text})
    async with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="sage",
        input=text,
        response_format="opus",
    ) as tts_resp:
        async for chunk in tts_resp.iter_bytes():
            if chunk:
                b64 = base64.b64encode(chunk).decode("utf-8")
                await websocket.send_json({"type": "audio", "data": b64})
    await websocket.send_json({"type": "speaking_end"})


@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
    session_id = str(uuid.uuid4())[:8]
    user_id = "default-user"
    identified = False
    memory_suffix = ""
    logger.info(f"[{session_id}] WebSocket connected")

    conversation_history: list[dict] = []

    try:
        while True:
            raw = await websocket.receive_text()
            msg = json.loads(raw)
            msg_type = msg.get("type", "")

            # ── Identity & Preferences ──
            if msg_type == "identify":
                user_id = msg.get("user_id", "").strip()
                user_name = msg.get("name", "").strip()
                if user_name and user_id:
                    kira_memory.set_user_preference(user_id, "name", user_name)

                prefs = kira_memory.get_user_preferences(user_id)
                identified = True

                if kira_memory.enabled:
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)
                    try:
                        ctx = kira_memory.build_system_prompt_suffix()
                        if ctx:
                            memory_suffix = ctx
                    except Exception:
                        pass

                await websocket.send_json({
                    "type": "identified",
                    "user_id": user_id,
                    "preferences": prefs,
                })
                continue

            if msg_type == "set_preference":
                key = msg.get("key", "").strip()
                value = msg.get("value", "").strip()
                if key and user_id and user_id != "default-user":
                    kira_memory.set_user_preference(user_id, key, value)
                await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
                continue

            # ── Audio (full webm/opus blob from MediaRecorder) → REST STT → LLM → TTS ──
            if msg_type == "audio":
                audio_b64 = msg.get("data", "")
                if not audio_b64:
                    continue

                client = get_openai()

                # STT (REST)
                transcript = await transcribe_audio(client, audio_b64)
                if not transcript:
                    await websocket.send_json({"type": "transcript", "role": "user", "text": "(could not transcribe)"})
                    await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
                    continue

                logger.info(f"[{session_id}] User: {transcript}")
                await websocket.send_json({"type": "transcript_delta", "text": transcript})
                await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
                conversation_history.append({"role": "user", "content": transcript})

                # LLM
                kira_text = await get_kira_response(client, transcript, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")

                # Store in Honcho
                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(transcript, kira_text)
                    except Exception:
                        pass

                # TTS (streaming)
                await stream_tts(client, kira_text, websocket)
                continue

            # ── Text input → direct LLM + TTS ──
            if msg_type == "conversation_text":
                text = msg.get("text", "").strip()
                if not text:
                    continue

                logger.info(f"[{session_id}] User (text): {text}")
                conversation_history.append({"role": "user", "content": text})

                client = get_openai()
                kira_text = await get_kira_response(client, text, memory_suffix)
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")

                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(text, kira_text)
                    except Exception:
                        pass

                await stream_tts(client, kira_text, websocket)
                continue

            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})

    except WebSocketDisconnect:
        logger.info(f"[{session_id}] Disconnected")
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")