kira/backend/main.py

"""Kira — AI body double backend

Real-time speech-to-speech pipeline:
  mic audio → Whisper API → text → DeepSeek LLM → response text → OpenAI TTS → audio

Honcho memory integration:
  Cross-session user context injected into LLM prompts,
  conversation exchanges stored for continuous learning.
"""

import json
import base64
import uuid
import logging

from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware

from config import settings
from services.stt import transcribe_audio
from services.llm import get_kira_response
from services.tts import synthesize_speech
from services.memory import kira_memory

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")

app = FastAPI(title="Kira Backend")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ─── Base system prompt (static part) ───
BASE_SYSTEM_PROMPT = (
    "You are Kira, a warm, kind, and encouraging AI body double. "
    "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
    "stay focused and on task. Keep responses short, supportive, and uplifting. "
    "Check in on them. Remind them to take breaks. Celebrate small wins. "
    "Use occasional emoji but don't overdo it. Never be judgmental. "
    "You remember things about them between conversations."
)


@app.on_event("startup")
async def startup():
    """Initialize Honcho memory on app startup."""
    if kira_memory.init():
        logger.info("Honcho memory initialized")
    else:
        logger.info("Honcho memory not configured — running without memory")


@app.get("/api/health")
async def health():
    mem_status = "active" if kira_memory.enabled else "disabled"
    return {"status": "ok", "name": "kira", "memory": mem_status}


def build_system_prompt(user_id: str) -> dict:
    """Build system prompt with Honcho memory context injected."""
    base = BASE_SYSTEM_PROMPT

    # Append memory context if Honcho is available
    if kira_memory.enabled:
        try:
            # Get user-specific context from Honcho
            kira_memory.ensure_peers(user_id)
            memory_suffix = kira_memory.build_system_prompt_suffix()
            if memory_suffix:
                base += memory_suffix
        except Exception as e:
            logger.warning(f"Failed to build memory context: {e}")

    return {"role": "system", "content": base}


@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
    await websocket.accept()
    session_id = str(uuid.uuid4())[:8]
    user_id = "default-user"
    logger.info(f"[{session_id}] WebSocket connected")

    # Audio buffer accumulates chunks from one utterance
    audio_buffer = bytearray()
    conversation_history: list[dict] = []

    # Initialize Honcho for this session
    if kira_memory.enabled:
        try:
            kira_memory.ensure_peers(user_id)
            kira_memory.ensure_session(session_id)
            logger.info(f"[{session_id}] Honcho session ready")
        except Exception as e:
            logger.warning(f"[{session_id}] Honcho setup failed: {e}")

    try:
        first_message = True

        while True:
            raw = await websocket.receive_text()
            msg = json.loads(raw)
            msg_type = msg.get("type", "")

            # Build system prompt fresh each turn to get latest Honcho context
            system_prompt = build_system_prompt(user_id)

            if msg_type == "audio_chunk":
                chunk = base64.b64decode(msg["data"])
                audio_buffer.extend(chunk)

            elif msg_type == "transcribe":
                if not audio_buffer:
                    await websocket.send_json({"type": "error", "message": "No audio data"})
                    continue

                logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")

                # 1. Speech-to-text
                transcript = await transcribe_audio(bytes(audio_buffer))
                audio_buffer.clear()

                if not transcript:
                    await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
                    continue

                # Echo transcript
                await websocket.send_json({
                    "type": "transcript",
                    "text": transcript,
                })

                # 2. LLM call
                logger.info(f"[{session_id}] User: {transcript}")
                user_msg = {"role": "user", "content": transcript}
                conversation_history.append(user_msg)

                messages = [system_prompt] + conversation_history[-10:]
                kira_text = await get_kira_response(messages)

                assistant_msg = {"role": "assistant", "content": kira_text}
                conversation_history.append(assistant_msg)
                logger.info(f"[{session_id}] Kira: {kira_text}")

                # 3. Store in Honcho
                if kira_memory.enabled:
                    try:
                        kira_memory.store_messages(transcript, kira_text)
                    except Exception as e:
                        logger.warning(f"[{session_id}] Failed to store messages: {e}")

                # 4. TTS
                await websocket.send_json({
                    "type": "speaking_start",
                    "text": kira_text,
                })

                audio_bytes = await synthesize_speech(kira_text)
                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
                await websocket.send_json({
                    "type": "audio",
                    "data": audio_b64,
                    "text": kira_text,
                })

                await websocket.send_json({"type": "speaking_end"})

            elif msg_type == "ping":
                await websocket.send_json({"type": "pong"})

            elif msg_type == "conversation_text":
                user_text = msg.get("text", "").strip()
                if not user_text:
                    continue

                logger.info(f"[{session_id}] User (text): {user_text}")
                user_msg = {"role": "user", "content": user_text}
                conversation_history.append(user_msg)

                messages = [system_prompt] + conversation_history[-10:]
                kira_text = await get_kira_response(messages)

                assistant_msg = {"role": "assistant", "content": kira_text}
                conversation_history.append(assistant_msg)
                logger.info(f"[{session_id}] Kira: {kira_text}")

                # Store in Honcho
                if kira_memory.enabled:
                    try:
                        kira_memory.store_messages(user_text, kira_text)
                    except Exception as e:
                        logger.warning(f"[{session_id}] Failed to store messages: {e}")

                # TTS
                await websocket.send_json({"type": "speaking_start", "text": kira_text})
                audio_bytes = await synthesize_speech(kira_text)
                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
                await websocket.send_json({
                    "type": "audio",
                    "data": audio_b64,
                    "text": kira_text,
                })
                await websocket.send_json({"type": "speaking_end"})

    except WebSocketDisconnect:
        logger.info(f"[{session_id}] Disconnected")
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")
        try:
            await websocket.send_json({"type": "error", "message": str(e)})
        except Exception:
            pass