Files
kira/backend/main.py
T
hobokenchicken 274d04ea10 feat: hybrid pipeline — gpt-realtime-whisper + gpt-5.4-nano + TTS
Hybrid approach gives streaming STT at ~/usr/bin/bash.017/min + cheap brain
at ~/usr/bin/bash.001/min + TTS at ~/usr/bin/bash.015/min = ~/usr/bin/bash.033/min total.

- gpt-realtime-whisper handles streaming transcription with VAD
- gpt-5.4-nano handles response generation (chat completions)
- OpenAI TTS (nova) for voice output
- Server VAD detects utterance boundaries
- Honcho memory context injected into system prompt
- Removed old full Realtime relay service
2026-06-04 13:48:06 -04:00

216 lines
6.9 KiB
Python

"""Kira — AI body double backend
Hybrid pipeline: gpt-realtime-whisper (streaming STT) → gpt-5.4-nano (LLM) → OpenAI TTS
"""
import json
import base64
import uuid
import logging
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.hybrid import HybridPipeline
from services.memory import kira_memory
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")
app = FastAPI(title="Kira Backend")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.on_event("startup")
async def startup():
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured")
@app.get("/api/health")
async def health():
mem_status = "active" if kira_memory.enabled else "disabled"
return {"status": "ok", "name": "kira", "memory": mem_status}
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
logger.info(f"[{session_id}] WebSocket connected")
pending_transcripts: list[str] = []
pipeline: HybridPipeline | None = None
pipeline_task: asyncio.Task | None = None
pipeline_ready = asyncio.Event()
audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
text_queue: asyncio.Queue[str] = asyncio.Queue()
memory_suffix = ""
async def on_ready():
pipeline_ready.set()
logger.info(f"[{session_id}] Pipeline ready")
async def on_transcript_delta(delta: str):
"""Streaming partial transcript."""
await websocket.send_json({"type": "transcript_delta", "text": delta})
async def on_transcript_done(full: str):
"""Full utterance received."""
await websocket.send_json({"type": "transcript", "role": "user", "text": full})
async def on_audio_delta(audio_bytes: bytes):
"""Forward TTS audio to client."""
try:
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64})
except Exception:
pass
async def on_speech_start():
await websocket.send_json({"type": "speaking_start"})
async def on_speech_end():
await websocket.send_json({"type": "speaking_end"})
async def on_error(msg: str):
await websocket.send_json({"type": "error", "message": msg})
# Create pipeline
pipeline = HybridPipeline(
on_transcript_delta=on_transcript_delta,
on_transcript_done=on_transcript_done,
on_audio_delta=on_audio_delta,
on_speech_start=on_speech_start,
on_speech_end=on_speech_end,
on_ready=on_ready,
on_error=on_error,
memory_suffix=memory_suffix,
)
pipeline_task = asyncio.create_task(pipeline.connect())
try:
await asyncio.wait_for(pipeline_ready.wait(), timeout=15)
except asyncio.TimeoutError:
logger.error(f"[{session_id}] Pipeline failed to connect")
await websocket.send_json({"type": "error", "message": "Failed to connect to AI"})
pipeline_task.cancel()
return
# Forward audio/text from client to pipeline
async def forward_audio():
while pipeline and pipeline._connected:
try:
pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1)
await pipeline.send_audio(pcm16)
except asyncio.TimeoutError:
continue
except Exception:
break
async def forward_text():
while pipeline and pipeline._connected:
try:
text = await asyncio.wait_for(text_queue.get(), timeout=1)
await pipeline.send_text(text)
# Store in Honcho
if kira_memory.enabled and identified:
kira_memory.store_user_message(text)
except asyncio.TimeoutError:
continue
except Exception:
break
fwd_audio = asyncio.create_task(forward_audio())
fwd_text = asyncio.create_task(forward_text())
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
# Build memory context and update pipeline
if kira_memory.enabled:
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
pipeline._memory_suffix = ctx
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
# ── Preferences ──
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
continue
# ── Audio (PCM16) ──
if msg_type == "audio":
audio_b64 = msg.get("data", "")
if audio_b64:
pcm16 = base64.b64decode(audio_b64)
await audio_queue.put(pcm16)
continue
# ── Text input ──
if msg_type == "conversation_text":
text = msg.get("text", "").strip()
if text:
await text_queue.put(text)
continue
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
finally:
fwd_audio.cancel()
fwd_text.cancel()
if pipeline:
await pipeline.disconnect()
if pipeline_task:
pipeline_task.cancel()