Files
kira/backend/main.py
T
hobokenchicken 7502f201c7 feat: Realtime WebSocket STT via gpt-realtime-whisper
Replaces REST-based transcription (gpt-4o-transcribe) with WebSocket
streaming via gpt-realtime-whisper. Frontend captures PCM16 audio and
streams it through the backend to a Realtime transcription session.

- Server-side VAD detects utterance boundaries automatically
- Word-level transcript deltas stream to the client in real-time
- On utterance end, gpt-5.4-nano generates a response
- TTS streams back via with_streaming_response
- Total pipeline: PCM16 → Realtime WS → LLM → streaming TTS
2026-06-04 14:26:19 -04:00

259 lines
8.8 KiB
Python

"""Kira — AI body double backend
Realtime WebSocket STT (gpt-realtime-whisper) → gpt-5.4-nano → streaming TTS
"""
import json
import base64
import uuid
import logging
import time
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.memory import kira_memory
from services.whisper_stream import WhisperStream
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")
app = FastAPI(title="Kira Backend")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental."
)
_openai = None
def get_openai():
global _openai
if _openai is None:
from openai import AsyncOpenAI
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
return _openai
@app.on_event("startup")
async def startup():
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured")
@app.get("/api/health")
async def health():
mem_status = "active" if kira_memory.enabled else "disabled"
return {"status": "ok", "name": "kira", "memory": mem_status}
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected")
conversation_history: list[dict] = []
pending_transcript: str | None = None
transcript_lock = asyncio.Lock()
# ── Whisper stream callbacks ──
async def on_ready():
logger.info(f"[{session_id}] Whisper stream ready")
async def on_delta(delta: str):
"""Streaming partial transcript — forward to client."""
try:
await websocket.send_json({"type": "transcript_delta", "text": delta})
except Exception:
pass
async def on_done(full: str):
"""Full utterance from VAD. Kick off LLM + TTS."""
nonlocal pending_transcript
logger.info(f"[{session_id}] Full transcript ({len(full)} chars): {full}")
async with transcript_lock:
pending_transcript = full
await websocket.send_json({"type": "transcript", "role": "user", "text": full})
conversation_history.append({"role": "user", "content": full})
# LLM
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
client = get_openai()
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": full},
],
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(full, kira_text)
except Exception:
pass
# Streaming TTS
await websocket.send_json({"type": "speaking_start", "text": kira_text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=kira_text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
async def on_error(msg: str):
logger.warning(f"Whisper error: {msg}")
# Start WhisperStream
stream = WhisperStream(
on_transcript_delta=on_delta,
on_transcript_done=on_done,
on_ready=on_ready,
on_error=on_error,
)
stream_task = asyncio.create_task(stream.connect())
await asyncio.sleep(2) # brief wait for connection
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
continue
# ── PCM16 audio → WhisperStream ──
if msg_type == "audio":
pcm16 = base64.b64decode(msg["data"])
await stream.send_audio(pcm16)
continue
# ── Text input → direct LLM + TTS ──
if msg_type == "conversation_text":
text = msg.get("text", "").strip()
if not text:
continue
logger.info(f"[{session_id}] User (text): {text}")
conversation_history.append({"role": "user", "content": text})
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
client = get_openai()
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
],
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm!"
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(text, kira_text)
except Exception:
pass
await websocket.send_json({"type": "speaking_start", "text": kira_text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=kira_text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
continue
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
finally:
await stream.disconnect()
stream_task.cancel()