fix: restore full WebSocket message loop in main.py (was truncated to 77 lines, missing the entire try/except message handler)

The REST STT revert commit (0e74a16) deleted lines 78-262 including the message loop, identify handler, audio handler, text handler, and disconnect cleanup. This caused the WS to accept then immediately close, triggering a reconnect loop.

Refactored for clarity: transcribe_audio(), get_kira_response(), stream_tts() as standalone async helpers. Full pipeline restored.
This commit is contained in:
2026-06-05 02:10:41 -04:00
parent 86b1e9aa04
commit 92250a668b
+157 -3
View File
@@ -1,13 +1,12 @@
"""Kira — AI body double backend """Kira — AI body double backend
Realtime WebSocket STT (gpt-realtime-whisper) → gpt-5.4-nano → streaming TTS REST STT (gpt-4o-transcribe) → gpt-5.4-nano + Honcho → streaming TTS (sage)
""" """
import json import json
import base64 import base64
import uuid import uuid
import logging import logging
import time
import asyncio import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -15,7 +14,6 @@ from fastapi.middleware.cors import CORSMiddleware
from config import settings from config import settings
from services.memory import kira_memory from services.memory import kira_memory
# from services.whisper_stream import WhisperStream # REST fallback active
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira") logger = logging.getLogger("kira")
@@ -63,6 +61,56 @@ async def health():
return {"status": "ok", "name": "kira", "memory": mem_status} return {"status": "ok", "name": "kira", "memory": mem_status}
async def transcribe_audio(client, audio_b64: str) -> str | None:
"""REST transcription via gpt-4o-transcribe (full utterance blob)."""
try:
audio_bytes = base64.b64decode(audio_b64)
import io
audio_file = io.BytesIO(audio_bytes)
audio_file.name = "audio.webm"
transcript = await client.audio.transcriptions.create(
model="gpt-4o-transcribe",
file=audio_file,
)
return transcript.text.strip() if transcript.text else None
except Exception as e:
logger.error(f"Transcription error: {e}")
return None
async def get_kira_response(client, user_text: str, memory_suffix: str) -> str:
"""Get Kira's response from gpt-5.4-nano."""
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
max_completion_tokens=300,
temperature=0.7,
)
return resp.choices[0].message.content or "Mhm, I'm here! ✨"
async def stream_tts(client, text: str, websocket: WebSocket):
"""Stream TTS audio as Opus chunks over WebSocket."""
await websocket.send_json({"type": "speaking_start", "text": text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
@app.websocket("/api/ws") @app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket): async def conversation_ws(websocket: WebSocket):
await websocket.accept() await websocket.accept()
@@ -74,4 +122,110 @@ async def conversation_ws(websocket: WebSocket):
conversation_history: list[dict] = [] conversation_history: list[dict] = []
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
continue
# ── Audio (full webm/opus blob from MediaRecorder) → REST STT → LLM → TTS ──
if msg_type == "audio":
audio_b64 = msg.get("data", "")
if not audio_b64:
continue
client = get_openai()
# STT (REST)
transcript = await transcribe_audio(client, audio_b64)
if not transcript:
await websocket.send_json({"type": "transcript", "role": "user", "text": "(could not transcribe)"})
await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
continue
logger.info(f"[{session_id}] User: {transcript}")
await websocket.send_json({"type": "transcript_delta", "text": transcript})
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript})
# LLM
kira_text = await get_kira_response(client, transcript, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
except Exception:
pass
# TTS (streaming)
await stream_tts(client, kira_text, websocket)
continue
# ── Text input → direct LLM + TTS ──
if msg_type == "conversation_text":
text = msg.get("text", "").strip()
if not text:
continue
logger.info(f"[{session_id}] User (text): {text}")
conversation_history.append({"role": "user", "content": text})
client = get_openai()
kira_text = await get_kira_response(client, text, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(text, kira_text)
except Exception:
pass
await stream_tts(client, kira_text, websocket)
continue
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")