Files
kira/backend/main.py
T
hobokenchicken c5cc4dd480 fix: replace PCM16 capture with MediaRecorder (Opus/webm)
PCM16 capture via AudioContext was streaming raw audio continuously,
causing massive accumulated buffers that took ~20s to transcribe.
Replaced with MediaRecorder which records compressed Opus/webm and
sends a single blob on release — much smaller, faster to transcribe.

Also removed all unused PCM16/WAV helper functions from both frontend
and backend.
2026-06-04 14:04:44 -04:00

287 lines
9.7 KiB
Python

"""Kira — AI body double backend
Cheapest pipeline: gpt-4o-mini-transcribe STT → gpt-5.4-nano LLM → OpenAI TTS
~$0.019/min total, simple 3-step chat completions.
"""
import json
import base64
import uuid
import logging
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.memory import kira_memory
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("kira")
app = FastAPI(title="Kira Backend")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# System prompt
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental."
)
_openai = None
def get_openai():
global _openai
if _openai is None:
from openai import AsyncOpenAI
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
return _openai
@app.on_event("startup")
async def startup():
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured")
@app.get("/api/health")
async def health():
mem_status = "active" if kira_memory.enabled else "disabled"
return {"status": "ok", "name": "kira", "memory": mem_status}
def build_system_prompt(user_id: str) -> str:
prompt = BASE_SYSTEM_PROMPT
if kira_memory.enabled:
try:
kira_memory.ensure_peers(user_id)
suffix = kira_memory.build_system_prompt_suffix()
if suffix:
prompt += suffix
except Exception as e:
logger.warning(f"Memory context failed: {e}")
return prompt
async def run_conversation(text: str, user_id: str) -> str:
"""STT → LLM → TTS using the cheapest models."""
system_prompt = build_system_prompt(user_id)
client = get_openai()
# LLM
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": text},
],
max_completion_tokens=300,
temperature=0.7,
)
kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
return kira_text
async def transcribe_audio(audio_bytes: bytes) -> str | None:
"""Transcribe Opus/webm audio using cheapest STT model."""
client = get_openai()
try:
transcript = await client.audio.transcriptions.create(
model="gpt-4o-mini-transcribe",
file=("audio.webm", audio_bytes, "audio/webm"),
response_format="text",
)
return transcript.strip() if transcript and transcript.strip() else None
except Exception as e:
logger.warning(f"STT error: {e}")
return None
async def synthesize_speech(text: str) -> bytes:
"""Generate TTS audio from text."""
client = get_openai()
try:
resp = await client.audio.speech.create(
model="tts-1",
voice="nova",
input=text,
response_format="opus",
)
return resp.content
except Exception as e:
logger.warning(f"TTS error: {e}")
return b""
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
import struct
num_channels = 1
sample_rate = 24000
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm_data)
header_size = 44
total_size = header_size + data_size
header = b"RIFF"
header += struct.pack("<I", total_size - 8)
header += b"WAVE"
header += b"fmt "
header += struct.pack("<I", 16) # subchunk size
header += struct.pack("<H", 1) # PCM format
header += struct.pack("<H", num_channels)
header += struct.pack("<I", sample_rate)
header += struct.pack("<I", byte_rate)
header += struct.pack("<H", block_align)
header += struct.pack("<H", bits_per_sample)
header += b"data"
header += struct.pack("<I", data_size)
return header + pcm_data
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
identified = False
logger.info(f"[{session_id}] WebSocket connected")
audio_buffer = bytearray()
conversation_history: list[dict] = []
try:
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify":
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
if msg_type == "set_preference":
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
await websocket.send_json({
"type": "preference_saved",
"key": key,
"success": True,
})
continue
# ── Conversation ──
if msg_type == "audio_chunk":
# Single Opus/webm blob from MediaRecorder
chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk)
elif msg_type == "transcribe":
if not audio_buffer:
await websocket.send_json({"type": "error", "message": "No audio data"})
continue
logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
# 1. STT
transcript = await transcribe_audio(bytes(audio_buffer))
audio_buffer.clear()
if not transcript:
await websocket.send_json({"type": "error", "message": "Could not transcribe"})
continue
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript})
# 2. LLM
logger.info(f"[{session_id}] User: {transcript}")
kira_text = await run_conversation(transcript, user_id)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
except Exception:
pass
# 3. TTS
await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"})
elif msg_type == "conversation_text":
user_text = msg.get("text", "").strip()
if not user_text:
continue
conversation_history.append({"role": "user", "content": user_text})
logger.info(f"[{session_id}] User (text): {user_text}")
kira_text = await run_conversation(user_text, user_id)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(user_text, kira_text)
except Exception:
pass
await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"})
elif msg_type == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
try:
await websocket.send_json({"type": "error", "message": str(e)})
except Exception:
pass