Files
kira/backend/services/realtime.py
T
hobokenchicken 1c15d42e06 fix: remove OpenAI-Beta header, use gpt-realtime-2 GA model
The old OpenAI-Beta: realtime=v1 header is rejected by the GA API.
Removing it via extra_headers override. Using gpt-realtime-2 which
is the current production Realtime model.
2026-06-04 13:42:06 -04:00

193 lines
7.3 KiB
Python

"""OpenAI Realtime API relay service.
Manages a WebSocket connection to OpenAI's Realtime API and relays
audio/text events between the client and OpenAI.
"""
import json
import logging
from typing import Callable, Awaitable
from config import settings
logger = logging.getLogger("kira.realtime")
# ─── System instructions for Kira's personality ───
KIRA_INSTRUCTIONS = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental. "
"You remember things about them between conversations."
)
class RealtimeRelay:
"""Relays audio/text between a client WS and OpenAI Realtime API."""
def __init__(
self,
on_audio_delta: Callable[[bytes], Awaitable[None]],
on_transcript: Callable[[str], Awaitable[None]],
on_speech_started: Callable[[], Awaitable[None]],
on_speech_stopped: Callable[[], Awaitable[None]],
on_interruption: Callable[[], Awaitable[None]],
on_error: Callable[[str], Awaitable[None]],
on_ready: Callable[[], Awaitable[None]],
):
self._on_audio_delta = on_audio_delta
self._on_transcript = on_transcript
self._on_speech_started = on_speech_started
self._on_speech_stopped = on_speech_stopped
self._on_interruption = on_interruption
self._on_error = on_error
self._on_ready = on_ready
self._conn = None
self._connected = False
async def connect(self):
"""Open a WebSocket to OpenAI Realtime API."""
if self._connected:
return
try:
from openai import AsyncOpenAI
client = AsyncOpenAI(api_key=settings.openai_api_key)
logger.info("Connecting to OpenAI Realtime API...")
async with client.beta.realtime.connect(
model="gpt-realtime-2",
extra_headers={"OpenAI-Beta": ""},
) as conn:
self._conn = conn
self._connected = True
logger.info("Connected to OpenAI Realtime API")
# Configure session
await self._send({
"type": "session.update",
"session": {
"instructions": KIRA_INSTRUCTIONS,
"voice": "alloy",
"input_audio_transcription": {"enabled": True},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 600,
},
},
})
await self._on_ready()
# Listen for events
while self._connected:
try:
event = await conn.recv()
await self._handle_event(event)
except Exception as e:
if self._connected:
logger.warning(f"Realtime recv error: {e}")
break
except ImportError:
logger.error("openai[realtime] not installed — run: pip install 'openai[realtime]'")
await self._on_error("Missing openai[realtime] dependency")
except Exception as e:
logger.error(f"Realtime connection error: {e}")
await self._on_error(str(e))
finally:
self._connected = False
self._conn = None
async def _handle_event(self, event):
"""Process an event from the OpenAI Realtime API."""
event_type = getattr(event, "type", None) or event.get("type", "")
if event_type == "response.audio.delta":
audio_b64 = getattr(event, "delta", None) or event.get("delta", "")
if audio_b64:
import base64
audio_bytes = base64.b64decode(audio_b64)
await self._on_audio_delta(audio_bytes)
elif event_type == "response.audio_buffer.speech_started":
await self._on_speech_started()
elif event_type == "response.audio_buffer.speech_stopped":
await self._on_speech_stopped()
elif event_type == "input_audio_buffer.speech_started":
# User started speaking — interrupt Kira
await self._on_interruption()
elif event_type == "conversation.item.created":
item = getattr(event, "item", None) or event.get("item", {})
role = getattr(item, "role", None) or item.get("role", "")
content = getattr(item, "content", None) or item.get("content", [])
for part in (content or []):
part_type = getattr(part, "type", None) or part.get("type", "")
part_text = getattr(part, "text", None) or part.get("text", "")
if part_type == "text" and part_text and role == "assistant":
await self._on_transcript(f"assistant: {part_text}")
part_transcript = getattr(part, "transcript", None) or part.get("transcript", "")
if part_type == "transcript" and part_transcript and role == "user":
await self._on_transcript(f"user: {part_transcript}")
elif event_type == "error":
err = getattr(event, "error", None) or event.get("error", {})
msg = getattr(err, "message", None) or err.get("message", str(event))
logger.warning(f"Realtime API error: {msg}")
await self._on_error(msg)
async def send_audio(self, pcm16_bytes: bytes):
"""Send PCM16 audio chunk to OpenAI."""
if not self._connected or not self._conn:
return
try:
import base64
audio_b64 = base64.b64encode(pcm16_bytes).decode("utf-8")
await self._send({
"type": "input_audio_buffer.append",
"audio": audio_b64,
})
except Exception as e:
logger.warning(f"Failed to send audio: {e}")
async def send_text(self, text: str):
"""Send a text message to OpenAI and trigger a response."""
if not self._connected or not self._conn:
return
try:
await self._send({
"type": "conversation.item.create",
"item": {
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": text}],
},
})
await self._send({"type": "response.create"})
except Exception as e:
logger.warning(f"Failed to send text: {e}")
async def _send(self, data: dict):
"""Send a JSON event to the Realtime API."""
try:
await self._conn.send(data)
except Exception as e:
logger.warning(f"Realtime send error: {e}")
async def disconnect(self):
"""Close the Realtime connection."""
self._connected = False
if self._conn:
try:
await self._conn.close()
except Exception:
pass
self._conn = None