feat: OpenAI Realtime API pipeline
Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM → OpenAI TTS) with a single OpenAI Realtime API WebSocket using gpt-4o-mini-realtime-preview. - ~300-800ms latency vs 1-3s - Server VAD for automatic turn detection - Streaming audio chunks during playback - Interruptions: user can speak over Kira mid-response - Honcho memory still injected into session instructions - Frontend captures PCM16 mono 24kHz via AudioContext - Backend relays client ↔ OpenAI Realtime API - Supports both voice (PCM16) and text input
This commit is contained in:
+183
-172
@@ -1,26 +1,21 @@
|
||||
"""Kira — AI body double backend
|
||||
|
||||
Real-time speech-to-speech pipeline:
|
||||
mic audio → Whisper API → text → DeepSeek LLM → response text → OpenAI TTS → audio
|
||||
|
||||
Honcho memory integration:
|
||||
Cross-session user context injected into LLM prompts,
|
||||
conversation exchanges stored for continuous learning.
|
||||
User preferences (name, scene, outfit, accessory) persisted in peer metadata.
|
||||
OpenAI Realtime API pipeline:
|
||||
mic audio → [built-in STT → GPT-4o-mini → built-in TTS] → speaker audio
|
||||
Single WebSocket, ~300-800ms latency
|
||||
"""
|
||||
|
||||
import json
|
||||
import base64
|
||||
import uuid
|
||||
import logging
|
||||
import asyncio
|
||||
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from config import settings
|
||||
from services.stt import transcribe_audio
|
||||
from services.llm import get_kira_response
|
||||
from services.tts import synthesize_speech
|
||||
from services.realtime import RealtimeRelay
|
||||
from services.memory import kira_memory
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -36,24 +31,13 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# ─── Base system prompt (static part) ───
|
||||
BASE_SYSTEM_PROMPT = (
|
||||
"You are Kira, a warm, kind, and encouraging AI body double. "
|
||||
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
|
||||
"stay focused and on task. Keep responses short, supportive, and uplifting. "
|
||||
"Check in on them. Remind them to take breaks. Celebrate small wins. "
|
||||
"Use occasional emoji but don't overdo it. Never be judgmental. "
|
||||
"You remember things about them between conversations."
|
||||
)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""Initialize Honcho memory on app startup."""
|
||||
if kira_memory.init():
|
||||
logger.info("Honcho memory initialized")
|
||||
else:
|
||||
logger.info("Honcho memory not configured — running without memory")
|
||||
logger.info("Honcho memory not configured")
|
||||
|
||||
|
||||
@app.get("/api/health")
|
||||
@@ -62,61 +46,6 @@ async def health():
|
||||
return {"status": "ok", "name": "kira", "memory": mem_status}
|
||||
|
||||
|
||||
def build_system_prompt(user_id: str) -> dict:
|
||||
"""Build system prompt with Honcho memory context injected."""
|
||||
base = BASE_SYSTEM_PROMPT
|
||||
|
||||
if kira_memory.enabled:
|
||||
try:
|
||||
kira_memory.ensure_peers(user_id)
|
||||
memory_suffix = kira_memory.build_system_prompt_suffix()
|
||||
if memory_suffix:
|
||||
base += memory_suffix
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to build memory context: {e}")
|
||||
|
||||
return {"role": "system", "content": base}
|
||||
|
||||
|
||||
def handle_identify(msg: dict, session_id: str) -> dict | None:
|
||||
"""Handle user identification. Returns user preferences or None."""
|
||||
user_id = msg.get("user_id", "").strip()
|
||||
if not user_id:
|
||||
return {"type": "error", "message": "user_id is required"}
|
||||
|
||||
user_name = msg.get("name", "").strip()
|
||||
if user_name:
|
||||
kira_memory.set_user_preference(user_id, "name", user_name)
|
||||
|
||||
prefs = kira_memory.get_user_preferences(user_id)
|
||||
logger.info(f"[{session_id}] Identified as {user_id} (name={user_name or prefs.get('name', '')})")
|
||||
|
||||
return {
|
||||
"type": "identified",
|
||||
"user_id": user_id,
|
||||
"preferences": prefs,
|
||||
}
|
||||
|
||||
|
||||
def handle_set_preference(msg: dict, session_id: str, user_id: str) -> dict | None:
|
||||
"""Handle preference update. Returns success status."""
|
||||
if not user_id or user_id == "default-user":
|
||||
return {"type": "error", "message": "Must identify first"}
|
||||
|
||||
key = msg.get("key", "").strip()
|
||||
value = msg.get("value", "").strip()
|
||||
|
||||
if not key:
|
||||
return {"type": "error", "message": "key is required"}
|
||||
|
||||
ok = kira_memory.set_user_preference(user_id, key, value)
|
||||
return {
|
||||
"type": "preference_saved",
|
||||
"key": key,
|
||||
"success": ok,
|
||||
}
|
||||
|
||||
|
||||
@app.websocket("/api/ws")
|
||||
async def conversation_ws(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
@@ -125,8 +54,102 @@ async def conversation_ws(websocket: WebSocket):
|
||||
identified = False
|
||||
logger.info(f"[{session_id}] WebSocket connected")
|
||||
|
||||
audio_buffer = bytearray()
|
||||
conversation_history: list[dict] = []
|
||||
# Track conversation for Honcho
|
||||
pending_transcripts: list[tuple[str, str]] = []
|
||||
|
||||
# Will be set when Realtime relay is ready
|
||||
relay_ready = asyncio.Event()
|
||||
relay: RealtimeRelay | None = None
|
||||
relay_task: asyncio.Task | None = None
|
||||
audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
|
||||
text_queue: asyncio.Queue[str] = asyncio.Queue()
|
||||
|
||||
async def on_ready():
|
||||
relay_ready.set()
|
||||
logger.info(f"[{session_id}] Realtime relay ready")
|
||||
|
||||
async def on_audio_delta(audio_bytes: bytes):
|
||||
"""Forward audio chunks from OpenAI to the client."""
|
||||
try:
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
await websocket.send_json({
|
||||
"type": "audio",
|
||||
"data": audio_b64,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def on_transcript(text: str):
|
||||
"""Store transcripts for Honcho."""
|
||||
pending_transcripts.append(("transcript", text))
|
||||
role, content = text.split(": ", 1)
|
||||
logger.info(f"[{session_id}] {role}: {content}")
|
||||
await websocket.send_json({
|
||||
"type": "transcript",
|
||||
"role": role,
|
||||
"text": content,
|
||||
})
|
||||
|
||||
async def on_speech_started():
|
||||
"""Kira started speaking."""
|
||||
await websocket.send_json({"type": "speaking_start"})
|
||||
|
||||
async def on_speech_stopped():
|
||||
"""Kira finished speaking."""
|
||||
await websocket.send_json({"type": "speaking_end"})
|
||||
|
||||
async def on_interruption():
|
||||
"""User interrupted — Kira stops speaking."""
|
||||
await websocket.send_json({"type": "interruption"})
|
||||
|
||||
async def on_error(msg: str):
|
||||
await websocket.send_json({"type": "error", "message": msg})
|
||||
|
||||
# ── Create and start the Realtime relay ──
|
||||
relay = RealtimeRelay(
|
||||
on_audio_delta=on_audio_delta,
|
||||
on_transcript=on_transcript,
|
||||
on_speech_started=on_speech_started,
|
||||
on_speech_stopped=on_speech_stopped,
|
||||
on_interruption=on_interruption,
|
||||
on_error=on_error,
|
||||
on_ready=on_ready,
|
||||
)
|
||||
|
||||
relay_task = asyncio.create_task(relay.connect())
|
||||
|
||||
# Wait for relay to be ready
|
||||
try:
|
||||
await asyncio.wait_for(relay_ready.wait(), timeout=15)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"[{session_id}] Realtime relay failed to connect")
|
||||
await websocket.send_json({"type": "error", "message": "Failed to connect to AI"})
|
||||
relay_task.cancel()
|
||||
return
|
||||
|
||||
# ── Forward audio/text from client to relay ──
|
||||
async def forward_audio():
|
||||
while relay and relay._connected:
|
||||
try:
|
||||
pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1)
|
||||
await relay.send_audio(pcm16)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except Exception:
|
||||
break
|
||||
|
||||
async def forward_text():
|
||||
while relay and relay._connected:
|
||||
try:
|
||||
text = await asyncio.wait_for(text_queue.get(), timeout=1)
|
||||
await relay.send_text(text)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except Exception:
|
||||
break
|
||||
|
||||
fwd_audio_task = asyncio.create_task(forward_audio())
|
||||
fwd_text_task = asyncio.create_task(forward_text())
|
||||
|
||||
try:
|
||||
while True:
|
||||
@@ -134,110 +157,98 @@ async def conversation_ws(websocket: WebSocket):
|
||||
msg = json.loads(raw)
|
||||
msg_type = msg.get("type", "")
|
||||
|
||||
# ── Identity & Preferences ──
|
||||
|
||||
# ── Identity ──
|
||||
if msg_type == "identify":
|
||||
response = handle_identify(msg, session_id)
|
||||
if response:
|
||||
await websocket.send_json(response)
|
||||
if response["type"] == "identified":
|
||||
user_id = response["user_id"]
|
||||
identified = True
|
||||
# Set up Honcho for this user
|
||||
if kira_memory.enabled:
|
||||
try:
|
||||
kira_memory.ensure_peers(user_id)
|
||||
kira_memory.ensure_session(session_id)
|
||||
logger.info(f"[{session_id}] Honcho session ready for {user_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[{session_id}] Honcho setup failed: {e}")
|
||||
user_id = msg.get("user_id", "").strip()
|
||||
user_name = msg.get("name", "").strip()
|
||||
if user_name and user_id:
|
||||
kira_memory.set_user_preference(user_id, "name", user_name)
|
||||
|
||||
prefs = kira_memory.get_user_preferences(user_id)
|
||||
identified = True
|
||||
|
||||
if kira_memory.enabled:
|
||||
kira_memory.ensure_peers(user_id)
|
||||
kira_memory.ensure_session(session_id)
|
||||
|
||||
# Inject Honcho context into the Realtime session instructions
|
||||
memory_suffix = ""
|
||||
if kira_memory.enabled:
|
||||
try:
|
||||
ctx = kira_memory.build_system_prompt_suffix()
|
||||
if ctx:
|
||||
memory_suffix = ctx
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if relay and relay._connected and memory_suffix:
|
||||
await relay._send({
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"instructions": (
|
||||
"You are Kira, a warm, kind, and encouraging AI body double. "
|
||||
"Speak in a friendly, girly-pop tone. Help someone with ADHD "
|
||||
"stay focused. Keep responses short and supportive. "
|
||||
"Check in, remind breaks, celebrate wins. Never judgmental."
|
||||
+ memory_suffix
|
||||
),
|
||||
},
|
||||
})
|
||||
|
||||
await websocket.send_json({
|
||||
"type": "identified",
|
||||
"user_id": user_id,
|
||||
"preferences": prefs,
|
||||
})
|
||||
continue
|
||||
|
||||
# ── Preferences ──
|
||||
if msg_type == "set_preference":
|
||||
response = handle_set_preference(msg, session_id, user_id)
|
||||
if response:
|
||||
await websocket.send_json(response)
|
||||
key = msg.get("key", "").strip()
|
||||
value = msg.get("value", "").strip()
|
||||
if key and user_id and user_id != "default-user":
|
||||
kira_memory.set_user_preference(user_id, key, value)
|
||||
continue
|
||||
|
||||
# ── Conversation ──
|
||||
# ── Audio from frontend (PCM16) ──
|
||||
if msg_type == "audio":
|
||||
audio_b64 = msg.get("data", "")
|
||||
if audio_b64:
|
||||
pcm16 = base64.b64decode(audio_b64)
|
||||
await audio_queue.put(pcm16)
|
||||
continue
|
||||
|
||||
system_prompt = build_system_prompt(user_id)
|
||||
# ── Text input ──
|
||||
if msg_type == "conversation_text":
|
||||
text = msg.get("text", "").strip()
|
||||
if text:
|
||||
await text_queue.put(text)
|
||||
# Also store in Honcho immediately
|
||||
if kira_memory.enabled and identified:
|
||||
kira_memory.store_user_message(text)
|
||||
continue
|
||||
|
||||
if msg_type == "audio_chunk":
|
||||
chunk = base64.b64decode(msg["data"])
|
||||
audio_buffer.extend(chunk)
|
||||
|
||||
elif msg_type == "transcribe":
|
||||
if not audio_buffer:
|
||||
await websocket.send_json({"type": "error", "message": "No audio data"})
|
||||
continue
|
||||
|
||||
logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
|
||||
|
||||
transcript = await transcribe_audio(bytes(audio_buffer))
|
||||
audio_buffer.clear()
|
||||
|
||||
if not transcript:
|
||||
await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
|
||||
continue
|
||||
|
||||
await websocket.send_json({"type": "transcript", "text": transcript})
|
||||
|
||||
logger.info(f"[{session_id}] User: {transcript}")
|
||||
conversation_history.append({"role": "user", "content": transcript})
|
||||
|
||||
messages = [system_prompt] + conversation_history[-10:]
|
||||
kira_text = await get_kira_response(messages)
|
||||
|
||||
conversation_history.append({"role": "assistant", "content": kira_text})
|
||||
logger.info(f"[{session_id}] Kira: {kira_text}")
|
||||
|
||||
if kira_memory.enabled and identified:
|
||||
try:
|
||||
kira_memory.store_messages(transcript, kira_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"[{session_id}] Failed to store messages: {e}")
|
||||
|
||||
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
||||
audio_bytes = await synthesize_speech(kira_text)
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
|
||||
await websocket.send_json({"type": "speaking_end"})
|
||||
|
||||
elif msg_type == "ping":
|
||||
if msg_type == "ping":
|
||||
await websocket.send_json({"type": "pong"})
|
||||
|
||||
elif msg_type == "conversation_text":
|
||||
user_text = msg.get("text", "").strip()
|
||||
if not user_text:
|
||||
continue
|
||||
|
||||
logger.info(f"[{session_id}] User (text): {user_text}")
|
||||
conversation_history.append({"role": "user", "content": user_text})
|
||||
|
||||
messages = [system_prompt] + conversation_history[-10:]
|
||||
kira_text = await get_kira_response(messages)
|
||||
|
||||
conversation_history.append({"role": "assistant", "content": kira_text})
|
||||
logger.info(f"[{session_id}] Kira: {kira_text}")
|
||||
|
||||
if kira_memory.enabled and identified:
|
||||
try:
|
||||
kira_memory.store_messages(user_text, kira_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"[{session_id}] Failed to store messages: {e}")
|
||||
|
||||
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
||||
audio_bytes = await synthesize_speech(kira_text)
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
|
||||
await websocket.send_json({"type": "speaking_end"})
|
||||
|
||||
except WebSocketDisconnect:
|
||||
logger.info(f"[{session_id}] Disconnected")
|
||||
except Exception as e:
|
||||
logger.error(f"[{session_id}] Error: {e}")
|
||||
try:
|
||||
await websocket.send_json({"type": "error", "message": str(e)})
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
# Store pending transcripts in Honcho
|
||||
if kira_memory.enabled and identified:
|
||||
for _, transcript_text in pending_transcripts:
|
||||
if transcript_text.startswith("user: "):
|
||||
content = transcript_text[6:]
|
||||
kira_memory.store_user_message(content)
|
||||
elif transcript_text.startswith("assistant: "):
|
||||
content = transcript_text[11:]
|
||||
kira_memory.store_kira_message(content)
|
||||
|
||||
fwd_audio_task.cancel()
|
||||
fwd_text_task.cancel()
|
||||
if relay:
|
||||
await relay.disconnect()
|
||||
if relay_task:
|
||||
relay_task.cancel()
|
||||
|
||||
@@ -7,3 +7,4 @@ pydantic>=2.10.0
|
||||
pydantic-settings>=2.7.0
|
||||
httpx>=0.28.0
|
||||
honcho-ai>=2.1.0
|
||||
openai[realtime]>=2.41.0
|
||||
|
||||
@@ -0,0 +1,191 @@
|
||||
"""OpenAI Realtime API relay service.
|
||||
|
||||
Manages a WebSocket connection to OpenAI's Realtime API and relays
|
||||
audio/text events between the client and OpenAI.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Callable, Awaitable
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger("kira.realtime")
|
||||
|
||||
# ─── System instructions for Kira's personality ───
|
||||
|
||||
KIRA_INSTRUCTIONS = (
|
||||
"You are Kira, a warm, kind, and encouraging AI body double. "
|
||||
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
|
||||
"stay focused and on task. Keep responses short, supportive, and uplifting. "
|
||||
"Check in on them. Remind them to take breaks. Celebrate small wins. "
|
||||
"Use occasional emoji but don't overdo it. Never be judgmental. "
|
||||
"You remember things about them between conversations."
|
||||
)
|
||||
|
||||
|
||||
class RealtimeRelay:
|
||||
"""Relays audio/text between a client WS and OpenAI Realtime API."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
on_audio_delta: Callable[[bytes], Awaitable[None]],
|
||||
on_transcript: Callable[[str], Awaitable[None]],
|
||||
on_speech_started: Callable[[], Awaitable[None]],
|
||||
on_speech_stopped: Callable[[], Awaitable[None]],
|
||||
on_interruption: Callable[[], Awaitable[None]],
|
||||
on_error: Callable[[str], Awaitable[None]],
|
||||
on_ready: Callable[[], Awaitable[None]],
|
||||
):
|
||||
self._on_audio_delta = on_audio_delta
|
||||
self._on_transcript = on_transcript
|
||||
self._on_speech_started = on_speech_started
|
||||
self._on_speech_stopped = on_speech_stopped
|
||||
self._on_interruption = on_interruption
|
||||
self._on_error = on_error
|
||||
self._on_ready = on_ready
|
||||
self._conn = None
|
||||
self._connected = False
|
||||
|
||||
async def connect(self):
|
||||
"""Open a WebSocket to OpenAI Realtime API."""
|
||||
if self._connected:
|
||||
return
|
||||
|
||||
try:
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
client = AsyncOpenAI(api_key=settings.openai_api_key)
|
||||
|
||||
logger.info("Connecting to OpenAI Realtime API...")
|
||||
async with client.beta.realtime.connect(
|
||||
model="gpt-4o-mini-realtime-preview-2025-07-18",
|
||||
) as conn:
|
||||
self._conn = conn
|
||||
self._connected = True
|
||||
logger.info("Connected to OpenAI Realtime API")
|
||||
|
||||
# Configure session
|
||||
await self._send({
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"instructions": KIRA_INSTRUCTIONS,
|
||||
"voice": "alloy",
|
||||
"input_audio_transcription": {"enabled": True},
|
||||
"turn_detection": {
|
||||
"type": "server_vad",
|
||||
"threshold": 0.5,
|
||||
"prefix_padding_ms": 300,
|
||||
"silence_duration_ms": 600,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
await self._on_ready()
|
||||
|
||||
# Listen for events
|
||||
while self._connected:
|
||||
try:
|
||||
event = await conn.recv()
|
||||
await self._handle_event(event)
|
||||
except Exception as e:
|
||||
if self._connected:
|
||||
logger.warning(f"Realtime recv error: {e}")
|
||||
break
|
||||
|
||||
except ImportError:
|
||||
logger.error("openai[realtime] not installed — run: pip install 'openai[realtime]'")
|
||||
await self._on_error("Missing openai[realtime] dependency")
|
||||
except Exception as e:
|
||||
logger.error(f"Realtime connection error: {e}")
|
||||
await self._on_error(str(e))
|
||||
finally:
|
||||
self._connected = False
|
||||
self._conn = None
|
||||
|
||||
async def _handle_event(self, event):
|
||||
"""Process an event from the OpenAI Realtime API."""
|
||||
event_type = getattr(event, "type", None) or event.get("type", "")
|
||||
|
||||
if event_type == "response.audio.delta":
|
||||
audio_b64 = getattr(event, "delta", None) or event.get("delta", "")
|
||||
if audio_b64:
|
||||
import base64
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
await self._on_audio_delta(audio_bytes)
|
||||
|
||||
elif event_type == "response.audio_buffer.speech_started":
|
||||
await self._on_speech_started()
|
||||
|
||||
elif event_type == "response.audio_buffer.speech_stopped":
|
||||
await self._on_speech_stopped()
|
||||
|
||||
elif event_type == "input_audio_buffer.speech_started":
|
||||
# User started speaking — interrupt Kira
|
||||
await self._on_interruption()
|
||||
|
||||
elif event_type == "conversation.item.created":
|
||||
item = getattr(event, "item", None) or event.get("item", {})
|
||||
role = getattr(item, "role", None) or item.get("role", "")
|
||||
content = getattr(item, "content", None) or item.get("content", [])
|
||||
for part in (content or []):
|
||||
part_type = getattr(part, "type", None) or part.get("type", "")
|
||||
part_text = getattr(part, "text", None) or part.get("text", "")
|
||||
if part_type == "text" and part_text and role == "assistant":
|
||||
await self._on_transcript(f"assistant: {part_text}")
|
||||
part_transcript = getattr(part, "transcript", None) or part.get("transcript", "")
|
||||
if part_type == "transcript" and part_transcript and role == "user":
|
||||
await self._on_transcript(f"user: {part_transcript}")
|
||||
|
||||
elif event_type == "error":
|
||||
err = getattr(event, "error", None) or event.get("error", {})
|
||||
msg = getattr(err, "message", None) or err.get("message", str(event))
|
||||
logger.warning(f"Realtime API error: {msg}")
|
||||
await self._on_error(msg)
|
||||
|
||||
async def send_audio(self, pcm16_bytes: bytes):
|
||||
"""Send PCM16 audio chunk to OpenAI."""
|
||||
if not self._connected or not self._conn:
|
||||
return
|
||||
try:
|
||||
import base64
|
||||
audio_b64 = base64.b64encode(pcm16_bytes).decode("utf-8")
|
||||
await self._send({
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": audio_b64,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to send audio: {e}")
|
||||
|
||||
async def send_text(self, text: str):
|
||||
"""Send a text message to OpenAI and trigger a response."""
|
||||
if not self._connected or not self._conn:
|
||||
return
|
||||
try:
|
||||
await self._send({
|
||||
"type": "conversation.item.create",
|
||||
"item": {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": text}],
|
||||
},
|
||||
})
|
||||
await self._send({"type": "response.create"})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to send text: {e}")
|
||||
|
||||
async def _send(self, data: dict):
|
||||
"""Send a JSON event to the Realtime API."""
|
||||
try:
|
||||
await self._conn.send(data)
|
||||
except Exception as e:
|
||||
logger.warning(f"Realtime send error: {e}")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Close the Realtime connection."""
|
||||
self._connected = False
|
||||
if self._conn:
|
||||
try:
|
||||
await self._conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._conn = None
|
||||
@@ -25,6 +25,41 @@ function saveUserId(id: string) {
|
||||
localStorage.setItem(USER_ID_KEY, id);
|
||||
}
|
||||
|
||||
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
||||
function startPCMCapture(
|
||||
stream: MediaStream,
|
||||
onChunk: (pcm16: Uint8Array) => void,
|
||||
): { stop: () => void } {
|
||||
const ctx = new AudioContext({ sampleRate: 24000 });
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
||||
let running = true;
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!running) return;
|
||||
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
|
||||
// Convert float32 → PCM16 int16
|
||||
const pcm16 = new Int16Array(input.length);
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
onChunk(new Uint8Array(pcm16.buffer));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(ctx.destination);
|
||||
|
||||
return {
|
||||
stop: () => {
|
||||
running = false;
|
||||
source.disconnect();
|
||||
processor.disconnect();
|
||||
ctx.close();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function useConversation() {
|
||||
const [messages, setMessages] = useState<Message[]>([]);
|
||||
const [isConnected, setIsConnected] = useState(false);
|
||||
@@ -38,11 +73,13 @@ export function useConversation() {
|
||||
accessory: '',
|
||||
});
|
||||
const [loadingPrefs, setLoadingPrefs] = useState(true);
|
||||
const [micError, setMicError] = useState<string | null>(null);
|
||||
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const captureRef = useRef<{ stop: () => void } | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioBufferRef = useRef<Uint8Array[]>([]);
|
||||
|
||||
// Connect WebSocket
|
||||
const connect = useCallback(() => {
|
||||
@@ -54,7 +91,6 @@ export function useConversation() {
|
||||
|
||||
ws.onopen = () => {
|
||||
setIsConnected(true);
|
||||
// Auto-identify if returning user
|
||||
const savedId = loadUserId();
|
||||
if (savedId) {
|
||||
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
|
||||
@@ -102,35 +138,56 @@ export function useConversation() {
|
||||
break;
|
||||
}
|
||||
|
||||
case 'preference_saved':
|
||||
// Already optimistically updated locally
|
||||
break;
|
||||
|
||||
case 'transcript':
|
||||
addMessage('user', msg.text);
|
||||
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
|
||||
break;
|
||||
|
||||
case 'speaking_start':
|
||||
setIsKiraSpeaking(true);
|
||||
addMessage('kira', msg.text || '...');
|
||||
break;
|
||||
|
||||
case 'audio':
|
||||
case 'audio': {
|
||||
// Incoming PCM16 audio from Kira
|
||||
if (msg.data && audioRef.current) {
|
||||
// Accumulate audio chunks and create a blob
|
||||
const binary = atob(msg.data);
|
||||
const bytes = new Uint8Array(binary.length);
|
||||
for (let i = 0; i < binary.length; i++) {
|
||||
bytes[i] = binary.charCodeAt(i);
|
||||
}
|
||||
const blob = new Blob([bytes], { type: 'audio/ogg' });
|
||||
audioBufferRef.current.push(bytes);
|
||||
|
||||
// Convert accumulated PCM16 to WAV blob for playback
|
||||
const allChunks = audioBufferRef.current;
|
||||
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
|
||||
const combined = new Uint8Array(totalLen);
|
||||
let offset = 0;
|
||||
for (const chunk of allChunks) {
|
||||
combined.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
const wav = pcm16ToWav(combined);
|
||||
const blob = new Blob([wav], { type: 'audio/wav' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
audioRef.current.src = url;
|
||||
audioRef.current.play().catch(() => {});
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 'speaking_end':
|
||||
setIsKiraSpeaking(false);
|
||||
audioBufferRef.current = [];
|
||||
break;
|
||||
|
||||
case 'interruption':
|
||||
setIsKiraSpeaking(false);
|
||||
audioBufferRef.current = [];
|
||||
if (audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current.currentTime = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'error':
|
||||
@@ -154,99 +211,80 @@ export function useConversation() {
|
||||
setPreferences((p) => ({ ...p, name }));
|
||||
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'identify',
|
||||
user_id: userId,
|
||||
name,
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
|
||||
}
|
||||
}, []);
|
||||
|
||||
// ── Preferences ──
|
||||
|
||||
const setPreference = useCallback((key: string, value: string) => {
|
||||
// Optimistic update
|
||||
setPreferences((p) => ({ ...p, [key]: value }));
|
||||
|
||||
// Sync to backend
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'set_preference',
|
||||
key,
|
||||
value,
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
|
||||
}
|
||||
}, [identified]);
|
||||
|
||||
// ── Audio (Realtime PCM16) ──
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Check HTTPS
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
setMicError(null);
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
streamRef.current = stream;
|
||||
|
||||
const ws = wsRef.current;
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
addMessage('kira', 'Not connected to server yet...');
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
return;
|
||||
}
|
||||
|
||||
// Start PCM16 capture — each chunk sent as WS message
|
||||
const capture = startPCMCapture(stream, (pcm16) => {
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
const base64 = arrayBufferToBase64(pcm16.buffer);
|
||||
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
||||
}
|
||||
});
|
||||
|
||||
captureRef.current = capture;
|
||||
setIsRecording(true);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
setMicError(msg);
|
||||
console.error('[Kira Mic]', msg);
|
||||
}
|
||||
}, [addMessage]);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
captureRef.current?.stop();
|
||||
captureRef.current = null;
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
setIsRecording(false);
|
||||
}, []);
|
||||
|
||||
// ── Text ──
|
||||
|
||||
const sendText = useCallback((text: string) => {
|
||||
if (!text.trim()) return;
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'conversation_text',
|
||||
text: text.trim(),
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
|
||||
}
|
||||
}, []);
|
||||
|
||||
// ── Audio ──
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Check if mediaDevices is available (requires HTTPS/localhost)
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
streamRef.current = stream;
|
||||
|
||||
const recorder = new MediaRecorder(stream, {
|
||||
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||
? 'audio/webm;codecs=opus'
|
||||
: 'audio/webm',
|
||||
});
|
||||
|
||||
const chunks: BlobPart[] = [];
|
||||
recorder.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunks.push(e.data);
|
||||
};
|
||||
|
||||
recorder.onstop = () => {
|
||||
const blob = new Blob(chunks, { type: 'audio/webm' });
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
const base64 = (reader.result as string).split(',')[1];
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
|
||||
wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
|
||||
}
|
||||
};
|
||||
reader.readAsDataURL(blob);
|
||||
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
setIsRecording(false);
|
||||
};
|
||||
|
||||
recorder.start();
|
||||
recorderRef.current = recorder;
|
||||
setIsRecording(true);
|
||||
} catch (err) {
|
||||
console.error('[Kira Mic] failed:', err);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
recorderRef.current?.stop();
|
||||
}, []);
|
||||
|
||||
// Connect on mount
|
||||
useEffect(() => {
|
||||
connect();
|
||||
return () => {
|
||||
wsRef.current?.close();
|
||||
captureRef.current?.stop();
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
};
|
||||
}, [connect]);
|
||||
@@ -259,6 +297,7 @@ export function useConversation() {
|
||||
identified,
|
||||
preferences,
|
||||
loadingPrefs,
|
||||
micError,
|
||||
identify,
|
||||
setPreference,
|
||||
sendText,
|
||||
@@ -266,3 +305,61 @@ export function useConversation() {
|
||||
stopRecording,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Helpers ──
|
||||
|
||||
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return btoa(binary);
|
||||
}
|
||||
|
||||
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
|
||||
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
|
||||
const numChannels = 1;
|
||||
const sampleRate = 24000;
|
||||
const bitsPerSample = 16;
|
||||
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
||||
const blockAlign = numChannels * (bitsPerSample / 8);
|
||||
const dataSize = pcm16.length;
|
||||
const headerSize = 44;
|
||||
const totalSize = headerSize + dataSize;
|
||||
|
||||
const buf = new ArrayBuffer(totalSize);
|
||||
const view = new DataView(buf);
|
||||
|
||||
// RIFF header
|
||||
writeString(view, 0, 'RIFF');
|
||||
view.setUint32(4, totalSize - 8, true);
|
||||
writeString(view, 8, 'WAVE');
|
||||
|
||||
// fmt subchunk
|
||||
writeString(view, 12, 'fmt ');
|
||||
view.setUint32(16, 16, true); // subchunk size
|
||||
view.setUint16(20, 1, true); // PCM
|
||||
view.setUint16(22, numChannels, true);
|
||||
view.setUint32(24, sampleRate, true);
|
||||
view.setUint32(28, byteRate, true);
|
||||
view.setUint16(32, blockAlign, true);
|
||||
view.setUint16(34, bitsPerSample, true);
|
||||
|
||||
// data subchunk
|
||||
writeString(view, 36, 'data');
|
||||
view.setUint32(40, dataSize, true);
|
||||
|
||||
// PCM data
|
||||
for (let i = 0; i < pcm16.length; i++) {
|
||||
view.setUint8(44 + i, pcm16[i]);
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
function writeString(view: DataView, offset: number, str: string) {
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
view.setUint8(offset + i, str.charCodeAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user