feat: OpenAI Realtime API pipeline

Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM
→ OpenAI TTS) with a single OpenAI Realtime API WebSocket using
gpt-4o-mini-realtime-preview.

- ~300-800ms latency vs 1-3s
- Server VAD for automatic turn detection
- Streaming audio chunks during playback
- Interruptions: user can speak over Kira mid-response
- Honcho memory still injected into session instructions
- Frontend captures PCM16 mono 24kHz via AudioContext
- Backend relays client ↔ OpenAI Realtime API
- Supports both voice (PCM16) and text input
This commit is contained in:
2026-06-04 13:32:39 -04:00
parent e64698b0ab
commit e2332af8d0
4 changed files with 551 additions and 251 deletions
+183 -172
View File
@@ -1,26 +1,21 @@
"""Kira — AI body double backend
Real-time speech-to-speech pipeline:
mic audio → Whisper API → text → DeepSeek LLM → response text → OpenAI TTS → audio
Honcho memory integration:
Cross-session user context injected into LLM prompts,
conversation exchanges stored for continuous learning.
User preferences (name, scene, outfit, accessory) persisted in peer metadata.
OpenAI Realtime API pipeline:
mic audio → [built-in STT → GPT-4o-mini → built-in TTS] speaker audio
Single WebSocket, ~300-800ms latency
"""
import json
import base64
import uuid
import logging
import asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from services.stt import transcribe_audio
from services.llm import get_kira_response
from services.tts import synthesize_speech
from services.realtime import RealtimeRelay
from services.memory import kira_memory
logging.basicConfig(level=logging.INFO)
@@ -36,24 +31,13 @@ app.add_middleware(
allow_headers=["*"],
)
# ─── Base system prompt (static part) ───
BASE_SYSTEM_PROMPT = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental. "
"You remember things about them between conversations."
)
@app.on_event("startup")
async def startup():
"""Initialize Honcho memory on app startup."""
if kira_memory.init():
logger.info("Honcho memory initialized")
else:
logger.info("Honcho memory not configured — running without memory")
logger.info("Honcho memory not configured")
@app.get("/api/health")
@@ -62,61 +46,6 @@ async def health():
return {"status": "ok", "name": "kira", "memory": mem_status}
def build_system_prompt(user_id: str) -> dict:
"""Build system prompt with Honcho memory context injected."""
base = BASE_SYSTEM_PROMPT
if kira_memory.enabled:
try:
kira_memory.ensure_peers(user_id)
memory_suffix = kira_memory.build_system_prompt_suffix()
if memory_suffix:
base += memory_suffix
except Exception as e:
logger.warning(f"Failed to build memory context: {e}")
return {"role": "system", "content": base}
def handle_identify(msg: dict, session_id: str) -> dict | None:
"""Handle user identification. Returns user preferences or None."""
user_id = msg.get("user_id", "").strip()
if not user_id:
return {"type": "error", "message": "user_id is required"}
user_name = msg.get("name", "").strip()
if user_name:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
logger.info(f"[{session_id}] Identified as {user_id} (name={user_name or prefs.get('name', '')})")
return {
"type": "identified",
"user_id": user_id,
"preferences": prefs,
}
def handle_set_preference(msg: dict, session_id: str, user_id: str) -> dict | None:
"""Handle preference update. Returns success status."""
if not user_id or user_id == "default-user":
return {"type": "error", "message": "Must identify first"}
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if not key:
return {"type": "error", "message": "key is required"}
ok = kira_memory.set_user_preference(user_id, key, value)
return {
"type": "preference_saved",
"key": key,
"success": ok,
}
@app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket):
await websocket.accept()
@@ -125,8 +54,102 @@ async def conversation_ws(websocket: WebSocket):
identified = False
logger.info(f"[{session_id}] WebSocket connected")
audio_buffer = bytearray()
conversation_history: list[dict] = []
# Track conversation for Honcho
pending_transcripts: list[tuple[str, str]] = []
# Will be set when Realtime relay is ready
relay_ready = asyncio.Event()
relay: RealtimeRelay | None = None
relay_task: asyncio.Task | None = None
audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
text_queue: asyncio.Queue[str] = asyncio.Queue()
async def on_ready():
relay_ready.set()
logger.info(f"[{session_id}] Realtime relay ready")
async def on_audio_delta(audio_bytes: bytes):
"""Forward audio chunks from OpenAI to the client."""
try:
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({
"type": "audio",
"data": audio_b64,
})
except Exception:
pass
async def on_transcript(text: str):
"""Store transcripts for Honcho."""
pending_transcripts.append(("transcript", text))
role, content = text.split(": ", 1)
logger.info(f"[{session_id}] {role}: {content}")
await websocket.send_json({
"type": "transcript",
"role": role,
"text": content,
})
async def on_speech_started():
"""Kira started speaking."""
await websocket.send_json({"type": "speaking_start"})
async def on_speech_stopped():
"""Kira finished speaking."""
await websocket.send_json({"type": "speaking_end"})
async def on_interruption():
"""User interrupted — Kira stops speaking."""
await websocket.send_json({"type": "interruption"})
async def on_error(msg: str):
await websocket.send_json({"type": "error", "message": msg})
# ── Create and start the Realtime relay ──
relay = RealtimeRelay(
on_audio_delta=on_audio_delta,
on_transcript=on_transcript,
on_speech_started=on_speech_started,
on_speech_stopped=on_speech_stopped,
on_interruption=on_interruption,
on_error=on_error,
on_ready=on_ready,
)
relay_task = asyncio.create_task(relay.connect())
# Wait for relay to be ready
try:
await asyncio.wait_for(relay_ready.wait(), timeout=15)
except asyncio.TimeoutError:
logger.error(f"[{session_id}] Realtime relay failed to connect")
await websocket.send_json({"type": "error", "message": "Failed to connect to AI"})
relay_task.cancel()
return
# ── Forward audio/text from client to relay ──
async def forward_audio():
while relay and relay._connected:
try:
pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1)
await relay.send_audio(pcm16)
except asyncio.TimeoutError:
continue
except Exception:
break
async def forward_text():
while relay and relay._connected:
try:
text = await asyncio.wait_for(text_queue.get(), timeout=1)
await relay.send_text(text)
except asyncio.TimeoutError:
continue
except Exception:
break
fwd_audio_task = asyncio.create_task(forward_audio())
fwd_text_task = asyncio.create_task(forward_text())
try:
while True:
@@ -134,110 +157,98 @@ async def conversation_ws(websocket: WebSocket):
msg = json.loads(raw)
msg_type = msg.get("type", "")
# ── Identity & Preferences ──
# ── Identity ──
if msg_type == "identify":
response = handle_identify(msg, session_id)
if response:
await websocket.send_json(response)
if response["type"] == "identified":
user_id = response["user_id"]
identified = True
# Set up Honcho for this user
if kira_memory.enabled:
try:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
logger.info(f"[{session_id}] Honcho session ready for {user_id}")
except Exception as e:
logger.warning(f"[{session_id}] Honcho setup failed: {e}")
user_id = msg.get("user_id", "").strip()
user_name = msg.get("name", "").strip()
if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled:
kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id)
# Inject Honcho context into the Realtime session instructions
memory_suffix = ""
if kira_memory.enabled:
try:
ctx = kira_memory.build_system_prompt_suffix()
if ctx:
memory_suffix = ctx
except Exception:
pass
if relay and relay._connected and memory_suffix:
await relay._send({
"type": "session.update",
"session": {
"instructions": (
"You are Kira, a warm, kind, and encouraging AI body double. "
"Speak in a friendly, girly-pop tone. Help someone with ADHD "
"stay focused. Keep responses short and supportive. "
"Check in, remind breaks, celebrate wins. Never judgmental."
+ memory_suffix
),
},
})
await websocket.send_json({
"type": "identified",
"user_id": user_id,
"preferences": prefs,
})
continue
# ── Preferences ──
if msg_type == "set_preference":
response = handle_set_preference(msg, session_id, user_id)
if response:
await websocket.send_json(response)
key = msg.get("key", "").strip()
value = msg.get("value", "").strip()
if key and user_id and user_id != "default-user":
kira_memory.set_user_preference(user_id, key, value)
continue
# ── Conversation ──
# ── Audio from frontend (PCM16) ──
if msg_type == "audio":
audio_b64 = msg.get("data", "")
if audio_b64:
pcm16 = base64.b64decode(audio_b64)
await audio_queue.put(pcm16)
continue
system_prompt = build_system_prompt(user_id)
# ── Text input ──
if msg_type == "conversation_text":
text = msg.get("text", "").strip()
if text:
await text_queue.put(text)
# Also store in Honcho immediately
if kira_memory.enabled and identified:
kira_memory.store_user_message(text)
continue
if msg_type == "audio_chunk":
chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk)
elif msg_type == "transcribe":
if not audio_buffer:
await websocket.send_json({"type": "error", "message": "No audio data"})
continue
logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...")
transcript = await transcribe_audio(bytes(audio_buffer))
audio_buffer.clear()
if not transcript:
await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
continue
await websocket.send_json({"type": "transcript", "text": transcript})
logger.info(f"[{session_id}] User: {transcript}")
conversation_history.append({"role": "user", "content": transcript})
messages = [system_prompt] + conversation_history[-10:]
kira_text = await get_kira_response(messages)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
except Exception as e:
logger.warning(f"[{session_id}] Failed to store messages: {e}")
await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"})
elif msg_type == "ping":
if msg_type == "ping":
await websocket.send_json({"type": "pong"})
elif msg_type == "conversation_text":
user_text = msg.get("text", "").strip()
if not user_text:
continue
logger.info(f"[{session_id}] User (text): {user_text}")
conversation_history.append({"role": "user", "content": user_text})
messages = [system_prompt] + conversation_history[-10:]
kira_text = await get_kira_response(messages)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(user_text, kira_text)
except Exception as e:
logger.warning(f"[{session_id}] Failed to store messages: {e}")
await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"})
except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected")
except Exception as e:
logger.error(f"[{session_id}] Error: {e}")
try:
await websocket.send_json({"type": "error", "message": str(e)})
except Exception:
pass
finally:
# Store pending transcripts in Honcho
if kira_memory.enabled and identified:
for _, transcript_text in pending_transcripts:
if transcript_text.startswith("user: "):
content = transcript_text[6:]
kira_memory.store_user_message(content)
elif transcript_text.startswith("assistant: "):
content = transcript_text[11:]
kira_memory.store_kira_message(content)
fwd_audio_task.cancel()
fwd_text_task.cancel()
if relay:
await relay.disconnect()
if relay_task:
relay_task.cancel()
+1
View File
@@ -7,3 +7,4 @@ pydantic>=2.10.0
pydantic-settings>=2.7.0
httpx>=0.28.0
honcho-ai>=2.1.0
openai[realtime]>=2.41.0
+191
View File
@@ -0,0 +1,191 @@
"""OpenAI Realtime API relay service.
Manages a WebSocket connection to OpenAI's Realtime API and relays
audio/text events between the client and OpenAI.
"""
import json
import logging
from typing import Callable, Awaitable
from config import settings
logger = logging.getLogger("kira.realtime")
# ─── System instructions for Kira's personality ───
KIRA_INSTRUCTIONS = (
"You are Kira, a warm, kind, and encouraging AI body double. "
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental. "
"You remember things about them between conversations."
)
class RealtimeRelay:
"""Relays audio/text between a client WS and OpenAI Realtime API."""
def __init__(
self,
on_audio_delta: Callable[[bytes], Awaitable[None]],
on_transcript: Callable[[str], Awaitable[None]],
on_speech_started: Callable[[], Awaitable[None]],
on_speech_stopped: Callable[[], Awaitable[None]],
on_interruption: Callable[[], Awaitable[None]],
on_error: Callable[[str], Awaitable[None]],
on_ready: Callable[[], Awaitable[None]],
):
self._on_audio_delta = on_audio_delta
self._on_transcript = on_transcript
self._on_speech_started = on_speech_started
self._on_speech_stopped = on_speech_stopped
self._on_interruption = on_interruption
self._on_error = on_error
self._on_ready = on_ready
self._conn = None
self._connected = False
async def connect(self):
"""Open a WebSocket to OpenAI Realtime API."""
if self._connected:
return
try:
from openai import AsyncOpenAI
client = AsyncOpenAI(api_key=settings.openai_api_key)
logger.info("Connecting to OpenAI Realtime API...")
async with client.beta.realtime.connect(
model="gpt-4o-mini-realtime-preview-2025-07-18",
) as conn:
self._conn = conn
self._connected = True
logger.info("Connected to OpenAI Realtime API")
# Configure session
await self._send({
"type": "session.update",
"session": {
"instructions": KIRA_INSTRUCTIONS,
"voice": "alloy",
"input_audio_transcription": {"enabled": True},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 600,
},
},
})
await self._on_ready()
# Listen for events
while self._connected:
try:
event = await conn.recv()
await self._handle_event(event)
except Exception as e:
if self._connected:
logger.warning(f"Realtime recv error: {e}")
break
except ImportError:
logger.error("openai[realtime] not installed — run: pip install 'openai[realtime]'")
await self._on_error("Missing openai[realtime] dependency")
except Exception as e:
logger.error(f"Realtime connection error: {e}")
await self._on_error(str(e))
finally:
self._connected = False
self._conn = None
async def _handle_event(self, event):
"""Process an event from the OpenAI Realtime API."""
event_type = getattr(event, "type", None) or event.get("type", "")
if event_type == "response.audio.delta":
audio_b64 = getattr(event, "delta", None) or event.get("delta", "")
if audio_b64:
import base64
audio_bytes = base64.b64decode(audio_b64)
await self._on_audio_delta(audio_bytes)
elif event_type == "response.audio_buffer.speech_started":
await self._on_speech_started()
elif event_type == "response.audio_buffer.speech_stopped":
await self._on_speech_stopped()
elif event_type == "input_audio_buffer.speech_started":
# User started speaking — interrupt Kira
await self._on_interruption()
elif event_type == "conversation.item.created":
item = getattr(event, "item", None) or event.get("item", {})
role = getattr(item, "role", None) or item.get("role", "")
content = getattr(item, "content", None) or item.get("content", [])
for part in (content or []):
part_type = getattr(part, "type", None) or part.get("type", "")
part_text = getattr(part, "text", None) or part.get("text", "")
if part_type == "text" and part_text and role == "assistant":
await self._on_transcript(f"assistant: {part_text}")
part_transcript = getattr(part, "transcript", None) or part.get("transcript", "")
if part_type == "transcript" and part_transcript and role == "user":
await self._on_transcript(f"user: {part_transcript}")
elif event_type == "error":
err = getattr(event, "error", None) or event.get("error", {})
msg = getattr(err, "message", None) or err.get("message", str(event))
logger.warning(f"Realtime API error: {msg}")
await self._on_error(msg)
async def send_audio(self, pcm16_bytes: bytes):
"""Send PCM16 audio chunk to OpenAI."""
if not self._connected or not self._conn:
return
try:
import base64
audio_b64 = base64.b64encode(pcm16_bytes).decode("utf-8")
await self._send({
"type": "input_audio_buffer.append",
"audio": audio_b64,
})
except Exception as e:
logger.warning(f"Failed to send audio: {e}")
async def send_text(self, text: str):
"""Send a text message to OpenAI and trigger a response."""
if not self._connected or not self._conn:
return
try:
await self._send({
"type": "conversation.item.create",
"item": {
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": text}],
},
})
await self._send({"type": "response.create"})
except Exception as e:
logger.warning(f"Failed to send text: {e}")
async def _send(self, data: dict):
"""Send a JSON event to the Realtime API."""
try:
await self._conn.send(data)
except Exception as e:
logger.warning(f"Realtime send error: {e}")
async def disconnect(self):
"""Close the Realtime connection."""
self._connected = False
if self._conn:
try:
await self._conn.close()
except Exception:
pass
self._conn = None
+176 -79
View File
@@ -25,6 +25,41 @@ function saveUserId(id: string) {
localStorage.setItem(USER_ID_KEY, id);
}
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
function startPCMCapture(
stream: MediaStream,
onChunk: (pcm16: Uint8Array) => void,
): { stop: () => void } {
const ctx = new AudioContext({ sampleRate: 24000 });
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
let running = true;
processor.onaudioprocess = (e) => {
if (!running) return;
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
// Convert float32 → PCM16 int16
const pcm16 = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const s = Math.max(-1, Math.min(1, input[i]));
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
onChunk(new Uint8Array(pcm16.buffer));
};
source.connect(processor);
processor.connect(ctx.destination);
return {
stop: () => {
running = false;
source.disconnect();
processor.disconnect();
ctx.close();
},
};
}
export function useConversation() {
const [messages, setMessages] = useState<Message[]>([]);
const [isConnected, setIsConnected] = useState(false);
@@ -38,11 +73,13 @@ export function useConversation() {
accessory: '',
});
const [loadingPrefs, setLoadingPrefs] = useState(true);
const [micError, setMicError] = useState<string | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const captureRef = useRef<{ stop: () => void } | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]);
// Connect WebSocket
const connect = useCallback(() => {
@@ -54,7 +91,6 @@ export function useConversation() {
ws.onopen = () => {
setIsConnected(true);
// Auto-identify if returning user
const savedId = loadUserId();
if (savedId) {
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
@@ -102,35 +138,56 @@ export function useConversation() {
break;
}
case 'preference_saved':
// Already optimistically updated locally
break;
case 'transcript':
addMessage('user', msg.text);
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
break;
case 'speaking_start':
setIsKiraSpeaking(true);
addMessage('kira', msg.text || '...');
break;
case 'audio':
case 'audio': {
// Incoming PCM16 audio from Kira
if (msg.data && audioRef.current) {
// Accumulate audio chunks and create a blob
const binary = atob(msg.data);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
const blob = new Blob([bytes], { type: 'audio/ogg' });
audioBufferRef.current.push(bytes);
// Convert accumulated PCM16 to WAV blob for playback
const allChunks = audioBufferRef.current;
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
const combined = new Uint8Array(totalLen);
let offset = 0;
for (const chunk of allChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
const wav = pcm16ToWav(combined);
const blob = new Blob([wav], { type: 'audio/wav' });
const url = URL.createObjectURL(blob);
audioRef.current.src = url;
audioRef.current.play().catch(() => {});
}
break;
}
case 'speaking_end':
setIsKiraSpeaking(false);
audioBufferRef.current = [];
break;
case 'interruption':
setIsKiraSpeaking(false);
audioBufferRef.current = [];
if (audioRef.current) {
audioRef.current.pause();
audioRef.current.currentTime = 0;
}
break;
case 'error':
@@ -154,99 +211,80 @@ export function useConversation() {
setPreferences((p) => ({ ...p, name }));
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({
type: 'identify',
user_id: userId,
name,
}));
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
}
}, []);
// ── Preferences ──
const setPreference = useCallback((key: string, value: string) => {
// Optimistic update
setPreferences((p) => ({ ...p, [key]: value }));
// Sync to backend
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
wsRef.current.send(JSON.stringify({
type: 'set_preference',
key,
value,
}));
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
}
}, [identified]);
// ── Audio (Realtime PCM16) ──
const startRecording = useCallback(async () => {
// Check HTTPS
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
return;
}
try {
setMicError(null);
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
const ws = wsRef.current;
if (!ws || ws.readyState !== WebSocket.OPEN) {
addMessage('kira', 'Not connected to server yet...');
stream.getTracks().forEach((t) => t.stop());
return;
}
// Start PCM16 capture — each chunk sent as WS message
const capture = startPCMCapture(stream, (pcm16) => {
if (ws.readyState === WebSocket.OPEN) {
const base64 = arrayBufferToBase64(pcm16.buffer);
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
}
});
captureRef.current = capture;
setIsRecording(true);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
setMicError(msg);
console.error('[Kira Mic]', msg);
}
}, [addMessage]);
const stopRecording = useCallback(() => {
captureRef.current?.stop();
captureRef.current = null;
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
setIsRecording(false);
}, []);
// ── Text ──
const sendText = useCallback((text: string) => {
if (!text.trim()) return;
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({
type: 'conversation_text',
text: text.trim(),
}));
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
}
}, []);
// ── Audio ──
const startRecording = useCallback(async () => {
// Check if mediaDevices is available (requires HTTPS/localhost)
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!');
return;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
const recorder = new MediaRecorder(stream, {
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm',
});
const chunks: BlobPart[] = [];
recorder.ondataavailable = (e) => {
if (e.data.size > 0) chunks.push(e.data);
};
recorder.onstop = () => {
const blob = new Blob(chunks, { type: 'audio/webm' });
const reader = new FileReader();
reader.onload = () => {
const base64 = (reader.result as string).split(',')[1];
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
}
};
reader.readAsDataURL(blob);
stream.getTracks().forEach((t) => t.stop());
setIsRecording(false);
};
recorder.start();
recorderRef.current = recorder;
setIsRecording(true);
} catch (err) {
console.error('[Kira Mic] failed:', err);
}
}, []);
const stopRecording = useCallback(() => {
recorderRef.current?.stop();
}, []);
// Connect on mount
useEffect(() => {
connect();
return () => {
wsRef.current?.close();
captureRef.current?.stop();
streamRef.current?.getTracks().forEach((t) => t.stop());
};
}, [connect]);
@@ -259,6 +297,7 @@ export function useConversation() {
identified,
preferences,
loadingPrefs,
micError,
identify,
setPreference,
sendText,
@@ -266,3 +305,61 @@ export function useConversation() {
stopRecording,
};
}
// ── Helpers ──
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
const numChannels = 1;
const sampleRate = 24000;
const bitsPerSample = 16;
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
const blockAlign = numChannels * (bitsPerSample / 8);
const dataSize = pcm16.length;
const headerSize = 44;
const totalSize = headerSize + dataSize;
const buf = new ArrayBuffer(totalSize);
const view = new DataView(buf);
// RIFF header
writeString(view, 0, 'RIFF');
view.setUint32(4, totalSize - 8, true);
writeString(view, 8, 'WAVE');
// fmt subchunk
writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true); // subchunk size
view.setUint16(20, 1, true); // PCM
view.setUint16(22, numChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitsPerSample, true);
// data subchunk
writeString(view, 36, 'data');
view.setUint32(40, dataSize, true);
// PCM data
for (let i = 0; i < pcm16.length; i++) {
view.setUint8(44 + i, pcm16[i]);
}
return buf;
}
function writeString(view: DataView, offset: number, str: string) {
for (let i = 0; i < str.length; i++) {
view.setUint8(offset + i, str.charCodeAt(i));
}
}