e2332af8d0
Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM → OpenAI TTS) with a single OpenAI Realtime API WebSocket using gpt-4o-mini-realtime-preview. - ~300-800ms latency vs 1-3s - Server VAD for automatic turn detection - Streaming audio chunks during playback - Interruptions: user can speak over Kira mid-response - Honcho memory still injected into session instructions - Frontend captures PCM16 mono 24kHz via AudioContext - Backend relays client ↔ OpenAI Realtime API - Supports both voice (PCM16) and text input
366 lines
10 KiB
TypeScript
366 lines
10 KiB
TypeScript
import { useState, useCallback, useRef, useEffect } from 'react';
|
|
|
|
export interface UserPreferences {
|
|
name: string;
|
|
scene: string;
|
|
outfit: string;
|
|
accessory: string;
|
|
}
|
|
|
|
interface Message {
|
|
id: string;
|
|
role: 'user' | 'kira';
|
|
text: string;
|
|
timestamp: number;
|
|
}
|
|
|
|
const WS_URL = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}/api/ws`;
|
|
const USER_ID_KEY = 'kira-user-id';
|
|
|
|
function loadUserId(): string {
|
|
return localStorage.getItem(USER_ID_KEY) || '';
|
|
}
|
|
|
|
function saveUserId(id: string) {
|
|
localStorage.setItem(USER_ID_KEY, id);
|
|
}
|
|
|
|
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
|
function startPCMCapture(
|
|
stream: MediaStream,
|
|
onChunk: (pcm16: Uint8Array) => void,
|
|
): { stop: () => void } {
|
|
const ctx = new AudioContext({ sampleRate: 24000 });
|
|
const source = ctx.createMediaStreamSource(stream);
|
|
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
|
let running = true;
|
|
|
|
processor.onaudioprocess = (e) => {
|
|
if (!running) return;
|
|
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
|
|
// Convert float32 → PCM16 int16
|
|
const pcm16 = new Int16Array(input.length);
|
|
for (let i = 0; i < input.length; i++) {
|
|
const s = Math.max(-1, Math.min(1, input[i]));
|
|
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
}
|
|
onChunk(new Uint8Array(pcm16.buffer));
|
|
};
|
|
|
|
source.connect(processor);
|
|
processor.connect(ctx.destination);
|
|
|
|
return {
|
|
stop: () => {
|
|
running = false;
|
|
source.disconnect();
|
|
processor.disconnect();
|
|
ctx.close();
|
|
},
|
|
};
|
|
}
|
|
|
|
export function useConversation() {
|
|
const [messages, setMessages] = useState<Message[]>([]);
|
|
const [isConnected, setIsConnected] = useState(false);
|
|
const [isKiraSpeaking, setIsKiraSpeaking] = useState(false);
|
|
const [isRecording, setIsRecording] = useState(false);
|
|
const [identified, setIdentified] = useState(false);
|
|
const [preferences, setPreferences] = useState<UserPreferences>({
|
|
name: '',
|
|
scene: 'cozy-room',
|
|
outfit: 'cozy-hoodie',
|
|
accessory: '',
|
|
});
|
|
const [loadingPrefs, setLoadingPrefs] = useState(true);
|
|
const [micError, setMicError] = useState<string | null>(null);
|
|
|
|
const wsRef = useRef<WebSocket | null>(null);
|
|
const audioRef = useRef<HTMLAudioElement | null>(null);
|
|
const captureRef = useRef<{ stop: () => void } | null>(null);
|
|
const streamRef = useRef<MediaStream | null>(null);
|
|
const audioBufferRef = useRef<Uint8Array[]>([]);
|
|
|
|
// Connect WebSocket
|
|
const connect = useCallback(() => {
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) return;
|
|
setLoadingPrefs(true);
|
|
|
|
const ws = new WebSocket(WS_URL);
|
|
wsRef.current = ws;
|
|
|
|
ws.onopen = () => {
|
|
setIsConnected(true);
|
|
const savedId = loadUserId();
|
|
if (savedId) {
|
|
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
|
|
} else {
|
|
setLoadingPrefs(false);
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
setIsConnected(false);
|
|
setTimeout(connect, 3000);
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
try {
|
|
const msg = JSON.parse(event.data);
|
|
handleMessage(msg);
|
|
} catch { /* ignore parse errors */ }
|
|
};
|
|
}, []);
|
|
|
|
// Audio playback element
|
|
useEffect(() => {
|
|
if (!audioRef.current) {
|
|
audioRef.current = new Audio();
|
|
audioRef.current.onended = () => setIsKiraSpeaking(false);
|
|
}
|
|
}, []);
|
|
|
|
// Handle incoming WS messages
|
|
const handleMessage = useCallback((msg: any) => {
|
|
switch (msg.type) {
|
|
case 'identified': {
|
|
setIdentified(true);
|
|
setLoadingPrefs(false);
|
|
if (msg.user_id) saveUserId(msg.user_id);
|
|
if (msg.preferences) {
|
|
setPreferences({
|
|
name: msg.preferences.name || '',
|
|
scene: msg.preferences.scene || 'cozy-room',
|
|
outfit: msg.preferences.outfit || 'cozy-hoodie',
|
|
accessory: msg.preferences.accessory || '',
|
|
});
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'transcript':
|
|
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
|
|
break;
|
|
|
|
case 'speaking_start':
|
|
setIsKiraSpeaking(true);
|
|
break;
|
|
|
|
case 'audio': {
|
|
// Incoming PCM16 audio from Kira
|
|
if (msg.data && audioRef.current) {
|
|
// Accumulate audio chunks and create a blob
|
|
const binary = atob(msg.data);
|
|
const bytes = new Uint8Array(binary.length);
|
|
for (let i = 0; i < binary.length; i++) {
|
|
bytes[i] = binary.charCodeAt(i);
|
|
}
|
|
audioBufferRef.current.push(bytes);
|
|
|
|
// Convert accumulated PCM16 to WAV blob for playback
|
|
const allChunks = audioBufferRef.current;
|
|
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
|
|
const combined = new Uint8Array(totalLen);
|
|
let offset = 0;
|
|
for (const chunk of allChunks) {
|
|
combined.set(chunk, offset);
|
|
offset += chunk.length;
|
|
}
|
|
|
|
const wav = pcm16ToWav(combined);
|
|
const blob = new Blob([wav], { type: 'audio/wav' });
|
|
const url = URL.createObjectURL(blob);
|
|
audioRef.current.src = url;
|
|
audioRef.current.play().catch(() => {});
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'speaking_end':
|
|
setIsKiraSpeaking(false);
|
|
audioBufferRef.current = [];
|
|
break;
|
|
|
|
case 'interruption':
|
|
setIsKiraSpeaking(false);
|
|
audioBufferRef.current = [];
|
|
if (audioRef.current) {
|
|
audioRef.current.pause();
|
|
audioRef.current.currentTime = 0;
|
|
}
|
|
break;
|
|
|
|
case 'error':
|
|
console.error('[Kira]', msg.message);
|
|
break;
|
|
}
|
|
}, []);
|
|
|
|
const addMessage = useCallback((role: 'user' | 'kira', text: string) => {
|
|
setMessages((prev) => [
|
|
...prev,
|
|
{ id: crypto.randomUUID(), role, text, timestamp: Date.now() },
|
|
]);
|
|
}, []);
|
|
|
|
// ── Identity ──
|
|
|
|
const identify = useCallback((name: string) => {
|
|
const userId = `kira-${name.toLowerCase().replace(/[^a-z0-9]/g, '-')}`;
|
|
saveUserId(userId);
|
|
setPreferences((p) => ({ ...p, name }));
|
|
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
|
|
}
|
|
}, []);
|
|
|
|
// ── Preferences ──
|
|
|
|
const setPreference = useCallback((key: string, value: string) => {
|
|
setPreferences((p) => ({ ...p, [key]: value }));
|
|
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
|
|
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
|
|
}
|
|
}, [identified]);
|
|
|
|
// ── Audio (Realtime PCM16) ──
|
|
|
|
const startRecording = useCallback(async () => {
|
|
// Check HTTPS
|
|
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
|
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
setMicError(null);
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
streamRef.current = stream;
|
|
|
|
const ws = wsRef.current;
|
|
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
addMessage('kira', 'Not connected to server yet...');
|
|
stream.getTracks().forEach((t) => t.stop());
|
|
return;
|
|
}
|
|
|
|
// Start PCM16 capture — each chunk sent as WS message
|
|
const capture = startPCMCapture(stream, (pcm16) => {
|
|
if (ws.readyState === WebSocket.OPEN) {
|
|
const base64 = arrayBufferToBase64(pcm16.buffer);
|
|
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
|
}
|
|
});
|
|
|
|
captureRef.current = capture;
|
|
setIsRecording(true);
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
setMicError(msg);
|
|
console.error('[Kira Mic]', msg);
|
|
}
|
|
}, [addMessage]);
|
|
|
|
const stopRecording = useCallback(() => {
|
|
captureRef.current?.stop();
|
|
captureRef.current = null;
|
|
streamRef.current?.getTracks().forEach((t) => t.stop());
|
|
streamRef.current = null;
|
|
setIsRecording(false);
|
|
}, []);
|
|
|
|
// ── Text ──
|
|
|
|
const sendText = useCallback((text: string) => {
|
|
if (!text.trim()) return;
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
|
|
}
|
|
}, []);
|
|
|
|
// Connect on mount
|
|
useEffect(() => {
|
|
connect();
|
|
return () => {
|
|
wsRef.current?.close();
|
|
captureRef.current?.stop();
|
|
streamRef.current?.getTracks().forEach((t) => t.stop());
|
|
};
|
|
}, [connect]);
|
|
|
|
return {
|
|
messages,
|
|
isConnected,
|
|
isKiraSpeaking,
|
|
isRecording,
|
|
identified,
|
|
preferences,
|
|
loadingPrefs,
|
|
micError,
|
|
identify,
|
|
setPreference,
|
|
sendText,
|
|
startRecording,
|
|
stopRecording,
|
|
};
|
|
}
|
|
|
|
// ── Helpers ──
|
|
|
|
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
|
const bytes = new Uint8Array(buffer);
|
|
let binary = '';
|
|
for (let i = 0; i < bytes.length; i++) {
|
|
binary += String.fromCharCode(bytes[i]);
|
|
}
|
|
return btoa(binary);
|
|
}
|
|
|
|
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
|
|
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
|
|
const numChannels = 1;
|
|
const sampleRate = 24000;
|
|
const bitsPerSample = 16;
|
|
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
|
const blockAlign = numChannels * (bitsPerSample / 8);
|
|
const dataSize = pcm16.length;
|
|
const headerSize = 44;
|
|
const totalSize = headerSize + dataSize;
|
|
|
|
const buf = new ArrayBuffer(totalSize);
|
|
const view = new DataView(buf);
|
|
|
|
// RIFF header
|
|
writeString(view, 0, 'RIFF');
|
|
view.setUint32(4, totalSize - 8, true);
|
|
writeString(view, 8, 'WAVE');
|
|
|
|
// fmt subchunk
|
|
writeString(view, 12, 'fmt ');
|
|
view.setUint32(16, 16, true); // subchunk size
|
|
view.setUint16(20, 1, true); // PCM
|
|
view.setUint16(22, numChannels, true);
|
|
view.setUint32(24, sampleRate, true);
|
|
view.setUint32(28, byteRate, true);
|
|
view.setUint16(32, blockAlign, true);
|
|
view.setUint16(34, bitsPerSample, true);
|
|
|
|
// data subchunk
|
|
writeString(view, 36, 'data');
|
|
view.setUint32(40, dataSize, true);
|
|
|
|
// PCM data
|
|
for (let i = 0; i < pcm16.length; i++) {
|
|
view.setUint8(44 + i, pcm16[i]);
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
function writeString(view: DataView, offset: number, str: string) {
|
|
for (let i = 0; i < str.length; i++) {
|
|
view.setUint8(offset + i, str.charCodeAt(i));
|
|
}
|
|
}
|