Files
kira/frontend/src/hooks/useConversation.ts
T
hobokenchicken 77cbd91b93 fix(tts): play Opus chunks immediately as they arrive instead of buffering until speaking_end
This makes the voice start playing the first words while the rest of the response is still generating (big win for perceived latency).
Per PLAN item 2.
2026-06-04 15:28:40 -04:00

337 lines
9.8 KiB
TypeScript

import { useState, useCallback, useRef, useEffect } from 'react';
export interface UserPreferences {
name: string;
scene: string;
outfit: string;
accessory: string;
}
interface Message {
id: string;
role: 'user' | 'kira';
text: string;
timestamp: number;
}
const WS_URL = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}/api/ws`;
const USER_ID_KEY = 'kira-user-id';
function loadUserId(): string {
return localStorage.getItem(USER_ID_KEY) || '';
}
function saveUserId(id: string) {
localStorage.setItem(USER_ID_KEY, id);
}
export function useConversation() {
const [messages, setMessages] = useState<Message[]>([]);
const [isConnected, setIsConnected] = useState(false);
const [isKiraSpeaking, setIsKiraSpeaking] = useState(false);
const [isRecording, setIsRecording] = useState(false);
const [identified, setIdentified] = useState(false);
const [preferences, setPreferences] = useState<UserPreferences>({
name: '',
scene: 'cozy-room',
outfit: 'cozy-hoodie',
accessory: '',
});
const [loadingPrefs, setLoadingPrefs] = useState(true);
const [micError, setMicError] = useState<string | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const captureRef = useRef<{ stop: () => void } | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]);
// Connect WebSocket
const connect = useCallback(() => {
if (wsRef.current?.readyState === WebSocket.OPEN) return;
setLoadingPrefs(true);
const ws = new WebSocket(WS_URL);
wsRef.current = ws;
ws.onopen = () => {
setIsConnected(true);
const savedId = loadUserId();
if (savedId) {
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
} else {
setLoadingPrefs(false);
}
};
ws.onclose = () => {
setIsConnected(false);
setTimeout(connect, 3000);
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
handleMessage(msg);
} catch { /* ignore parse errors */ }
};
}, []);
// Audio playback element
useEffect(() => {
if (!audioRef.current) {
audioRef.current = new Audio();
audioRef.current.onended = () => setIsKiraSpeaking(false);
}
}, []);
// Handle incoming WS messages
const handleMessage = useCallback((msg: any) => {
switch (msg.type) {
case 'identified': {
setIdentified(true);
setLoadingPrefs(false);
if (msg.user_id) saveUserId(msg.user_id);
if (msg.preferences) {
setPreferences({
name: msg.preferences.name || '',
scene: msg.preferences.scene || 'cozy-room',
outfit: msg.preferences.outfit || 'cozy-hoodie',
accessory: msg.preferences.accessory || '',
});
}
break;
}
case 'transcript':
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
break;
case 'transcript_delta':
// Streaming partial transcript — could show as typing indicator
break;
case 'speaking_start':
setIsKiraSpeaking(true);
break;
case 'audio': {
// Incoming Opus audio chunk from streaming TTS
if (msg.data) {
const binary = atob(msg.data);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
audioBufferRef.current.push(bytes);
}
break;
}
case 'speaking_end':
setIsKiraSpeaking(false);
// Play all accumulated chunks as one blob
if (audioBufferRef.current.length > 0 && audioRef.current) {
const allChunks = audioBufferRef.current;
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
const combined = new Uint8Array(totalLen);
let offset = 0;
for (const chunk of allChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
// audioBufferRef no longer used for playback (incremental)
const blob = new Blob([combined], { type: 'audio/ogg' });
const url = URL.createObjectURL(blob);
audioRef.current.src = url;
audioRef.current.play().catch(() => {});
}
break;
case 'interruption':
setIsKiraSpeaking(false);
if (audioRef.current) {
audioRef.current.pause();
audioRef.current.currentTime = 0;
}
break;
case 'error':
console.error('[Kira]', msg.message);
break;
}
}, []);
const addMessage = useCallback((role: 'user' | 'kira', text: string) => {
setMessages((prev) => [
...prev,
{ id: crypto.randomUUID(), role, text, timestamp: Date.now() },
]);
}, []);
// ── Identity ──
const identify = useCallback((name: string) => {
const userId = `kira-${name.toLowerCase().replace(/[^a-z0-9]/g, '-')}`;
saveUserId(userId);
setPreferences((p) => ({ ...p, name }));
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
}
}, []);
// ── Preferences ──
const setPreference = useCallback((key: string, value: string) => {
setPreferences((p) => ({ ...p, [key]: value }));
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
}
}, [identified]);
// ── Audio (Realtime PCM16) ──
const startRecording = useCallback(async () => {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
return;
}
try {
setMicError(null);
const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } });
streamRef.current = stream;
const ws = wsRef.current;
if (!ws || ws.readyState !== WebSocket.OPEN) {
addMessage('kira', 'Not connected to server yet...');
stream.getTracks().forEach((t) => t.stop());
return;
}
// Use MediaRecorder for full utterance blob (Opus/webm) — sent on stop for REST STT
const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
const chunks: Blob[] = [];
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) chunks.push(e.data);
};
mediaRecorder.onstop = () => {
if (chunks.length > 0 && ws.readyState === WebSocket.OPEN) {
const blob = new Blob(chunks, { type: 'audio/webm' });
blob.arrayBuffer().then((buf) => {
const base64 = arrayBufferToBase64(buf);
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
});
}
chunks.length = 0;
stream.getTracks().forEach((t) => t.stop());
streamRef.current = null;
setIsRecording(false);
};
recorderRef.current = mediaRecorder;
mediaRecorder.start();
setIsRecording(true);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
setMicError(msg);
console.error('[Kira Mic]', msg);
}
}, [addMessage]);
const stopRecording = useCallback(() => {
if (recorderRef.current && recorderRef.current.state === 'recording') {
recorderRef.current.stop();
// onstop will handle sending the blob and cleanup
} else {
// fallback cleanup
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
setIsRecording(false);
}
captureRef.current = null; // legacy
}, []);
// ── Text ──
const sendText = useCallback((text: string) => {
if (!text.trim()) return;
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
}
}, []);
// Connect on mount
useEffect(() => {
connect();
return () => {
wsRef.current?.close();
if (recorderRef.current && recorderRef.current.state === 'recording') recorderRef.current.stop();
captureRef.current?.stop();
streamRef.current?.getTracks().forEach((t) => t.stop());
};
}, [connect]);
return {
messages,
isConnected,
isKiraSpeaking,
isRecording,
identified,
preferences,
loadingPrefs,
micError,
identify,
setPreference,
sendText,
startRecording,
stopRecording,
};
}
// ── Helpers ──
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
function startPCMCapture(
stream: MediaStream,
onChunk: (pcm16: Uint8Array) => void,
): { stop: () => void } {
const ctx = new AudioContext({ sampleRate: 24000 });
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
let running = true;
processor.onaudioprocess = (e) => {
if (!running) return;
const input = e.inputBuffer.getChannelData(0);
const pcm16 = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const s = Math.max(-1, Math.min(1, input[i]));
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
onChunk(new Uint8Array(pcm16.buffer));
};
source.connect(processor);
processor.connect(ctx.destination);
return {
stop: () => {
running = false;
source.disconnect();
processor.disconnect();
ctx.close();
},
};
}