77cbd91b93
This makes the voice start playing the first words while the rest of the response is still generating (big win for perceived latency). Per PLAN item 2.
337 lines
9.8 KiB
TypeScript
337 lines
9.8 KiB
TypeScript
import { useState, useCallback, useRef, useEffect } from 'react';
|
|
|
|
export interface UserPreferences {
|
|
name: string;
|
|
scene: string;
|
|
outfit: string;
|
|
accessory: string;
|
|
}
|
|
|
|
interface Message {
|
|
id: string;
|
|
role: 'user' | 'kira';
|
|
text: string;
|
|
timestamp: number;
|
|
}
|
|
|
|
const WS_URL = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}/api/ws`;
|
|
const USER_ID_KEY = 'kira-user-id';
|
|
|
|
function loadUserId(): string {
|
|
return localStorage.getItem(USER_ID_KEY) || '';
|
|
}
|
|
|
|
function saveUserId(id: string) {
|
|
localStorage.setItem(USER_ID_KEY, id);
|
|
}
|
|
|
|
export function useConversation() {
|
|
const [messages, setMessages] = useState<Message[]>([]);
|
|
const [isConnected, setIsConnected] = useState(false);
|
|
const [isKiraSpeaking, setIsKiraSpeaking] = useState(false);
|
|
const [isRecording, setIsRecording] = useState(false);
|
|
const [identified, setIdentified] = useState(false);
|
|
const [preferences, setPreferences] = useState<UserPreferences>({
|
|
name: '',
|
|
scene: 'cozy-room',
|
|
outfit: 'cozy-hoodie',
|
|
accessory: '',
|
|
});
|
|
const [loadingPrefs, setLoadingPrefs] = useState(true);
|
|
const [micError, setMicError] = useState<string | null>(null);
|
|
|
|
const wsRef = useRef<WebSocket | null>(null);
|
|
const audioRef = useRef<HTMLAudioElement | null>(null);
|
|
const captureRef = useRef<{ stop: () => void } | null>(null);
|
|
const recorderRef = useRef<MediaRecorder | null>(null);
|
|
const streamRef = useRef<MediaStream | null>(null);
|
|
const audioBufferRef = useRef<Uint8Array[]>([]);
|
|
|
|
// Connect WebSocket
|
|
const connect = useCallback(() => {
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) return;
|
|
setLoadingPrefs(true);
|
|
|
|
const ws = new WebSocket(WS_URL);
|
|
wsRef.current = ws;
|
|
|
|
ws.onopen = () => {
|
|
setIsConnected(true);
|
|
const savedId = loadUserId();
|
|
if (savedId) {
|
|
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
|
|
} else {
|
|
setLoadingPrefs(false);
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
setIsConnected(false);
|
|
setTimeout(connect, 3000);
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
try {
|
|
const msg = JSON.parse(event.data);
|
|
handleMessage(msg);
|
|
} catch { /* ignore parse errors */ }
|
|
};
|
|
}, []);
|
|
|
|
// Audio playback element
|
|
useEffect(() => {
|
|
if (!audioRef.current) {
|
|
audioRef.current = new Audio();
|
|
audioRef.current.onended = () => setIsKiraSpeaking(false);
|
|
}
|
|
}, []);
|
|
|
|
// Handle incoming WS messages
|
|
const handleMessage = useCallback((msg: any) => {
|
|
switch (msg.type) {
|
|
case 'identified': {
|
|
setIdentified(true);
|
|
setLoadingPrefs(false);
|
|
if (msg.user_id) saveUserId(msg.user_id);
|
|
if (msg.preferences) {
|
|
setPreferences({
|
|
name: msg.preferences.name || '',
|
|
scene: msg.preferences.scene || 'cozy-room',
|
|
outfit: msg.preferences.outfit || 'cozy-hoodie',
|
|
accessory: msg.preferences.accessory || '',
|
|
});
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'transcript':
|
|
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
|
|
break;
|
|
|
|
case 'transcript_delta':
|
|
// Streaming partial transcript — could show as typing indicator
|
|
break;
|
|
|
|
case 'speaking_start':
|
|
setIsKiraSpeaking(true);
|
|
break;
|
|
|
|
case 'audio': {
|
|
// Incoming Opus audio chunk from streaming TTS
|
|
if (msg.data) {
|
|
const binary = atob(msg.data);
|
|
const bytes = new Uint8Array(binary.length);
|
|
for (let i = 0; i < binary.length; i++) {
|
|
bytes[i] = binary.charCodeAt(i);
|
|
}
|
|
audioBufferRef.current.push(bytes);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'speaking_end':
|
|
setIsKiraSpeaking(false);
|
|
// Play all accumulated chunks as one blob
|
|
if (audioBufferRef.current.length > 0 && audioRef.current) {
|
|
const allChunks = audioBufferRef.current;
|
|
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
|
|
const combined = new Uint8Array(totalLen);
|
|
let offset = 0;
|
|
for (const chunk of allChunks) {
|
|
combined.set(chunk, offset);
|
|
offset += chunk.length;
|
|
}
|
|
// audioBufferRef no longer used for playback (incremental)
|
|
const blob = new Blob([combined], { type: 'audio/ogg' });
|
|
const url = URL.createObjectURL(blob);
|
|
audioRef.current.src = url;
|
|
audioRef.current.play().catch(() => {});
|
|
}
|
|
break;
|
|
|
|
case 'interruption':
|
|
setIsKiraSpeaking(false);
|
|
if (audioRef.current) {
|
|
audioRef.current.pause();
|
|
audioRef.current.currentTime = 0;
|
|
}
|
|
break;
|
|
|
|
case 'error':
|
|
console.error('[Kira]', msg.message);
|
|
break;
|
|
}
|
|
}, []);
|
|
|
|
const addMessage = useCallback((role: 'user' | 'kira', text: string) => {
|
|
setMessages((prev) => [
|
|
...prev,
|
|
{ id: crypto.randomUUID(), role, text, timestamp: Date.now() },
|
|
]);
|
|
}, []);
|
|
|
|
// ── Identity ──
|
|
|
|
const identify = useCallback((name: string) => {
|
|
const userId = `kira-${name.toLowerCase().replace(/[^a-z0-9]/g, '-')}`;
|
|
saveUserId(userId);
|
|
setPreferences((p) => ({ ...p, name }));
|
|
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
|
|
}
|
|
}, []);
|
|
|
|
// ── Preferences ──
|
|
|
|
const setPreference = useCallback((key: string, value: string) => {
|
|
setPreferences((p) => ({ ...p, [key]: value }));
|
|
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
|
|
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
|
|
}
|
|
}, [identified]);
|
|
|
|
// ── Audio (Realtime PCM16) ──
|
|
|
|
const startRecording = useCallback(async () => {
|
|
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
|
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
setMicError(null);
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } });
|
|
streamRef.current = stream;
|
|
|
|
const ws = wsRef.current;
|
|
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
addMessage('kira', 'Not connected to server yet...');
|
|
stream.getTracks().forEach((t) => t.stop());
|
|
return;
|
|
}
|
|
|
|
// Use MediaRecorder for full utterance blob (Opus/webm) — sent on stop for REST STT
|
|
const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
|
|
const chunks: Blob[] = [];
|
|
mediaRecorder.ondataavailable = (e) => {
|
|
if (e.data.size > 0) chunks.push(e.data);
|
|
};
|
|
mediaRecorder.onstop = () => {
|
|
if (chunks.length > 0 && ws.readyState === WebSocket.OPEN) {
|
|
const blob = new Blob(chunks, { type: 'audio/webm' });
|
|
blob.arrayBuffer().then((buf) => {
|
|
const base64 = arrayBufferToBase64(buf);
|
|
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
|
});
|
|
}
|
|
chunks.length = 0;
|
|
stream.getTracks().forEach((t) => t.stop());
|
|
streamRef.current = null;
|
|
setIsRecording(false);
|
|
};
|
|
recorderRef.current = mediaRecorder;
|
|
mediaRecorder.start();
|
|
setIsRecording(true);
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
setMicError(msg);
|
|
console.error('[Kira Mic]', msg);
|
|
}
|
|
}, [addMessage]);
|
|
|
|
const stopRecording = useCallback(() => {
|
|
if (recorderRef.current && recorderRef.current.state === 'recording') {
|
|
recorderRef.current.stop();
|
|
// onstop will handle sending the blob and cleanup
|
|
} else {
|
|
// fallback cleanup
|
|
streamRef.current?.getTracks().forEach((t) => t.stop());
|
|
streamRef.current = null;
|
|
setIsRecording(false);
|
|
}
|
|
captureRef.current = null; // legacy
|
|
}, []);
|
|
|
|
// ── Text ──
|
|
|
|
const sendText = useCallback((text: string) => {
|
|
if (!text.trim()) return;
|
|
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
|
|
}
|
|
}, []);
|
|
|
|
// Connect on mount
|
|
useEffect(() => {
|
|
connect();
|
|
return () => {
|
|
wsRef.current?.close();
|
|
if (recorderRef.current && recorderRef.current.state === 'recording') recorderRef.current.stop();
|
|
captureRef.current?.stop();
|
|
streamRef.current?.getTracks().forEach((t) => t.stop());
|
|
};
|
|
}, [connect]);
|
|
|
|
return {
|
|
messages,
|
|
isConnected,
|
|
isKiraSpeaking,
|
|
isRecording,
|
|
identified,
|
|
preferences,
|
|
loadingPrefs,
|
|
micError,
|
|
identify,
|
|
setPreference,
|
|
sendText,
|
|
startRecording,
|
|
stopRecording,
|
|
};
|
|
}
|
|
|
|
// ── Helpers ──
|
|
|
|
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
|
const bytes = new Uint8Array(buffer);
|
|
let binary = '';
|
|
for (let i = 0; i < bytes.length; i++) {
|
|
binary += String.fromCharCode(bytes[i]);
|
|
}
|
|
return btoa(binary);
|
|
}
|
|
|
|
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
|
function startPCMCapture(
|
|
stream: MediaStream,
|
|
onChunk: (pcm16: Uint8Array) => void,
|
|
): { stop: () => void } {
|
|
const ctx = new AudioContext({ sampleRate: 24000 });
|
|
const source = ctx.createMediaStreamSource(stream);
|
|
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
|
let running = true;
|
|
|
|
processor.onaudioprocess = (e) => {
|
|
if (!running) return;
|
|
const input = e.inputBuffer.getChannelData(0);
|
|
const pcm16 = new Int16Array(input.length);
|
|
for (let i = 0; i < input.length; i++) {
|
|
const s = Math.max(-1, Math.min(1, input[i]));
|
|
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
}
|
|
onChunk(new Uint8Array(pcm16.buffer));
|
|
};
|
|
|
|
source.connect(processor);
|
|
processor.connect(ctx.destination);
|
|
|
|
return {
|
|
stop: () => {
|
|
running = false;
|
|
source.disconnect();
|
|
processor.disconnect();
|
|
ctx.close();
|
|
},
|
|
};
|
|
}
|