feat: OpenAI Realtime API pipeline
Replaced the 3-step sequential pipeline (Whisper STT → DeepSeek LLM → OpenAI TTS) with a single OpenAI Realtime API WebSocket using gpt-4o-mini-realtime-preview. - ~300-800ms latency vs 1-3s - Server VAD for automatic turn detection - Streaming audio chunks during playback - Interruptions: user can speak over Kira mid-response - Honcho memory still injected into session instructions - Frontend captures PCM16 mono 24kHz via AudioContext - Backend relays client ↔ OpenAI Realtime API - Supports both voice (PCM16) and text input
This commit is contained in:
@@ -25,6 +25,41 @@ function saveUserId(id: string) {
|
||||
localStorage.setItem(USER_ID_KEY, id);
|
||||
}
|
||||
|
||||
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
||||
function startPCMCapture(
|
||||
stream: MediaStream,
|
||||
onChunk: (pcm16: Uint8Array) => void,
|
||||
): { stop: () => void } {
|
||||
const ctx = new AudioContext({ sampleRate: 24000 });
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
||||
let running = true;
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!running) return;
|
||||
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
|
||||
// Convert float32 → PCM16 int16
|
||||
const pcm16 = new Int16Array(input.length);
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
onChunk(new Uint8Array(pcm16.buffer));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(ctx.destination);
|
||||
|
||||
return {
|
||||
stop: () => {
|
||||
running = false;
|
||||
source.disconnect();
|
||||
processor.disconnect();
|
||||
ctx.close();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function useConversation() {
|
||||
const [messages, setMessages] = useState<Message[]>([]);
|
||||
const [isConnected, setIsConnected] = useState(false);
|
||||
@@ -38,11 +73,13 @@ export function useConversation() {
|
||||
accessory: '',
|
||||
});
|
||||
const [loadingPrefs, setLoadingPrefs] = useState(true);
|
||||
const [micError, setMicError] = useState<string | null>(null);
|
||||
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const captureRef = useRef<{ stop: () => void } | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioBufferRef = useRef<Uint8Array[]>([]);
|
||||
|
||||
// Connect WebSocket
|
||||
const connect = useCallback(() => {
|
||||
@@ -54,7 +91,6 @@ export function useConversation() {
|
||||
|
||||
ws.onopen = () => {
|
||||
setIsConnected(true);
|
||||
// Auto-identify if returning user
|
||||
const savedId = loadUserId();
|
||||
if (savedId) {
|
||||
ws.send(JSON.stringify({ type: 'identify', user_id: savedId }));
|
||||
@@ -102,35 +138,56 @@ export function useConversation() {
|
||||
break;
|
||||
}
|
||||
|
||||
case 'preference_saved':
|
||||
// Already optimistically updated locally
|
||||
break;
|
||||
|
||||
case 'transcript':
|
||||
addMessage('user', msg.text);
|
||||
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
|
||||
break;
|
||||
|
||||
case 'speaking_start':
|
||||
setIsKiraSpeaking(true);
|
||||
addMessage('kira', msg.text || '...');
|
||||
break;
|
||||
|
||||
case 'audio':
|
||||
case 'audio': {
|
||||
// Incoming PCM16 audio from Kira
|
||||
if (msg.data && audioRef.current) {
|
||||
// Accumulate audio chunks and create a blob
|
||||
const binary = atob(msg.data);
|
||||
const bytes = new Uint8Array(binary.length);
|
||||
for (let i = 0; i < binary.length; i++) {
|
||||
bytes[i] = binary.charCodeAt(i);
|
||||
}
|
||||
const blob = new Blob([bytes], { type: 'audio/ogg' });
|
||||
audioBufferRef.current.push(bytes);
|
||||
|
||||
// Convert accumulated PCM16 to WAV blob for playback
|
||||
const allChunks = audioBufferRef.current;
|
||||
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
|
||||
const combined = new Uint8Array(totalLen);
|
||||
let offset = 0;
|
||||
for (const chunk of allChunks) {
|
||||
combined.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
const wav = pcm16ToWav(combined);
|
||||
const blob = new Blob([wav], { type: 'audio/wav' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
audioRef.current.src = url;
|
||||
audioRef.current.play().catch(() => {});
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 'speaking_end':
|
||||
setIsKiraSpeaking(false);
|
||||
audioBufferRef.current = [];
|
||||
break;
|
||||
|
||||
case 'interruption':
|
||||
setIsKiraSpeaking(false);
|
||||
audioBufferRef.current = [];
|
||||
if (audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current.currentTime = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'error':
|
||||
@@ -154,99 +211,80 @@ export function useConversation() {
|
||||
setPreferences((p) => ({ ...p, name }));
|
||||
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'identify',
|
||||
user_id: userId,
|
||||
name,
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
|
||||
}
|
||||
}, []);
|
||||
|
||||
// ── Preferences ──
|
||||
|
||||
const setPreference = useCallback((key: string, value: string) => {
|
||||
// Optimistic update
|
||||
setPreferences((p) => ({ ...p, [key]: value }));
|
||||
|
||||
// Sync to backend
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'set_preference',
|
||||
key,
|
||||
value,
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value }));
|
||||
}
|
||||
}, [identified]);
|
||||
|
||||
// ── Audio (Realtime PCM16) ──
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Check HTTPS
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
setMicError(null);
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
streamRef.current = stream;
|
||||
|
||||
const ws = wsRef.current;
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
addMessage('kira', 'Not connected to server yet...');
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
return;
|
||||
}
|
||||
|
||||
// Start PCM16 capture — each chunk sent as WS message
|
||||
const capture = startPCMCapture(stream, (pcm16) => {
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
const base64 = arrayBufferToBase64(pcm16.buffer);
|
||||
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
||||
}
|
||||
});
|
||||
|
||||
captureRef.current = capture;
|
||||
setIsRecording(true);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
setMicError(msg);
|
||||
console.error('[Kira Mic]', msg);
|
||||
}
|
||||
}, [addMessage]);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
captureRef.current?.stop();
|
||||
captureRef.current = null;
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
setIsRecording(false);
|
||||
}, []);
|
||||
|
||||
// ── Text ──
|
||||
|
||||
const sendText = useCallback((text: string) => {
|
||||
if (!text.trim()) return;
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({
|
||||
type: 'conversation_text',
|
||||
text: text.trim(),
|
||||
}));
|
||||
wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() }));
|
||||
}
|
||||
}, []);
|
||||
|
||||
// ── Audio ──
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Check if mediaDevices is available (requires HTTPS/localhost)
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
streamRef.current = stream;
|
||||
|
||||
const recorder = new MediaRecorder(stream, {
|
||||
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||
? 'audio/webm;codecs=opus'
|
||||
: 'audio/webm',
|
||||
});
|
||||
|
||||
const chunks: BlobPart[] = [];
|
||||
recorder.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunks.push(e.data);
|
||||
};
|
||||
|
||||
recorder.onstop = () => {
|
||||
const blob = new Blob(chunks, { type: 'audio/webm' });
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
const base64 = (reader.result as string).split(',')[1];
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
|
||||
wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
|
||||
}
|
||||
};
|
||||
reader.readAsDataURL(blob);
|
||||
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
setIsRecording(false);
|
||||
};
|
||||
|
||||
recorder.start();
|
||||
recorderRef.current = recorder;
|
||||
setIsRecording(true);
|
||||
} catch (err) {
|
||||
console.error('[Kira Mic] failed:', err);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
recorderRef.current?.stop();
|
||||
}, []);
|
||||
|
||||
// Connect on mount
|
||||
useEffect(() => {
|
||||
connect();
|
||||
return () => {
|
||||
wsRef.current?.close();
|
||||
captureRef.current?.stop();
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
};
|
||||
}, [connect]);
|
||||
@@ -259,6 +297,7 @@ export function useConversation() {
|
||||
identified,
|
||||
preferences,
|
||||
loadingPrefs,
|
||||
micError,
|
||||
identify,
|
||||
setPreference,
|
||||
sendText,
|
||||
@@ -266,3 +305,61 @@ export function useConversation() {
|
||||
stopRecording,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Helpers ──
|
||||
|
||||
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return btoa(binary);
|
||||
}
|
||||
|
||||
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
|
||||
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
|
||||
const numChannels = 1;
|
||||
const sampleRate = 24000;
|
||||
const bitsPerSample = 16;
|
||||
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
||||
const blockAlign = numChannels * (bitsPerSample / 8);
|
||||
const dataSize = pcm16.length;
|
||||
const headerSize = 44;
|
||||
const totalSize = headerSize + dataSize;
|
||||
|
||||
const buf = new ArrayBuffer(totalSize);
|
||||
const view = new DataView(buf);
|
||||
|
||||
// RIFF header
|
||||
writeString(view, 0, 'RIFF');
|
||||
view.setUint32(4, totalSize - 8, true);
|
||||
writeString(view, 8, 'WAVE');
|
||||
|
||||
// fmt subchunk
|
||||
writeString(view, 12, 'fmt ');
|
||||
view.setUint32(16, 16, true); // subchunk size
|
||||
view.setUint16(20, 1, true); // PCM
|
||||
view.setUint16(22, numChannels, true);
|
||||
view.setUint32(24, sampleRate, true);
|
||||
view.setUint32(28, byteRate, true);
|
||||
view.setUint16(32, blockAlign, true);
|
||||
view.setUint16(34, bitsPerSample, true);
|
||||
|
||||
// data subchunk
|
||||
writeString(view, 36, 'data');
|
||||
view.setUint32(40, dataSize, true);
|
||||
|
||||
// PCM data
|
||||
for (let i = 0; i < pcm16.length; i++) {
|
||||
view.setUint8(44 + i, pcm16[i]);
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
function writeString(view: DataView, offset: number, str: string) {
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
view.setUint8(offset + i, str.charCodeAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user