feat: Realtime WebSocket STT via gpt-realtime-whisper

Replaces REST-based transcription (gpt-4o-transcribe) with WebSocket
streaming via gpt-realtime-whisper. Frontend captures PCM16 audio and
streams it through the backend to a Realtime transcription session.

- Server-side VAD detects utterance boundaries automatically
- Word-level transcript deltas stream to the client in real-time
- On utterance end, gpt-5.4-nano generates a response
- TTS streams back via with_streaming_response
- Total pipeline: PCM16 → Realtime WS → LLM → streaming TTS
This commit is contained in:
2026-06-04 14:26:19 -04:00
parent 25b12ee14f
commit 7502f201c7
3 changed files with 326 additions and 153 deletions
+58 -31
View File
@@ -42,6 +42,7 @@ export function useConversation() {
const wsRef = useRef<WebSocket | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const captureRef = useRef<{ stop: () => void } | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]);
@@ -193,7 +194,6 @@ export function useConversation() {
// ── Audio (Realtime PCM16) ──
const startRecording = useCallback(async () => {
// Check HTTPS
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
return;
@@ -211,37 +211,14 @@ export function useConversation() {
return;
}
// Record Opus/webm — much more efficient than PCM16
const chunks: BlobPart[] = [];
const recorder = new MediaRecorder(stream, {
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm',
// PCM16 capture for Realtime WebSocket STT
captureRef.current = startPCMCapture(stream, (pcm16) => {
if (ws.readyState === WebSocket.OPEN) {
const base64 = arrayBufferToBase64(pcm16.buffer);
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
}
});
recorder.ondataavailable = (e) => {
if (e.data.size > 0) chunks.push(e.data);
};
recorder.onstop = () => {
// Send recorded audio as one blob, then transcribe
const blob = new Blob(chunks, { type: 'audio/webm' });
const reader = new FileReader();
reader.onload = () => {
const base64 = (reader.result as string).split(',')[1];
if (ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
ws.send(JSON.stringify({ type: 'transcribe' }));
}
};
reader.readAsDataURL(blob);
stream.getTracks().forEach((t) => t.stop());
setIsRecording(false);
};
recorder.start();
recorderRef.current = recorder;
setIsRecording(true);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
@@ -251,7 +228,11 @@ export function useConversation() {
}, [addMessage]);
const stopRecording = useCallback(() => {
recorderRef.current?.stop();
captureRef.current?.stop();
captureRef.current = null;
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
setIsRecording(false);
}, []);
// ── Text ──
@@ -268,6 +249,7 @@ export function useConversation() {
connect();
return () => {
wsRef.current?.close();
captureRef.current?.stop();
recorderRef.current?.stop();
streamRef.current?.getTracks().forEach((t) => t.stop());
};
@@ -289,3 +271,48 @@ export function useConversation() {
stopRecording,
};
}
// ── Helpers ──
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
function startPCMCapture(
stream: MediaStream,
onChunk: (pcm16: Uint8Array) => void,
): { stop: () => void } {
const ctx = new AudioContext({ sampleRate: 24000 });
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
let running = true;
processor.onaudioprocess = (e) => {
if (!running) return;
const input = e.inputBuffer.getChannelData(0);
const pcm16 = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const s = Math.max(-1, Math.min(1, input[i]));
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
onChunk(new Uint8Array(pcm16.buffer));
};
source.connect(processor);
processor.connect(ctx.destination);
return {
stop: () => {
running = false;
source.disconnect();
processor.disconnect();
ctx.close();
},
};
}