feat: Realtime WebSocket STT via gpt-realtime-whisper
Replaces REST-based transcription (gpt-4o-transcribe) with WebSocket streaming via gpt-realtime-whisper. Frontend captures PCM16 audio and streams it through the backend to a Realtime transcription session. - Server-side VAD detects utterance boundaries automatically - Word-level transcript deltas stream to the client in real-time - On utterance end, gpt-5.4-nano generates a response - TTS streams back via with_streaming_response - Total pipeline: PCM16 → Realtime WS → LLM → streaming TTS
This commit is contained in:
@@ -42,6 +42,7 @@ export function useConversation() {
|
||||
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const captureRef = useRef<{ stop: () => void } | null>(null);
|
||||
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioBufferRef = useRef<Uint8Array[]>([]);
|
||||
@@ -193,7 +194,6 @@ export function useConversation() {
|
||||
// ── Audio (Realtime PCM16) ──
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Check HTTPS
|
||||
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
|
||||
return;
|
||||
@@ -211,37 +211,14 @@ export function useConversation() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Record Opus/webm — much more efficient than PCM16
|
||||
const chunks: BlobPart[] = [];
|
||||
const recorder = new MediaRecorder(stream, {
|
||||
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||
? 'audio/webm;codecs=opus'
|
||||
: 'audio/webm',
|
||||
// PCM16 capture for Realtime WebSocket STT
|
||||
captureRef.current = startPCMCapture(stream, (pcm16) => {
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
const base64 = arrayBufferToBase64(pcm16.buffer);
|
||||
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
||||
}
|
||||
});
|
||||
|
||||
recorder.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) chunks.push(e.data);
|
||||
};
|
||||
|
||||
recorder.onstop = () => {
|
||||
// Send recorded audio as one blob, then transcribe
|
||||
const blob = new Blob(chunks, { type: 'audio/webm' });
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
const base64 = (reader.result as string).split(',')[1];
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
|
||||
ws.send(JSON.stringify({ type: 'transcribe' }));
|
||||
}
|
||||
};
|
||||
reader.readAsDataURL(blob);
|
||||
|
||||
stream.getTracks().forEach((t) => t.stop());
|
||||
setIsRecording(false);
|
||||
};
|
||||
|
||||
recorder.start();
|
||||
recorderRef.current = recorder;
|
||||
setIsRecording(true);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
@@ -251,7 +228,11 @@ export function useConversation() {
|
||||
}, [addMessage]);
|
||||
|
||||
const stopRecording = useCallback(() => {
|
||||
recorderRef.current?.stop();
|
||||
captureRef.current?.stop();
|
||||
captureRef.current = null;
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
streamRef.current = null;
|
||||
setIsRecording(false);
|
||||
}, []);
|
||||
|
||||
// ── Text ──
|
||||
@@ -268,6 +249,7 @@ export function useConversation() {
|
||||
connect();
|
||||
return () => {
|
||||
wsRef.current?.close();
|
||||
captureRef.current?.stop();
|
||||
recorderRef.current?.stop();
|
||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||
};
|
||||
@@ -289,3 +271,48 @@ export function useConversation() {
|
||||
stopRecording,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Helpers ──
|
||||
|
||||
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return btoa(binary);
|
||||
}
|
||||
|
||||
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
||||
function startPCMCapture(
|
||||
stream: MediaStream,
|
||||
onChunk: (pcm16: Uint8Array) => void,
|
||||
): { stop: () => void } {
|
||||
const ctx = new AudioContext({ sampleRate: 24000 });
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
||||
let running = true;
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!running) return;
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
const pcm16 = new Int16Array(input.length);
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
onChunk(new Uint8Array(pcm16.buffer));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(ctx.destination);
|
||||
|
||||
return {
|
||||
stop: () => {
|
||||
running = false;
|
||||
source.disconnect();
|
||||
processor.disconnect();
|
||||
ctx.close();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user