fix: replace PCM16 capture with MediaRecorder (Opus/webm)

PCM16 capture via AudioContext was streaming raw audio continuously,
causing massive accumulated buffers that took ~20s to transcribe.
Replaced with MediaRecorder which records compressed Opus/webm and
sends a single blob on release — much smaller, faster to transcribe.

Also removed all unused PCM16/WAV helper functions from both frontend
and backend.
This commit is contained in:
2026-06-04 14:04:44 -04:00
parent 537ddcd841
commit c5cc4dd480
2 changed files with 36 additions and 124 deletions
+4 -9
View File
@@ -96,17 +96,12 @@ async def run_conversation(text: str, user_id: str) -> str:
async def transcribe_audio(audio_bytes: bytes) -> str | None: async def transcribe_audio(audio_bytes: bytes) -> str | None:
"""Transcribe audio bytes using cheapest STT model. """Transcribe Opus/webm audio using cheapest STT model."""
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
"""
client = get_openai() client = get_openai()
try: try:
# Wrap raw PCM16 in WAV container for the API
wav_bytes = _pcm16_to_wav(audio_bytes)
transcript = await client.audio.transcriptions.create( transcript = await client.audio.transcriptions.create(
model="gpt-4o-mini-transcribe", model="gpt-4o-mini-transcribe",
file=("audio.wav", wav_bytes, "audio/wav"), file=("audio.webm", audio_bytes, "audio/webm"),
response_format="text", response_format="text",
) )
return transcript.strip() if transcript and transcript.strip() else None return transcript.strip() if transcript and transcript.strip() else None
@@ -211,8 +206,8 @@ async def conversation_ws(websocket: WebSocket):
continue continue
# ── Conversation ── # ── Conversation ──
if msg_type == "audio": if msg_type == "audio_chunk":
# Accumulate PCM16 audio chunks # Single Opus/webm blob from MediaRecorder
chunk = base64.b64decode(msg["data"]) chunk = base64.b64decode(msg["data"])
audio_buffer.extend(chunk) audio_buffer.extend(chunk)
+32 -115
View File
@@ -25,41 +25,6 @@ function saveUserId(id: string) {
localStorage.setItem(USER_ID_KEY, id); localStorage.setItem(USER_ID_KEY, id);
} }
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
function startPCMCapture(
stream: MediaStream,
onChunk: (pcm16: Uint8Array) => void,
): { stop: () => void } {
const ctx = new AudioContext({ sampleRate: 24000 });
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
let running = true;
processor.onaudioprocess = (e) => {
if (!running) return;
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
// Convert float32 → PCM16 int16
const pcm16 = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const s = Math.max(-1, Math.min(1, input[i]));
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
onChunk(new Uint8Array(pcm16.buffer));
};
source.connect(processor);
processor.connect(ctx.destination);
return {
stop: () => {
running = false;
source.disconnect();
processor.disconnect();
ctx.close();
},
};
}
export function useConversation() { export function useConversation() {
const [messages, setMessages] = useState<Message[]>([]); const [messages, setMessages] = useState<Message[]>([]);
const [isConnected, setIsConnected] = useState(false); const [isConnected, setIsConnected] = useState(false);
@@ -77,9 +42,8 @@ export function useConversation() {
const wsRef = useRef<WebSocket | null>(null); const wsRef = useRef<WebSocket | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null); const audioRef = useRef<HTMLAudioElement | null>(null);
const captureRef = useRef<{ stop: () => void } | null>(null); const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]);
// Connect WebSocket // Connect WebSocket
const connect = useCallback(() => { const connect = useCallback(() => {
@@ -168,12 +132,10 @@ export function useConversation() {
case 'speaking_end': case 'speaking_end':
setIsKiraSpeaking(false); setIsKiraSpeaking(false);
audioBufferRef.current = [];
break; break;
case 'interruption': case 'interruption':
setIsKiraSpeaking(false); setIsKiraSpeaking(false);
audioBufferRef.current = [];
if (audioRef.current) { if (audioRef.current) {
audioRef.current.pause(); audioRef.current.pause();
audioRef.current.currentTime = 0; audioRef.current.currentTime = 0;
@@ -235,15 +197,37 @@ export function useConversation() {
return; return;
} }
// Start PCM16 captureeach chunk sent as WS message // Record Opus/webmmuch more efficient than PCM16
const capture = startPCMCapture(stream, (pcm16) => { const chunks: BlobPart[] = [];
if (ws.readyState === WebSocket.OPEN) { const recorder = new MediaRecorder(stream, {
const base64 = arrayBufferToBase64(pcm16.buffer); mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
ws.send(JSON.stringify({ type: 'audio', data: base64 })); ? 'audio/webm;codecs=opus'
} : 'audio/webm',
}); });
captureRef.current = capture; recorder.ondataavailable = (e) => {
if (e.data.size > 0) chunks.push(e.data);
};
recorder.onstop = () => {
// Send recorded audio as one blob, then transcribe
const blob = new Blob(chunks, { type: 'audio/webm' });
const reader = new FileReader();
reader.onload = () => {
const base64 = (reader.result as string).split(',')[1];
if (ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
ws.send(JSON.stringify({ type: 'transcribe' }));
}
};
reader.readAsDataURL(blob);
stream.getTracks().forEach((t) => t.stop());
setIsRecording(false);
};
recorder.start();
recorderRef.current = recorder;
setIsRecording(true); setIsRecording(true);
} catch (err) { } catch (err) {
const msg = err instanceof Error ? err.message : String(err); const msg = err instanceof Error ? err.message : String(err);
@@ -253,16 +237,7 @@ export function useConversation() {
}, [addMessage]); }, [addMessage]);
const stopRecording = useCallback(() => { const stopRecording = useCallback(() => {
captureRef.current?.stop(); recorderRef.current?.stop();
captureRef.current = null;
streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null;
setIsRecording(false);
// Tell backend to process accumulated audio
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
}
}, []); }, []);
// ── Text ── // ── Text ──
@@ -279,7 +254,7 @@ export function useConversation() {
connect(); connect();
return () => { return () => {
wsRef.current?.close(); wsRef.current?.close();
captureRef.current?.stop(); recorderRef.current?.stop();
streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current?.getTracks().forEach((t) => t.stop());
}; };
}, [connect]); }, [connect]);
@@ -300,61 +275,3 @@ export function useConversation() {
stopRecording, stopRecording,
}; };
} }
// ── Helpers ──
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
const numChannels = 1;
const sampleRate = 24000;
const bitsPerSample = 16;
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
const blockAlign = numChannels * (bitsPerSample / 8);
const dataSize = pcm16.length;
const headerSize = 44;
const totalSize = headerSize + dataSize;
const buf = new ArrayBuffer(totalSize);
const view = new DataView(buf);
// RIFF header
writeString(view, 0, 'RIFF');
view.setUint32(4, totalSize - 8, true);
writeString(view, 8, 'WAVE');
// fmt subchunk
writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true); // subchunk size
view.setUint16(20, 1, true); // PCM
view.setUint16(22, numChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitsPerSample, true);
// data subchunk
writeString(view, 36, 'data');
view.setUint32(40, dataSize, true);
// PCM data
for (let i = 0; i < pcm16.length; i++) {
view.setUint8(44 + i, pcm16[i]);
}
return buf;
}
function writeString(view: DataView, offset: number, str: string) {
for (let i = 0; i < str.length; i++) {
view.setUint8(offset + i, str.charCodeAt(i));
}
}