From c5cc4dd4809f33ae2f5527cf0f73e3a4ac2e403a Mon Sep 17 00:00:00 2001 From: hobokenchicken Date: Thu, 4 Jun 2026 14:04:44 -0400 Subject: [PATCH] fix: replace PCM16 capture with MediaRecorder (Opus/webm) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCM16 capture via AudioContext was streaming raw audio continuously, causing massive accumulated buffers that took ~20s to transcribe. Replaced with MediaRecorder which records compressed Opus/webm and sends a single blob on release — much smaller, faster to transcribe. Also removed all unused PCM16/WAV helper functions from both frontend and backend. --- backend/main.py | 13 +-- frontend/src/hooks/useConversation.ts | 147 ++++++-------------------- 2 files changed, 36 insertions(+), 124 deletions(-) diff --git a/backend/main.py b/backend/main.py index f581ddb..85ebc11 100644 --- a/backend/main.py +++ b/backend/main.py @@ -96,17 +96,12 @@ async def run_conversation(text: str, user_id: str) -> str: async def transcribe_audio(audio_bytes: bytes) -> str | None: - """Transcribe audio bytes using cheapest STT model. - - Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically. - """ + """Transcribe Opus/webm audio using cheapest STT model.""" client = get_openai() try: - # Wrap raw PCM16 in WAV container for the API - wav_bytes = _pcm16_to_wav(audio_bytes) transcript = await client.audio.transcriptions.create( model="gpt-4o-mini-transcribe", - file=("audio.wav", wav_bytes, "audio/wav"), + file=("audio.webm", audio_bytes, "audio/webm"), response_format="text", ) return transcript.strip() if transcript and transcript.strip() else None @@ -211,8 +206,8 @@ async def conversation_ws(websocket: WebSocket): continue # ── Conversation ── - if msg_type == "audio": - # Accumulate PCM16 audio chunks + if msg_type == "audio_chunk": + # Single Opus/webm blob from MediaRecorder chunk = base64.b64decode(msg["data"]) audio_buffer.extend(chunk) diff --git a/frontend/src/hooks/useConversation.ts b/frontend/src/hooks/useConversation.ts index 8f44cc6..8f90e72 100644 --- a/frontend/src/hooks/useConversation.ts +++ b/frontend/src/hooks/useConversation.ts @@ -25,41 +25,6 @@ function saveUserId(id: string) { localStorage.setItem(USER_ID_KEY, id); } -/** Capture PCM16 mono 24kHz audio from mic and send via callback. */ -function startPCMCapture( - stream: MediaStream, - onChunk: (pcm16: Uint8Array) => void, -): { stop: () => void } { - const ctx = new AudioContext({ sampleRate: 24000 }); - const source = ctx.createMediaStreamSource(stream); - const processor = ctx.createScriptProcessor(4096, 1, 1); - let running = true; - - processor.onaudioprocess = (e) => { - if (!running) return; - const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1] - // Convert float32 → PCM16 int16 - const pcm16 = new Int16Array(input.length); - for (let i = 0; i < input.length; i++) { - const s = Math.max(-1, Math.min(1, input[i])); - pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff; - } - onChunk(new Uint8Array(pcm16.buffer)); - }; - - source.connect(processor); - processor.connect(ctx.destination); - - return { - stop: () => { - running = false; - source.disconnect(); - processor.disconnect(); - ctx.close(); - }, - }; -} - export function useConversation() { const [messages, setMessages] = useState([]); const [isConnected, setIsConnected] = useState(false); @@ -77,9 +42,8 @@ export function useConversation() { const wsRef = useRef(null); const audioRef = useRef(null); - const captureRef = useRef<{ stop: () => void } | null>(null); + const recorderRef = useRef(null); const streamRef = useRef(null); - const audioBufferRef = useRef([]); // Connect WebSocket const connect = useCallback(() => { @@ -168,12 +132,10 @@ export function useConversation() { case 'speaking_end': setIsKiraSpeaking(false); - audioBufferRef.current = []; break; case 'interruption': setIsKiraSpeaking(false); - audioBufferRef.current = []; if (audioRef.current) { audioRef.current.pause(); audioRef.current.currentTime = 0; @@ -235,15 +197,37 @@ export function useConversation() { return; } - // Start PCM16 capture — each chunk sent as WS message - const capture = startPCMCapture(stream, (pcm16) => { - if (ws.readyState === WebSocket.OPEN) { - const base64 = arrayBufferToBase64(pcm16.buffer); - ws.send(JSON.stringify({ type: 'audio', data: base64 })); - } + // Record Opus/webm — much more efficient than PCM16 + const chunks: BlobPart[] = []; + const recorder = new MediaRecorder(stream, { + mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus') + ? 'audio/webm;codecs=opus' + : 'audio/webm', }); - captureRef.current = capture; + recorder.ondataavailable = (e) => { + if (e.data.size > 0) chunks.push(e.data); + }; + + recorder.onstop = () => { + // Send recorded audio as one blob, then transcribe + const blob = new Blob(chunks, { type: 'audio/webm' }); + const reader = new FileReader(); + reader.onload = () => { + const base64 = (reader.result as string).split(',')[1]; + if (ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 })); + ws.send(JSON.stringify({ type: 'transcribe' })); + } + }; + reader.readAsDataURL(blob); + + stream.getTracks().forEach((t) => t.stop()); + setIsRecording(false); + }; + + recorder.start(); + recorderRef.current = recorder; setIsRecording(true); } catch (err) { const msg = err instanceof Error ? err.message : String(err); @@ -253,16 +237,7 @@ export function useConversation() { }, [addMessage]); const stopRecording = useCallback(() => { - captureRef.current?.stop(); - captureRef.current = null; - streamRef.current?.getTracks().forEach((t) => t.stop()); - streamRef.current = null; - setIsRecording(false); - - // Tell backend to process accumulated audio - if (wsRef.current?.readyState === WebSocket.OPEN) { - wsRef.current.send(JSON.stringify({ type: 'transcribe' })); - } + recorderRef.current?.stop(); }, []); // ── Text ── @@ -279,7 +254,7 @@ export function useConversation() { connect(); return () => { wsRef.current?.close(); - captureRef.current?.stop(); + recorderRef.current?.stop(); streamRef.current?.getTracks().forEach((t) => t.stop()); }; }, [connect]); @@ -300,61 +275,3 @@ export function useConversation() { stopRecording, }; } - -// ── Helpers ── - -function arrayBufferToBase64(buffer: ArrayBufferLike): string { - const bytes = new Uint8Array(buffer); - let binary = ''; - for (let i = 0; i < bytes.length; i++) { - binary += String.fromCharCode(bytes[i]); - } - return btoa(binary); -} - -/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */ -function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer { - const numChannels = 1; - const sampleRate = 24000; - const bitsPerSample = 16; - const byteRate = sampleRate * numChannels * (bitsPerSample / 8); - const blockAlign = numChannels * (bitsPerSample / 8); - const dataSize = pcm16.length; - const headerSize = 44; - const totalSize = headerSize + dataSize; - - const buf = new ArrayBuffer(totalSize); - const view = new DataView(buf); - - // RIFF header - writeString(view, 0, 'RIFF'); - view.setUint32(4, totalSize - 8, true); - writeString(view, 8, 'WAVE'); - - // fmt subchunk - writeString(view, 12, 'fmt '); - view.setUint32(16, 16, true); // subchunk size - view.setUint16(20, 1, true); // PCM - view.setUint16(22, numChannels, true); - view.setUint32(24, sampleRate, true); - view.setUint32(28, byteRate, true); - view.setUint16(32, blockAlign, true); - view.setUint16(34, bitsPerSample, true); - - // data subchunk - writeString(view, 36, 'data'); - view.setUint32(40, dataSize, true); - - // PCM data - for (let i = 0; i < pcm16.length; i++) { - view.setUint8(44 + i, pcm16[i]); - } - - return buf; -} - -function writeString(view: DataView, offset: number, str: string) { - for (let i = 0; i < str.length; i++) { - view.setUint8(offset + i, str.charCodeAt(i)); - } -}