fix: replace PCM16 capture with MediaRecorder (Opus/webm)

PCM16 capture via AudioContext was streaming raw audio continuously, causing massive accumulated buffers that took ~20s to transcribe. Replaced with MediaRecorder which records compressed Opus/webm and sends a single blob on release — much smaller, faster to transcribe. Also removed all unused PCM16/WAV helper functions from both frontend and backend.
2026-06-04 14:04:44 -04:00
parent 537ddcd841
commit c5cc4dd480
2 changed files with 36 additions and 124 deletions
@@ -96,17 +96,12 @@ async def run_conversation(text: str, user_id: str) -> str:


 async def transcribe_audio(audio_bytes: bytes) -> str | None:
-    """Transcribe audio bytes using cheapest STT model.
-    
-    Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
-    """
+    """Transcribe Opus/webm audio using cheapest STT model."""
    client = get_openai()
    try:
-        # Wrap raw PCM16 in WAV container for the API
-        wav_bytes = _pcm16_to_wav(audio_bytes)
        transcript = await client.audio.transcriptions.create(
            model="gpt-4o-mini-transcribe",
-            file=("audio.wav", wav_bytes, "audio/wav"),
+            file=("audio.webm", audio_bytes, "audio/webm"),
            response_format="text",
        )
        return transcript.strip() if transcript and transcript.strip() else None
@@ -211,8 +206,8 @@ async def conversation_ws(websocket: WebSocket):
                continue

            # ── Conversation ──
-            if msg_type == "audio":
-                # Accumulate PCM16 audio chunks
+            if msg_type == "audio_chunk":
+                # Single Opus/webm blob from MediaRecorder
                chunk = base64.b64decode(msg["data"])
                audio_buffer.extend(chunk)