From a19ac46312eb0e03c693b03393a672f24065044d Mon Sep 17 00:00:00 2001
From: hobokenchicken <dustin@dustin.coffee>
Date: Thu, 4 Jun 2026 13:55:05 -0400
Subject: [PATCH] fix: wrap PCM16 in WAV container before STT API call

Frontend captures PCM16 mono 24kHz audio. The transcription API
expects a proper audio container format (wav, webm, etc.), not raw
PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV
header before sending to gpt-4o-mini-transcribe.
---
 backend/main.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/backend/main.py b/backend/main.py
index 681e8c8..cba6858 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str:
 
 
 async def transcribe_audio(audio_bytes: bytes) -> str | None:
-    """Transcribe audio bytes using cheapest STT model."""
+    """Transcribe audio bytes using cheapest STT model.
+    
+    Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
+    """
     client = get_openai()
     try:
+        # Wrap raw PCM16 in WAV container for the API
+        wav_bytes = _pcm16_to_wav(audio_bytes)
         transcript = await client.audio.transcriptions.create(
             model="gpt-4o-mini-transcribe",
-            file=("audio.webm", audio_bytes, "audio/webm"),
+            file=("audio.wav", wav_bytes, "audio/wav"),
             response_format="text",
         )
         return transcript.strip() if transcript and transcript.strip() else None
@@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes:
         return b""
 
 
+def _pcm16_to_wav(pcm_data: bytes) -> bytes:
+    """Wrap raw PCM16 mono 24kHz data in a WAV container."""
+    import struct
+    num_channels = 1
+    sample_rate = 24000
+    bits_per_sample = 16
+    byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
+    block_align = num_channels * (bits_per_sample // 8)
+    data_size = len(pcm_data)
+    header_size = 44
+    total_size = header_size + data_size
+
+    header = b"RIFF"
+    header += struct.pack("<I", total_size - 8)
+    header += b"WAVE"
+    header += b"fmt "
+    header += struct.pack("<I", 16)           # subchunk size
+    header += struct.pack("<H", 1)            # PCM format
+    header += struct.pack("<H", num_channels)
+    header += struct.pack("<I", sample_rate)
+    header += struct.pack("<I", byte_rate)
+    header += struct.pack("<H", block_align)
+    header += struct.pack("<H", bits_per_sample)
+    header += b"data"
+    header += struct.pack("<I", data_size)
+
+    return header + pcm_data
+
+
 @app.websocket("/api/ws")
 async def conversation_ws(websocket: WebSocket):
     await websocket.accept()