fix: wrap PCM16 in WAV container before STT API call

Frontend captures PCM16 mono 24kHz audio. The transcription API expects a proper audio container format (wav, webm, etc.), not raw PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV header before sending to gpt-4o-mini-transcribe.
2026-06-04 13:55:05 -04:00
parent f2a5416408
commit a19ac46312
1 changed files with 36 additions and 2 deletions
@@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str:
 async def transcribe_audio(audio_bytes: bytes) -> str | None:
-    """Transcribe audio bytes using cheapest STT model."""
+    """Transcribe audio bytes using cheapest STT model.
    Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
    """
    client = get_openai()
    try:
        # Wrap raw PCM16 in WAV container for the API
        wav_bytes = _pcm16_to_wav(audio_bytes)
        transcript = await client.audio.transcriptions.create(
            model="gpt-4o-mini-transcribe",
-            file=("audio.webm", audio_bytes, "audio/webm"),
+            file=("audio.wav", wav_bytes, "audio/wav"),
            response_format="text",
        )
        return transcript.strip() if transcript and transcript.strip() else None
@@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes:
        return b""
 def _pcm16_to_wav(pcm_data: bytes) -> bytes:
    """Wrap raw PCM16 mono 24kHz data in a WAV container."""
    import struct
    num_channels = 1
    sample_rate = 24000
    bits_per_sample = 16
    byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
    block_align = num_channels * (bits_per_sample // 8)
    data_size = len(pcm_data)
    header_size = 44
    total_size = header_size + data_size
    header = b"RIFF"
    header += struct.pack("<I", total_size - 8)
    header += b"WAVE"
    header += b"fmt "
    header += struct.pack("<I", 16)           # subchunk size
    header += struct.pack("<H", 1)            # PCM format
    header += struct.pack("<H", num_channels)
    header += struct.pack("<I", sample_rate)
    header += struct.pack("<I", byte_rate)
    header += struct.pack("<H", block_align)
    header += struct.pack("<H", bits_per_sample)
    header += b"data"
    header += struct.pack("<I", data_size)
    return header + pcm_data
@app.websocket("/api/ws")
 async def conversation_ws(websocket: WebSocket):
    await websocket.accept()