From a19ac46312eb0e03c693b03393a672f24065044d Mon Sep 17 00:00:00 2001 From: hobokenchicken Date: Thu, 4 Jun 2026 13:55:05 -0400 Subject: [PATCH] fix: wrap PCM16 in WAV container before STT API call Frontend captures PCM16 mono 24kHz audio. The transcription API expects a proper audio container format (wav, webm, etc.), not raw PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV header before sending to gpt-4o-mini-transcribe. --- backend/main.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/backend/main.py b/backend/main.py index 681e8c8..cba6858 100644 --- a/backend/main.py +++ b/backend/main.py @@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str: async def transcribe_audio(audio_bytes: bytes) -> str | None: - """Transcribe audio bytes using cheapest STT model.""" + """Transcribe audio bytes using cheapest STT model. + + Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically. + """ client = get_openai() try: + # Wrap raw PCM16 in WAV container for the API + wav_bytes = _pcm16_to_wav(audio_bytes) transcript = await client.audio.transcriptions.create( model="gpt-4o-mini-transcribe", - file=("audio.webm", audio_bytes, "audio/webm"), + file=("audio.wav", wav_bytes, "audio/wav"), response_format="text", ) return transcript.strip() if transcript and transcript.strip() else None @@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes: return b"" +def _pcm16_to_wav(pcm_data: bytes) -> bytes: + """Wrap raw PCM16 mono 24kHz data in a WAV container.""" + import struct + num_channels = 1 + sample_rate = 24000 + bits_per_sample = 16 + byte_rate = sample_rate * num_channels * (bits_per_sample // 8) + block_align = num_channels * (bits_per_sample // 8) + data_size = len(pcm_data) + header_size = 44 + total_size = header_size + data_size + + header = b"RIFF" + header += struct.pack("