fix: wrap PCM16 in WAV container before STT API call

Frontend captures PCM16 mono 24kHz audio. The transcription API
expects a proper audio container format (wav, webm, etc.), not raw
PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV
header before sending to gpt-4o-mini-transcribe.
This commit is contained in:
2026-06-04 13:55:05 -04:00
parent f2a5416408
commit a19ac46312
+36 -2
View File
@@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str:
async def transcribe_audio(audio_bytes: bytes) -> str | None: async def transcribe_audio(audio_bytes: bytes) -> str | None:
"""Transcribe audio bytes using cheapest STT model.""" """Transcribe audio bytes using cheapest STT model.
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
"""
client = get_openai() client = get_openai()
try: try:
# Wrap raw PCM16 in WAV container for the API
wav_bytes = _pcm16_to_wav(audio_bytes)
transcript = await client.audio.transcriptions.create( transcript = await client.audio.transcriptions.create(
model="gpt-4o-mini-transcribe", model="gpt-4o-mini-transcribe",
file=("audio.webm", audio_bytes, "audio/webm"), file=("audio.wav", wav_bytes, "audio/wav"),
response_format="text", response_format="text",
) )
return transcript.strip() if transcript and transcript.strip() else None return transcript.strip() if transcript and transcript.strip() else None
@@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes:
return b"" return b""
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
import struct
num_channels = 1
sample_rate = 24000
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm_data)
header_size = 44
total_size = header_size + data_size
header = b"RIFF"
header += struct.pack("<I", total_size - 8)
header += b"WAVE"
header += b"fmt "
header += struct.pack("<I", 16) # subchunk size
header += struct.pack("<H", 1) # PCM format
header += struct.pack("<H", num_channels)
header += struct.pack("<I", sample_rate)
header += struct.pack("<I", byte_rate)
header += struct.pack("<H", block_align)
header += struct.pack("<H", bits_per_sample)
header += b"data"
header += struct.pack("<I", data_size)
return header + pcm_data
@app.websocket("/api/ws") @app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket): async def conversation_ws(websocket: WebSocket):
await websocket.accept() await websocket.accept()