fix: wrap PCM16 in WAV container before STT API call
Frontend captures PCM16 mono 24kHz audio. The transcription API expects a proper audio container format (wav, webm, etc.), not raw PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV header before sending to gpt-4o-mini-transcribe.
This commit is contained in:
+36
-2
@@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str:
|
||||
|
||||
|
||||
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||
"""Transcribe audio bytes using cheapest STT model."""
|
||||
"""Transcribe audio bytes using cheapest STT model.
|
||||
|
||||
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
|
||||
"""
|
||||
client = get_openai()
|
||||
try:
|
||||
# Wrap raw PCM16 in WAV container for the API
|
||||
wav_bytes = _pcm16_to_wav(audio_bytes)
|
||||
transcript = await client.audio.transcriptions.create(
|
||||
model="gpt-4o-mini-transcribe",
|
||||
file=("audio.webm", audio_bytes, "audio/webm"),
|
||||
file=("audio.wav", wav_bytes, "audio/wav"),
|
||||
response_format="text",
|
||||
)
|
||||
return transcript.strip() if transcript and transcript.strip() else None
|
||||
@@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes:
|
||||
return b""
|
||||
|
||||
|
||||
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
|
||||
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
|
||||
import struct
|
||||
num_channels = 1
|
||||
sample_rate = 24000
|
||||
bits_per_sample = 16
|
||||
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||
block_align = num_channels * (bits_per_sample // 8)
|
||||
data_size = len(pcm_data)
|
||||
header_size = 44
|
||||
total_size = header_size + data_size
|
||||
|
||||
header = b"RIFF"
|
||||
header += struct.pack("<I", total_size - 8)
|
||||
header += b"WAVE"
|
||||
header += b"fmt "
|
||||
header += struct.pack("<I", 16) # subchunk size
|
||||
header += struct.pack("<H", 1) # PCM format
|
||||
header += struct.pack("<H", num_channels)
|
||||
header += struct.pack("<I", sample_rate)
|
||||
header += struct.pack("<I", byte_rate)
|
||||
header += struct.pack("<H", block_align)
|
||||
header += struct.pack("<H", bits_per_sample)
|
||||
header += b"data"
|
||||
header += struct.pack("<I", data_size)
|
||||
|
||||
return header + pcm_data
|
||||
|
||||
|
||||
@app.websocket("/api/ws")
|
||||
async def conversation_ws(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
|
||||
Reference in New Issue
Block a user