fix: wrap PCM16 in WAV container before STT API call
Frontend captures PCM16 mono 24kHz audio. The transcription API expects a proper audio container format (wav, webm, etc.), not raw PCM16 data. Added _pcm16_to_wav() to wrap the raw bytes in a WAV header before sending to gpt-4o-mini-transcribe.
This commit is contained in:
+36
-2
@@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||||
"""Transcribe audio bytes using cheapest STT model."""
|
"""Transcribe audio bytes using cheapest STT model.
|
||||||
|
|
||||||
|
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
|
||||||
|
"""
|
||||||
client = get_openai()
|
client = get_openai()
|
||||||
try:
|
try:
|
||||||
|
# Wrap raw PCM16 in WAV container for the API
|
||||||
|
wav_bytes = _pcm16_to_wav(audio_bytes)
|
||||||
transcript = await client.audio.transcriptions.create(
|
transcript = await client.audio.transcriptions.create(
|
||||||
model="gpt-4o-mini-transcribe",
|
model="gpt-4o-mini-transcribe",
|
||||||
file=("audio.webm", audio_bytes, "audio/webm"),
|
file=("audio.wav", wav_bytes, "audio/wav"),
|
||||||
response_format="text",
|
response_format="text",
|
||||||
)
|
)
|
||||||
return transcript.strip() if transcript and transcript.strip() else None
|
return transcript.strip() if transcript and transcript.strip() else None
|
||||||
@@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes:
|
|||||||
return b""
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
def _pcm16_to_wav(pcm_data: bytes) -> bytes:
|
||||||
|
"""Wrap raw PCM16 mono 24kHz data in a WAV container."""
|
||||||
|
import struct
|
||||||
|
num_channels = 1
|
||||||
|
sample_rate = 24000
|
||||||
|
bits_per_sample = 16
|
||||||
|
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
||||||
|
block_align = num_channels * (bits_per_sample // 8)
|
||||||
|
data_size = len(pcm_data)
|
||||||
|
header_size = 44
|
||||||
|
total_size = header_size + data_size
|
||||||
|
|
||||||
|
header = b"RIFF"
|
||||||
|
header += struct.pack("<I", total_size - 8)
|
||||||
|
header += b"WAVE"
|
||||||
|
header += b"fmt "
|
||||||
|
header += struct.pack("<I", 16) # subchunk size
|
||||||
|
header += struct.pack("<H", 1) # PCM format
|
||||||
|
header += struct.pack("<H", num_channels)
|
||||||
|
header += struct.pack("<I", sample_rate)
|
||||||
|
header += struct.pack("<I", byte_rate)
|
||||||
|
header += struct.pack("<H", block_align)
|
||||||
|
header += struct.pack("<H", bits_per_sample)
|
||||||
|
header += b"data"
|
||||||
|
header += struct.pack("<I", data_size)
|
||||||
|
|
||||||
|
return header + pcm_data
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/api/ws")
|
@app.websocket("/api/ws")
|
||||||
async def conversation_ws(websocket: WebSocket):
|
async def conversation_ws(websocket: WebSocket):
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
|
|||||||
Reference in New Issue
Block a user