fix: replace PCM16 capture with MediaRecorder (Opus/webm)
PCM16 capture via AudioContext was streaming raw audio continuously, causing massive accumulated buffers that took ~20s to transcribe. Replaced with MediaRecorder which records compressed Opus/webm and sends a single blob on release — much smaller, faster to transcribe. Also removed all unused PCM16/WAV helper functions from both frontend and backend.
This commit is contained in:
+4
-9
@@ -96,17 +96,12 @@ async def run_conversation(text: str, user_id: str) -> str:
|
||||
|
||||
|
||||
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||
"""Transcribe audio bytes using cheapest STT model.
|
||||
|
||||
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
|
||||
"""
|
||||
"""Transcribe Opus/webm audio using cheapest STT model."""
|
||||
client = get_openai()
|
||||
try:
|
||||
# Wrap raw PCM16 in WAV container for the API
|
||||
wav_bytes = _pcm16_to_wav(audio_bytes)
|
||||
transcript = await client.audio.transcriptions.create(
|
||||
model="gpt-4o-mini-transcribe",
|
||||
file=("audio.wav", wav_bytes, "audio/wav"),
|
||||
file=("audio.webm", audio_bytes, "audio/webm"),
|
||||
response_format="text",
|
||||
)
|
||||
return transcript.strip() if transcript and transcript.strip() else None
|
||||
@@ -211,8 +206,8 @@ async def conversation_ws(websocket: WebSocket):
|
||||
continue
|
||||
|
||||
# ── Conversation ──
|
||||
if msg_type == "audio":
|
||||
# Accumulate PCM16 audio chunks
|
||||
if msg_type == "audio_chunk":
|
||||
# Single Opus/webm blob from MediaRecorder
|
||||
chunk = base64.b64decode(msg["data"])
|
||||
audio_buffer.extend(chunk)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user