fix(stt): correct Realtime WS model to gpt-realtime-whisper + enhance event handling for deltas/completed

- URL now uses ?model=gpt-realtime-whisper (was invalid gpt-4o-mini-realtime-preview)
- Cleaned session.update (removed modalities that may not apply)
- Expanded _handle to catch input_audio_transcription.delta and .completed events
- on_error now forwards transcription errors to frontend client
- Per AUDIT + PLAN item 1
This commit is contained in:
2026-06-04 15:14:26 -04:00
parent 7502f201c7
commit 191b7ad9b5
4 changed files with 367 additions and 12 deletions
+26 -12
View File
@@ -41,7 +41,7 @@ class WhisperStream:
try:
import websockets
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview"
url = "wss://api.openai.com/v1/realtime?model=gpt-realtime-whisper"
ws = await websockets.connect(
url,
additional_headers={
@@ -58,7 +58,6 @@ class WhisperStream:
await self._send({
"type": "session.update",
"session": {
"modalities": ["text"], # no audio output
"input_audio_format": "pcm16",
"input_audio_transcription": {
"model": "gpt-realtime-whisper",
@@ -98,26 +97,41 @@ class WhisperStream:
if et == "input_audio_buffer.speech_started":
self._transcript = ""
logger.debug("speech_started")
elif et in ("conversation.item.input_audio_transcription.delta", "input_audio_buffer.transcription.delta"):
# Partial streaming transcript
delta = data.get("delta", "") or data.get("transcript", "")
if delta:
self._transcript = delta # or append if cumulative
await self._on_delta(delta)
elif et in ("conversation.item.input_audio_transcription.completed", "input_audio_buffer.transcription.completed", "conversation.item.created"):
# Final or item created with transcript
item = data.get("item", {})
content = item.get("content", []) if item else []
transcript = data.get("transcript", "")
if not transcript:
for part in (content or []):
if part.get("type") in ("transcript", "text"):
transcript = part.get("transcript", "") or part.get("text", "")
break
if transcript:
self._transcript = transcript
await self._on_delta(transcript)
await self._on_done(transcript.strip())
self._transcript = ""
elif et == "input_audio_buffer.speech_stopped":
if self._transcript.strip():
await self._on_done(self._transcript.strip())
self._transcript = ""
elif et == "conversation.item.created":
item = data.get("item", {})
content = item.get("content", [])
for part in (content or []):
pt = part.get("type", "")
txt = part.get("transcript", "") or part.get("text", "")
if pt == "transcript" and txt:
self._transcript = txt
await self._on_delta(txt)
elif et == "error":
err = data.get("error", {})
msg = err.get("message", str(data))
logger.warning(f"Whisper error: {msg}")
await self._on_error(msg)
async def send_audio(self, pcm16_bytes: bytes):
if not self._connected: