fix(stt): correct Realtime WS model to gpt-realtime-whisper + enhance event handling for deltas/completed

- URL now uses ?model=gpt-realtime-whisper (was invalid gpt-4o-mini-realtime-preview) - Cleaned session.update (removed modalities that may not apply) - Expanded _handle to catch input_audio_transcription.delta and .completed events - on_error now forwards transcription errors to frontend client - Per AUDIT + PLAN item 1
2026-06-04 15:14:26 -04:00
parent 7502f201c7
commit 191b7ad9b5
4 changed files with 367 additions and 12 deletions
@@ -41,7 +41,7 @@ class WhisperStream:
        try:
            import websockets

-            url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview"
+            url = "wss://api.openai.com/v1/realtime?model=gpt-realtime-whisper"
            ws = await websockets.connect(
                url,
                additional_headers={
@@ -58,7 +58,6 @@ class WhisperStream:
                await self._send({
                    "type": "session.update",
                    "session": {
-                        "modalities": ["text"],  # no audio output
                        "input_audio_format": "pcm16",
                        "input_audio_transcription": {
                            "model": "gpt-realtime-whisper",
@@ -98,26 +97,41 @@ class WhisperStream:

        if et == "input_audio_buffer.speech_started":
            self._transcript = ""
+            logger.debug("speech_started")
+
+        elif et in ("conversation.item.input_audio_transcription.delta", "input_audio_buffer.transcription.delta"):
+            # Partial streaming transcript
+            delta = data.get("delta", "") or data.get("transcript", "")
+            if delta:
+                self._transcript = delta  # or append if cumulative
+                await self._on_delta(delta)
+
+        elif et in ("conversation.item.input_audio_transcription.completed", "input_audio_buffer.transcription.completed", "conversation.item.created"):
+            # Final or item created with transcript
+            item = data.get("item", {})
+            content = item.get("content", []) if item else []
+            transcript = data.get("transcript", "")
+            if not transcript:
+                for part in (content or []):
+                    if part.get("type") in ("transcript", "text"):
+                        transcript = part.get("transcript", "") or part.get("text", "")
+                        break
+            if transcript:
+                self._transcript = transcript
+                await self._on_delta(transcript)
+                await self._on_done(transcript.strip())
+                self._transcript = ""

        elif et == "input_audio_buffer.speech_stopped":
            if self._transcript.strip():
                await self._on_done(self._transcript.strip())
            self._transcript = ""

-        elif et == "conversation.item.created":
-            item = data.get("item", {})
-            content = item.get("content", [])
-            for part in (content or []):
-                pt = part.get("type", "")
-                txt = part.get("transcript", "") or part.get("text", "")
-                if pt == "transcript" and txt:
-                    self._transcript = txt
-                    await self._on_delta(txt)
-
        elif et == "error":
            err = data.get("error", {})
            msg = err.get("message", str(data))
            logger.warning(f"Whisper error: {msg}")
+            await self._on_error(msg)

    async def send_audio(self, pcm16_bytes: bytes):
        if not self._connected: