fix(stt): correct Realtime WS model to gpt-realtime-whisper + enhance event handling for deltas/completed
- URL now uses ?model=gpt-realtime-whisper (was invalid gpt-4o-mini-realtime-preview) - Cleaned session.update (removed modalities that may not apply) - Expanded _handle to catch input_audio_transcription.delta and .completed events - on_error now forwards transcription errors to frontend client - Per AUDIT + PLAN item 1
This commit is contained in:
@@ -41,7 +41,7 @@ class WhisperStream:
|
||||
try:
|
||||
import websockets
|
||||
|
||||
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview"
|
||||
url = "wss://api.openai.com/v1/realtime?model=gpt-realtime-whisper"
|
||||
ws = await websockets.connect(
|
||||
url,
|
||||
additional_headers={
|
||||
@@ -58,7 +58,6 @@ class WhisperStream:
|
||||
await self._send({
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"modalities": ["text"], # no audio output
|
||||
"input_audio_format": "pcm16",
|
||||
"input_audio_transcription": {
|
||||
"model": "gpt-realtime-whisper",
|
||||
@@ -98,26 +97,41 @@ class WhisperStream:
|
||||
|
||||
if et == "input_audio_buffer.speech_started":
|
||||
self._transcript = ""
|
||||
logger.debug("speech_started")
|
||||
|
||||
elif et in ("conversation.item.input_audio_transcription.delta", "input_audio_buffer.transcription.delta"):
|
||||
# Partial streaming transcript
|
||||
delta = data.get("delta", "") or data.get("transcript", "")
|
||||
if delta:
|
||||
self._transcript = delta # or append if cumulative
|
||||
await self._on_delta(delta)
|
||||
|
||||
elif et in ("conversation.item.input_audio_transcription.completed", "input_audio_buffer.transcription.completed", "conversation.item.created"):
|
||||
# Final or item created with transcript
|
||||
item = data.get("item", {})
|
||||
content = item.get("content", []) if item else []
|
||||
transcript = data.get("transcript", "")
|
||||
if not transcript:
|
||||
for part in (content or []):
|
||||
if part.get("type") in ("transcript", "text"):
|
||||
transcript = part.get("transcript", "") or part.get("text", "")
|
||||
break
|
||||
if transcript:
|
||||
self._transcript = transcript
|
||||
await self._on_delta(transcript)
|
||||
await self._on_done(transcript.strip())
|
||||
self._transcript = ""
|
||||
|
||||
elif et == "input_audio_buffer.speech_stopped":
|
||||
if self._transcript.strip():
|
||||
await self._on_done(self._transcript.strip())
|
||||
self._transcript = ""
|
||||
|
||||
elif et == "conversation.item.created":
|
||||
item = data.get("item", {})
|
||||
content = item.get("content", [])
|
||||
for part in (content or []):
|
||||
pt = part.get("type", "")
|
||||
txt = part.get("transcript", "") or part.get("text", "")
|
||||
if pt == "transcript" and txt:
|
||||
self._transcript = txt
|
||||
await self._on_delta(txt)
|
||||
|
||||
elif et == "error":
|
||||
err = data.get("error", {})
|
||||
msg = err.get("message", str(data))
|
||||
logger.warning(f"Whisper error: {msg}")
|
||||
await self._on_error(msg)
|
||||
|
||||
async def send_audio(self, pcm16_bytes: bytes):
|
||||
if not self._connected:
|
||||
|
||||
Reference in New Issue
Block a user