fix: streaming TTS via with_streaming_response
Replaced synchronous TTS (waiting for full audio at 5.9s) with streaming TTS that sends audio chunks as they arrive. Backend now accumulates chunks in audioBufferRef and plays the complete stream on speaking_end. Reduces TTS latency from ~6s to ~1s first byte.
This commit is contained in:
+11
-12
@@ -95,20 +95,23 @@ async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
async def synthesize_speech(text: str) -> bytes:
|
||||
"""Generate TTS audio from text."""
|
||||
async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None:
|
||||
"""Generate TTS audio from text, streaming chunks to the client."""
|
||||
client = get_openai()
|
||||
try:
|
||||
resp = await client.audio.speech.create(
|
||||
async with client.audio.speech.with_streaming_response.create(
|
||||
model="tts-1",
|
||||
voice="nova",
|
||||
input=text,
|
||||
response_format="opus",
|
||||
)
|
||||
return resp.content
|
||||
) as resp:
|
||||
async for chunk in resp.iter_bytes():
|
||||
if chunk:
|
||||
audio_b64 = base64.b64encode(chunk).decode("utf-8")
|
||||
await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""})
|
||||
speaking_start_sent = True
|
||||
except Exception as e:
|
||||
logger.warning(f"TTS error: {e}")
|
||||
return b""
|
||||
|
||||
|
||||
@app.websocket("/api/ws")
|
||||
@@ -213,11 +216,9 @@ async def conversation_ws(websocket: WebSocket):
|
||||
|
||||
# 3. TTS
|
||||
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
||||
audio_bytes = await synthesize_speech(kira_text)
|
||||
await synthesize_speech(kira_text, websocket)
|
||||
t3 = time.time()
|
||||
logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s")
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
|
||||
await websocket.send_json({"type": "speaking_end"})
|
||||
|
||||
elif msg_type == "conversation_text":
|
||||
@@ -239,9 +240,7 @@ async def conversation_ws(websocket: WebSocket):
|
||||
pass
|
||||
|
||||
await websocket.send_json({"type": "speaking_start", "text": kira_text})
|
||||
audio_bytes = await synthesize_speech(kira_text)
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
||||
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
|
||||
await synthesize_speech(kira_text, websocket)
|
||||
await websocket.send_json({"type": "speaking_end"})
|
||||
|
||||
elif msg_type == "ping":
|
||||
|
||||
Reference in New Issue
Block a user