fix: streaming TTS via with_streaming_response

Replaced synchronous TTS (waiting for full audio at 5.9s) with
streaming TTS that sends audio chunks as they arrive. Backend now
accumulates chunks in audioBufferRef and plays the complete stream
on speaking_end. Reduces TTS latency from ~6s to ~1s first byte.
This commit is contained in:
2026-06-04 14:17:54 -04:00
parent 2cd5636ad6
commit 9cd183a83b
2 changed files with 31 additions and 18 deletions
+11 -12
View File
@@ -95,20 +95,23 @@ async def transcribe_audio(audio_bytes: bytes) -> str | None:
return None return None
async def synthesize_speech(text: str) -> bytes: async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None:
"""Generate TTS audio from text.""" """Generate TTS audio from text, streaming chunks to the client."""
client = get_openai() client = get_openai()
try: try:
resp = await client.audio.speech.create( async with client.audio.speech.with_streaming_response.create(
model="tts-1", model="tts-1",
voice="nova", voice="nova",
input=text, input=text,
response_format="opus", response_format="opus",
) ) as resp:
return resp.content async for chunk in resp.iter_bytes():
if chunk:
audio_b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""})
speaking_start_sent = True
except Exception as e: except Exception as e:
logger.warning(f"TTS error: {e}") logger.warning(f"TTS error: {e}")
return b""
@app.websocket("/api/ws") @app.websocket("/api/ws")
@@ -213,11 +216,9 @@ async def conversation_ws(websocket: WebSocket):
# 3. TTS # 3. TTS
await websocket.send_json({"type": "speaking_start", "text": kira_text}) await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text) await synthesize_speech(kira_text, websocket)
t3 = time.time() t3 = time.time()
logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s") logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s")
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"}) await websocket.send_json({"type": "speaking_end"})
elif msg_type == "conversation_text": elif msg_type == "conversation_text":
@@ -239,9 +240,7 @@ async def conversation_ws(websocket: WebSocket):
pass pass
await websocket.send_json({"type": "speaking_start", "text": kira_text}) await websocket.send_json({"type": "speaking_start", "text": kira_text})
audio_bytes = await synthesize_speech(kira_text) await synthesize_speech(kira_text, websocket)
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
await websocket.send_json({"type": "speaking_end"}) await websocket.send_json({"type": "speaking_end"})
elif msg_type == "ping": elif msg_type == "ping":
+20 -6
View File
@@ -44,6 +44,7 @@ export function useConversation() {
const audioRef = useRef<HTMLAudioElement | null>(null); const audioRef = useRef<HTMLAudioElement | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null); const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]);
// Connect WebSocket // Connect WebSocket
const connect = useCallback(() => { const connect = useCallback(() => {
@@ -115,23 +116,36 @@ export function useConversation() {
break; break;
case 'audio': { case 'audio': {
// Incoming Opus audio from TTS (full response, not streamed) // Incoming Opus audio chunk from streaming TTS
if (msg.data && audioRef.current) { if (msg.data) {
const binary = atob(msg.data); const binary = atob(msg.data);
const bytes = new Uint8Array(binary.length); const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) { for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i); bytes[i] = binary.charCodeAt(i);
} }
const blob = new Blob([bytes], { type: 'audio/ogg' }); audioBufferRef.current.push(bytes);
const url = URL.createObjectURL(blob);
audioRef.current.src = url;
audioRef.current.play().catch(() => {});
} }
break; break;
} }
case 'speaking_end': case 'speaking_end':
setIsKiraSpeaking(false); setIsKiraSpeaking(false);
// Play all accumulated chunks as one blob
if (audioBufferRef.current.length > 0 && audioRef.current) {
const allChunks = audioBufferRef.current;
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
const combined = new Uint8Array(totalLen);
let offset = 0;
for (const chunk of allChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
audioBufferRef.current = [];
const blob = new Blob([combined], { type: 'audio/ogg' });
const url = URL.createObjectURL(blob);
audioRef.current.src = url;
audioRef.current.play().catch(() => {});
}
break; break;
case 'interruption': case 'interruption':