From 9cd183a83bd9400fa92837c42a0c09326a71a37f Mon Sep 17 00:00:00 2001
From: hobokenchicken <dustin@dustin.coffee>
Date: Thu, 4 Jun 2026 14:17:54 -0400
Subject: [PATCH] fix: streaming TTS via with_streaming_response

Replaced synchronous TTS (waiting for full audio at 5.9s) with
streaming TTS that sends audio chunks as they arrive. Backend now
accumulates chunks in audioBufferRef and plays the complete stream
on speaking_end. Reduces TTS latency from ~6s to ~1s first byte.
---
 backend/main.py                       | 23 +++++++++++------------
 frontend/src/hooks/useConversation.ts | 26 ++++++++++++++++++++------
 2 files changed, 31 insertions(+), 18 deletions(-)
diff --git a/backend/main.py b/backend/main.py
index a54f000..b1d4d59 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -95,20 +95,23 @@ async def transcribe_audio(audio_bytes: bytes) -> str | None:
         return None
 
 
-async def synthesize_speech(text: str) -> bytes:
-    """Generate TTS audio from text."""
+async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None:
+    """Generate TTS audio from text, streaming chunks to the client."""
     client = get_openai()
     try:
-        resp = await client.audio.speech.create(
+        async with client.audio.speech.with_streaming_response.create(
             model="tts-1",
             voice="nova",
             input=text,
             response_format="opus",
-        )
-        return resp.content
+        ) as resp:
+            async for chunk in resp.iter_bytes():
+                if chunk:
+                    audio_b64 = base64.b64encode(chunk).decode("utf-8")
+                    await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""})
+                    speaking_start_sent = True
     except Exception as e:
         logger.warning(f"TTS error: {e}")
-        return b""
 
 
 @app.websocket("/api/ws")
@@ -213,11 +216,9 @@ async def conversation_ws(websocket: WebSocket):
 
                 # 3. TTS
                 await websocket.send_json({"type": "speaking_start", "text": kira_text})
-                audio_bytes = await synthesize_speech(kira_text)
+                await synthesize_speech(kira_text, websocket)
                 t3 = time.time()
                 logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s")
-                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-                await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
                 await websocket.send_json({"type": "speaking_end"})
 
             elif msg_type == "conversation_text":
@@ -239,9 +240,7 @@ async def conversation_ws(websocket: WebSocket):
                         pass
 
                 await websocket.send_json({"type": "speaking_start", "text": kira_text})
-                audio_bytes = await synthesize_speech(kira_text)
-                audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-                await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text})
+                await synthesize_speech(kira_text, websocket)
                 await websocket.send_json({"type": "speaking_end"})
 
             elif msg_type == "ping":
diff --git a/frontend/src/hooks/useConversation.ts b/frontend/src/hooks/useConversation.ts
index 8f90e72..5763bef 100644
--- a/frontend/src/hooks/useConversation.ts
+++ b/frontend/src/hooks/useConversation.ts
@@ -44,6 +44,7 @@ export function useConversation() {
   const audioRef = useRef<HTMLAudioElement | null>(null);
   const recorderRef = useRef<MediaRecorder | null>(null);
   const streamRef = useRef<MediaStream | null>(null);
+  const audioBufferRef = useRef<Uint8Array[]>([]);
 
   // Connect WebSocket
   const connect = useCallback(() => {
@@ -115,23 +116,36 @@ export function useConversation() {
         break;
 
       case 'audio': {
-        // Incoming Opus audio from TTS (full response, not streamed)
-        if (msg.data && audioRef.current) {
+        // Incoming Opus audio chunk from streaming TTS
+        if (msg.data) {
           const binary = atob(msg.data);
           const bytes = new Uint8Array(binary.length);
           for (let i = 0; i < binary.length; i++) {
             bytes[i] = binary.charCodeAt(i);
           }
-          const blob = new Blob([bytes], { type: 'audio/ogg' });
-          const url = URL.createObjectURL(blob);
-          audioRef.current.src = url;
-          audioRef.current.play().catch(() => {});
+          audioBufferRef.current.push(bytes);
         }
         break;
       }
 
       case 'speaking_end':
         setIsKiraSpeaking(false);
+        // Play all accumulated chunks as one blob
+        if (audioBufferRef.current.length > 0 && audioRef.current) {
+          const allChunks = audioBufferRef.current;
+          const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
+          const combined = new Uint8Array(totalLen);
+          let offset = 0;
+          for (const chunk of allChunks) {
+            combined.set(chunk, offset);
+            offset += chunk.length;
+          }
+          audioBufferRef.current = [];
+          const blob = new Blob([combined], { type: 'audio/ogg' });
+          const url = URL.createObjectURL(blob);
+          audioRef.current.src = url;
+          audioRef.current.play().catch(() => {});
+        }
         break;
 
       case 'interruption':