diff --git a/backend/main.py b/backend/main.py index a54f000..b1d4d59 100644 --- a/backend/main.py +++ b/backend/main.py @@ -95,20 +95,23 @@ async def transcribe_audio(audio_bytes: bytes) -> str | None: return None -async def synthesize_speech(text: str) -> bytes: - """Generate TTS audio from text.""" +async def synthesize_speech(text: str, websocket, speaking_start_sent: bool = False) -> None: + """Generate TTS audio from text, streaming chunks to the client.""" client = get_openai() try: - resp = await client.audio.speech.create( + async with client.audio.speech.with_streaming_response.create( model="tts-1", voice="nova", input=text, response_format="opus", - ) - return resp.content + ) as resp: + async for chunk in resp.iter_bytes(): + if chunk: + audio_b64 = base64.b64encode(chunk).decode("utf-8") + await websocket.send_json({"type": "audio", "data": audio_b64, "text": text if speaking_start_sent else ""}) + speaking_start_sent = True except Exception as e: logger.warning(f"TTS error: {e}") - return b"" @app.websocket("/api/ws") @@ -213,11 +216,9 @@ async def conversation_ws(websocket: WebSocket): # 3. TTS await websocket.send_json({"type": "speaking_start", "text": kira_text}) - audio_bytes = await synthesize_speech(kira_text) + await synthesize_speech(kira_text, websocket) t3 = time.time() logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s") - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) await websocket.send_json({"type": "speaking_end"}) elif msg_type == "conversation_text": @@ -239,9 +240,7 @@ async def conversation_ws(websocket: WebSocket): pass await websocket.send_json({"type": "speaking_start", "text": kira_text}) - audio_bytes = await synthesize_speech(kira_text) - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) + await synthesize_speech(kira_text, websocket) await websocket.send_json({"type": "speaking_end"}) elif msg_type == "ping": diff --git a/frontend/src/hooks/useConversation.ts b/frontend/src/hooks/useConversation.ts index 8f90e72..5763bef 100644 --- a/frontend/src/hooks/useConversation.ts +++ b/frontend/src/hooks/useConversation.ts @@ -44,6 +44,7 @@ export function useConversation() { const audioRef = useRef(null); const recorderRef = useRef(null); const streamRef = useRef(null); + const audioBufferRef = useRef([]); // Connect WebSocket const connect = useCallback(() => { @@ -115,23 +116,36 @@ export function useConversation() { break; case 'audio': { - // Incoming Opus audio from TTS (full response, not streamed) - if (msg.data && audioRef.current) { + // Incoming Opus audio chunk from streaming TTS + if (msg.data) { const binary = atob(msg.data); const bytes = new Uint8Array(binary.length); for (let i = 0; i < binary.length; i++) { bytes[i] = binary.charCodeAt(i); } - const blob = new Blob([bytes], { type: 'audio/ogg' }); - const url = URL.createObjectURL(blob); - audioRef.current.src = url; - audioRef.current.play().catch(() => {}); + audioBufferRef.current.push(bytes); } break; } case 'speaking_end': setIsKiraSpeaking(false); + // Play all accumulated chunks as one blob + if (audioBufferRef.current.length > 0 && audioRef.current) { + const allChunks = audioBufferRef.current; + const totalLen = allChunks.reduce((s, c) => s + c.length, 0); + const combined = new Uint8Array(totalLen); + let offset = 0; + for (const chunk of allChunks) { + combined.set(chunk, offset); + offset += chunk.length; + } + audioBufferRef.current = []; + const blob = new Blob([combined], { type: 'audio/ogg' }); + const url = URL.createObjectURL(blob); + audioRef.current.src = url; + audioRef.current.play().catch(() => {}); + } break; case 'interruption':