diff --git a/backend/main.py b/backend/main.py index c5abf76..a54f000 100644 --- a/backend/main.py +++ b/backend/main.py @@ -179,22 +179,29 @@ async def conversation_ws(websocket: WebSocket): await websocket.send_json({"type": "error", "message": "No audio data"}) continue + import time + t0 = time.time() logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...") # 1. STT transcript = await transcribe_audio(bytes(audio_buffer)) + t1 = time.time() audio_buffer.clear() if not transcript: await websocket.send_json({"type": "error", "message": "Could not transcribe"}) continue + logger.info(f"[{session_id}] STT took {t1-t0:.1f}s") + await websocket.send_json({"type": "transcript", "role": "user", "text": transcript}) conversation_history.append({"role": "user", "content": transcript}) # 2. LLM (uses cached memory_suffix from identify) logger.info(f"[{session_id}] User: {transcript}") kira_text = await run_conversation(transcript, memory_suffix) + t2 = time.time() + logger.info(f"[{session_id}] LLM took {t2-t1:.1f}s") conversation_history.append({"role": "assistant", "content": kira_text}) logger.info(f"[{session_id}] Kira: {kira_text}") @@ -207,6 +214,8 @@ async def conversation_ws(websocket: WebSocket): # 3. TTS await websocket.send_json({"type": "speaking_start", "text": kira_text}) audio_bytes = await synthesize_speech(kira_text) + t3 = time.time() + logger.info(f"[{session_id}] TTS took {t3-t2:.1f}s. Total: {t3-t0:.1f}s") audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) await websocket.send_json({"type": "speaking_end"})