feat(tasks): Gemini tool calling for task list management

5 tools: add_task, remove_task, complete_task, get_tasks, clear_completed_tasks
Backend stores tasks in-memory per session. Frontend TaskList component syncs via WS.
Kira can manage tasks via voice or text conversation.
This commit is contained in:
2026-06-06 00:01:52 -04:00
parent 3dd3032ffa
commit cbeec65637
5 changed files with 227 additions and 29 deletions
+157 -26
View File
@@ -1,15 +1,13 @@
"""Kira — AI body double backend """Kira — AI body double backend
Gemini Live API (gemini-3.1-flash-live-preview) for real-time voice. Gemini Live API (gemini-3.1-flash-live-preview) for real-time voice.
Text chat still goes through Gemini generateContent REST endpoint. Task list management via Gemini function calling.
""" """
import json import json
import base64
import uuid import uuid
import logging import logging
import asyncio import asyncio
import struct
import websockets import websockets
from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -38,11 +36,118 @@ BASE_SYSTEM_PROMPT = (
"Check in on them. Remind them to take breaks. Celebrate small wins. " "Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental. " "Use occasional emoji but don't overdo it. Never be judgmental. "
"You are speaking out loud via voice, so keep natural conversational flow. " "You are speaking out loud via voice, so keep natural conversational flow. "
"You have tools to manage a task list for the user. When they mention tasks, "
"todos, things to do, or things to remember, use the task tools to help. "
"Always confirm when you add or remove something. When asked what's on the "
"list, read it back to them."
) )
GEMINI_WS_URL = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent" GEMINI_WS_URL = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"
GEMINI_MODEL = "models/gemini-3.1-flash-live-preview" GEMINI_MODEL = "models/gemini-3.1-flash-live-preview"
# ── Gemini tool declarations ──
TOOLS = [
{
"functionDeclarations": [
{
"name": "add_task",
"description": "Add a new task to the user's task list.",
"parameters": {
"type": "OBJECT",
"properties": {
"text": {
"type": "STRING",
"description": "The task description.",
}
},
"required": ["text"],
},
},
{
"name": "remove_task",
"description": "Remove a task from the list by its ID.",
"parameters": {
"type": "OBJECT",
"properties": {
"task_id": {
"type": "STRING",
"description": "The ID of the task to remove.",
}
},
"required": ["task_id"],
},
},
{
"name": "complete_task",
"description": "Mark a task as completed.",
"parameters": {
"type": "OBJECT",
"properties": {
"task_id": {
"type": "STRING",
"description": "The ID of the task to mark complete.",
}
},
"required": ["task_id"],
},
},
{
"name": "get_tasks",
"description": "Get the current task list.",
"parameters": {
"type": "OBJECT",
"properties": {},
},
},
{
"name": "clear_completed_tasks",
"description": "Remove all completed tasks from the list.",
"parameters": {
"type": "OBJECT",
"properties": {},
},
},
]
}
]
def execute_tool(name: str, args: dict, tasks: list[dict]) -> dict:
"""Execute a tool call and return the result."""
if name == "add_task":
text = args.get("text", "").strip()
if not text:
return {"error": "Task text cannot be empty."}
task = {"id": str(uuid.uuid4())[:8], "text": text, "completed": False}
tasks.append(task)
return {"status": "added", "task": task, "total": len(tasks)}
elif name == "remove_task":
task_id = args.get("task_id", "")
for i, t in enumerate(tasks):
if t["id"] == task_id:
removed = tasks.pop(i)
return {"status": "removed", "task": removed, "total": len(tasks)}
return {"error": f"Task {task_id} not found."}
elif name == "complete_task":
task_id = args.get("task_id", "")
for t in tasks:
if t["id"] == task_id:
t["completed"] = True
return {"status": "completed", "task": t}
return {"error": f"Task {task_id} not found."}
elif name == "get_tasks":
return {"tasks": tasks, "total": len(tasks)}
elif name == "clear_completed_tasks":
before = len(tasks)
tasks[:] = [t for t in tasks if not t["completed"]]
return {"status": "cleared", "removed": before - len(tasks), "total": len(tasks)}
return {"error": f"Unknown tool: {name}"}
@app.on_event("startup") @app.on_event("startup")
async def startup(): async def startup():
@@ -62,33 +167,37 @@ async def health():
async def gemini_voice_ws(websocket: WebSocket): async def gemini_voice_ws(websocket: WebSocket):
"""WebSocket proxy between frontend and Gemini Live API. """WebSocket proxy between frontend and Gemini Live API.
Protocol (frontend this proxy): Protocol (frontend <-> this proxy):
{"type": "audio", "data": "<base64 PCM16 16kHz>"} -> {"type": "audio", "data": "<base64 PCM16 16kHz>"}
{"type": "conversation_text", "text": "..."} -> {"type": "conversation_text", "text": "..."}
{"type": "identify", "user_id": "...", "name": "..."} -> {"type": "identify", "user_id": "...", "name": "..."}
{"type": "ping"} -> {"type": "ping"}
{"type": "audio", "data": "<base64 PCM16 24kHz>"} <- {"type": "audio", "data": "<base64 PCM16 24kHz>"}
{"type": "transcript", "role": "user"|"kira", "text": "..."} <- {"type": "transcript", "role": "user"|"kira", "text": "..."}
{"type": "turn_complete"} <- {"type": "turn_complete"}
{"type": "interrupted"} <- {"type": "interrupted"}
{"type": "error", "message": "..."} <- {"type": "tasks", "tasks": [...]}
<- {"type": "error", "message": "..."}
""" """
await websocket.accept() await websocket.accept()
session_id = str(uuid.uuid4())[:8] session_id = str(uuid.uuid4())[:8]
user_id = "default-user" user_id = "default-user"
memory_suffix = "" memory_suffix = ""
tasks: list[dict] = []
logger.info(f"[{session_id}] WebSocket connected") logger.info(f"[{session_id}] WebSocket connected")
gemini_ws = None gemini_ws = None
gemini_task = None
frontend_task = None async def send_tasks_to_frontend():
"""Push current task list to frontend."""
await websocket.send_json({"type": "tasks", "tasks": tasks})
try: try:
# ── Connect to Gemini Live API ── # ── Connect to Gemini Live API ──
gemini_url = f"{GEMINI_WS_URL}?key={settings.gemini_api_key}" gemini_url = f"{GEMINI_WS_URL}?key={settings.gemini_api_key}"
gemini_ws = await websockets.connect(gemini_url, max_size=2**24) gemini_ws = await websockets.connect(gemini_url, max_size=2**24)
# ── Send setup ── # ── Send setup with tools ──
system_prompt = BASE_SYSTEM_PROMPT system_prompt = BASE_SYSTEM_PROMPT
setup_msg = { setup_msg = {
"setup": { "setup": {
@@ -106,6 +215,7 @@ async def gemini_voice_ws(websocket: WebSocket):
"systemInstruction": { "systemInstruction": {
"parts": [{"text": system_prompt}] "parts": [{"text": system_prompt}]
}, },
"tools": TOOLS,
} }
} }
await gemini_ws.send(json.dumps(setup_msg)) await gemini_ws.send(json.dumps(setup_msg))
@@ -119,7 +229,7 @@ async def gemini_voice_ws(websocket: WebSocket):
else: else:
logger.warning(f"[{session_id}] Unexpected setup response: {list(setup_resp.keys())}") logger.warning(f"[{session_id}] Unexpected setup response: {list(setup_resp.keys())}")
# ── Gemini Frontend relay ── # ── Gemini -> Frontend relay ──
async def relay_gemini(): async def relay_gemini():
try: try:
async for raw in gemini_ws: async for raw in gemini_ws:
@@ -131,7 +241,6 @@ async def gemini_voice_ws(websocket: WebSocket):
parts = model_turn.get("parts", []) parts = model_turn.get("parts", [])
for part in parts: for part in parts:
# Text response
if "text" in part: if "text" in part:
await websocket.send_json({ await websocket.send_json({
"type": "transcript", "type": "transcript",
@@ -139,7 +248,6 @@ async def gemini_voice_ws(websocket: WebSocket):
"text": part["text"], "text": part["text"],
}) })
# Audio response (PCM16 24kHz)
if "inlineData" in part: if "inlineData" in part:
audio_data = part["inlineData"].get("data", "") audio_data = part["inlineData"].get("data", "")
if audio_data: if audio_data:
@@ -148,16 +256,40 @@ async def gemini_voice_ws(websocket: WebSocket):
"data": audio_data, "data": audio_data,
}) })
# Turn complete
if sc.get("turnComplete"): if sc.get("turnComplete"):
await websocket.send_json({"type": "turn_complete"}) await websocket.send_json({"type": "turn_complete"})
# Interrupted
if sc.get("interrupted"): if sc.get("interrupted"):
await websocket.send_json({"type": "interrupted"}) await websocket.send_json({"type": "interrupted"})
elif "toolCall" in msg: elif "toolCall" in msg:
pass # future: tool use # Execute tool calls and send responses back
tool_call = msg["toolCall"]
function_calls = tool_call.get("functionCalls", [])
tool_results = []
for fc in function_calls:
call_id = fc.get("id", "")
fn_name = fc.get("name", "")
fn_args = fc.get("args", {})
logger.info(f"[{session_id}] Tool call: {fn_name}({fn_args})")
result = execute_tool(fn_name, fn_args, tasks)
tool_results.append({
"id": call_id,
"name": fn_name,
"response": result,
})
# Push updated task list to frontend after any mutation
if fn_name in ("add_task", "remove_task", "complete_task", "clear_completed_tasks"):
await send_tasks_to_frontend()
# Send tool response back to Gemini
if tool_results:
resp = {"toolResponse": {"functionResponses": tool_results}}
await gemini_ws.send(json.dumps(resp))
logger.info(f"[{session_id}] Sent {len(tool_results)} tool responses")
elif "toolCallCancellation" in msg: elif "toolCallCancellation" in msg:
pass pass
@@ -175,7 +307,7 @@ async def gemini_voice_ws(websocket: WebSocket):
except Exception as e: except Exception as e:
logger.error(f"[{session_id}] Gemini relay error: {e}") logger.error(f"[{session_id}] Gemini relay error: {e}")
# ── Frontend Gemini relay ── # ── Frontend -> Gemini relay ──
async def relay_frontend(): async def relay_frontend():
nonlocal user_id, memory_suffix nonlocal user_id, memory_suffix
try: try:
@@ -205,6 +337,8 @@ async def gemini_voice_ws(websocket: WebSocket):
"user_id": user_id, "user_id": user_id,
"preferences": prefs, "preferences": prefs,
}) })
# Send current tasks on identify
await send_tasks_to_frontend()
continue continue
if msg_type == "set_preference": if msg_type == "set_preference":
@@ -219,7 +353,6 @@ async def gemini_voice_ws(websocket: WebSocket):
continue continue
if msg_type == "audio": if msg_type == "audio":
# Forward PCM16 audio to Gemini as realtimeInput
audio_b64 = msg.get("data", "") audio_b64 = msg.get("data", "")
if audio_b64 and gemini_ws and gemini_ws.state.name == "OPEN": if audio_b64 and gemini_ws and gemini_ws.state.name == "OPEN":
gemini_msg = { gemini_msg = {
@@ -238,7 +371,6 @@ async def gemini_voice_ws(websocket: WebSocket):
if not text: if not text:
continue continue
logger.info(f"[{session_id}] User (text): {text}") logger.info(f"[{session_id}] User (text): {text}")
# Send as a text turn to Gemini
if gemini_ws and gemini_ws.state.name == "OPEN": if gemini_ws and gemini_ws.state.name == "OPEN":
user_part = {"text": text} user_part = {"text": text}
if memory_suffix: if memory_suffix:
@@ -268,7 +400,6 @@ async def gemini_voice_ws(websocket: WebSocket):
gemini_task = asyncio.create_task(relay_gemini()) gemini_task = asyncio.create_task(relay_gemini())
frontend_task = asyncio.create_task(relay_frontend()) frontend_task = asyncio.create_task(relay_frontend())
# Wait for either to finish
done, pending = await asyncio.wait( done, pending = await asyncio.wait(
[gemini_task, frontend_task], [gemini_task, frontend_task],
return_when=asyncio.FIRST_COMPLETED, return_when=asyncio.FIRST_COMPLETED,
+3 -1
View File
@@ -1,5 +1,6 @@
import { useState, useEffect, useRef } from 'react'; import { useState, useEffect, useRef } from 'react';
import MusicPlayer from './components/MusicPlayer'; import MusicPlayer from './components/MusicPlayer';
import TaskList from './components/TaskList';
import Timer from './components/Timer'; import Timer from './components/Timer';
import Notes from './components/Notes'; import Notes from './components/Notes';
import WhiteNoise from './components/WhiteNoise'; import WhiteNoise from './components/WhiteNoise';
@@ -27,7 +28,7 @@ export default function App() {
sendText, sendText,
startRecording, startRecording,
stopRecording, stopRecording,
tasks,
} = useConversation(); } = useConversation();
const [currentSceneId, setCurrentSceneId] = useState('cozy-room'); const [currentSceneId, setCurrentSceneId] = useState('cozy-room');
@@ -143,6 +144,7 @@ export default function App() {
</div> </div>
<div className="shrink-0"> <div className="shrink-0">
<Notes /> <Notes />
<TaskList tasks={tasks} />
</div> </div>
<div className="shrink-0"> <div className="shrink-0">
<ChatBubble messages={messages} isKiraSpeaking={isKiraSpeaking} userName={userName} /> <ChatBubble messages={messages} isKiraSpeaking={isKiraSpeaking} userName={userName} />
+53
View File
@@ -0,0 +1,53 @@
import { Task } from '../hooks/useConversation';
interface Props {
tasks: Task[];
}
export default function TaskList({ tasks }: Props) {
const pending = tasks.filter((t) => !t.completed);
const done = tasks.filter((t) => t.completed);
return (
<div className="p-3">
<h3 className="text-sm font-bold text-kira-plum mb-2 flex items-center gap-2">
<span></span> Tasks
{pending.length > 0 && (
<span className="text-xs font-normal bg-kira-pink/30 text-kira-plum px-1.5 py-0.5 rounded-full">
{pending.length}
</span>
)}
</h3>
{tasks.length === 0 && (
<div className="text-xs text-kira-plum/30 text-center py-4">
tell Kira what you need to do!
</div>
)}
{pending.length > 0 && (
<ul className="space-y-1.5 mb-2">
{pending.map((t) => (
<li key={t.id} className="flex items-start gap-2 text-sm text-kira-plum">
<span className="mt-0.5 shrink-0 w-4 h-4 rounded-full border-2 border-kira-pink/40" />
<span className="leading-snug">{t.text}</span>
</li>
))}
</ul>
)}
{done.length > 0 && (
<ul className="space-y-1">
{done.map((t) => (
<li key={t.id} className="flex items-start gap-2 text-sm text-kira-plum/40 line-through">
<span className="mt-0.5 shrink-0 w-4 h-4 rounded-full bg-kira-mint flex items-center justify-center">
<span className="text-[8px] text-white"></span>
</span>
<span className="leading-snug">{t.text}</span>
</li>
))}
</ul>
)}
</div>
);
}
+12
View File
@@ -7,6 +7,12 @@ export interface UserPreferences {
accessory: string; accessory: string;
} }
export interface Task {
id: string;
text: string;
completed: boolean;
}
interface Message { interface Message {
id: string; id: string;
role: 'user' | 'kira'; role: 'user' | 'kira';
@@ -63,6 +69,7 @@ export function useConversation() {
}); });
const [loadingPrefs, setLoadingPrefs] = useState(true); const [loadingPrefs, setLoadingPrefs] = useState(true);
const [micError, setMicError] = useState<string | null>(null); const [micError, setMicError] = useState<string | null>(null);
const [tasks, setTasks] = useState<Task[]>([]);
const wsRef = useRef<WebSocket | null>(null); const wsRef = useRef<WebSocket | null>(null);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
@@ -159,6 +166,10 @@ export function useConversation() {
case 'error': case 'error':
console.error('[Kira]', msg.message); console.error('[Kira]', msg.message);
break; break;
case 'tasks':
if (msg.tasks) setTasks(msg.tasks);
break;
} }
}, []); }, []);
@@ -319,5 +330,6 @@ export function useConversation() {
sendText, sendText,
startRecording, startRecording,
stopRecording, stopRecording,
tasks,
}; };
} }
+1 -1
View File
@@ -1 +1 @@
{"root":["./src/App.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/AnimatedAvatar.tsx","./src/components/BackgroundScene.tsx","./src/components/ChatBubble.tsx","./src/components/Clock.tsx","./src/components/KiraAvatar.tsx","./src/components/Live2DCat.tsx","./src/components/Live2DStage.tsx","./src/components/MusicPlayer.tsx","./src/components/Notes.tsx","./src/components/Particles.tsx","./src/components/PetZone.tsx","./src/components/Timer.tsx","./src/components/Toolbar.tsx","./src/components/Wardrobe.tsx","./src/components/WelcomeScreen.tsx","./src/components/WhiteNoise.tsx","./src/components/scenes.ts","./src/hooks/useConversation.ts","./src/types/index.ts"],"version":"6.0.3"} {"root":["./src/App.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/AnimatedAvatar.tsx","./src/components/BackgroundScene.tsx","./src/components/ChatBubble.tsx","./src/components/Clock.tsx","./src/components/KiraAvatar.tsx","./src/components/Live2DCat.tsx","./src/components/Live2DStage.tsx","./src/components/MusicPlayer.tsx","./src/components/Notes.tsx","./src/components/Particles.tsx","./src/components/PetZone.tsx","./src/components/TaskList.tsx","./src/components/Timer.tsx","./src/components/Toolbar.tsx","./src/components/Wardrobe.tsx","./src/components/WelcomeScreen.tsx","./src/components/WhiteNoise.tsx","./src/components/scenes.ts","./src/hooks/useConversation.ts","./src/types/index.ts"],"version":"6.0.3"}