feat(tasks): Gemini tool calling for task list management

5 tools: add_task, remove_task, complete_task, get_tasks, clear_completed_tasks
Backend stores tasks in-memory per session. Frontend TaskList component syncs via WS.
Kira can manage tasks via voice or text conversation.
This commit is contained in:
2026-06-06 00:01:52 -04:00
parent 3dd3032ffa
commit cbeec65637
5 changed files with 227 additions and 29 deletions
+157 -26
View File
@@ -1,15 +1,13 @@
"""Kira — AI body double backend
Gemini Live API (gemini-3.1-flash-live-preview) for real-time voice.
Text chat still goes through Gemini generateContent REST endpoint.
Task list management via Gemini function calling.
"""
import json
import base64
import uuid
import logging
import asyncio
import struct
import websockets
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -38,11 +36,118 @@ BASE_SYSTEM_PROMPT = (
"Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental. "
"You are speaking out loud via voice, so keep natural conversational flow. "
"You have tools to manage a task list for the user. When they mention tasks, "
"todos, things to do, or things to remember, use the task tools to help. "
"Always confirm when you add or remove something. When asked what's on the "
"list, read it back to them."
)
GEMINI_WS_URL = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent"
GEMINI_MODEL = "models/gemini-3.1-flash-live-preview"
# ── Gemini tool declarations ──
TOOLS = [
{
"functionDeclarations": [
{
"name": "add_task",
"description": "Add a new task to the user's task list.",
"parameters": {
"type": "OBJECT",
"properties": {
"text": {
"type": "STRING",
"description": "The task description.",
}
},
"required": ["text"],
},
},
{
"name": "remove_task",
"description": "Remove a task from the list by its ID.",
"parameters": {
"type": "OBJECT",
"properties": {
"task_id": {
"type": "STRING",
"description": "The ID of the task to remove.",
}
},
"required": ["task_id"],
},
},
{
"name": "complete_task",
"description": "Mark a task as completed.",
"parameters": {
"type": "OBJECT",
"properties": {
"task_id": {
"type": "STRING",
"description": "The ID of the task to mark complete.",
}
},
"required": ["task_id"],
},
},
{
"name": "get_tasks",
"description": "Get the current task list.",
"parameters": {
"type": "OBJECT",
"properties": {},
},
},
{
"name": "clear_completed_tasks",
"description": "Remove all completed tasks from the list.",
"parameters": {
"type": "OBJECT",
"properties": {},
},
},
]
}
]
def execute_tool(name: str, args: dict, tasks: list[dict]) -> dict:
"""Execute a tool call and return the result."""
if name == "add_task":
text = args.get("text", "").strip()
if not text:
return {"error": "Task text cannot be empty."}
task = {"id": str(uuid.uuid4())[:8], "text": text, "completed": False}
tasks.append(task)
return {"status": "added", "task": task, "total": len(tasks)}
elif name == "remove_task":
task_id = args.get("task_id", "")
for i, t in enumerate(tasks):
if t["id"] == task_id:
removed = tasks.pop(i)
return {"status": "removed", "task": removed, "total": len(tasks)}
return {"error": f"Task {task_id} not found."}
elif name == "complete_task":
task_id = args.get("task_id", "")
for t in tasks:
if t["id"] == task_id:
t["completed"] = True
return {"status": "completed", "task": t}
return {"error": f"Task {task_id} not found."}
elif name == "get_tasks":
return {"tasks": tasks, "total": len(tasks)}
elif name == "clear_completed_tasks":
before = len(tasks)
tasks[:] = [t for t in tasks if not t["completed"]]
return {"status": "cleared", "removed": before - len(tasks), "total": len(tasks)}
return {"error": f"Unknown tool: {name}"}
@app.on_event("startup")
async def startup():
@@ -62,33 +167,37 @@ async def health():
async def gemini_voice_ws(websocket: WebSocket):
"""WebSocket proxy between frontend and Gemini Live API.
Protocol (frontend this proxy):
{"type": "audio", "data": "<base64 PCM16 16kHz>"}
{"type": "conversation_text", "text": "..."}
{"type": "identify", "user_id": "...", "name": "..."}
{"type": "ping"}
{"type": "audio", "data": "<base64 PCM16 24kHz>"}
{"type": "transcript", "role": "user"|"kira", "text": "..."}
{"type": "turn_complete"}
{"type": "interrupted"}
{"type": "error", "message": "..."}
Protocol (frontend <-> this proxy):
-> {"type": "audio", "data": "<base64 PCM16 16kHz>"}
-> {"type": "conversation_text", "text": "..."}
-> {"type": "identify", "user_id": "...", "name": "..."}
-> {"type": "ping"}
<- {"type": "audio", "data": "<base64 PCM16 24kHz>"}
<- {"type": "transcript", "role": "user"|"kira", "text": "..."}
<- {"type": "turn_complete"}
<- {"type": "interrupted"}
<- {"type": "tasks", "tasks": [...]}
<- {"type": "error", "message": "..."}
"""
await websocket.accept()
session_id = str(uuid.uuid4())[:8]
user_id = "default-user"
memory_suffix = ""
tasks: list[dict] = []
logger.info(f"[{session_id}] WebSocket connected")
gemini_ws = None
gemini_task = None
frontend_task = None
async def send_tasks_to_frontend():
"""Push current task list to frontend."""
await websocket.send_json({"type": "tasks", "tasks": tasks})
try:
# ── Connect to Gemini Live API ──
gemini_url = f"{GEMINI_WS_URL}?key={settings.gemini_api_key}"
gemini_ws = await websockets.connect(gemini_url, max_size=2**24)
# ── Send setup ──
# ── Send setup with tools ──
system_prompt = BASE_SYSTEM_PROMPT
setup_msg = {
"setup": {
@@ -106,6 +215,7 @@ async def gemini_voice_ws(websocket: WebSocket):
"systemInstruction": {
"parts": [{"text": system_prompt}]
},
"tools": TOOLS,
}
}
await gemini_ws.send(json.dumps(setup_msg))
@@ -119,7 +229,7 @@ async def gemini_voice_ws(websocket: WebSocket):
else:
logger.warning(f"[{session_id}] Unexpected setup response: {list(setup_resp.keys())}")
# ── Gemini Frontend relay ──
# ── Gemini -> Frontend relay ──
async def relay_gemini():
try:
async for raw in gemini_ws:
@@ -131,7 +241,6 @@ async def gemini_voice_ws(websocket: WebSocket):
parts = model_turn.get("parts", [])
for part in parts:
# Text response
if "text" in part:
await websocket.send_json({
"type": "transcript",
@@ -139,7 +248,6 @@ async def gemini_voice_ws(websocket: WebSocket):
"text": part["text"],
})
# Audio response (PCM16 24kHz)
if "inlineData" in part:
audio_data = part["inlineData"].get("data", "")
if audio_data:
@@ -148,16 +256,40 @@ async def gemini_voice_ws(websocket: WebSocket):
"data": audio_data,
})
# Turn complete
if sc.get("turnComplete"):
await websocket.send_json({"type": "turn_complete"})
# Interrupted
if sc.get("interrupted"):
await websocket.send_json({"type": "interrupted"})
elif "toolCall" in msg:
pass # future: tool use
# Execute tool calls and send responses back
tool_call = msg["toolCall"]
function_calls = tool_call.get("functionCalls", [])
tool_results = []
for fc in function_calls:
call_id = fc.get("id", "")
fn_name = fc.get("name", "")
fn_args = fc.get("args", {})
logger.info(f"[{session_id}] Tool call: {fn_name}({fn_args})")
result = execute_tool(fn_name, fn_args, tasks)
tool_results.append({
"id": call_id,
"name": fn_name,
"response": result,
})
# Push updated task list to frontend after any mutation
if fn_name in ("add_task", "remove_task", "complete_task", "clear_completed_tasks"):
await send_tasks_to_frontend()
# Send tool response back to Gemini
if tool_results:
resp = {"toolResponse": {"functionResponses": tool_results}}
await gemini_ws.send(json.dumps(resp))
logger.info(f"[{session_id}] Sent {len(tool_results)} tool responses")
elif "toolCallCancellation" in msg:
pass
@@ -175,7 +307,7 @@ async def gemini_voice_ws(websocket: WebSocket):
except Exception as e:
logger.error(f"[{session_id}] Gemini relay error: {e}")
# ── Frontend Gemini relay ──
# ── Frontend -> Gemini relay ──
async def relay_frontend():
nonlocal user_id, memory_suffix
try:
@@ -205,6 +337,8 @@ async def gemini_voice_ws(websocket: WebSocket):
"user_id": user_id,
"preferences": prefs,
})
# Send current tasks on identify
await send_tasks_to_frontend()
continue
if msg_type == "set_preference":
@@ -219,7 +353,6 @@ async def gemini_voice_ws(websocket: WebSocket):
continue
if msg_type == "audio":
# Forward PCM16 audio to Gemini as realtimeInput
audio_b64 = msg.get("data", "")
if audio_b64 and gemini_ws and gemini_ws.state.name == "OPEN":
gemini_msg = {
@@ -238,7 +371,6 @@ async def gemini_voice_ws(websocket: WebSocket):
if not text:
continue
logger.info(f"[{session_id}] User (text): {text}")
# Send as a text turn to Gemini
if gemini_ws and gemini_ws.state.name == "OPEN":
user_part = {"text": text}
if memory_suffix:
@@ -268,7 +400,6 @@ async def gemini_voice_ws(websocket: WebSocket):
gemini_task = asyncio.create_task(relay_gemini())
frontend_task = asyncio.create_task(relay_frontend())
# Wait for either to finish
done, pending = await asyncio.wait(
[gemini_task, frontend_task],
return_when=asyncio.FIRST_COMPLETED,
+3 -1
View File
@@ -1,5 +1,6 @@
import { useState, useEffect, useRef } from 'react';
import MusicPlayer from './components/MusicPlayer';
import TaskList from './components/TaskList';
import Timer from './components/Timer';
import Notes from './components/Notes';
import WhiteNoise from './components/WhiteNoise';
@@ -27,7 +28,7 @@ export default function App() {
sendText,
startRecording,
stopRecording,
tasks,
} = useConversation();
const [currentSceneId, setCurrentSceneId] = useState('cozy-room');
@@ -143,6 +144,7 @@ export default function App() {
</div>
<div className="shrink-0">
<Notes />
<TaskList tasks={tasks} />
</div>
<div className="shrink-0">
<ChatBubble messages={messages} isKiraSpeaking={isKiraSpeaking} userName={userName} />
+53
View File
@@ -0,0 +1,53 @@
import { Task } from '../hooks/useConversation';
interface Props {
tasks: Task[];
}
export default function TaskList({ tasks }: Props) {
const pending = tasks.filter((t) => !t.completed);
const done = tasks.filter((t) => t.completed);
return (
<div className="p-3">
<h3 className="text-sm font-bold text-kira-plum mb-2 flex items-center gap-2">
<span></span> Tasks
{pending.length > 0 && (
<span className="text-xs font-normal bg-kira-pink/30 text-kira-plum px-1.5 py-0.5 rounded-full">
{pending.length}
</span>
)}
</h3>
{tasks.length === 0 && (
<div className="text-xs text-kira-plum/30 text-center py-4">
tell Kira what you need to do!
</div>
)}
{pending.length > 0 && (
<ul className="space-y-1.5 mb-2">
{pending.map((t) => (
<li key={t.id} className="flex items-start gap-2 text-sm text-kira-plum">
<span className="mt-0.5 shrink-0 w-4 h-4 rounded-full border-2 border-kira-pink/40" />
<span className="leading-snug">{t.text}</span>
</li>
))}
</ul>
)}
{done.length > 0 && (
<ul className="space-y-1">
{done.map((t) => (
<li key={t.id} className="flex items-start gap-2 text-sm text-kira-plum/40 line-through">
<span className="mt-0.5 shrink-0 w-4 h-4 rounded-full bg-kira-mint flex items-center justify-center">
<span className="text-[8px] text-white"></span>
</span>
<span className="leading-snug">{t.text}</span>
</li>
))}
</ul>
)}
</div>
);
}
+12
View File
@@ -7,6 +7,12 @@ export interface UserPreferences {
accessory: string;
}
export interface Task {
id: string;
text: string;
completed: boolean;
}
interface Message {
id: string;
role: 'user' | 'kira';
@@ -63,6 +69,7 @@ export function useConversation() {
});
const [loadingPrefs, setLoadingPrefs] = useState(true);
const [micError, setMicError] = useState<string | null>(null);
const [tasks, setTasks] = useState<Task[]>([]);
const wsRef = useRef<WebSocket | null>(null);
const streamRef = useRef<MediaStream | null>(null);
@@ -159,6 +166,10 @@ export function useConversation() {
case 'error':
console.error('[Kira]', msg.message);
break;
case 'tasks':
if (msg.tasks) setTasks(msg.tasks);
break;
}
}, []);
@@ -319,5 +330,6 @@ export function useConversation() {
sendText,
startRecording,
stopRecording,
tasks,
};
}
+1 -1
View File
@@ -1 +1 @@
{"root":["./src/App.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/AnimatedAvatar.tsx","./src/components/BackgroundScene.tsx","./src/components/ChatBubble.tsx","./src/components/Clock.tsx","./src/components/KiraAvatar.tsx","./src/components/Live2DCat.tsx","./src/components/Live2DStage.tsx","./src/components/MusicPlayer.tsx","./src/components/Notes.tsx","./src/components/Particles.tsx","./src/components/PetZone.tsx","./src/components/Timer.tsx","./src/components/Toolbar.tsx","./src/components/Wardrobe.tsx","./src/components/WelcomeScreen.tsx","./src/components/WhiteNoise.tsx","./src/components/scenes.ts","./src/hooks/useConversation.ts","./src/types/index.ts"],"version":"6.0.3"}
{"root":["./src/App.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/components/AnimatedAvatar.tsx","./src/components/BackgroundScene.tsx","./src/components/ChatBubble.tsx","./src/components/Clock.tsx","./src/components/KiraAvatar.tsx","./src/components/Live2DCat.tsx","./src/components/Live2DStage.tsx","./src/components/MusicPlayer.tsx","./src/components/Notes.tsx","./src/components/Particles.tsx","./src/components/PetZone.tsx","./src/components/TaskList.tsx","./src/components/Timer.tsx","./src/components/Toolbar.tsx","./src/components/Wardrobe.tsx","./src/components/WelcomeScreen.tsx","./src/components/WhiteNoise.tsx","./src/components/scenes.ts","./src/hooks/useConversation.ts","./src/types/index.ts"],"version":"6.0.3"}