feat(audio): Gemini Live API replaces Whisper+GPT+ElevenLabs

Single WebSocket proxy: frontend PCM16 16kHz → backend → Gemini Live API
Gemini returns PCM16 24kHz audio + text. Playback via Web Audio API queue.
Removed OpenAI/DeepSeek deps. Model: gemini-3.1-flash-live-preview.
Voice: Aoede. Streaming bidirectional audio with silence gating.
This commit is contained in:
2026-06-05 23:36:29 -04:00
parent d2bde65645
commit 83a990e838
6 changed files with 331 additions and 286 deletions
+2 -7
View File
@@ -1,13 +1,8 @@
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
class Settings(BaseSettings): class Settings(BaseSettings):
# OpenAI (used for STT + TTS) # Gemini Live API
openai_api_key: str = "" gemini_api_key: str = ""
# DeepSeek (LLM)
deepseek_api_key: str = ""
deepseek_base_url: str = "https://api.deepseek.com/v1"
deepseek_model: str = "deepseek-chat"
# Honcho (memory) # Honcho (memory)
honcho_api_key: str = "" honcho_api_key: str = ""
+169 -120
View File
@@ -1,6 +1,7 @@
"""Kira — AI body double backend """Kira — AI body double backend
REST STT (gpt-4o-transcribe) → gpt-5.4-nano + Honcho → streaming TTS (sage) Gemini Live API (gemini-3.1-flash-live-preview) for real-time voice.
Text chat still goes through Gemini generateContent REST endpoint.
""" """
import json import json
@@ -8,7 +9,9 @@ import base64
import uuid import uuid
import logging import logging
import asyncio import asyncio
import struct
import websockets
from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@@ -33,18 +36,12 @@ BASE_SYSTEM_PROMPT = (
"You speak in a friendly, girly-pop tone. You are helping someone with ADHD " "You speak in a friendly, girly-pop tone. You are helping someone with ADHD "
"stay focused and on task. Keep responses short, supportive, and uplifting. " "stay focused and on task. Keep responses short, supportive, and uplifting. "
"Check in on them. Remind them to take breaks. Celebrate small wins. " "Check in on them. Remind them to take breaks. Celebrate small wins. "
"Use occasional emoji but don't overdo it. Never be judgmental." "Use occasional emoji but don't overdo it. Never be judgmental. "
"You are speaking out loud via voice, so keep natural conversational flow."
) )
_openai = None GEMINI_WS_URL = "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.BidiGenerateContent"
GEMINI_MODEL = "models/gemini-3.1-flash-live-preview"
def get_openai():
global _openai
if _openai is None:
from openai import AsyncOpenAI
_openai = AsyncOpenAI(api_key=settings.openai_api_key)
return _openai
@app.on_event("startup") @app.on_event("startup")
@@ -61,83 +58,138 @@ async def health():
return {"status": "ok", "name": "kira", "memory": mem_status} return {"status": "ok", "name": "kira", "memory": mem_status}
async def transcribe_audio(client, audio_b64: str) -> str | None:
"""REST transcription via gpt-4o-transcribe (full utterance blob)."""
try:
audio_bytes = base64.b64decode(audio_b64)
import io
audio_file = io.BytesIO(audio_bytes)
audio_file.name = "audio.webm"
transcript = await client.audio.transcriptions.create(
model="gpt-4o-transcribe",
file=audio_file,
)
return transcript.text.strip() if transcript.text else None
except Exception as e:
logger.error(f"Transcription error: {e}")
return None
async def get_kira_response(client, user_text: str, memory_suffix: str) -> str:
"""Get Kira's response from gpt-5.4-nano."""
system_prompt = BASE_SYSTEM_PROMPT
if memory_suffix:
system_prompt += memory_suffix
resp = await client.chat.completions.create(
model="gpt-5.4-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
max_completion_tokens=300,
temperature=0.7,
)
return resp.choices[0].message.content or "Mhm, I'm here! ✨"
async def stream_tts(client, text: str, websocket: WebSocket):
"""Stream TTS audio as Opus chunks over WebSocket."""
await websocket.send_json({"type": "speaking_start", "text": text})
async with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="sage",
input=text,
response_format="opus",
) as tts_resp:
async for chunk in tts_resp.iter_bytes():
if chunk:
b64 = base64.b64encode(chunk).decode("utf-8")
await websocket.send_json({"type": "audio", "data": b64})
await websocket.send_json({"type": "speaking_end"})
@app.websocket("/api/ws") @app.websocket("/api/ws")
async def conversation_ws(websocket: WebSocket): async def gemini_voice_ws(websocket: WebSocket):
"""WebSocket proxy between frontend and Gemini Live API.
Protocol (frontend ↔ this proxy):
{"type": "audio", "data": "<base64 PCM16 16kHz>"}
{"type": "conversation_text", "text": "..."}
{"type": "identify", "user_id": "...", "name": "..."}
{"type": "ping"}
{"type": "audio", "data": "<base64 PCM16 24kHz>"}
{"type": "transcript", "role": "user"|"kira", "text": "..."}
{"type": "turn_complete"}
{"type": "interrupted"}
{"type": "error", "message": "..."}
"""
await websocket.accept() await websocket.accept()
session_id = str(uuid.uuid4())[:8] session_id = str(uuid.uuid4())[:8]
user_id = "default-user" user_id = "default-user"
identified = False
memory_suffix = "" memory_suffix = ""
logger.info(f"[{session_id}] WebSocket connected") logger.info(f"[{session_id}] WebSocket connected")
conversation_history: list[dict] = [] gemini_ws = None
gemini_task = None
frontend_task = None
try:
# ── Connect to Gemini Live API ──
gemini_url = f"{GEMINI_WS_URL}?key={settings.gemini_api_key}"
gemini_ws = await websockets.connect(gemini_url, max_size=2**24)
# ── Send setup ──
system_prompt = BASE_SYSTEM_PROMPT
setup_msg = {
"setup": {
"model": GEMINI_MODEL,
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Aoede"
}
}
},
},
"systemInstruction": {
"parts": [{"text": system_prompt}]
},
}
}
await gemini_ws.send(json.dumps(setup_msg))
logger.info(f"[{session_id}] Connected to Gemini Live API")
# Wait for setup complete
raw = await asyncio.wait_for(gemini_ws.recv(), timeout=10)
setup_resp = json.loads(raw)
if "setupComplete" in setup_resp:
logger.info(f"[{session_id}] Gemini setup complete")
else:
logger.warning(f"[{session_id}] Unexpected setup response: {list(setup_resp.keys())}")
# ── Gemini → Frontend relay ──
async def relay_gemini():
try:
async for raw in gemini_ws:
msg = json.loads(raw)
if "serverContent" in msg:
sc = msg["serverContent"]
model_turn = sc.get("modelTurn", {})
parts = model_turn.get("parts", [])
for part in parts:
# Text response
if "text" in part:
await websocket.send_json({
"type": "transcript",
"role": "kira",
"text": part["text"],
})
# Audio response (PCM16 24kHz)
if "inlineData" in part:
audio_data = part["inlineData"].get("data", "")
if audio_data:
await websocket.send_json({
"type": "audio",
"data": audio_data,
})
# Turn complete
if sc.get("turnComplete"):
await websocket.send_json({"type": "turn_complete"})
# Interrupted
if sc.get("interrupted"):
await websocket.send_json({"type": "interrupted"})
elif "toolCall" in msg:
pass # future: tool use
elif "toolCallCancellation" in msg:
pass
elif "error" in msg:
err = msg["error"]
logger.error(f"[{session_id}] Gemini error: {err}")
await websocket.send_json({
"type": "error",
"message": str(err.get("message", err)),
})
except websockets.exceptions.ConnectionClosed:
logger.info(f"[{session_id}] Gemini WS closed")
except Exception as e:
logger.error(f"[{session_id}] Gemini relay error: {e}")
# ── Frontend → Gemini relay ──
async def relay_frontend():
nonlocal user_id, memory_suffix
try: try:
while True: while True:
raw = await websocket.receive_text() raw = await websocket.receive_text()
msg = json.loads(raw) msg = json.loads(raw)
msg_type = msg.get("type", "") msg_type = msg.get("type", "")
# ── Identity & Preferences ──
if msg_type == "identify": if msg_type == "identify":
user_id = msg.get("user_id", "").strip() user_id = msg.get("user_id", "default-user").strip()
user_name = msg.get("name", "").strip() user_name = msg.get("name", "").strip()
if user_name and user_id: if user_name and user_id:
kira_memory.set_user_preference(user_id, "name", user_name) kira_memory.set_user_preference(user_id, "name", user_name)
prefs = kira_memory.get_user_preferences(user_id) prefs = kira_memory.get_user_preferences(user_id)
identified = True
if kira_memory.enabled: if kira_memory.enabled:
kira_memory.ensure_peers(user_id) kira_memory.ensure_peers(user_id)
kira_memory.ensure_session(session_id) kira_memory.ensure_session(session_id)
@@ -147,7 +199,6 @@ async def conversation_ws(websocket: WebSocket):
memory_suffix = ctx memory_suffix = ctx
except Exception: except Exception:
pass pass
await websocket.send_json({ await websocket.send_json({
"type": "identified", "type": "identified",
"user_id": user_id, "user_id": user_id,
@@ -163,69 +214,67 @@ async def conversation_ws(websocket: WebSocket):
await websocket.send_json({"type": "preference_saved", "key": key, "success": True}) await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
continue continue
# ── Audio (full webm/opus blob from MediaRecorder) → REST STT → LLM → TTS ──
if msg_type == "audio": if msg_type == "audio":
# Forward PCM16 audio to Gemini as realtimeInput
audio_b64 = msg.get("data", "") audio_b64 = msg.get("data", "")
if not audio_b64: if audio_b64 and gemini_ws and gemini_ws.state.name == "OPEN":
gemini_msg = {
"realtimeInput": {
"audio": {
"mimeType": "audio/pcm;rate=16000",
"data": audio_b64,
}
}
}
await gemini_ws.send(json.dumps(gemini_msg))
continue continue
client = get_openai()
# STT (REST)
transcript = await transcribe_audio(client, audio_b64)
if not transcript:
await websocket.send_json({"type": "transcript", "role": "user", "text": "(could not transcribe)"})
await websocket.send_json({"type": "error", "message": "Could not transcribe audio"})
continue
logger.info(f"[{session_id}] User: {transcript}")
await websocket.send_json({"type": "transcript_delta", "text": transcript})
await websocket.send_json({"type": "transcript", "role": "user", "text": transcript})
conversation_history.append({"role": "user", "content": transcript})
# LLM
kira_text = await get_kira_response(client, transcript, memory_suffix)
conversation_history.append({"role": "assistant", "content": kira_text})
logger.info(f"[{session_id}] Kira: {kira_text}")
# Store in Honcho
if kira_memory.enabled and identified:
try:
kira_memory.store_messages(transcript, kira_text)
except Exception:
pass
# TTS (streaming)
await stream_tts(client, kira_text, websocket)
continue
# ── Text input → direct LLM + TTS ──
if msg_type == "conversation_text": if msg_type == "conversation_text":
text = msg.get("text", "").strip() text = msg.get("text", "").strip()
if not text: if not text:
continue continue
logger.info(f"[{session_id}] User (text): {text}") logger.info(f"[{session_id}] User (text): {text}")
conversation_history.append({"role": "user", "content": text}) # Send as a text turn to Gemini
if gemini_ws and gemini_ws.state.name == "OPEN":
client = get_openai() user_part = {"text": text}
kira_text = await get_kira_response(client, text, memory_suffix) if memory_suffix:
conversation_history.append({"role": "assistant", "content": kira_text}) user_part = {"text": f"[Context: {memory_suffix}]\n{text}"}
logger.info(f"[{session_id}] Kira: {kira_text}") gemini_msg = {
"clientContent": {
if kira_memory.enabled and identified: "turns": [{"role": "user", "parts": [user_part]}],
try: "turnComplete": True,
kira_memory.store_messages(text, kira_text) }
except Exception: }
pass await gemini_ws.send(json.dumps(gemini_msg))
await websocket.send_json({
await stream_tts(client, kira_text, websocket) "type": "transcript",
"role": "user",
"text": text,
})
continue continue
if msg_type == "ping": if msg_type == "ping":
await websocket.send_json({"type": "pong"}) await websocket.send_json({"type": "pong"})
except WebSocketDisconnect: except WebSocketDisconnect:
logger.info(f"[{session_id}] Disconnected") pass
except Exception as e: except Exception as e:
logger.error(f"[{session_id}] Error: {e}") logger.error(f"[{session_id}] Frontend relay error: {e}")
gemini_task = asyncio.create_task(relay_gemini())
frontend_task = asyncio.create_task(relay_frontend())
# Wait for either to finish
done, pending = await asyncio.wait(
[gemini_task, frontend_task],
return_when=asyncio.FIRST_COMPLETED,
)
for t in pending:
t.cancel()
except Exception as e:
logger.error(f"[{session_id}] Connection error: {e}")
finally:
if gemini_ws:
await gemini_ws.close()
logger.info(f"[{session_id}] Disconnected")
-2
View File
@@ -1,10 +1,8 @@
fastapi>=0.115.0 fastapi>=0.115.0
uvicorn[standard]>=0.34.0 uvicorn[standard]>=0.34.0
python-dotenv>=1.1.0 python-dotenv>=1.1.0
openai>=1.55.0
websockets>=14.1 websockets>=14.1
pydantic>=2.10.0 pydantic>=2.10.0
pydantic-settings>=2.7.0 pydantic-settings>=2.7.0
httpx>=0.28.0 httpx>=0.28.0
honcho-ai>=2.1.0 honcho-ai>=2.1.0
openai[realtime]>=2.41.0
+2 -2
View File
@@ -27,7 +27,7 @@ export default function App() {
sendText, sendText,
startRecording, startRecording,
stopRecording, stopRecording,
livePartial,
} = useConversation(); } = useConversation();
const [currentSceneId, setCurrentSceneId] = useState('cozy-room'); const [currentSceneId, setCurrentSceneId] = useState('cozy-room');
@@ -145,7 +145,7 @@ export default function App() {
<Notes /> <Notes />
</div> </div>
<div className="shrink-0"> <div className="shrink-0">
<ChatBubble messages={messages} isKiraSpeaking={isKiraSpeaking} userName={userName} livePartial={livePartial} /> <ChatBubble messages={messages} isKiraSpeaking={isKiraSpeaking} userName={userName} />
</div> </div>
<div className="shrink-0 flex gap-2"> <div className="shrink-0 flex gap-2">
<input <input
+1 -11
View File
@@ -11,10 +11,9 @@ interface Props {
messages: Message[]; messages: Message[];
isKiraSpeaking: boolean; isKiraSpeaking: boolean;
userName?: string; userName?: string;
livePartial?: string;
} }
export default function ChatBubble({ messages, isKiraSpeaking, livePartial }: Props) { export default function ChatBubble({ messages, isKiraSpeaking }: Props) {
const bottomRef = useRef<HTMLDivElement>(null); const bottomRef = useRef<HTMLDivElement>(null);
useEffect(() => { useEffect(() => {
@@ -28,15 +27,6 @@ export default function ChatBubble({ messages, isKiraSpeaking, livePartial }: Pr
<span className={`w-2 h-2 rounded-full ${isKiraSpeaking ? 'bg-kira-pink animate-pulse' : 'bg-kira-mint'}`} /> <span className={`w-2 h-2 rounded-full ${isKiraSpeaking ? 'bg-kira-pink animate-pulse' : 'bg-kira-mint'}`} />
</h3> </h3>
{livePartial && (
<div className="mb-2 px-3 py-1.5 bg-kira-lav/20 text-kira-plum/70 text-xs rounded-xl flex items-center gap-2">
<span>👂</span>
<span className="font-medium">Hearing:</span>
<span className="truncate">{livePartial}</span>
<span className="animate-pulse">...</span>
</div>
)}
<div className="flex-1 overflow-y-auto space-y-2 scrollbar-thin pr-1"> <div className="flex-1 overflow-y-auto space-y-2 scrollbar-thin pr-1">
{messages.length === 0 && ( {messages.length === 0 && (
<div className="text-xs text-kira-plum/30 text-center py-6"> <div className="text-xs text-kira-plum/30 text-center py-6">
+121 -108
View File
@@ -20,11 +20,35 @@ const USER_ID_KEY = 'kira-user-id';
function loadUserId(): string { function loadUserId(): string {
return localStorage.getItem(USER_ID_KEY) || ''; return localStorage.getItem(USER_ID_KEY) || '';
} }
function saveUserId(id: string) { function saveUserId(id: string) {
localStorage.setItem(USER_ID_KEY, id); localStorage.setItem(USER_ID_KEY, id);
} }
/**
* Encode Float32Array (44100 Hz or any rate) to PCM16 mono at 16kHz.
* Downsamples by simple nearest-neighbour if source rate != 16000.
*/
function float32ToPcm16Base64(float32: Float32Array, srcRate: number): string {
const targetRate = 16000;
const ratio = srcRate / targetRate;
const outLen = Math.floor(float32.length / ratio);
const pcm = new Int16Array(outLen);
for (let i = 0; i < outLen; i++) {
const s = float32[Math.floor(i * ratio)];
const clamped = Math.max(-1, Math.min(1, s));
pcm[i] = clamped < 0 ? clamped * 0x8000 : clamped * 0x7FFF;
}
// base64 encode raw PCM16 bytes (little-endian)
const bytes = new Uint8Array(pcm.buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
export function useConversation() { export function useConversation() {
const [messages, setMessages] = useState<Message[]>([]); const [messages, setMessages] = useState<Message[]>([]);
const [isConnected, setIsConnected] = useState(false); const [isConnected, setIsConnected] = useState(false);
@@ -39,16 +63,17 @@ export function useConversation() {
}); });
const [loadingPrefs, setLoadingPrefs] = useState(true); const [loadingPrefs, setLoadingPrefs] = useState(true);
const [micError, setMicError] = useState<string | null>(null); const [micError, setMicError] = useState<string | null>(null);
const [livePartial, setLivePartial] = useState<string>('');
const wsRef = useRef<WebSocket | null>(null); const wsRef = useRef<WebSocket | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const captureRef = useRef<{ stop: () => void } | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const audioBufferRef = useRef<Uint8Array[]>([]); const audioCtxRef = useRef<AudioContext | null>(null);
const processorRef = useRef<ScriptProcessorNode | null>(null);
// Audio playback queue
const playbackCtxRef = useRef<AudioContext | null>(null);
const playbackQueueRef = useRef<ArrayBuffer[]>([]);
const isPlayingRef = useRef(false);
// Connect WebSocket
const connect = useCallback(() => { const connect = useCallback(() => {
if (wsRef.current?.readyState === WebSocket.OPEN) return; if (wsRef.current?.readyState === WebSocket.OPEN) return;
setLoadingPrefs(true); setLoadingPrefs(true);
@@ -75,22 +100,14 @@ export function useConversation() {
try { try {
const msg = JSON.parse(event.data); const msg = JSON.parse(event.data);
handleMessage(msg); handleMessage(msg);
} catch { /* ignore parse errors */ } } catch { /* ignore */ }
}; };
}, []); }, []);
// Audio playback element // Handle incoming messages from backend
useEffect(() => {
if (!audioRef.current) {
audioRef.current = new Audio();
audioRef.current.onended = () => setIsKiraSpeaking(false);
}
}, []);
// Handle incoming WS messages
const handleMessage = useCallback((msg: any) => { const handleMessage = useCallback((msg: any) => {
switch (msg.type) { switch (msg.type) {
case 'identified': { case 'identified':
setIdentified(true); setIdentified(true);
setLoadingPrefs(false); setLoadingPrefs(false);
if (msg.user_id) saveUserId(msg.user_id); if (msg.user_id) saveUserId(msg.user_id);
@@ -103,63 +120,40 @@ export function useConversation() {
}); });
} }
break; break;
}
case 'transcript': case 'transcript':
addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text); addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text);
break; break;
case 'transcript_delta':
if (msg.text) {
setLivePartial(msg.text);
// Clear after short delay so it doesn't stick (for REST full-text case)
setTimeout(() => setLivePartial(''), 1500);
}
break;
case 'speaking_start':
setIsKiraSpeaking(true);
break;
case 'audio': { case 'audio': {
// Incoming Opus audio chunk from streaming TTS // Incoming PCM16 24kHz audio from Gemini
if (msg.data) { if (msg.data) {
const binary = atob(msg.data); const binary = atob(msg.data);
const bytes = new Uint8Array(binary.length); const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) { for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i); bytes[i] = binary.charCodeAt(i);
} }
audioBufferRef.current.push(bytes); // Convert PCM16 24kHz to Float32 for Web Audio API
const int16 = new Int16Array(bytes.buffer);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
enqueueAudio(float32, 24000);
setIsKiraSpeaking(true);
} }
break; break;
} }
case 'speaking_end': case 'turn_complete':
setIsKiraSpeaking(false); setIsKiraSpeaking(false);
// Play all accumulated chunks as one blob
if (audioBufferRef.current.length > 0 && audioRef.current) {
const allChunks = audioBufferRef.current;
const totalLen = allChunks.reduce((s, c) => s + c.length, 0);
const combined = new Uint8Array(totalLen);
let offset = 0;
for (const chunk of allChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
// audioBufferRef no longer used for playback (incremental)
const blob = new Blob([combined], { type: 'audio/ogg' });
const url = URL.createObjectURL(blob);
audioRef.current.src = url;
audioRef.current.play().catch(() => {});
}
break; break;
case 'interruption': case 'interrupted':
setIsKiraSpeaking(false); setIsKiraSpeaking(false);
if (audioRef.current) { // Clear playback queue
audioRef.current.pause(); playbackQueueRef.current = [];
audioRef.current.currentTime = 0; isPlayingRef.current = false;
}
break; break;
case 'error': case 'error':
@@ -168,6 +162,42 @@ export function useConversation() {
} }
}, []); }, []);
// Queue PCM float32 audio for playback
const enqueueAudio = useCallback((float32: Float32Array, sampleRate: number) => {
playbackQueueRef.current.push(float32.buffer as ArrayBuffer);
if (!isPlayingRef.current) {
playNext();
}
function playNext() {
const next = playbackQueueRef.current.shift();
if (!next) {
isPlayingRef.current = false;
return;
}
isPlayingRef.current = true;
const ctx = getPlaybackCtx();
const float32 = new Float32Array(next as ArrayBuffer);
const buf = ctx.createBuffer(1, float32.length, sampleRate);
buf.getChannelData(0).set(float32);
const src = ctx.createBufferSource();
src.buffer = buf;
src.connect(ctx.destination);
src.onended = playNext;
src.start();
}
}, []);
function getPlaybackCtx(): AudioContext {
if (!playbackCtxRef.current) {
playbackCtxRef.current = new AudioContext({ sampleRate: 24000 });
}
return playbackCtxRef.current;
}
const addMessage = useCallback((role: 'user' | 'kira', text: string) => { const addMessage = useCallback((role: 'user' | 'kira', text: string) => {
setMessages((prev) => [ setMessages((prev) => [
...prev, ...prev,
@@ -176,19 +206,16 @@ export function useConversation() {
}, []); }, []);
// ── Identity ── // ── Identity ──
const identify = useCallback((name: string) => { const identify = useCallback((name: string) => {
const userId = `kira-${name.toLowerCase().replace(/[^a-z0-9]/g, '-')}`; const userId = `kira-${name.toLowerCase().replace(/[^a-z0-9]/g, '-')}`;
saveUserId(userId); saveUserId(userId);
setPreferences((p) => ({ ...p, name })); setPreferences((p) => ({ ...p, name }));
if (wsRef.current?.readyState === WebSocket.OPEN) { if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name })); wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name }));
} }
}, []); }, []);
// ── Preferences ── // ── Preferences ──
const setPreference = useCallback((key: string, value: string) => { const setPreference = useCallback((key: string, value: string) => {
setPreferences((p) => ({ ...p, [key]: value })); setPreferences((p) => ({ ...p, [key]: value }));
if (wsRef.current?.readyState === WebSocket.OPEN && identified) { if (wsRef.current?.readyState === WebSocket.OPEN && identified) {
@@ -196,17 +223,18 @@ export function useConversation() {
} }
}, [identified]); }, [identified]);
// ── Audio (Realtime PCM16) ── // ── Audio capture via ScriptProcessorNode (PCM16 16kHz stream) ──
const startRecording = useCallback(async () => { const startRecording = useCallback(async () => {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { if (!navigator.mediaDevices?.getUserMedia) {
addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!'); addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!');
return; return;
} }
try { try {
setMicError(null); setMicError(null);
const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } }); const stream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 48000 },
});
streamRef.current = stream; streamRef.current = stream;
const ws = wsRef.current; const ws = wsRef.current;
@@ -216,27 +244,32 @@ export function useConversation() {
return; return;
} }
// Use MediaRecorder for full utterance blob (Opus/webm) — sent on stop for REST STT // Create AudioContext at native sample rate, capture via ScriptProcessor
const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' }); const audioCtx = new AudioContext({ sampleRate: 48000 });
const chunks: Blob[] = []; audioCtxRef.current = audioCtx;
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) chunks.push(e.data); const source = audioCtx.createMediaStreamSource(stream);
}; // 4096 buffer size → ~85ms chunks at 48kHz
mediaRecorder.onstop = () => { const processor = audioCtx.createScriptProcessor(4096, 1, 1);
if (chunks.length > 0 && ws.readyState === WebSocket.OPEN) { processorRef.current = processor;
const blob = new Blob(chunks, { type: 'audio/webm' });
blob.arrayBuffer().then((buf) => { processor.onaudioprocess = (e) => {
const base64 = arrayBufferToBase64(buf); if (ws.readyState !== WebSocket.OPEN) return;
ws.send(JSON.stringify({ type: 'audio', data: base64 })); const float32 = e.inputBuffer.getChannelData(0);
}); // Skip silent frames (reduces network traffic)
let maxAbs = 0;
for (let i = 0; i < float32.length; i += 4) {
const v = Math.abs(float32[i]);
if (v > maxAbs) maxAbs = v;
} }
chunks.length = 0; if (maxAbs < 0.01) return; // silence gate
stream.getTracks().forEach((t) => t.stop());
streamRef.current = null; const b64 = float32ToPcm16Base64(float32, audioCtx.sampleRate);
setIsRecording(false); ws.send(JSON.stringify({ type: 'audio', data: b64 }));
}; };
recorderRef.current = mediaRecorder;
mediaRecorder.start(); source.connect(processor);
processor.connect(audioCtx.destination);
setIsRecording(true); setIsRecording(true);
} catch (err) { } catch (err) {
const msg = err instanceof Error ? err.message : String(err); const msg = err instanceof Error ? err.message : String(err);
@@ -246,20 +279,16 @@ export function useConversation() {
}, [addMessage]); }, [addMessage]);
const stopRecording = useCallback(() => { const stopRecording = useCallback(() => {
if (recorderRef.current && recorderRef.current.state === 'recording') { processorRef.current?.disconnect();
recorderRef.current.stop(); processorRef.current = null;
// onstop will handle sending the blob and cleanup audioCtxRef.current?.close().catch(() => {});
} else { audioCtxRef.current = null;
// fallback cleanup
streamRef.current?.getTracks().forEach((t) => t.stop()); streamRef.current?.getTracks().forEach((t) => t.stop());
streamRef.current = null; streamRef.current = null;
setIsRecording(false); setIsRecording(false);
}
captureRef.current = null; // legacy
}, []); }, []);
// ── Text ── // ── Text input ──
const sendText = useCallback((text: string) => { const sendText = useCallback((text: string) => {
if (!text.trim()) return; if (!text.trim()) return;
if (wsRef.current?.readyState === WebSocket.OPEN) { if (wsRef.current?.readyState === WebSocket.OPEN) {
@@ -272,11 +301,9 @@ export function useConversation() {
connect(); connect();
return () => { return () => {
wsRef.current?.close(); wsRef.current?.close();
if (recorderRef.current && recorderRef.current.state === 'recording') recorderRef.current.stop(); stopRecording();
captureRef.current?.stop();
streamRef.current?.getTracks().forEach((t) => t.stop());
}; };
}, [connect]); }, [connect, stopRecording]);
return { return {
messages, messages,
@@ -287,7 +314,6 @@ export function useConversation() {
preferences, preferences,
loadingPrefs, loadingPrefs,
micError, micError,
livePartial,
identify, identify,
setPreference, setPreference,
sendText, sendText,
@@ -295,16 +321,3 @@ export function useConversation() {
stopRecording, stopRecording,
}; };
} }
// ── Helpers ──
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
const bytes = new Uint8Array(buffer);
let binary = '';
for (let i = 0; i < bytes.length; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
// (Legacy PCM capture removed - MediaRecorder full-blob path is active; eliminates ScriptProcessorNode deprecation)