diff --git a/backend/main.py b/backend/main.py index 66b937e..6c0d9c3 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,26 +1,21 @@ """Kira — AI body double backend -Real-time speech-to-speech pipeline: - mic audio → Whisper API → text → DeepSeek LLM → response text → OpenAI TTS → audio - -Honcho memory integration: - Cross-session user context injected into LLM prompts, - conversation exchanges stored for continuous learning. - User preferences (name, scene, outfit, accessory) persisted in peer metadata. +OpenAI Realtime API pipeline: + mic audio → [built-in STT → GPT-4o-mini → built-in TTS] → speaker audio + Single WebSocket, ~300-800ms latency """ import json import base64 import uuid import logging +import asyncio from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from config import settings -from services.stt import transcribe_audio -from services.llm import get_kira_response -from services.tts import synthesize_speech +from services.realtime import RealtimeRelay from services.memory import kira_memory logging.basicConfig(level=logging.INFO) @@ -36,24 +31,13 @@ app.add_middleware( allow_headers=["*"], ) -# ─── Base system prompt (static part) ─── -BASE_SYSTEM_PROMPT = ( - "You are Kira, a warm, kind, and encouraging AI body double. " - "You speak in a friendly, girly-pop tone. You are helping someone with ADHD " - "stay focused and on task. Keep responses short, supportive, and uplifting. " - "Check in on them. Remind them to take breaks. Celebrate small wins. " - "Use occasional emoji but don't overdo it. Never be judgmental. " - "You remember things about them between conversations." -) - @app.on_event("startup") async def startup(): - """Initialize Honcho memory on app startup.""" if kira_memory.init(): logger.info("Honcho memory initialized") else: - logger.info("Honcho memory not configured — running without memory") + logger.info("Honcho memory not configured") @app.get("/api/health") @@ -62,61 +46,6 @@ async def health(): return {"status": "ok", "name": "kira", "memory": mem_status} -def build_system_prompt(user_id: str) -> dict: - """Build system prompt with Honcho memory context injected.""" - base = BASE_SYSTEM_PROMPT - - if kira_memory.enabled: - try: - kira_memory.ensure_peers(user_id) - memory_suffix = kira_memory.build_system_prompt_suffix() - if memory_suffix: - base += memory_suffix - except Exception as e: - logger.warning(f"Failed to build memory context: {e}") - - return {"role": "system", "content": base} - - -def handle_identify(msg: dict, session_id: str) -> dict | None: - """Handle user identification. Returns user preferences or None.""" - user_id = msg.get("user_id", "").strip() - if not user_id: - return {"type": "error", "message": "user_id is required"} - - user_name = msg.get("name", "").strip() - if user_name: - kira_memory.set_user_preference(user_id, "name", user_name) - - prefs = kira_memory.get_user_preferences(user_id) - logger.info(f"[{session_id}] Identified as {user_id} (name={user_name or prefs.get('name', '')})") - - return { - "type": "identified", - "user_id": user_id, - "preferences": prefs, - } - - -def handle_set_preference(msg: dict, session_id: str, user_id: str) -> dict | None: - """Handle preference update. Returns success status.""" - if not user_id or user_id == "default-user": - return {"type": "error", "message": "Must identify first"} - - key = msg.get("key", "").strip() - value = msg.get("value", "").strip() - - if not key: - return {"type": "error", "message": "key is required"} - - ok = kira_memory.set_user_preference(user_id, key, value) - return { - "type": "preference_saved", - "key": key, - "success": ok, - } - - @app.websocket("/api/ws") async def conversation_ws(websocket: WebSocket): await websocket.accept() @@ -125,8 +54,102 @@ async def conversation_ws(websocket: WebSocket): identified = False logger.info(f"[{session_id}] WebSocket connected") - audio_buffer = bytearray() - conversation_history: list[dict] = [] + # Track conversation for Honcho + pending_transcripts: list[tuple[str, str]] = [] + + # Will be set when Realtime relay is ready + relay_ready = asyncio.Event() + relay: RealtimeRelay | None = None + relay_task: asyncio.Task | None = None + audio_queue: asyncio.Queue[bytes] = asyncio.Queue() + text_queue: asyncio.Queue[str] = asyncio.Queue() + + async def on_ready(): + relay_ready.set() + logger.info(f"[{session_id}] Realtime relay ready") + + async def on_audio_delta(audio_bytes: bytes): + """Forward audio chunks from OpenAI to the client.""" + try: + audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") + await websocket.send_json({ + "type": "audio", + "data": audio_b64, + }) + except Exception: + pass + + async def on_transcript(text: str): + """Store transcripts for Honcho.""" + pending_transcripts.append(("transcript", text)) + role, content = text.split(": ", 1) + logger.info(f"[{session_id}] {role}: {content}") + await websocket.send_json({ + "type": "transcript", + "role": role, + "text": content, + }) + + async def on_speech_started(): + """Kira started speaking.""" + await websocket.send_json({"type": "speaking_start"}) + + async def on_speech_stopped(): + """Kira finished speaking.""" + await websocket.send_json({"type": "speaking_end"}) + + async def on_interruption(): + """User interrupted — Kira stops speaking.""" + await websocket.send_json({"type": "interruption"}) + + async def on_error(msg: str): + await websocket.send_json({"type": "error", "message": msg}) + + # ── Create and start the Realtime relay ── + relay = RealtimeRelay( + on_audio_delta=on_audio_delta, + on_transcript=on_transcript, + on_speech_started=on_speech_started, + on_speech_stopped=on_speech_stopped, + on_interruption=on_interruption, + on_error=on_error, + on_ready=on_ready, + ) + + relay_task = asyncio.create_task(relay.connect()) + + # Wait for relay to be ready + try: + await asyncio.wait_for(relay_ready.wait(), timeout=15) + except asyncio.TimeoutError: + logger.error(f"[{session_id}] Realtime relay failed to connect") + await websocket.send_json({"type": "error", "message": "Failed to connect to AI"}) + relay_task.cancel() + return + + # ── Forward audio/text from client to relay ── + async def forward_audio(): + while relay and relay._connected: + try: + pcm16 = await asyncio.wait_for(audio_queue.get(), timeout=1) + await relay.send_audio(pcm16) + except asyncio.TimeoutError: + continue + except Exception: + break + + async def forward_text(): + while relay and relay._connected: + try: + text = await asyncio.wait_for(text_queue.get(), timeout=1) + await relay.send_text(text) + except asyncio.TimeoutError: + continue + except Exception: + break + + fwd_audio_task = asyncio.create_task(forward_audio()) + fwd_text_task = asyncio.create_task(forward_text()) try: while True: @@ -134,110 +157,98 @@ async def conversation_ws(websocket: WebSocket): msg = json.loads(raw) msg_type = msg.get("type", "") - # ── Identity & Preferences ── - + # ── Identity ── if msg_type == "identify": - response = handle_identify(msg, session_id) - if response: - await websocket.send_json(response) - if response["type"] == "identified": - user_id = response["user_id"] - identified = True - # Set up Honcho for this user - if kira_memory.enabled: - try: - kira_memory.ensure_peers(user_id) - kira_memory.ensure_session(session_id) - logger.info(f"[{session_id}] Honcho session ready for {user_id}") - except Exception as e: - logger.warning(f"[{session_id}] Honcho setup failed: {e}") + user_id = msg.get("user_id", "").strip() + user_name = msg.get("name", "").strip() + if user_name and user_id: + kira_memory.set_user_preference(user_id, "name", user_name) + + prefs = kira_memory.get_user_preferences(user_id) + identified = True + + if kira_memory.enabled: + kira_memory.ensure_peers(user_id) + kira_memory.ensure_session(session_id) + + # Inject Honcho context into the Realtime session instructions + memory_suffix = "" + if kira_memory.enabled: + try: + ctx = kira_memory.build_system_prompt_suffix() + if ctx: + memory_suffix = ctx + except Exception: + pass + + if relay and relay._connected and memory_suffix: + await relay._send({ + "type": "session.update", + "session": { + "instructions": ( + "You are Kira, a warm, kind, and encouraging AI body double. " + "Speak in a friendly, girly-pop tone. Help someone with ADHD " + "stay focused. Keep responses short and supportive. " + "Check in, remind breaks, celebrate wins. Never judgmental." + + memory_suffix + ), + }, + }) + + await websocket.send_json({ + "type": "identified", + "user_id": user_id, + "preferences": prefs, + }) continue + # ── Preferences ── if msg_type == "set_preference": - response = handle_set_preference(msg, session_id, user_id) - if response: - await websocket.send_json(response) + key = msg.get("key", "").strip() + value = msg.get("value", "").strip() + if key and user_id and user_id != "default-user": + kira_memory.set_user_preference(user_id, key, value) continue - # ── Conversation ── + # ── Audio from frontend (PCM16) ── + if msg_type == "audio": + audio_b64 = msg.get("data", "") + if audio_b64: + pcm16 = base64.b64decode(audio_b64) + await audio_queue.put(pcm16) + continue - system_prompt = build_system_prompt(user_id) + # ── Text input ── + if msg_type == "conversation_text": + text = msg.get("text", "").strip() + if text: + await text_queue.put(text) + # Also store in Honcho immediately + if kira_memory.enabled and identified: + kira_memory.store_user_message(text) + continue - if msg_type == "audio_chunk": - chunk = base64.b64decode(msg["data"]) - audio_buffer.extend(chunk) - - elif msg_type == "transcribe": - if not audio_buffer: - await websocket.send_json({"type": "error", "message": "No audio data"}) - continue - - logger.info(f"[{session_id}] Transcribing {len(audio_buffer)} bytes...") - - transcript = await transcribe_audio(bytes(audio_buffer)) - audio_buffer.clear() - - if not transcript: - await websocket.send_json({"type": "error", "message": "Could not transcribe audio"}) - continue - - await websocket.send_json({"type": "transcript", "text": transcript}) - - logger.info(f"[{session_id}] User: {transcript}") - conversation_history.append({"role": "user", "content": transcript}) - - messages = [system_prompt] + conversation_history[-10:] - kira_text = await get_kira_response(messages) - - conversation_history.append({"role": "assistant", "content": kira_text}) - logger.info(f"[{session_id}] Kira: {kira_text}") - - if kira_memory.enabled and identified: - try: - kira_memory.store_messages(transcript, kira_text) - except Exception as e: - logger.warning(f"[{session_id}] Failed to store messages: {e}") - - await websocket.send_json({"type": "speaking_start", "text": kira_text}) - audio_bytes = await synthesize_speech(kira_text) - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) - await websocket.send_json({"type": "speaking_end"}) - - elif msg_type == "ping": + if msg_type == "ping": await websocket.send_json({"type": "pong"}) - elif msg_type == "conversation_text": - user_text = msg.get("text", "").strip() - if not user_text: - continue - - logger.info(f"[{session_id}] User (text): {user_text}") - conversation_history.append({"role": "user", "content": user_text}) - - messages = [system_prompt] + conversation_history[-10:] - kira_text = await get_kira_response(messages) - - conversation_history.append({"role": "assistant", "content": kira_text}) - logger.info(f"[{session_id}] Kira: {kira_text}") - - if kira_memory.enabled and identified: - try: - kira_memory.store_messages(user_text, kira_text) - except Exception as e: - logger.warning(f"[{session_id}] Failed to store messages: {e}") - - await websocket.send_json({"type": "speaking_start", "text": kira_text}) - audio_bytes = await synthesize_speech(kira_text) - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - await websocket.send_json({"type": "audio", "data": audio_b64, "text": kira_text}) - await websocket.send_json({"type": "speaking_end"}) - except WebSocketDisconnect: logger.info(f"[{session_id}] Disconnected") except Exception as e: logger.error(f"[{session_id}] Error: {e}") - try: - await websocket.send_json({"type": "error", "message": str(e)}) - except Exception: - pass + finally: + # Store pending transcripts in Honcho + if kira_memory.enabled and identified: + for _, transcript_text in pending_transcripts: + if transcript_text.startswith("user: "): + content = transcript_text[6:] + kira_memory.store_user_message(content) + elif transcript_text.startswith("assistant: "): + content = transcript_text[11:] + kira_memory.store_kira_message(content) + + fwd_audio_task.cancel() + fwd_text_task.cancel() + if relay: + await relay.disconnect() + if relay_task: + relay_task.cancel() diff --git a/backend/requirements.txt b/backend/requirements.txt index 1720374..fd79264 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -7,3 +7,4 @@ pydantic>=2.10.0 pydantic-settings>=2.7.0 httpx>=0.28.0 honcho-ai>=2.1.0 +openai[realtime]>=2.41.0 diff --git a/backend/services/realtime.py b/backend/services/realtime.py new file mode 100644 index 0000000..f831176 --- /dev/null +++ b/backend/services/realtime.py @@ -0,0 +1,191 @@ +"""OpenAI Realtime API relay service. + +Manages a WebSocket connection to OpenAI's Realtime API and relays +audio/text events between the client and OpenAI. +""" + +import json +import logging +from typing import Callable, Awaitable +from config import settings + +logger = logging.getLogger("kira.realtime") + +# ─── System instructions for Kira's personality ─── + +KIRA_INSTRUCTIONS = ( + "You are Kira, a warm, kind, and encouraging AI body double. " + "You speak in a friendly, girly-pop tone. You are helping someone with ADHD " + "stay focused and on task. Keep responses short, supportive, and uplifting. " + "Check in on them. Remind them to take breaks. Celebrate small wins. " + "Use occasional emoji but don't overdo it. Never be judgmental. " + "You remember things about them between conversations." +) + + +class RealtimeRelay: + """Relays audio/text between a client WS and OpenAI Realtime API.""" + + def __init__( + self, + on_audio_delta: Callable[[bytes], Awaitable[None]], + on_transcript: Callable[[str], Awaitable[None]], + on_speech_started: Callable[[], Awaitable[None]], + on_speech_stopped: Callable[[], Awaitable[None]], + on_interruption: Callable[[], Awaitable[None]], + on_error: Callable[[str], Awaitable[None]], + on_ready: Callable[[], Awaitable[None]], + ): + self._on_audio_delta = on_audio_delta + self._on_transcript = on_transcript + self._on_speech_started = on_speech_started + self._on_speech_stopped = on_speech_stopped + self._on_interruption = on_interruption + self._on_error = on_error + self._on_ready = on_ready + self._conn = None + self._connected = False + + async def connect(self): + """Open a WebSocket to OpenAI Realtime API.""" + if self._connected: + return + + try: + from openai import AsyncOpenAI + + client = AsyncOpenAI(api_key=settings.openai_api_key) + + logger.info("Connecting to OpenAI Realtime API...") + async with client.beta.realtime.connect( + model="gpt-4o-mini-realtime-preview-2025-07-18", + ) as conn: + self._conn = conn + self._connected = True + logger.info("Connected to OpenAI Realtime API") + + # Configure session + await self._send({ + "type": "session.update", + "session": { + "instructions": KIRA_INSTRUCTIONS, + "voice": "alloy", + "input_audio_transcription": {"enabled": True}, + "turn_detection": { + "type": "server_vad", + "threshold": 0.5, + "prefix_padding_ms": 300, + "silence_duration_ms": 600, + }, + }, + }) + + await self._on_ready() + + # Listen for events + while self._connected: + try: + event = await conn.recv() + await self._handle_event(event) + except Exception as e: + if self._connected: + logger.warning(f"Realtime recv error: {e}") + break + + except ImportError: + logger.error("openai[realtime] not installed — run: pip install 'openai[realtime]'") + await self._on_error("Missing openai[realtime] dependency") + except Exception as e: + logger.error(f"Realtime connection error: {e}") + await self._on_error(str(e)) + finally: + self._connected = False + self._conn = None + + async def _handle_event(self, event): + """Process an event from the OpenAI Realtime API.""" + event_type = getattr(event, "type", None) or event.get("type", "") + + if event_type == "response.audio.delta": + audio_b64 = getattr(event, "delta", None) or event.get("delta", "") + if audio_b64: + import base64 + audio_bytes = base64.b64decode(audio_b64) + await self._on_audio_delta(audio_bytes) + + elif event_type == "response.audio_buffer.speech_started": + await self._on_speech_started() + + elif event_type == "response.audio_buffer.speech_stopped": + await self._on_speech_stopped() + + elif event_type == "input_audio_buffer.speech_started": + # User started speaking — interrupt Kira + await self._on_interruption() + + elif event_type == "conversation.item.created": + item = getattr(event, "item", None) or event.get("item", {}) + role = getattr(item, "role", None) or item.get("role", "") + content = getattr(item, "content", None) or item.get("content", []) + for part in (content or []): + part_type = getattr(part, "type", None) or part.get("type", "") + part_text = getattr(part, "text", None) or part.get("text", "") + if part_type == "text" and part_text and role == "assistant": + await self._on_transcript(f"assistant: {part_text}") + part_transcript = getattr(part, "transcript", None) or part.get("transcript", "") + if part_type == "transcript" and part_transcript and role == "user": + await self._on_transcript(f"user: {part_transcript}") + + elif event_type == "error": + err = getattr(event, "error", None) or event.get("error", {}) + msg = getattr(err, "message", None) or err.get("message", str(event)) + logger.warning(f"Realtime API error: {msg}") + await self._on_error(msg) + + async def send_audio(self, pcm16_bytes: bytes): + """Send PCM16 audio chunk to OpenAI.""" + if not self._connected or not self._conn: + return + try: + import base64 + audio_b64 = base64.b64encode(pcm16_bytes).decode("utf-8") + await self._send({ + "type": "input_audio_buffer.append", + "audio": audio_b64, + }) + except Exception as e: + logger.warning(f"Failed to send audio: {e}") + + async def send_text(self, text: str): + """Send a text message to OpenAI and trigger a response.""" + if not self._connected or not self._conn: + return + try: + await self._send({ + "type": "conversation.item.create", + "item": { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": text}], + }, + }) + await self._send({"type": "response.create"}) + except Exception as e: + logger.warning(f"Failed to send text: {e}") + + async def _send(self, data: dict): + """Send a JSON event to the Realtime API.""" + try: + await self._conn.send(data) + except Exception as e: + logger.warning(f"Realtime send error: {e}") + + async def disconnect(self): + """Close the Realtime connection.""" + self._connected = False + if self._conn: + try: + await self._conn.close() + except Exception: + pass + self._conn = None diff --git a/frontend/src/hooks/useConversation.ts b/frontend/src/hooks/useConversation.ts index abb403e..35a85d1 100644 --- a/frontend/src/hooks/useConversation.ts +++ b/frontend/src/hooks/useConversation.ts @@ -25,6 +25,41 @@ function saveUserId(id: string) { localStorage.setItem(USER_ID_KEY, id); } +/** Capture PCM16 mono 24kHz audio from mic and send via callback. */ +function startPCMCapture( + stream: MediaStream, + onChunk: (pcm16: Uint8Array) => void, +): { stop: () => void } { + const ctx = new AudioContext({ sampleRate: 24000 }); + const source = ctx.createMediaStreamSource(stream); + const processor = ctx.createScriptProcessor(4096, 1, 1); + let running = true; + + processor.onaudioprocess = (e) => { + if (!running) return; + const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1] + // Convert float32 → PCM16 int16 + const pcm16 = new Int16Array(input.length); + for (let i = 0; i < input.length; i++) { + const s = Math.max(-1, Math.min(1, input[i])); + pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + onChunk(new Uint8Array(pcm16.buffer)); + }; + + source.connect(processor); + processor.connect(ctx.destination); + + return { + stop: () => { + running = false; + source.disconnect(); + processor.disconnect(); + ctx.close(); + }, + }; +} + export function useConversation() { const [messages, setMessages] = useState([]); const [isConnected, setIsConnected] = useState(false); @@ -38,11 +73,13 @@ export function useConversation() { accessory: '', }); const [loadingPrefs, setLoadingPrefs] = useState(true); + const [micError, setMicError] = useState(null); const wsRef = useRef(null); const audioRef = useRef(null); - const recorderRef = useRef(null); + const captureRef = useRef<{ stop: () => void } | null>(null); const streamRef = useRef(null); + const audioBufferRef = useRef([]); // Connect WebSocket const connect = useCallback(() => { @@ -54,7 +91,6 @@ export function useConversation() { ws.onopen = () => { setIsConnected(true); - // Auto-identify if returning user const savedId = loadUserId(); if (savedId) { ws.send(JSON.stringify({ type: 'identify', user_id: savedId })); @@ -102,35 +138,56 @@ export function useConversation() { break; } - case 'preference_saved': - // Already optimistically updated locally - break; - case 'transcript': - addMessage('user', msg.text); + addMessage(msg.role === 'user' ? 'user' : 'kira', msg.text); break; case 'speaking_start': setIsKiraSpeaking(true); - addMessage('kira', msg.text || '...'); break; - case 'audio': + case 'audio': { + // Incoming PCM16 audio from Kira if (msg.data && audioRef.current) { + // Accumulate audio chunks and create a blob const binary = atob(msg.data); const bytes = new Uint8Array(binary.length); for (let i = 0; i < binary.length; i++) { bytes[i] = binary.charCodeAt(i); } - const blob = new Blob([bytes], { type: 'audio/ogg' }); + audioBufferRef.current.push(bytes); + + // Convert accumulated PCM16 to WAV blob for playback + const allChunks = audioBufferRef.current; + const totalLen = allChunks.reduce((s, c) => s + c.length, 0); + const combined = new Uint8Array(totalLen); + let offset = 0; + for (const chunk of allChunks) { + combined.set(chunk, offset); + offset += chunk.length; + } + + const wav = pcm16ToWav(combined); + const blob = new Blob([wav], { type: 'audio/wav' }); const url = URL.createObjectURL(blob); audioRef.current.src = url; audioRef.current.play().catch(() => {}); } break; + } case 'speaking_end': setIsKiraSpeaking(false); + audioBufferRef.current = []; + break; + + case 'interruption': + setIsKiraSpeaking(false); + audioBufferRef.current = []; + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current.currentTime = 0; + } break; case 'error': @@ -154,99 +211,80 @@ export function useConversation() { setPreferences((p) => ({ ...p, name })); if (wsRef.current?.readyState === WebSocket.OPEN) { - wsRef.current.send(JSON.stringify({ - type: 'identify', - user_id: userId, - name, - })); + wsRef.current.send(JSON.stringify({ type: 'identify', user_id: userId, name })); } }, []); // ── Preferences ── const setPreference = useCallback((key: string, value: string) => { - // Optimistic update setPreferences((p) => ({ ...p, [key]: value })); - - // Sync to backend if (wsRef.current?.readyState === WebSocket.OPEN && identified) { - wsRef.current.send(JSON.stringify({ - type: 'set_preference', - key, - value, - })); + wsRef.current.send(JSON.stringify({ type: 'set_preference', key, value })); } }, [identified]); + // ── Audio (Realtime PCM16) ── + + const startRecording = useCallback(async () => { + // Check HTTPS + if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { + addMessage('kira', 'Mic requires HTTPS. Try accessing via HTTPS!'); + return; + } + + try { + setMicError(null); + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + const ws = wsRef.current; + if (!ws || ws.readyState !== WebSocket.OPEN) { + addMessage('kira', 'Not connected to server yet...'); + stream.getTracks().forEach((t) => t.stop()); + return; + } + + // Start PCM16 capture — each chunk sent as WS message + const capture = startPCMCapture(stream, (pcm16) => { + if (ws.readyState === WebSocket.OPEN) { + const base64 = arrayBufferToBase64(pcm16.buffer); + ws.send(JSON.stringify({ type: 'audio', data: base64 })); + } + }); + + captureRef.current = capture; + setIsRecording(true); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + setMicError(msg); + console.error('[Kira Mic]', msg); + } + }, [addMessage]); + + const stopRecording = useCallback(() => { + captureRef.current?.stop(); + captureRef.current = null; + streamRef.current?.getTracks().forEach((t) => t.stop()); + streamRef.current = null; + setIsRecording(false); + }, []); + // ── Text ── const sendText = useCallback((text: string) => { if (!text.trim()) return; if (wsRef.current?.readyState === WebSocket.OPEN) { - wsRef.current.send(JSON.stringify({ - type: 'conversation_text', - text: text.trim(), - })); + wsRef.current.send(JSON.stringify({ type: 'conversation_text', text: text.trim() })); } }, []); - // ── Audio ── - - const startRecording = useCallback(async () => { - // Check if mediaDevices is available (requires HTTPS/localhost) - if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { - addMessage('kira', 'Your browser needs HTTPS to use the microphone. Try accessing Kira through the HTTPS address instead!'); - return; - } - - try { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - streamRef.current = stream; - - const recorder = new MediaRecorder(stream, { - mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus') - ? 'audio/webm;codecs=opus' - : 'audio/webm', - }); - - const chunks: BlobPart[] = []; - recorder.ondataavailable = (e) => { - if (e.data.size > 0) chunks.push(e.data); - }; - - recorder.onstop = () => { - const blob = new Blob(chunks, { type: 'audio/webm' }); - const reader = new FileReader(); - reader.onload = () => { - const base64 = (reader.result as string).split(',')[1]; - if (wsRef.current?.readyState === WebSocket.OPEN) { - wsRef.current.send(JSON.stringify({ type: 'audio_chunk', data: base64 })); - wsRef.current.send(JSON.stringify({ type: 'transcribe' })); - } - }; - reader.readAsDataURL(blob); - - stream.getTracks().forEach((t) => t.stop()); - setIsRecording(false); - }; - - recorder.start(); - recorderRef.current = recorder; - setIsRecording(true); - } catch (err) { - console.error('[Kira Mic] failed:', err); - } - }, []); - - const stopRecording = useCallback(() => { - recorderRef.current?.stop(); - }, []); - // Connect on mount useEffect(() => { connect(); return () => { wsRef.current?.close(); + captureRef.current?.stop(); streamRef.current?.getTracks().forEach((t) => t.stop()); }; }, [connect]); @@ -259,6 +297,7 @@ export function useConversation() { identified, preferences, loadingPrefs, + micError, identify, setPreference, sendText, @@ -266,3 +305,61 @@ export function useConversation() { stopRecording, }; } + +// ── Helpers ── + +function arrayBufferToBase64(buffer: ArrayBufferLike): string { + const bytes = new Uint8Array(buffer); + let binary = ''; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */ +function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer { + const numChannels = 1; + const sampleRate = 24000; + const bitsPerSample = 16; + const byteRate = sampleRate * numChannels * (bitsPerSample / 8); + const blockAlign = numChannels * (bitsPerSample / 8); + const dataSize = pcm16.length; + const headerSize = 44; + const totalSize = headerSize + dataSize; + + const buf = new ArrayBuffer(totalSize); + const view = new DataView(buf); + + // RIFF header + writeString(view, 0, 'RIFF'); + view.setUint32(4, totalSize - 8, true); + writeString(view, 8, 'WAVE'); + + // fmt subchunk + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); // subchunk size + view.setUint16(20, 1, true); // PCM + view.setUint16(22, numChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitsPerSample, true); + + // data subchunk + writeString(view, 36, 'data'); + view.setUint32(40, dataSize, true); + + // PCM data + for (let i = 0; i < pcm16.length; i++) { + view.setUint8(44 + i, pcm16[i]); + } + + return buf; +} + +function writeString(view: DataView, offset: number, str: string) { + for (let i = 0; i < str.length; i++) { + view.setUint8(offset + i, str.charCodeAt(i)); + } +}