diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index d040464..2ceba26 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -27,23 +27,24 @@ Browser-based real-time AI body double. She talks to Kira (microphone → STT ``` ┌─────────────────────────────────────────────────────────┐ -│ Browser │ +│ Browser (React + Live2D + girly UI) │ │ │ -│ [Mic] → MediaRecorder → audio chunks │ -│ ↓ (WebSocket) │ +│ [Mic] → MediaRecorder (webm/opus full utterance) │ +│ ↓ (WebSocket /api/ws) │ │ [FastAPI Backend] │ │ ↓ │ -│ 1. Whisper API → text transcript │ -│ 2. DeepSeek V4 (system prompt: "You are Kira...") │ -│ 3. OpenAI TTS → audio buffer │ +│ 1. REST gpt-4o-transcribe → full text (with delta emit)| +│ 2. gpt-5.4-nano + Honcho memory suffix │ +│ 3. OpenAI TTS (sage) streaming response → Opus chunks │ │ ↑ (WebSocket) │ -│ [Audio Player + Live2D Lip-Sync] │ +│ [Incremental Audio playback + Live2D + "Hearing" UI] │ │ │ -│ Kira's idle animations run between conversation turns │ +│ White noise (Web Audio) and lofi run independently. │ +│ VAD is client-button based (toggle Talk). │ └─────────────────────────────────────────────────────────┘ ``` -Lo-fi music runs independently via YouTube embed — no backend involvement. +Current: REST STT for reliability (Realtime WS attempted but blocked by model access). --- @@ -65,9 +66,8 @@ ai-body-double/ │ │ └── assets.py # REST: outfit/pet state, backgrounds │ ├── services/ │ │ ├── __init__.py -│ │ ├── stt.py # Whisper API client -│ │ ├── llm.py # DeepSeek chat client -│ │ └── tts.py # OpenAI TTS client +│ │ ├── memory.py # Honcho wrapper (context, prefs, summaries) +│ │ └── whisper_stream.py # Realtime WS attempt (archived use; fallback to REST) │ └── models/ │ ├── __init__.py │ └── schemas.py # Pydantic models diff --git a/README.md b/README.md index 0af05a0..5935ab8 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ cd ~/Projects/ai-body-double cp .env.example .env # Edit .env with your API keys: # OPENAI_API_KEY=sk-... -# DEEPSEEK_API_KEY=sk-... +# HONCHO_API_KEY=... ``` ### 3. Run @@ -56,17 +56,19 @@ kira.hobokenchicken.com { ## Architecture ``` -Browser ──WebSocket──▶ Backend (FastAPI) - │ │ - ├─ Mic audio ──────────▶ ├─ Whisper API (STT) - │ ├─ DeepSeek (LLM) - │ ◀── TTS audio ──────── ├─ OpenAI TTS - │ │ - ├─ YouTube embed (lo-fi) │ - ├─ Timer / Notes / Cats │ - └─ Animated avatar │ +Browser (React + Live2D + WhiteNoise + Notes) ──WebSocket──▶ Backend (FastAPI) + │ │ + ├─ Mic (MediaRecorder full webm on stop) ────────────────────▶ ├─ gpt-4o-transcribe (REST STT, emits delta) + │ ├─ gpt-5.4-nano + Honcho memory context + │ ◀── Incremental Opus chunks (play on arrival) ─────────────── ├─ OpenAI TTS streaming (sage) + │ │ + ├─ YouTube lofi + Web Audio noise (independent) │ + ├─ Timer / Notes / Pets / Wardrobe / Scenes │ + └─ Live2D (or fallback) avatar │ ``` +Note: Realtime WebSocket STT (gpt-realtime-whisper) was attempted for true streaming but blocked by model access — current REST path is stable and cheap. + ## Live2D Model Setup Kira currently uses a CSS/SVG animated placeholder avatar. To add a Live2D model: diff --git a/backend/services/llm.py b/archive/legacy-pipeline/llm.py similarity index 100% rename from backend/services/llm.py rename to archive/legacy-pipeline/llm.py diff --git a/backend/services/stt.py b/archive/legacy-pipeline/stt.py similarity index 100% rename from backend/services/stt.py rename to archive/legacy-pipeline/stt.py diff --git a/backend/services/tts.py b/archive/legacy-pipeline/tts.py similarity index 100% rename from backend/services/tts.py rename to archive/legacy-pipeline/tts.py