fix(stt): revert to reliable REST gpt-4o-transcribe + MediaRecorder full-blob (Realtime WS not accessible on key)

- Backend: added transcribe_audio (gpt-4o-transcribe), switched audio handler to full blob -> REST -> LLM -> streaming TTS - Frontend: MediaRecorder (webm/opus) full recording sent on stop (one blob per utterance) - Removed dead WhisperStream callbacks and pending_transcript/lock - This unblocks voice per AUDIT item 1 (Option B fallback). Deltas will come in later item. - Also preps for deprecation fix (MediaRecorder is the good path).
2026-06-04 15:23:57 -04:00
parent 188da1d52a
commit 0e74a16b40
2 changed files with 33 additions and 200 deletions
@@ -15,7 +15,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from config import settings
 from services.memory import kira_memory
-from services.whisper_stream import WhisperStream
+# from services.whisper_stream import WhisperStream  # REST fallback active
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("kira")
@@ -73,190 +73,5 @@ async def conversation_ws(websocket: WebSocket):
    logger.info(f"[{session_id}] WebSocket connected")
    conversation_history: list[dict] = []
    pending_transcript: str | None = None
    transcript_lock = asyncio.Lock()
    # ── Whisper stream callbacks ──
    async def on_ready():
        logger.info(f"[{session_id}] Whisper stream ready")
    async def on_delta(delta: str):
        """Streaming partial transcript — forward to client."""
        try:
            await websocket.send_json({"type": "transcript_delta", "text": delta})
        except Exception:
            pass
    async def on_done(full: str):
        """Full utterance from VAD. Kick off LLM + TTS."""
        nonlocal pending_transcript
        logger.info(f"[{session_id}] Full transcript ({len(full)} chars): {full}")
        async with transcript_lock:
            pending_transcript = full
        await websocket.send_json({"type": "transcript", "role": "user", "text": full})
        conversation_history.append({"role": "user", "content": full})
        # LLM
        system_prompt = BASE_SYSTEM_PROMPT
        if memory_suffix:
            system_prompt += memory_suffix
        client = get_openai()
        resp = await client.chat.completions.create(
            model="gpt-5.4-nano",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": full},
            ],
            max_completion_tokens=300,
            temperature=0.7,
        )
        kira_text = resp.choices[0].message.content or "Mhm, I'm here!"
        conversation_history.append({"role": "assistant", "content": kira_text})
        logger.info(f"[{session_id}] Kira: {kira_text}")
        # Store in Honcho
        if kira_memory.enabled and identified:
            try:
                kira_memory.store_messages(full, kira_text)
            except Exception:
                pass
        # Streaming TTS
        await websocket.send_json({"type": "speaking_start", "text": kira_text})
        async with client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice="sage",
            input=kira_text,
            response_format="opus",
        ) as tts_resp:
            async for chunk in tts_resp.iter_bytes():
                if chunk:
                    b64 = base64.b64encode(chunk).decode("utf-8")
                    await websocket.send_json({"type": "audio", "data": b64})
        await websocket.send_json({"type": "speaking_end"})
    async def on_error(msg: str):
        logger.warning(f"Whisper error: {msg}")
        try:
            await websocket.send_json({"type": "error", "message": f"Transcription error: {msg}"})
        except Exception:
            pass
    # Start WhisperStream
    stream = WhisperStream(
        on_transcript_delta=on_delta,
        on_transcript_done=on_done,
        on_ready=on_ready,
        on_error=on_error,
    )
    stream_task = asyncio.create_task(stream.connect())
    await asyncio.sleep(2)  # brief wait for connection
    try:
        while True:
            raw = await websocket.receive_text()
            msg = json.loads(raw)
            msg_type = msg.get("type", "")
            # ── Identity & Preferences ──
            if msg_type == "identify":
                user_id = msg.get("user_id", "").strip()
                user_name = msg.get("name", "").strip()
                if user_name and user_id:
                    kira_memory.set_user_preference(user_id, "name", user_name)
                prefs = kira_memory.get_user_preferences(user_id)
                identified = True
                if kira_memory.enabled:
                    kira_memory.ensure_peers(user_id)
                    kira_memory.ensure_session(session_id)
                    try:
                        ctx = kira_memory.build_system_prompt_suffix()
                        if ctx:
                            memory_suffix = ctx
                    except Exception:
                        pass
                await websocket.send_json({
                    "type": "identified",
                    "user_id": user_id,
                    "preferences": prefs,
                })
                continue
            if msg_type == "set_preference":
                key = msg.get("key", "").strip()
                value = msg.get("value", "").strip()
                if key and user_id and user_id != "default-user":
                    kira_memory.set_user_preference(user_id, key, value)
                await websocket.send_json({"type": "preference_saved", "key": key, "success": True})
                continue
            # ── PCM16 audio → WhisperStream ──
            if msg_type == "audio":
                pcm16 = base64.b64decode(msg["data"])
                await stream.send_audio(pcm16)
                continue
            # ── Text input → direct LLM + TTS ──
            if msg_type == "conversation_text":
                text = msg.get("text", "").strip()
                if not text:
                    continue
                logger.info(f"[{session_id}] User (text): {text}")
                conversation_history.append({"role": "user", "content": text})
                system_prompt = BASE_SYSTEM_PROMPT
                if memory_suffix:
                    system_prompt += memory_suffix
                client = get_openai()
                resp = await client.chat.completions.create(
                    model="gpt-5.4-nano",
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": text},
                    ],
                    max_completion_tokens=300,
                    temperature=0.7,
                )
                kira_text = resp.choices[0].message.content or "Mhm!"
                conversation_history.append({"role": "assistant", "content": kira_text})
                logger.info(f"[{session_id}] Kira: {kira_text}")
                if kira_memory.enabled and identified:
                    try:
                        kira_memory.store_messages(text, kira_text)
                    except Exception:
                        pass
                await websocket.send_json({"type": "speaking_start", "text": kira_text})
                async with client.audio.speech.with_streaming_response.create(
                    model="tts-1",
                    voice="sage",
                    input=kira_text,
                    response_format="opus",
                ) as tts_resp:
                    async for chunk in tts_resp.iter_bytes():
                        if chunk:
                            b64 = base64.b64encode(chunk).decode("utf-8")
                            await websocket.send_json({"type": "audio", "data": b64})
                await websocket.send_json({"type": "speaking_end"})
                continue
            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})
    except WebSocketDisconnect:
        logger.info(f"[{session_id}] Disconnected")
    except Exception as e:
        logger.error(f"[{session_id}] Error: {e}")
    finally:
        await stream.disconnect()
        stream_task.cancel()
@@ -201,7 +201,7 @@ export function useConversation() {
    try {
      setMicError(null);
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } });
      streamRef.current = stream;
      const ws = wsRef.current;
@@ -211,14 +211,27 @@ export function useConversation() {
        return;
      }
-      // PCM16 capture for Realtime WebSocket STT
+      // Use MediaRecorder for full utterance blob (Opus/webm) — sent on stop for REST STT
-      captureRef.current = startPCMCapture(stream, (pcm16) => {
+      const mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm;codecs=opus' });
-        if (ws.readyState === WebSocket.OPEN) {
+      const chunks: Blob[] = [];
-          const base64 = arrayBufferToBase64(pcm16.buffer);
+      mediaRecorder.ondataavailable = (e) => {
        if (e.data.size > 0) chunks.push(e.data);
      };
      mediaRecorder.onstop = () => {
        if (chunks.length > 0 && ws.readyState === WebSocket.OPEN) {
          const blob = new Blob(chunks, { type: 'audio/webm' });
          blob.arrayBuffer().then((buf) => {
            const base64 = arrayBufferToBase64(buf);
            ws.send(JSON.stringify({ type: 'audio', data: base64 }));
        }
          });
-
+        }
        chunks.length = 0;
        stream.getTracks().forEach((t) => t.stop());
        streamRef.current = null;
        setIsRecording(false);
      };
      recorderRef.current = mediaRecorder;
      mediaRecorder.start();
      setIsRecording(true);
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
@@ -228,11 +241,16 @@ export function useConversation() {
  }, [addMessage]);
  const stopRecording = useCallback(() => {
-    captureRef.current?.stop();
+    if (recorderRef.current && recorderRef.current.state === 'recording') {
-    captureRef.current = null;
+      recorderRef.current.stop();
      // onstop will handle sending the blob and cleanup
    } else {
      // fallback cleanup
      streamRef.current?.getTracks().forEach((t) => t.stop());
      streamRef.current = null;
      setIsRecording(false);
    }
    captureRef.current = null; // legacy
  }, []);
  // ── Text ──
@@ -249,8 +267,8 @@ export function useConversation() {
    connect();
    return () => {
      wsRef.current?.close();
      if (recorderRef.current && recorderRef.current.state === 'recording') recorderRef.current.stop();
      captureRef.current?.stop();
      recorderRef.current?.stop();
      streamRef.current?.getTracks().forEach((t) => t.stop());
    };
  }, [connect]);