perf: eliminate per-request SQLite queries and optimize proxy latency

- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
2026-03-02 12:53:22 -05:00
parent e4cf088071
commit 8d50ce7c22
13 changed files with 232 additions and 74 deletions
--- a/src/dashboard/models.rs
+++ b/src/dashboard/models.rs
@@ -159,7 +159,11 @@ pub(super) async fn handle_update_model(
    .await;

    match result {
-        Ok(_) => Json(ApiResponse::success(serde_json::json!({ "message": "Model updated" }))),
+        Ok(_) => {
+            // Invalidate the in-memory cache so the proxy picks up the change immediately
+            state.app_state.model_config_cache.invalidate().await;
+            Json(ApiResponse::success(serde_json::json!({ "message": "Model updated" })))
+        }
        Err(e) => Json(ApiResponse::error(format!("Failed to update model: {}", e))),
    }
 }