perf: eliminate per-request SQLite queries and optimize proxy latency
- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
This commit is contained in:
@@ -65,6 +65,11 @@ async fn main() -> Result<()> {
|
||||
config.server.auth_tokens.clone(),
|
||||
);
|
||||
|
||||
// Initialize model config cache and start background refresh (every 30s)
|
||||
state.model_config_cache.refresh().await;
|
||||
state.model_config_cache.clone().start_refresh_task(30);
|
||||
info!("Model config cache initialized");
|
||||
|
||||
// Create application router
|
||||
let app = Router::new()
|
||||
.route("/health", get(health_check))
|
||||
|
||||
Reference in New Issue
Block a user