perf: eliminate per-request SQLite queries and optimize proxy latency

- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
2026-03-02 12:53:22 -05:00
parent e4cf088071
commit 8d50ce7c22
13 changed files with 232 additions and 74 deletions
--- a/src/utils/streaming.rs
+++ b/src/utils/streaming.rs
@@ -3,9 +3,9 @@ use crate::errors::AppError;
 use crate::logging::{RequestLog, RequestLogger};
 use crate::models::ToolCall;
 use crate::providers::{Provider, ProviderStreamChunk};
+use crate::state::ModelConfigCache;
 use crate::utils::tokens::estimate_completion_tokens;
 use futures::stream::Stream;
-use sqlx::Row;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -20,7 +20,7 @@ pub struct StreamConfig {
    pub logger: Arc<RequestLogger>,
    pub client_manager: Arc<ClientManager>,
    pub model_registry: Arc<crate::models::registry::ModelRegistry>,
-    pub db_pool: crate::database::DbPool,
+    pub model_config_cache: ModelConfigCache,
 }

 pub struct AggregatingStream<S> {
@@ -36,7 +36,7 @@ pub struct AggregatingStream<S> {
    logger: Arc<RequestLogger>,
    client_manager: Arc<ClientManager>,
    model_registry: Arc<crate::models::registry::ModelRegistry>,
-    db_pool: crate::database::DbPool,
+    model_config_cache: ModelConfigCache,
    start_time: std::time::Instant,
    has_logged: bool,
 }
@@ -59,7 +59,7 @@ where
            logger: config.logger,
            client_manager: config.client_manager,
            model_registry: config.model_registry,
-            db_pool: config.db_pool,
+            model_config_cache: config.model_config_cache,
            start_time: std::time::Instant::now(),
            has_logged: false,
        }
@@ -81,7 +81,7 @@ where
        let prompt_tokens = self.prompt_tokens;
        let has_images = self.has_images;
        let registry = self.model_registry.clone();
-        let pool = self.db_pool.clone();
+        let config_cache = self.model_config_cache.clone();

        // Estimate completion tokens (including reasoning if present)
        let content_tokens = estimate_completion_tokens(&self.accumulated_content, &model);
@@ -96,19 +96,9 @@ where

        // Spawn a background task to log the completion
        tokio::spawn(async move {
-            // Check database for cost overrides
-            let db_cost =
-                sqlx::query("SELECT prompt_cost_per_m, completion_cost_per_m FROM model_configs WHERE id = ?")
-                    .bind(&model)
-                    .fetch_optional(&pool)
-                    .await
-                    .unwrap_or(None);
-
-            let cost = if let Some(row) = db_cost {
-                let prompt_rate = row.get::<Option<f64>, _>("prompt_cost_per_m");
-                let completion_rate = row.get::<Option<f64>, _>("completion_cost_per_m");
-
-                if let (Some(p), Some(c)) = (prompt_rate, completion_rate) {
+            // Check in-memory cache for cost overrides (no SQLite hit)
+            let cost = if let Some(cached) = config_cache.get(&model).await {
+                if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) {
                    (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0)
                } else {
                    provider.calculate_cost(&model, prompt_tokens, completion_tokens, &registry)
@@ -284,7 +274,7 @@ mod tests {
                logger,
                client_manager,
                model_registry: registry,
-                db_pool: pool.clone(),
+                model_config_cache: ModelConfigCache::new(pool.clone()),
            },
        );