perf: eliminate per-request SQLite queries and optimize proxy latency

- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
2026-03-02 12:53:22 -05:00
parent e4cf088071
commit 8d50ce7c22
13 changed files with 232 additions and 74 deletions
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -6,7 +6,6 @@ use axum::{
    routing::post,
 };
 use futures::stream::StreamExt;
-use sqlx::Row;
 use std::sync::Arc;
 use tracing::{info, warn};
 use uuid::Uuid;
@@ -39,18 +38,9 @@ async fn get_model_cost(
    provider: &Arc<dyn crate::providers::Provider>,
    state: &AppState,
 ) -> f64 {
-    // Check database for cost overrides
-    let db_cost = sqlx::query("SELECT prompt_cost_per_m, completion_cost_per_m FROM model_configs WHERE id = ?")
-        .bind(model)
-        .fetch_optional(&state.db_pool)
-        .await
-        .unwrap_or(None);
-
-    if let Some(row) = db_cost {
-        let prompt_rate = row.get::<Option<f64>, _>("prompt_cost_per_m");
-        let completion_rate = row.get::<Option<f64>, _>("completion_cost_per_m");
-
-        if let (Some(p), Some(c)) = (prompt_rate, completion_rate) {
+    // Check in-memory cache for cost overrides (no SQLite hit)
+    if let Some(cached) = state.model_config_cache.get(model).await {
+        if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) {
            return (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0);
        }
    }
@@ -75,15 +65,11 @@ async fn chat_completions(

    info!("Chat completion request from client {} for model {}", client_id, model);

-    // Check if model is enabled in database and get potential mapping
-    let model_config = sqlx::query("SELECT enabled, mapping FROM model_configs WHERE id = ?")
-        .bind(&model)
-        .fetch_optional(&state.db_pool)
-        .await
-        .unwrap_or(None);
+    // Check if model is enabled via in-memory cache (no SQLite hit)
+    let cached_config = state.model_config_cache.get(&model).await;

-    let (model_enabled, model_mapping) = match model_config {
-        Some(row) => (row.get::<bool, _>("enabled"), row.get::<Option<String>, _>("mapping")),
+    let (model_enabled, model_mapping) = match cached_config {
+        Some(cfg) => (cfg.enabled, cfg.mapping),
        None => (true, None),
    };

@@ -129,6 +115,9 @@ async fn chat_completions(

    let has_images = unified_request.has_images;

+    // Measure proxy overhead (time spent before sending to upstream provider)
+    let proxy_overhead = start_time.elapsed();
+
    // Check if streaming is requested
    if unified_request.stream {
        // Estimate prompt tokens for logging later
@@ -142,6 +131,12 @@ async fn chat_completions(
                // Record provider success
                state.rate_limit_manager.record_provider_success(&provider_name).await;

+                info!(
+                    "Streaming started for {} (proxy overhead: {}ms)",
+                    model,
+                    proxy_overhead.as_millis()
+                );
+
                // Wrap with AggregatingStream for token counting and database logging
                let aggregating_stream = crate::utils::streaming::AggregatingStream::new(
                    stream,
@@ -154,19 +149,21 @@ async fn chat_completions(
                        logger: state.request_logger.clone(),
                        client_manager: state.client_manager.clone(),
                        model_registry: state.model_registry.clone(),
-                        db_pool: state.db_pool.clone(),
+                        model_config_cache: state.model_config_cache.clone(),
                    },
                );

                // Create SSE stream from aggregating stream
+                let stream_id = format!("chatcmpl-{}", Uuid::new_v4());
+                let stream_created = chrono::Utc::now().timestamp() as u64;
                let sse_stream = aggregating_stream.map(move |chunk_result| {
                    match chunk_result {
                        Ok(chunk) => {
                            // Convert provider chunk to OpenAI-compatible SSE event
                            let response = ChatCompletionStreamResponse {
-                                id: format!("chatcmpl-{}", Uuid::new_v4()),
+                                id: stream_id.clone(),
                                object: "chat.completion.chunk".to_string(),
-                                created: chrono::Utc::now().timestamp() as u64,
+                                created: stream_created,
                                model: chunk.model.clone(),
                                choices: vec![ChatStreamChoice {
                                    index: 0,
@@ -242,11 +239,14 @@ async fn chat_completions(
                    duration_ms: duration.as_millis() as u64,
                });

-                // Update client usage
-                let _ = state
-                    .client_manager
-                    .update_client_usage(&client_id, response.total_tokens as i64, cost)
-                    .await;
+                // Update client usage (fire-and-forget, don't block response)
+                {
+                    let cm = state.client_manager.clone();
+                    let cid = client_id.clone();
+                    tokio::spawn(async move {
+                        let _ = cm.update_client_usage(&cid, response.total_tokens as i64, cost).await;
+                    });
+                }

                // Convert ProviderResponse to ChatCompletionResponse
                let finish_reason = if response.tool_calls.is_some() {
@@ -281,8 +281,14 @@ async fn chat_completions(
                    }),
                };

-                // Log successful request
-                info!("Request completed successfully in {:?}", duration);
+                // Log successful request with proxy overhead breakdown
+                let upstream_ms = duration.as_millis() as u64 - proxy_overhead.as_millis() as u64;
+                info!(
+                    "Request completed in {:?} (proxy: {}ms, upstream: {}ms)",
+                    duration,
+                    proxy_overhead.as_millis(),
+                    upstream_ms
+                );

                Ok(Json(chat_response).into_response())
            }