perf: eliminate per-request SQLite queries and optimize proxy latency
- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
This commit is contained in:
@@ -6,7 +6,6 @@ use axum::{
|
||||
routing::post,
|
||||
};
|
||||
use futures::stream::StreamExt;
|
||||
use sqlx::Row;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info, warn};
|
||||
use uuid::Uuid;
|
||||
@@ -39,18 +38,9 @@ async fn get_model_cost(
|
||||
provider: &Arc<dyn crate::providers::Provider>,
|
||||
state: &AppState,
|
||||
) -> f64 {
|
||||
// Check database for cost overrides
|
||||
let db_cost = sqlx::query("SELECT prompt_cost_per_m, completion_cost_per_m FROM model_configs WHERE id = ?")
|
||||
.bind(model)
|
||||
.fetch_optional(&state.db_pool)
|
||||
.await
|
||||
.unwrap_or(None);
|
||||
|
||||
if let Some(row) = db_cost {
|
||||
let prompt_rate = row.get::<Option<f64>, _>("prompt_cost_per_m");
|
||||
let completion_rate = row.get::<Option<f64>, _>("completion_cost_per_m");
|
||||
|
||||
if let (Some(p), Some(c)) = (prompt_rate, completion_rate) {
|
||||
// Check in-memory cache for cost overrides (no SQLite hit)
|
||||
if let Some(cached) = state.model_config_cache.get(model).await {
|
||||
if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) {
|
||||
return (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0);
|
||||
}
|
||||
}
|
||||
@@ -75,15 +65,11 @@ async fn chat_completions(
|
||||
|
||||
info!("Chat completion request from client {} for model {}", client_id, model);
|
||||
|
||||
// Check if model is enabled in database and get potential mapping
|
||||
let model_config = sqlx::query("SELECT enabled, mapping FROM model_configs WHERE id = ?")
|
||||
.bind(&model)
|
||||
.fetch_optional(&state.db_pool)
|
||||
.await
|
||||
.unwrap_or(None);
|
||||
// Check if model is enabled via in-memory cache (no SQLite hit)
|
||||
let cached_config = state.model_config_cache.get(&model).await;
|
||||
|
||||
let (model_enabled, model_mapping) = match model_config {
|
||||
Some(row) => (row.get::<bool, _>("enabled"), row.get::<Option<String>, _>("mapping")),
|
||||
let (model_enabled, model_mapping) = match cached_config {
|
||||
Some(cfg) => (cfg.enabled, cfg.mapping),
|
||||
None => (true, None),
|
||||
};
|
||||
|
||||
@@ -129,6 +115,9 @@ async fn chat_completions(
|
||||
|
||||
let has_images = unified_request.has_images;
|
||||
|
||||
// Measure proxy overhead (time spent before sending to upstream provider)
|
||||
let proxy_overhead = start_time.elapsed();
|
||||
|
||||
// Check if streaming is requested
|
||||
if unified_request.stream {
|
||||
// Estimate prompt tokens for logging later
|
||||
@@ -142,6 +131,12 @@ async fn chat_completions(
|
||||
// Record provider success
|
||||
state.rate_limit_manager.record_provider_success(&provider_name).await;
|
||||
|
||||
info!(
|
||||
"Streaming started for {} (proxy overhead: {}ms)",
|
||||
model,
|
||||
proxy_overhead.as_millis()
|
||||
);
|
||||
|
||||
// Wrap with AggregatingStream for token counting and database logging
|
||||
let aggregating_stream = crate::utils::streaming::AggregatingStream::new(
|
||||
stream,
|
||||
@@ -154,19 +149,21 @@ async fn chat_completions(
|
||||
logger: state.request_logger.clone(),
|
||||
client_manager: state.client_manager.clone(),
|
||||
model_registry: state.model_registry.clone(),
|
||||
db_pool: state.db_pool.clone(),
|
||||
model_config_cache: state.model_config_cache.clone(),
|
||||
},
|
||||
);
|
||||
|
||||
// Create SSE stream from aggregating stream
|
||||
let stream_id = format!("chatcmpl-{}", Uuid::new_v4());
|
||||
let stream_created = chrono::Utc::now().timestamp() as u64;
|
||||
let sse_stream = aggregating_stream.map(move |chunk_result| {
|
||||
match chunk_result {
|
||||
Ok(chunk) => {
|
||||
// Convert provider chunk to OpenAI-compatible SSE event
|
||||
let response = ChatCompletionStreamResponse {
|
||||
id: format!("chatcmpl-{}", Uuid::new_v4()),
|
||||
id: stream_id.clone(),
|
||||
object: "chat.completion.chunk".to_string(),
|
||||
created: chrono::Utc::now().timestamp() as u64,
|
||||
created: stream_created,
|
||||
model: chunk.model.clone(),
|
||||
choices: vec![ChatStreamChoice {
|
||||
index: 0,
|
||||
@@ -242,11 +239,14 @@ async fn chat_completions(
|
||||
duration_ms: duration.as_millis() as u64,
|
||||
});
|
||||
|
||||
// Update client usage
|
||||
let _ = state
|
||||
.client_manager
|
||||
.update_client_usage(&client_id, response.total_tokens as i64, cost)
|
||||
.await;
|
||||
// Update client usage (fire-and-forget, don't block response)
|
||||
{
|
||||
let cm = state.client_manager.clone();
|
||||
let cid = client_id.clone();
|
||||
tokio::spawn(async move {
|
||||
let _ = cm.update_client_usage(&cid, response.total_tokens as i64, cost).await;
|
||||
});
|
||||
}
|
||||
|
||||
// Convert ProviderResponse to ChatCompletionResponse
|
||||
let finish_reason = if response.tool_calls.is_some() {
|
||||
@@ -281,8 +281,14 @@ async fn chat_completions(
|
||||
}),
|
||||
};
|
||||
|
||||
// Log successful request
|
||||
info!("Request completed successfully in {:?}", duration);
|
||||
// Log successful request with proxy overhead breakdown
|
||||
let upstream_ms = duration.as_millis() as u64 - proxy_overhead.as_millis() as u64;
|
||||
info!(
|
||||
"Request completed in {:?} (proxy: {}ms, upstream: {}ms)",
|
||||
duration,
|
||||
proxy_overhead.as_millis(),
|
||||
upstream_ms
|
||||
);
|
||||
|
||||
Ok(Json(chat_response).into_response())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user