perf: eliminate per-request SQLite queries and optimize proxy latency

- Add in-memory ModelConfigCache (30s refresh, explicit invalidation) replacing 2 SQLite queries per request (model lookup + cost override) - Configure all 5 provider HTTP clients with proper timeouts (300s), connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive - Move client_usage update to tokio::spawn in non-streaming path - Use fast chars/4 heuristic for token estimation on large inputs (>1KB) - Generate single UUID/timestamp per SSE stream instead of per chunk - Add shared LazyLock<Client> for image fetching in multimodal module - Add proxy overhead timing instrumentation for both request paths - Fix test helper to include new model_config_cache field
2026-03-02 12:53:22 -05:00
parent e4cf088071
commit 8d50ce7c22
13 changed files with 232 additions and 74 deletions
--- a/src/providers/ollama.rs
+++ b/src/providers/ollama.rs
@@ -14,8 +14,16 @@ pub struct OllamaProvider {

 impl OllamaProvider {
    pub fn new(config: &crate::config::OllamaConfig, app_config: &AppConfig) -> Result<Self> {
+        let client = reqwest::Client::builder()
+            .connect_timeout(std::time::Duration::from_secs(5))
+            .timeout(std::time::Duration::from_secs(300))
+            .pool_idle_timeout(std::time::Duration::from_secs(90))
+            .pool_max_idle_per_host(4)
+            .tcp_keepalive(std::time::Duration::from_secs(30))
+            .build()?;
+
        Ok(Self {
-            client: reqwest::Client::new(),
+            client,
            config: config.clone(),
            pricing: app_config.pricing.ollama.clone(),
        })