diff --git a/src/dashboard/models.rs b/src/dashboard/models.rs
index b75c3944..eb870e91 100644
--- a/src/dashboard/models.rs
+++ b/src/dashboard/models.rs
@@ -84,6 +84,8 @@ pub(super) async fn handle_get_models(
         let mut enabled = true;
         let mut prompt_cost = m_meta.cost.as_ref().map(|c| c.input).unwrap_or(0.0);
         let mut completion_cost = m_meta.cost.as_ref().map(|c| c.output).unwrap_or(0.0);
+        let cache_read_cost = m_meta.cost.as_ref().and_then(|c| c.cache_read);
+        let cache_write_cost = m_meta.cost.as_ref().and_then(|c| c.cache_write);
         let mut mapping = None::<String>;
 
         if let Some(row) = db_models.get(m_key) {
@@ -105,6 +107,8 @@ pub(super) async fn handle_get_models(
             "enabled": enabled,
             "prompt_cost": prompt_cost,
             "completion_cost": completion_cost,
+            "cache_read_cost": cache_read_cost,
+            "cache_write_cost": cache_write_cost,
             "mapping": mapping,
             "context_limit": m_meta.limit.as_ref().map(|l| l.context).unwrap_or(0),
             "output_limit": m_meta.limit.as_ref().map(|l| l.output).unwrap_or(0),
diff --git a/src/dashboard/usage.rs b/src/dashboard/usage.rs
index 2b522b18..5a0c7133 100644
--- a/src/dashboard/usage.rs
+++ b/src/dashboard/usage.rs
@@ -16,7 +16,9 @@ pub(super) async fn handle_usage_summary(State(state): State<DashboardState>) ->
             COUNT(*) as total_requests,
             COALESCE(SUM(total_tokens), 0) as total_tokens,
             COALESCE(SUM(cost), 0.0) as total_cost,
-            COUNT(DISTINCT client_id) as active_clients
+            COUNT(DISTINCT client_id) as active_clients,
+            COALESCE(SUM(cache_read_tokens), 0) as total_cache_read,
+            COALESCE(SUM(cache_write_tokens), 0) as total_cache_write
         FROM llm_requests
         "#,
     )
@@ -64,6 +66,8 @@ pub(super) async fn handle_usage_summary(State(state): State<DashboardState>) ->
             let total_tokens: i64 = t.get("total_tokens");
             let total_cost: f64 = t.get("total_cost");
             let active_clients: i64 = t.get("active_clients");
+            let total_cache_read: i64 = t.get("total_cache_read");
+            let total_cache_write: i64 = t.get("total_cache_write");
 
             let today_requests: i64 = d.get("today_requests");
             let today_cost: f64 = d.get("today_cost");
@@ -87,6 +91,8 @@ pub(super) async fn handle_usage_summary(State(state): State<DashboardState>) ->
                 "today_cost": today_cost,
                 "error_rate": error_rate,
                 "avg_response_time": avg_response_time,
+                "total_cache_read_tokens": total_cache_read,
+                "total_cache_write_tokens": total_cache_write,
             })))
         }
         _ => Json(ApiResponse::error("Failed to fetch usage statistics".to_string())),
@@ -208,7 +214,9 @@ pub(super) async fn handle_providers_usage(
             provider,
             COUNT(*) as requests,
             COALESCE(SUM(total_tokens), 0) as tokens,
-            COALESCE(SUM(cost), 0.0) as cost
+            COALESCE(SUM(cost), 0.0) as cost,
+            COALESCE(SUM(cache_read_tokens), 0) as cache_read,
+            COALESCE(SUM(cache_write_tokens), 0) as cache_write
         FROM llm_requests
         GROUP BY provider
         ORDER BY requests DESC
@@ -226,12 +234,16 @@ pub(super) async fn handle_providers_usage(
                 let requests: i64 = row.get("requests");
                 let tokens: i64 = row.get("tokens");
                 let cost: f64 = row.get("cost");
+                let cache_read: i64 = row.get("cache_read");
+                let cache_write: i64 = row.get("cache_write");
 
                 provider_usage.push(serde_json::json!({
                     "provider": provider,
                     "requests": requests,
                     "tokens": tokens,
                     "cost": cost,
+                    "cache_read_tokens": cache_read,
+                    "cache_write_tokens": cache_write,
                 }));
             }
 
@@ -256,7 +268,9 @@ pub(super) async fn handle_detailed_usage(State(state): State<DashboardState>) -
             model,
             COUNT(*) as requests,
             COALESCE(SUM(total_tokens), 0) as tokens,
-            COALESCE(SUM(cost), 0.0) as cost
+            COALESCE(SUM(cost), 0.0) as cost,
+            COALESCE(SUM(cache_read_tokens), 0) as cache_read,
+            COALESCE(SUM(cache_write_tokens), 0) as cache_write
         FROM llm_requests
         GROUP BY date, client_id, provider, model
         ORDER BY date DESC
@@ -279,6 +293,8 @@ pub(super) async fn handle_detailed_usage(State(state): State<DashboardState>) -
                         "requests": row.get::<i64, _>("requests"),
                         "tokens": row.get::<i64, _>("tokens"),
                         "cost": row.get::<f64, _>("cost"),
+                        "cache_read_tokens": row.get::<i64, _>("cache_read"),
+                        "cache_write_tokens": row.get::<i64, _>("cache_write"),
                     })
                 })
                 .collect();
diff --git a/src/database/mod.rs b/src/database/mod.rs
index c352abfb..1c796a40 100644
--- a/src/database/mod.rs
+++ b/src/database/mod.rs
@@ -135,6 +135,14 @@ async fn run_migrations(pool: &DbPool) -> Result<()> {
         .execute(pool)
         .await;
 
+    // Add cache token columns if they don't exist (migration for existing DBs)
+    let _ = sqlx::query("ALTER TABLE llm_requests ADD COLUMN cache_read_tokens INTEGER DEFAULT 0")
+        .execute(pool)
+        .await;
+    let _ = sqlx::query("ALTER TABLE llm_requests ADD COLUMN cache_write_tokens INTEGER DEFAULT 0")
+        .execute(pool)
+        .await;
+
     // Insert default admin user if none exists (default password: admin)
     let user_count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM users").fetch_one(pool).await?;
 
diff --git a/src/logging/mod.rs b/src/logging/mod.rs
index d80fdd10..b1bd1c42 100644
--- a/src/logging/mod.rs
+++ b/src/logging/mod.rs
@@ -16,6 +16,8 @@ pub struct RequestLog {
     pub prompt_tokens: u32,
     pub completion_tokens: u32,
     pub total_tokens: u32,
+    pub cache_read_tokens: u32,
+    pub cache_write_tokens: u32,
     pub cost: f64,
     pub has_images: bool,
     pub status: String, // "success", "error"
@@ -75,8 +77,8 @@ impl RequestLogger {
         sqlx::query(
             r#"
             INSERT INTO llm_requests 
-            (timestamp, client_id, provider, model, prompt_tokens, completion_tokens, total_tokens, cost, has_images, status, error_message, duration_ms, request_body, response_body)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            (timestamp, client_id, provider, model, prompt_tokens, completion_tokens, total_tokens, cache_read_tokens, cache_write_tokens, cost, has_images, status, error_message, duration_ms, request_body, response_body)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             "#,
         )
         .bind(log.timestamp)
@@ -86,6 +88,8 @@ impl RequestLogger {
         .bind(log.prompt_tokens as i64)
         .bind(log.completion_tokens as i64)
         .bind(log.total_tokens as i64)
+        .bind(log.cache_read_tokens as i64)
+        .bind(log.cache_write_tokens as i64)
         .bind(log.cost)
         .bind(log.has_images)
         .bind(log.status)
diff --git a/src/models/mod.rs b/src/models/mod.rs
index d632d9d7..2eea0b8f 100644
--- a/src/models/mod.rs
+++ b/src/models/mod.rs
@@ -152,6 +152,10 @@ pub struct Usage {
     pub prompt_tokens: u32,
     pub completion_tokens: u32,
     pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_read_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_write_tokens: Option<u32>,
 }
 
 // ========== Streaming Response Structs ==========
diff --git a/src/providers/deepseek.rs b/src/providers/deepseek.rs
index 905452c7..611294cd 100644
--- a/src/providers/deepseek.rs
+++ b/src/providers/deepseek.rs
@@ -90,12 +90,16 @@ impl super::Provider for DeepSeekProvider {
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64 {
         helpers::calculate_cost_with_registry(
             model,
             prompt_tokens,
             completion_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
             registry,
             &self.pricing,
             0.14,
diff --git a/src/providers/gemini.rs b/src/providers/gemini.rs
index 53ad336a..357a7866 100644
--- a/src/providers/gemini.rs
+++ b/src/providers/gemini.rs
@@ -119,6 +119,8 @@ struct GeminiUsageMetadata {
     candidates_token_count: u32,
     #[serde(default)]
     total_token_count: u32,
+    #[serde(default)]
+    cached_content_token_count: u32,
 }
 
 #[derive(Debug, Deserialize)]
@@ -454,6 +456,11 @@ impl super::Provider for GeminiProvider {
             .as_ref()
             .map(|u| u.total_token_count)
             .unwrap_or(0);
+        let cache_read_tokens = gemini_response
+            .usage_metadata
+            .as_ref()
+            .map(|u| u.cached_content_token_count)
+            .unwrap_or(0);
 
         Ok(ProviderResponse {
             content,
@@ -462,6 +469,8 @@ impl super::Provider for GeminiProvider {
             prompt_tokens,
             completion_tokens,
             total_tokens,
+            cache_read_tokens,
+            cache_write_tokens: 0, // Gemini doesn't report cache writes separately
             model,
         })
     }
@@ -475,12 +484,16 @@ impl super::Provider for GeminiProvider {
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64 {
         super::helpers::calculate_cost_with_registry(
             model,
             prompt_tokens,
             completion_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
             registry,
             &self.pricing,
             0.075,
@@ -537,6 +550,17 @@ impl super::Provider for GeminiProvider {
                         let gemini_response: GeminiResponse = serde_json::from_str(&msg.data)
                             .map_err(|e| AppError::ProviderError(format!("Failed to parse stream chunk: {}", e)))?;
 
+                        // Extract usage from usageMetadata if present (reported on every/last chunk)
+                        let stream_usage = gemini_response.usage_metadata.as_ref().map(|u| {
+                            super::StreamUsage {
+                                prompt_tokens: u.prompt_token_count,
+                                completion_tokens: u.candidates_token_count,
+                                total_tokens: u.total_token_count,
+                                cache_read_tokens: u.cached_content_token_count,
+                                cache_write_tokens: 0,
+                            }
+                        });
+
                         if let Some(candidate) = gemini_response.candidates.first() {
                             let content = candidate
                                 .content
@@ -561,6 +585,7 @@ impl super::Provider for GeminiProvider {
                                 finish_reason,
                                 tool_calls,
                                 model: model.clone(),
+                                usage: stream_usage,
                             };
                         }
                     }
diff --git a/src/providers/grok.rs b/src/providers/grok.rs
index 858c83b9..3da7fa89 100644
--- a/src/providers/grok.rs
+++ b/src/providers/grok.rs
@@ -86,12 +86,16 @@ impl super::Provider for GrokProvider {
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64 {
         helpers::calculate_cost_with_registry(
             model,
             prompt_tokens,
             completion_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
             registry,
             &self.pricing,
             5.0,
diff --git a/src/providers/helpers.rs b/src/providers/helpers.rs
index 81bc214d..4a048323 100644
--- a/src/providers/helpers.rs
+++ b/src/providers/helpers.rs
@@ -1,4 +1,4 @@
-use super::{ProviderResponse, ProviderStreamChunk};
+use super::{ProviderResponse, ProviderStreamChunk, StreamUsage};
 use crate::errors::AppError;
 use crate::models::{ContentPart, ToolCall, ToolCallDelta, UnifiedMessage, UnifiedRequest};
 use futures::stream::{BoxStream, StreamExt};
@@ -156,6 +156,8 @@ pub async fn messages_to_openai_json_text_only(
 
 /// Build an OpenAI-compatible request body from a UnifiedRequest and pre-converted messages.
 /// Includes tools and tool_choice when present.
+/// When streaming, adds `stream_options.include_usage: true` so providers report
+/// token counts in the final SSE chunk.
 pub fn build_openai_body(
     request: &UnifiedRequest,
     messages_json: Vec<serde_json::Value>,
@@ -167,6 +169,10 @@ pub fn build_openai_body(
         "stream": stream,
     });
 
+    if stream {
+        body["stream_options"] = serde_json::json!({ "include_usage": true });
+    }
+
     if let Some(temp) = request.temperature {
         body["temperature"] = serde_json::json!(temp);
     }
@@ -185,6 +191,9 @@ pub fn build_openai_body(
 
 /// Parse an OpenAI-compatible chat completion response JSON into a ProviderResponse.
 /// Extracts tool_calls from the message when present.
+/// Extracts cache token counts from:
+/// - OpenAI/Grok: `usage.prompt_tokens_details.cached_tokens`
+/// - DeepSeek: `usage.prompt_cache_hit_tokens` / `usage.prompt_cache_miss_tokens`
 pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<ProviderResponse, AppError> {
     let choice = resp_json["choices"]
         .get(0)
@@ -204,6 +213,17 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
     let completion_tokens = usage["completion_tokens"].as_u64().unwrap_or(0) as u32;
     let total_tokens = usage["total_tokens"].as_u64().unwrap_or(0) as u32;
 
+    // Extract cache tokens — try OpenAI/Grok format first, then DeepSeek format
+    let cache_read_tokens = usage["prompt_tokens_details"]["cached_tokens"]
+        .as_u64()
+        // DeepSeek uses a different field name
+        .or_else(|| usage["prompt_cache_hit_tokens"].as_u64())
+        .unwrap_or(0) as u32;
+
+    // DeepSeek reports cache_write as prompt_cache_miss_tokens (tokens written to cache for future use).
+    // OpenAI doesn't report cache_write in this location, but may in the future.
+    let cache_write_tokens = usage["prompt_cache_miss_tokens"].as_u64().unwrap_or(0) as u32;
+
     Ok(ProviderResponse {
         content,
         reasoning_content,
@@ -211,6 +231,8 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
         prompt_tokens,
         completion_tokens,
         total_tokens,
+        cache_read_tokens,
+        cache_write_tokens,
         model,
     })
 }
@@ -220,6 +242,9 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
 /// The optional `reasoning_field` allows overriding the field name for
 /// reasoning content (e.g., "thought" for Ollama).
 /// Parses tool_calls deltas from streaming chunks when present.
+/// When `stream_options.include_usage: true` was sent, the provider sends a
+/// final chunk with `usage` data — this is parsed into `StreamUsage` and
+/// attached to the yielded `ProviderStreamChunk`.
 pub fn create_openai_stream(
     es: reqwest_eventsource::EventSource,
     model: String,
@@ -239,6 +264,34 @@ pub fn create_openai_stream(
                     let chunk: Value = serde_json::from_str(&msg.data)
                         .map_err(|e| AppError::ProviderError(format!("Failed to parse stream chunk: {}", e)))?;
 
+                    // Parse usage from the final chunk (sent when stream_options.include_usage is true).
+                    // This chunk may have an empty `choices` array.
+                    let stream_usage = chunk.get("usage").and_then(|u| {
+                        if u.is_null() {
+                            return None;
+                        }
+                        let prompt_tokens = u["prompt_tokens"].as_u64().unwrap_or(0) as u32;
+                        let completion_tokens = u["completion_tokens"].as_u64().unwrap_or(0) as u32;
+                        let total_tokens = u["total_tokens"].as_u64().unwrap_or(0) as u32;
+
+                        let cache_read_tokens = u["prompt_tokens_details"]["cached_tokens"]
+                            .as_u64()
+                            .or_else(|| u["prompt_cache_hit_tokens"].as_u64())
+                            .unwrap_or(0) as u32;
+
+                        let cache_write_tokens = u["prompt_cache_miss_tokens"]
+                            .as_u64()
+                            .unwrap_or(0) as u32;
+
+                        Some(StreamUsage {
+                            prompt_tokens,
+                            completion_tokens,
+                            total_tokens,
+                            cache_read_tokens,
+                            cache_write_tokens,
+                        })
+                    });
+
                     if let Some(choice) = chunk["choices"].get(0) {
                         let delta = &choice["delta"];
                         let content = delta["content"].as_str().unwrap_or_default().to_string();
@@ -259,6 +312,18 @@ pub fn create_openai_stream(
                             finish_reason,
                             tool_calls,
                             model: model.clone(),
+                            usage: stream_usage,
+                        };
+                    } else if stream_usage.is_some() {
+                        // Final usage-only chunk (empty choices array) — yield it so
+                        // AggregatingStream can capture the real token counts.
+                        yield ProviderStreamChunk {
+                            content: String::new(),
+                            reasoning_content: None,
+                            finish_reason: None,
+                            tool_calls: None,
+                            model: model.clone(),
+                            usage: stream_usage,
                         };
                     }
                 }
@@ -274,10 +339,20 @@ pub fn create_openai_stream(
 }
 
 /// Calculate cost using the model registry first, then falling back to provider pricing config.
+///
+/// When the registry provides `cache_read` / `cache_write` rates, the formula is:
+///   (prompt_tokens - cache_read_tokens) * input_rate
+///   + cache_read_tokens * cache_read_rate
+///   + cache_write_tokens * cache_write_rate   (if applicable)
+///   + completion_tokens * output_rate
+///
+/// All rates are per-token (the registry stores per-million-token rates).
 pub fn calculate_cost_with_registry(
     model: &str,
     prompt_tokens: u32,
     completion_tokens: u32,
+    cache_read_tokens: u32,
+    cache_write_tokens: u32,
     registry: &crate::models::registry::ModelRegistry,
     pricing: &[crate::config::ModelPricing],
     default_prompt_rate: f64,
@@ -286,10 +361,25 @@ pub fn calculate_cost_with_registry(
     if let Some(metadata) = registry.find_model(model)
         && let Some(cost) = &metadata.cost
     {
-        return (prompt_tokens as f64 * cost.input / 1_000_000.0)
+        let non_cached_prompt = prompt_tokens.saturating_sub(cache_read_tokens);
+        let mut total = (non_cached_prompt as f64 * cost.input / 1_000_000.0)
             + (completion_tokens as f64 * cost.output / 1_000_000.0);
+
+        if let Some(cache_read_rate) = cost.cache_read {
+            total += cache_read_tokens as f64 * cache_read_rate / 1_000_000.0;
+        } else {
+            // No cache_read rate — charge cached tokens at full input rate
+            total += cache_read_tokens as f64 * cost.input / 1_000_000.0;
+        }
+
+        if let Some(cache_write_rate) = cost.cache_write {
+            total += cache_write_tokens as f64 * cache_write_rate / 1_000_000.0;
+        }
+
+        return total;
     }
 
+    // Fallback: no registry entry — use provider pricing config (no cache awareness)
     let (prompt_rate, completion_rate) = pricing
         .iter()
         .find(|p| model.contains(&p.model))
diff --git a/src/providers/mod.rs b/src/providers/mod.rs
index 446ea643..078a01c8 100644
--- a/src/providers/mod.rs
+++ b/src/providers/mod.rs
@@ -37,12 +37,16 @@ pub trait Provider: Send + Sync {
     /// Estimate token count for a request (for cost calculation)
     fn estimate_tokens(&self, request: &UnifiedRequest) -> Result<u32>;
 
-    /// Calculate cost based on token usage and model using the registry
+    /// Calculate cost based on token usage and model using the registry.
+    /// `cache_read_tokens` / `cache_write_tokens` allow cache-aware pricing
+    /// when the registry provides `cache_read` / `cache_write` rates.
     fn calculate_cost(
         &self,
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64;
 }
@@ -54,9 +58,21 @@ pub struct ProviderResponse {
     pub prompt_tokens: u32,
     pub completion_tokens: u32,
     pub total_tokens: u32,
+    pub cache_read_tokens: u32,
+    pub cache_write_tokens: u32,
     pub model: String,
 }
 
+/// Usage data from the final streaming chunk (when providers report real token counts).
+#[derive(Debug, Clone, Default)]
+pub struct StreamUsage {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+    pub cache_read_tokens: u32,
+    pub cache_write_tokens: u32,
+}
+
 #[derive(Debug, Clone)]
 pub struct ProviderStreamChunk {
     pub content: String,
@@ -64,6 +80,8 @@ pub struct ProviderStreamChunk {
     pub finish_reason: Option<String>,
     pub tool_calls: Option<Vec<crate::models::ToolCallDelta>>,
     pub model: String,
+    /// Populated only on the final chunk when providers report usage (e.g. stream_options.include_usage).
+    pub usage: Option<StreamUsage>,
 }
 
 use tokio::sync::RwLock;
@@ -299,6 +317,8 @@ pub mod placeholder {
             _model: &str,
             _prompt_tokens: u32,
             _completion_tokens: u32,
+            _cache_read_tokens: u32,
+            _cache_write_tokens: u32,
             _registry: &crate::models::registry::ModelRegistry,
         ) -> f64 {
             0.0
diff --git a/src/providers/ollama.rs b/src/providers/ollama.rs
index 8f5fed90..90b63d10 100644
--- a/src/providers/ollama.rs
+++ b/src/providers/ollama.rs
@@ -95,12 +95,16 @@ impl super::Provider for OllamaProvider {
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64 {
         helpers::calculate_cost_with_registry(
             model,
             prompt_tokens,
             completion_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
             registry,
             &self.pricing,
             0.0,
diff --git a/src/providers/openai.rs b/src/providers/openai.rs
index b424065d..15001f0a 100644
--- a/src/providers/openai.rs
+++ b/src/providers/openai.rs
@@ -86,12 +86,16 @@ impl super::Provider for OpenAIProvider {
         model: &str,
         prompt_tokens: u32,
         completion_tokens: u32,
+        cache_read_tokens: u32,
+        cache_write_tokens: u32,
         registry: &crate::models::registry::ModelRegistry,
     ) -> f64 {
         helpers::calculate_cost_with_registry(
             model,
             prompt_tokens,
             completion_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
             registry,
             &self.pricing,
             0.15,
diff --git a/src/server/mod.rs b/src/server/mod.rs
index 69666bf1..962131d0 100644
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -97,18 +97,21 @@ async fn get_model_cost(
     model: &str,
     prompt_tokens: u32,
     completion_tokens: u32,
+    cache_read_tokens: u32,
+    cache_write_tokens: u32,
     provider: &Arc<dyn crate::providers::Provider>,
     state: &AppState,
 ) -> f64 {
     // Check in-memory cache for cost overrides (no SQLite hit)
     if let Some(cached) = state.model_config_cache.get(model).await {
         if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) {
+            // Manual overrides don't have cache-specific rates, so use simple formula
             return (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0);
         }
     }
 
-    // Fallback to provider's registry-based calculation
-    provider.calculate_cost(model, prompt_tokens, completion_tokens, &state.model_registry)
+    // Fallback to provider's registry-based calculation (cache-aware)
+    provider.calculate_cost(model, prompt_tokens, completion_tokens, cache_read_tokens, cache_write_tokens, &state.model_registry)
 }
 
 async fn chat_completions(
@@ -281,6 +284,8 @@ async fn chat_completions(
                     &response.model,
                     response.prompt_tokens,
                     response.completion_tokens,
+                    response.cache_read_tokens,
+                    response.cache_write_tokens,
                     &provider,
                     &state,
                 )
@@ -294,6 +299,8 @@ async fn chat_completions(
                     prompt_tokens: response.prompt_tokens,
                     completion_tokens: response.completion_tokens,
                     total_tokens: response.total_tokens,
+                    cache_read_tokens: response.cache_read_tokens,
+                    cache_write_tokens: response.cache_write_tokens,
                     cost,
                     has_images,
                     status: "success".to_string(),
@@ -340,6 +347,8 @@ async fn chat_completions(
                         prompt_tokens: response.prompt_tokens,
                         completion_tokens: response.completion_tokens,
                         total_tokens: response.total_tokens,
+                        cache_read_tokens: if response.cache_read_tokens > 0 { Some(response.cache_read_tokens) } else { None },
+                        cache_write_tokens: if response.cache_write_tokens > 0 { Some(response.cache_write_tokens) } else { None },
                     }),
                 };
 
@@ -368,6 +377,8 @@ async fn chat_completions(
                     prompt_tokens: 0,
                     completion_tokens: 0,
                     total_tokens: 0,
+                    cache_read_tokens: 0,
+                    cache_write_tokens: 0,
                     cost: 0.0,
                     has_images: false,
                     status: "error".to_string(),
diff --git a/src/utils/streaming.rs b/src/utils/streaming.rs
index db1cb893..123eba2c 100644
--- a/src/utils/streaming.rs
+++ b/src/utils/streaming.rs
@@ -2,7 +2,7 @@ use crate::client::ClientManager;
 use crate::errors::AppError;
 use crate::logging::{RequestLog, RequestLogger};
 use crate::models::ToolCall;
-use crate::providers::{Provider, ProviderStreamChunk};
+use crate::providers::{Provider, ProviderStreamChunk, StreamUsage};
 use crate::state::ModelConfigCache;
 use crate::utils::tokens::estimate_completion_tokens;
 use futures::stream::Stream;
@@ -33,6 +33,8 @@ pub struct AggregatingStream<S> {
     accumulated_content: String,
     accumulated_reasoning: String,
     accumulated_tool_calls: Vec<ToolCall>,
+    /// Real usage data from the provider's final stream chunk (when available).
+    real_usage: Option<StreamUsage>,
     logger: Arc<RequestLogger>,
     client_manager: Arc<ClientManager>,
     model_registry: Arc<crate::models::registry::ModelRegistry>,
@@ -56,6 +58,7 @@ where
             accumulated_content: String::new(),
             accumulated_reasoning: String::new(),
             accumulated_tool_calls: Vec::new(),
+            real_usage: None,
             logger: config.logger,
             client_manager: config.client_manager,
             model_registry: config.model_registry,
@@ -78,33 +81,68 @@ where
         let logger = self.logger.clone();
         let client_manager = self.client_manager.clone();
         let provider = self.provider.clone();
-        let prompt_tokens = self.prompt_tokens;
+        let estimated_prompt_tokens = self.prompt_tokens;
         let has_images = self.has_images;
         let registry = self.model_registry.clone();
         let config_cache = self.model_config_cache.clone();
+        let real_usage = self.real_usage.take();
 
         // Estimate completion tokens (including reasoning if present)
-        let content_tokens = estimate_completion_tokens(&self.accumulated_content, &model);
-        let reasoning_tokens = if !self.accumulated_reasoning.is_empty() {
+        let estimated_content_tokens = estimate_completion_tokens(&self.accumulated_content, &model);
+        let estimated_reasoning_tokens = if !self.accumulated_reasoning.is_empty() {
             estimate_completion_tokens(&self.accumulated_reasoning, &model)
         } else {
             0
         };
 
-        let completion_tokens = content_tokens + reasoning_tokens;
-        let total_tokens = prompt_tokens + completion_tokens;
+        let estimated_completion = estimated_content_tokens + estimated_reasoning_tokens;
 
         // Spawn a background task to log the completion
         tokio::spawn(async move {
+            // Use real usage from the provider when available, otherwise fall back to estimates
+            let (prompt_tokens, completion_tokens, total_tokens, cache_read_tokens, cache_write_tokens) =
+                if let Some(usage) = &real_usage {
+                    (
+                        usage.prompt_tokens,
+                        usage.completion_tokens,
+                        usage.total_tokens,
+                        usage.cache_read_tokens,
+                        usage.cache_write_tokens,
+                    )
+                } else {
+                    (
+                        estimated_prompt_tokens,
+                        estimated_completion,
+                        estimated_prompt_tokens + estimated_completion,
+                        0u32,
+                        0u32,
+                    )
+                };
+
             // Check in-memory cache for cost overrides (no SQLite hit)
             let cost = if let Some(cached) = config_cache.get(&model).await {
                 if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) {
+                    // Cost override doesn't have cache-aware pricing, use simple formula
                     (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0)
                 } else {
-                    provider.calculate_cost(&model, prompt_tokens, completion_tokens, &registry)
+                    provider.calculate_cost(
+                        &model,
+                        prompt_tokens,
+                        completion_tokens,
+                        cache_read_tokens,
+                        cache_write_tokens,
+                        &registry,
+                    )
                 }
             } else {
-                provider.calculate_cost(&model, prompt_tokens, completion_tokens, &registry)
+                provider.calculate_cost(
+                    &model,
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read_tokens,
+                    cache_write_tokens,
+                    &registry,
+                )
             };
 
             // Log to database
@@ -116,6 +154,8 @@ where
                 prompt_tokens,
                 completion_tokens,
                 total_tokens,
+                cache_read_tokens,
+                cache_write_tokens,
                 cost,
                 has_images,
                 status: "success".to_string(),
@@ -146,6 +186,10 @@ where
                 if let Some(reasoning) = &chunk.reasoning_content {
                     self.accumulated_reasoning.push_str(reasoning);
                 }
+                // Capture real usage from the provider when present (typically on the final chunk)
+                if let Some(usage) = &chunk.usage {
+                    self.real_usage = Some(usage.clone());
+                }
                 // Accumulate tool call deltas into complete tool calls
                 if let Some(deltas) = &chunk.tool_calls {
                     for delta in deltas {
@@ -230,7 +274,7 @@ mod tests {
         fn estimate_tokens(&self, _req: &crate::models::UnifiedRequest) -> Result<u32> {
             Ok(10)
         }
-        fn calculate_cost(&self, _model: &str, _p: u32, _c: u32, _r: &crate::models::registry::ModelRegistry) -> f64 {
+        fn calculate_cost(&self, _model: &str, _p: u32, _c: u32, _cr: u32, _cw: u32, _r: &crate::models::registry::ModelRegistry) -> f64 {
             0.05
         }
     }
@@ -244,6 +288,7 @@ mod tests {
                 finish_reason: None,
                 tool_calls: None,
                 model: "test".to_string(),
+                usage: None,
             }),
             Ok(ProviderStreamChunk {
                 content: " World".to_string(),
@@ -251,6 +296,7 @@ mod tests {
                 finish_reason: Some("stop".to_string()),
                 tool_calls: None,
                 model: "test".to_string(),
+                usage: None,
             }),
         ];
         let inner_stream = stream::iter(chunks);
diff --git a/static/index.html b/static/index.html
index f73cb3e4..913d6b64 100644
--- a/static/index.html
+++ b/static/index.html
@@ -4,7 +4,7 @@
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>LLM Proxy Gateway - Admin Dashboard</title>
-    <link rel="stylesheet" href="/css/dashboard.css?v=4">
+    <link rel="stylesheet" href="/css/dashboard.css?v=5">
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
     <link rel="icon" href="img/logo-icon.png" type="image/png" sizes="any">
     <link rel="apple-touch-icon" href="img/logo-icon.png">
@@ -166,19 +166,19 @@
     </div>
 
     <!-- Scripts (cache-busted with version query params) -->
-    <script src="/js/api.js?v=4"></script>
-    <script src="/js/auth.js?v=4"></script>
-    <script src="/js/dashboard.js?v=4"></script>
-    <script src="/js/websocket.js?v=4"></script>
-    <script src="/js/charts.js?v=4"></script>
-    <script src="/js/pages/overview.js?v=4"></script>
-    <script src="/js/pages/analytics.js?v=4"></script>
-    <script src="/js/pages/costs.js?v=4"></script>
-    <script src="/js/pages/clients.js?v=4"></script>
-    <script src="/js/pages/providers.js?v=4"></script>
-    <script src="/js/pages/models.js?v=4"></script>
-    <script src="/js/pages/monitoring.js?v=4"></script>
-    <script src="/js/pages/settings.js?v=4"></script>
-    <script src="/js/pages/logs.js?v=4"></script>
+    <script src="/js/api.js?v=5"></script>
+    <script src="/js/auth.js?v=5"></script>
+    <script src="/js/dashboard.js?v=5"></script>
+    <script src="/js/websocket.js?v=5"></script>
+    <script src="/js/charts.js?v=5"></script>
+    <script src="/js/pages/overview.js?v=5"></script>
+    <script src="/js/pages/analytics.js?v=5"></script>
+    <script src="/js/pages/costs.js?v=5"></script>
+    <script src="/js/pages/clients.js?v=5"></script>
+    <script src="/js/pages/providers.js?v=5"></script>
+    <script src="/js/pages/models.js?v=5"></script>
+    <script src="/js/pages/monitoring.js?v=5"></script>
+    <script src="/js/pages/settings.js?v=5"></script>
+    <script src="/js/pages/logs.js?v=5"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/static/js/api.js b/static/js/api.js
index 1ef094a2..140e9511 100644
--- a/static/js/api.js
+++ b/static/js/api.js
@@ -17,61 +17,25 @@ class ApiClient {
             headers['Authorization'] = `Bearer ${window.authManager.token}`;
         }
 
+        const response = await fetch(url, {
+            ...options,
+            headers
+        });
+
+        const text = await response.text();
+
+        let result;
         try {
-            console.log(`[API] Fetching ${url}...`);
-            const response = await fetch(url, {
-                ...options,
-                headers
-            });
-
-            console.log(`[API] ${url} → status=${response.status} ok=${response.ok} type=${response.headers.get('content-type')}`);
-
-            const text = await response.text();
-            console.log(`[API] ${url} → body length=${text.length}, first 200 chars:`, text.substring(0, 200));
-
-            let result;
-            try {
-                result = JSON.parse(text);
-            } catch (parseErr) {
-                const msg = `JSON parse failed for ${url}: ${parseErr.message}. Body: ${text.substring(0, 300)}`;
-                console.error(`[API] ${msg}`);
-                this._addDebugEntry(url, 'JSON_PARSE_ERROR', msg);
-                throw new Error(msg);
-            }
-
-            if (!response.ok || !result.success) {
-                const msg = `API error for ${url}: ok=${response.ok} success=${result.success} error=${result.error} status=${response.status}`;
-                console.error(`[API] ${msg}`);
-                this._addDebugEntry(url, 'API_ERROR', msg);
-                throw new Error(result.error || `HTTP error! status: ${response.status}`);
-            }
-
-            console.log(`[API] ${url} → SUCCESS, data keys:`, result.data ? Object.keys(result.data) : 'null');
-            return result.data;
-        } catch (error) {
-            console.error(`[API] Request failed (${path}):`, error);
-            this._addDebugEntry(url, 'EXCEPTION', error.message);
-            throw error;
+            result = JSON.parse(text);
+        } catch (parseErr) {
+            throw new Error(`JSON parse failed for ${url}: ${parseErr.message}`);
         }
-    }
 
-    // Visible on-page debug panel for diagnosing fetch failures
-    _addDebugEntry(url, status, detail) {
-        let panel = document.getElementById('api-debug-panel');
-        if (!panel) {
-            panel = document.createElement('div');
-            panel.id = 'api-debug-panel';
-            panel.style.cssText = 'position:fixed;bottom:0;left:0;right:0;max-height:200px;overflow-y:auto;background:#1d2021;color:#fbf1c7;font-family:monospace;font-size:11px;padding:8px;z-index:99999;border-top:2px solid #cc241d;';
-            const title = document.createElement('div');
-            title.style.cssText = 'font-weight:bold;margin-bottom:4px;color:#fb4934;';
-            title.textContent = 'API Debug Panel (remove after fixing)';
-            panel.appendChild(title);
-            document.body.appendChild(panel);
+        if (!response.ok || !result.success) {
+            throw new Error(result.error || `HTTP error! status: ${response.status}`);
         }
-        const entry = document.createElement('div');
-        entry.style.cssText = 'margin:2px 0;padding:2px 4px;background:#282828;border-left:3px solid ' + (status === 'EXCEPTION' ? '#fb4934' : '#fabd2f') + ';';
-        entry.textContent = `[${status}] ${url}: ${detail}`;
-        panel.appendChild(entry);
+
+        return result.data;
     }
 
     async get(path) {
diff --git a/static/js/dashboard.js b/static/js/dashboard.js
index fd88935c..41d19636 100644
--- a/static/js/dashboard.js
+++ b/static/js/dashboard.js
@@ -348,7 +348,7 @@ class Dashboard {
                 <div class="table-container">
                     <table class="table" id="usage-table">
                         <thead>
-                            <tr><th>Date</th><th>Client</th><th>Provider</th><th>Model</th><th>Requests</th><th>Tokens</th><th>Cost</th></tr>
+                            <tr><th>Date</th><th>Client</th><th>Provider</th><th>Model</th><th>Requests</th><th>Tokens</th><th>Cache Read</th><th>Cache Write</th><th>Cost</th></tr>
                         </thead>
                         <tbody></tbody>
                     </table>
@@ -383,7 +383,7 @@ class Dashboard {
                 <div class="table-container">
                     <table class="table" id="pricing-table">
                         <thead>
-                            <tr><th>Provider</th><th>Model</th><th>Input Cost</th><th>Output Cost</th><th>Last Updated</th></tr>
+                            <tr><th>Provider</th><th>Model</th><th>Input Cost</th><th>Output Cost</th><th>Cache Read</th><th>Cache Write</th></tr>
                         </thead>
                         <tbody></tbody>
                     </table>
diff --git a/static/js/pages/analytics.js b/static/js/pages/analytics.js
index a6a6c6de..6f926462 100644
--- a/static/js/pages/analytics.js
+++ b/static/js/pages/analytics.js
@@ -184,21 +184,27 @@ class AnalyticsPage {
         if (!tableBody) return;
         
         if (data.length === 0) {
-            tableBody.innerHTML = '<tr><td colspan="7" class="text-center">No historical data found</td></tr>';
+            tableBody.innerHTML = '<tr><td colspan="9" class="text-center">No historical data found</td></tr>';
             return;
         }
 
-        tableBody.innerHTML = data.map(row => `
-            <tr>
-                <td>${row.date}</td>
-                <td><span class="badge-client">${row.client}</span></td>
-                <td>${row.provider}</td>
-                <td><code class="code-sm">${row.model}</code></td>
-                <td>${row.requests.toLocaleString()}</td>
-                <td>${window.api.formatNumber(row.tokens)}</td>
-                <td>${window.api.formatCurrency(row.cost)}</td>
-            </tr>
-        `).join('');
+        tableBody.innerHTML = data.map(row => {
+            const cacheRead = row.cache_read_tokens || 0;
+            const cacheWrite = row.cache_write_tokens || 0;
+            return `
+                <tr>
+                    <td>${row.date}</td>
+                    <td><span class="badge-client">${row.client}</span></td>
+                    <td>${row.provider}</td>
+                    <td><code class="code-sm">${row.model}</code></td>
+                    <td>${row.requests.toLocaleString()}</td>
+                    <td>${window.api.formatNumber(row.tokens)}</td>
+                    <td>${window.api.formatNumber(cacheRead)}</td>
+                    <td>${window.api.formatNumber(cacheWrite)}</td>
+                    <td>${window.api.formatCurrency(row.cost)}</td>
+                </tr>
+            `;
+        }).join('');
     }
 
     setupEventListeners() {
diff --git a/static/js/pages/costs.js b/static/js/pages/costs.js
index d84a1257..034ee167 100644
--- a/static/js/pages/costs.js
+++ b/static/js/pages/costs.js
@@ -31,7 +31,10 @@ class CostsPage {
                 avgDailyCost: data.total_cost / 30, // Simplified
                 costTrend: 5.2,
                 budgetUsed: Math.min(Math.round((data.total_cost / 100) * 100), 100), // Assuming $100 budget
-                projectedMonthEnd: data.today_cost * 30
+                projectedMonthEnd: data.today_cost * 30,
+                cacheReadTokens: data.total_cache_read_tokens || 0,
+                cacheWriteTokens: data.total_cache_write_tokens || 0,
+                totalTokens: data.total_tokens || 0,
             };
             
             this.renderCostStats();
@@ -44,6 +47,10 @@ class CostsPage {
     renderCostStats() {
         const container = document.getElementById('cost-stats');
         if (!container) return;
+
+        const cacheHitRate = this.costData.totalTokens > 0
+            ? ((this.costData.cacheReadTokens / this.costData.totalTokens) * 100).toFixed(1)
+            : '0.0';
         
         container.innerHTML = `
             <div class="stat-card">
@@ -74,6 +81,19 @@ class CostsPage {
                 </div>
             </div>
             
+            <div class="stat-card">
+                <div class="stat-icon primary">
+                    <i class="fas fa-bolt"></i>
+                </div>
+                <div class="stat-content">
+                    <div class="stat-value">${cacheHitRate}%</div>
+                    <div class="stat-label">Cache Hit Rate</div>
+                    <div class="stat-change">
+                        ${window.api.formatNumber(this.costData.cacheReadTokens)} cached tokens
+                    </div>
+                </div>
+            </div>
+            
             <div class="stat-card">
                 <div class="stat-icon danger">
                     <i class="fas fa-piggy-bank"></i>
@@ -181,15 +201,24 @@ class CostsPage {
         const tableBody = document.querySelector('#pricing-table tbody');
         if (!tableBody) return;
         
-        tableBody.innerHTML = data.map(row => `
-            <tr>
-                <td><span class="badge-client">${row.provider.toUpperCase()}</span></td>
-                <td><code class="code-sm">${row.id}</code></td>
-                <td>${window.api.formatCurrency(row.prompt_cost)} / 1M</td>
-                <td>${window.api.formatCurrency(row.completion_cost)} / 1M</td>
-                <td>Now</td>
-            </tr>
-        `).join('');
+        tableBody.innerHTML = data.map(row => {
+            const cacheRead = row.cache_read_cost != null
+                ? `${window.api.formatCurrency(row.cache_read_cost)} / 1M`
+                : '<span style="color:var(--fg4)">--</span>';
+            const cacheWrite = row.cache_write_cost != null
+                ? `${window.api.formatCurrency(row.cache_write_cost)} / 1M`
+                : '<span style="color:var(--fg4)">--</span>';
+            return `
+                <tr>
+                    <td><span class="badge-client">${row.provider.toUpperCase()}</span></td>
+                    <td><code class="code-sm">${row.id}</code></td>
+                    <td>${window.api.formatCurrency(row.prompt_cost)} / 1M</td>
+                    <td>${window.api.formatCurrency(row.completion_cost)} / 1M</td>
+                    <td>${cacheRead}</td>
+                    <td>${cacheWrite}</td>
+                </tr>
+            `;
+        }).join('');
     }
 
     setupEventListeners() {