fix(openai): implement parsing for real-time Responses API streaming format

fix(openai): map system to developer role and enhance stream diagnostics for Responses API
fix(openai): improve Responses API streaming reliability and diagnostics
2026-03-17 19:00:40 +00:00 · 2026-03-17 18:50:55 +00:00 · 2026-03-17 18:45:09 +00:00 · 2026-03-17 18:40:06 +00:00 · 2026-03-17 18:36:05 +00:00 · 2026-03-17 18:31:23 +00:00
13 changed files with 306 additions and 163 deletions
--- a/data/llm_proxy.db
+++ b/data/llm_proxy.db
--- a/src/dashboard/system.rs
+++ b/src/dashboard/system.rs
@@ -255,7 +255,10 @@ pub(super) async fn handle_system_logs(
            model,
            prompt_tokens,
            completion_tokens,
+            reasoning_tokens,
            total_tokens,
+            cache_read_tokens,
+            cache_write_tokens,
            cost,
            status,
            error_message,
@@ -279,6 +282,11 @@ pub(super) async fn handle_system_logs(
                        "client_id": row.get::<String, _>("client_id"),
                        "provider": row.get::<String, _>("provider"),
                        "model": row.get::<String, _>("model"),
+                        "prompt_tokens": row.get::<i64, _>("prompt_tokens"),
+                        "completion_tokens": row.get::<i64, _>("completion_tokens"),
+                        "reasoning_tokens": row.get::<i64, _>("reasoning_tokens"),
+                        "cache_read_tokens": row.get::<i64, _>("cache_read_tokens"),
+                        "cache_write_tokens": row.get::<i64, _>("cache_write_tokens"),
                        "tokens": row.get::<i64, _>("total_tokens"),
                        "cost": row.get::<f64, _>("cost"),
                        "status": row.get::<String, _>("status"),
--- a/src/database/mod.rs
+++ b/src/database/mod.rs
@@ -64,6 +64,7 @@ pub async fn run_migrations(pool: &DbPool) -> Result<()> {
            model TEXT,
            prompt_tokens INTEGER,
            completion_tokens INTEGER,
+            reasoning_tokens INTEGER DEFAULT 0,
            total_tokens INTEGER,
            cost REAL,
            has_images BOOLEAN DEFAULT FALSE,
@@ -172,6 +173,9 @@ pub async fn run_migrations(pool: &DbPool) -> Result<()> {
    let _ = sqlx::query("ALTER TABLE llm_requests ADD COLUMN cache_write_tokens INTEGER DEFAULT 0")
        .execute(pool)
        .await;
+    let _ = sqlx::query("ALTER TABLE llm_requests ADD COLUMN reasoning_tokens INTEGER DEFAULT 0")
+        .execute(pool)
+        .await;

    // Add billing_mode column if it doesn't exist (migration for existing DBs)
    let _ = sqlx::query("ALTER TABLE provider_configs ADD COLUMN billing_mode TEXT")
--- a/src/logging/mod.rs
+++ b/src/logging/mod.rs
@@ -15,6 +15,7 @@ pub struct RequestLog {
    pub model: String,
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
+    pub reasoning_tokens: u32,
    pub total_tokens: u32,
    pub cache_read_tokens: u32,
    pub cache_write_tokens: u32,
@@ -77,8 +78,8 @@ impl RequestLogger {
        sqlx::query(
            r#"
            INSERT INTO llm_requests 
-            (timestamp, client_id, provider, model, prompt_tokens, completion_tokens, total_tokens, cache_read_tokens, cache_write_tokens, cost, has_images, status, error_message, duration_ms, request_body, response_body)
-            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            (timestamp, client_id, provider, model, prompt_tokens, completion_tokens, reasoning_tokens, total_tokens, cache_read_tokens, cache_write_tokens, cost, has_images, status, error_message, duration_ms, request_body, response_body)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            "#,
        )
        .bind(log.timestamp)
@@ -87,6 +88,7 @@ impl RequestLogger {
        .bind(&log.model)
        .bind(log.prompt_tokens as i64)
        .bind(log.completion_tokens as i64)
+        .bind(log.reasoning_tokens as i64)
        .bind(log.total_tokens as i64)
        .bind(log.cache_read_tokens as i64)
        .bind(log.cache_write_tokens as i64)
--- a/src/models/mod.rs
+++ b/src/models/mod.rs
@@ -165,6 +165,8 @@ pub struct Usage {
    pub completion_tokens: u32,
    pub total_tokens: u32,
    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub cache_read_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub cache_write_tokens: Option<u32>,
@@ -179,6 +181,8 @@ pub struct ChatCompletionStreamResponse {
    pub created: u64,
    pub model: String,
    pub choices: Vec<ChatStreamChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/src/providers/deepseek.rs
+++ b/src/providers/deepseek.rs
@@ -128,17 +128,36 @@ impl super::Provider for DeepSeekProvider {
        cache_write_tokens: u32,
        registry: &crate::models::registry::ModelRegistry,
    ) -> f64 {
-        helpers::calculate_cost_with_registry(
-            model,
-            prompt_tokens,
-            completion_tokens,
-            cache_read_tokens,
-            cache_write_tokens,
-            registry,
-            &self.pricing,
-            0.14,
-            0.28,
-        )
+        if let Some(metadata) = registry.find_model(model) {
+            if metadata.cost.is_some() {
+                return helpers::calculate_cost_with_registry(
+                    model,
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read_tokens,
+                    cache_write_tokens,
+                    registry,
+                    &self.pricing,
+                    0.28,
+                    0.42,
+                );
+            }
+        }
+
+        // Custom DeepSeek fallback that correctly handles cache hits
+        let (prompt_rate, completion_rate) = self
+            .pricing
+            .iter()
+            .find(|p| model.contains(&p.model))
+            .map(|p| (p.prompt_tokens_per_million, p.completion_tokens_per_million))
+            .unwrap_or((0.28, 0.42)); // Default to DeepSeek's current API pricing
+
+        let cache_hit_rate = prompt_rate / 10.0;
+        let non_cached_prompt = prompt_tokens.saturating_sub(cache_read_tokens);
+
+        (non_cached_prompt as f64 * prompt_rate / 1_000_000.0)
+            + (cache_read_tokens as f64 * cache_hit_rate / 1_000_000.0)
+            + (completion_tokens as f64 * completion_rate / 1_000_000.0)
    }

    async fn chat_completion_stream(
--- a/src/providers/gemini.rs
+++ b/src/providers/gemini.rs
@@ -722,6 +722,10 @@ impl super::Provider for GeminiProvider {
        let reasoning_content = candidate
            .and_then(|c| c.content.parts.iter().find_map(|p| p.thought.clone()));

+        let reasoning_tokens = reasoning_content.as_ref()
+            .map(|r| crate::utils::tokens::estimate_completion_tokens(r, &model))
+            .unwrap_or(0);
+
        // Extract function calls → OpenAI tool_calls
        let tool_calls = candidate.and_then(|c| Self::extract_tool_calls(&c.content.parts));

@@ -752,6 +756,7 @@ impl super::Provider for GeminiProvider {
            tool_calls,
            prompt_tokens,
            completion_tokens,
+            reasoning_tokens,
            total_tokens,
            cache_read_tokens,
            cache_write_tokens: 0, // Gemini doesn't report cache writes separately
@@ -772,17 +777,36 @@ impl super::Provider for GeminiProvider {
        cache_write_tokens: u32,
        registry: &crate::models::registry::ModelRegistry,
    ) -> f64 {
-        super::helpers::calculate_cost_with_registry(
-            model,
-            prompt_tokens,
-            completion_tokens,
-            cache_read_tokens,
-            cache_write_tokens,
-            registry,
-            &self.pricing,
-            0.075,
-            0.30,
-        )
+        if let Some(metadata) = registry.find_model(model) {
+            if metadata.cost.is_some() {
+                return super::helpers::calculate_cost_with_registry(
+                    model,
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read_tokens,
+                    cache_write_tokens,
+                    registry,
+                    &self.pricing,
+                    0.075,
+                    0.30,
+                );
+            }
+        }
+
+        // Custom Gemini fallback that correctly handles cache hits (25% of input cost)
+        let (prompt_rate, completion_rate) = self
+            .pricing
+            .iter()
+            .find(|p| model.contains(&p.model))
+            .map(|p| (p.prompt_tokens_per_million, p.completion_tokens_per_million))
+            .unwrap_or((0.075, 0.30)); // Default to Gemini 1.5 Flash current API pricing
+
+        let cache_hit_rate = prompt_rate * 0.25;
+        let non_cached_prompt = prompt_tokens.saturating_sub(cache_read_tokens);
+
+        (non_cached_prompt as f64 * prompt_rate / 1_000_000.0)
+            + (cache_read_tokens as f64 * cache_hit_rate / 1_000_000.0)
+            + (completion_tokens as f64 * completion_rate / 1_000_000.0)
    }

    async fn chat_completion_stream(
@@ -883,6 +907,7 @@ impl super::Provider for GeminiProvider {
                            super::StreamUsage {
                                prompt_tokens: u.prompt_token_count,
                                completion_tokens: u.candidates_token_count,
+                                reasoning_tokens: 0,
                                total_tokens: u.total_token_count,
                                cache_read_tokens: u.cached_content_token_count,
                                cache_write_tokens: 0,
--- a/src/providers/helpers.rs
+++ b/src/providers/helpers.rs
@@ -254,6 +254,11 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
    let completion_tokens = usage["completion_tokens"].as_u64().unwrap_or(0) as u32;
    let total_tokens = usage["total_tokens"].as_u64().unwrap_or(0) as u32;

+    // Extract reasoning tokens
+    let reasoning_tokens = usage["completion_tokens_details"]["reasoning_tokens"]
+        .as_u64()
+        .unwrap_or(0) as u32;
+
    // Extract cache tokens — try OpenAI/Grok format first, then DeepSeek format
    let cache_read_tokens = usage["prompt_tokens_details"]["cached_tokens"]
        .as_u64()
@@ -261,9 +266,9 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
        .or_else(|| usage["prompt_cache_hit_tokens"].as_u64())
        .unwrap_or(0) as u32;

-    // DeepSeek reports cache_write as prompt_cache_miss_tokens (tokens written to cache for future use).
-    // OpenAI doesn't report cache_write in this location, but may in the future.
-    let cache_write_tokens = usage["prompt_cache_miss_tokens"].as_u64().unwrap_or(0) as u32;
+    // DeepSeek reports prompt_cache_miss_tokens which are just regular non-cached tokens.
+    // They do not incur a separate cache_write fee, so we don't map them here to avoid double-charging.
+    let cache_write_tokens = 0;

    Ok(ProviderResponse {
        content,
@@ -271,6 +276,7 @@ pub fn parse_openai_response(resp_json: &Value, model: String) -> Result<Provide
        tool_calls,
        prompt_tokens,
        completion_tokens,
+        reasoning_tokens,
        total_tokens,
        cache_read_tokens,
        cache_write_tokens,
@@ -295,18 +301,21 @@ pub fn parse_openai_stream_chunk(
        let completion_tokens = u["completion_tokens"].as_u64().unwrap_or(0) as u32;
        let total_tokens = u["total_tokens"].as_u64().unwrap_or(0) as u32;

+        let reasoning_tokens = u["completion_tokens_details"]["reasoning_tokens"]
+            .as_u64()
+            .unwrap_or(0) as u32;
+
        let cache_read_tokens = u["prompt_tokens_details"]["cached_tokens"]
            .as_u64()
            .or_else(|| u["prompt_cache_hit_tokens"].as_u64())
            .unwrap_or(0) as u32;

-        let cache_write_tokens = u["prompt_cache_miss_tokens"]
-            .as_u64()
-            .unwrap_or(0) as u32;
+        let cache_write_tokens = 0;

        Some(StreamUsage {
            prompt_tokens,
            completion_tokens,
+            reasoning_tokens,
            total_tokens,
            cache_read_tokens,
            cache_write_tokens,
--- a/src/providers/mod.rs
+++ b/src/providers/mod.rs
@@ -75,6 +75,7 @@ pub struct ProviderResponse {
    pub tool_calls: Option<Vec<crate::models::ToolCall>>,
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
+    pub reasoning_tokens: u32,
    pub total_tokens: u32,
    pub cache_read_tokens: u32,
    pub cache_write_tokens: u32,
@@ -86,6 +87,7 @@ pub struct ProviderResponse {
 pub struct StreamUsage {
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
+    pub reasoning_tokens: u32,
    pub total_tokens: u32,
    pub cache_read_tokens: u32,
    pub cache_write_tokens: u32,
--- a/src/providers/openai.rs
+++ b/src/providers/openai.rs
@@ -26,7 +26,7 @@ impl OpenAIProvider {
            .timeout(std::time::Duration::from_secs(300))
            .pool_idle_timeout(std::time::Duration::from_secs(90))
            .pool_max_idle_per_host(4)
-            .tcp_keepalive(std::time::Duration::from_secs(30))
+            .tcp_keepalive(std::time::Duration::from_secs(15))
            .build()?;

        Ok(Self {
@@ -92,96 +92,7 @@ impl super::Provider for OpenAIProvider {
            // Read error body to diagnose. If the model requires the Responses
            // API (v1/responses), retry against that endpoint.
            if error_text.to_lowercase().contains("v1/responses") || error_text.to_lowercase().contains("only supported in v1/responses") {
-                // Build a simple `input` string by concatenating message parts.
-                let messages_json = helpers::messages_to_openai_json(&request.messages).await?;
-                let mut inputs: Vec<String> = Vec::new();
-                for m in &messages_json {
-                    let role = m["role"].as_str().unwrap_or("");
-                    let parts = m.get("content").and_then(|c| c.as_array()).cloned().unwrap_or_default();
-                    let mut text_parts = Vec::new();
-                    for p in parts {
-                        if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
-                            text_parts.push(t.to_string());
-                        }
-                    }
-                    inputs.push(format!("{}: {}", role, text_parts.join("")));
-                }
-                let input_text = inputs.join("\n");
-
-                let resp = self
-                    .client
-                    .post(format!("{}/responses", self.config.base_url))
-                    .header("Authorization", format!("Bearer {}", self.api_key))
-                    .json(&serde_json::json!({ "model": request.model, "input": input_text }))
-                    .send()
-                    .await
-                    .map_err(|e| AppError::ProviderError(e.to_string()))?;
-
-                if !resp.status().is_success() {
-                    let err = resp.text().await.unwrap_or_default();
-                    return Err(AppError::ProviderError(format!("OpenAI Responses API error: {}", err)));
-                }
-
-                let resp_json: serde_json::Value = resp.json().await.map_err(|e| AppError::ProviderError(e.to_string()))?;
-                // Try to normalize: if it's chat-style, use existing parser
-                if resp_json.get("choices").is_some() {
-                    return helpers::parse_openai_response(&resp_json, request.model);
-                }
-
-                // Responses API: try to extract text from `output` or `candidates`
-                let mut content_text = String::new();
-                if let Some(output) = resp_json.get("output").and_then(|o| o.as_array()) {
-                    if let Some(first) = output.get(0) {
-                        if let Some(contents) = first.get("content").and_then(|c| c.as_array()) {
-                            for item in contents {
-                                if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
-                                    if !content_text.is_empty() { content_text.push_str("\n"); }
-                                    content_text.push_str(text);
-                                } else if let Some(parts) = item.get("parts").and_then(|p| p.as_array()) {
-                                    for p in parts {
-                                        if let Some(t) = p.as_str() {
-                                            if !content_text.is_empty() { content_text.push_str("\n"); }
-                                            content_text.push_str(t);
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                if content_text.is_empty() {
-                    if let Some(cands) = resp_json.get("candidates").and_then(|c| c.as_array()) {
-                        if let Some(c0) = cands.get(0) {
-                            if let Some(content) = c0.get("content") {
-                                if let Some(parts) = content.get("parts").and_then(|p| p.as_array()) {
-                                    for p in parts {
-                                        if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
-                                            if !content_text.is_empty() { content_text.push_str("\n"); }
-                                            content_text.push_str(t);
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                let prompt_tokens = resp_json.get("usage").and_then(|u| u.get("prompt_tokens")).and_then(|v| v.as_u64()).unwrap_or(0) as u32;
-                let completion_tokens = resp_json.get("usage").and_then(|u| u.get("completion_tokens")).and_then(|v| v.as_u64()).unwrap_or(0) as u32;
-                let total_tokens = resp_json.get("usage").and_then(|u| u.get("total_tokens")).and_then(|v| v.as_u64()).unwrap_or(0) as u32;
-
-                return Ok(ProviderResponse {
-                    content: content_text,
-                    reasoning_content: None,
-                    tool_calls: None,
-                    prompt_tokens,
-                    completion_tokens,
-                    total_tokens,
-                    cache_read_tokens: 0,
-                    cache_write_tokens: 0,
-                    model: request.model,
-                });
+                return self.chat_responses(request).await;
            }

            tracing::error!("OpenAI API error ({}): {}", status, error_text);
@@ -201,20 +112,76 @@ impl super::Provider for OpenAIProvider {
        let messages_json = helpers::messages_to_openai_json(&request.messages).await?;
        let mut input_parts = Vec::new();
        for m in &messages_json {
-            let role = m["role"].as_str().unwrap_or("user");
-            let content = m.get("content").cloned().unwrap_or(serde_json::json!(""));
+            let mut role = m["role"].as_str().unwrap_or("user").to_string();
+            // Newer models (gpt-5, o1) prefer "developer" over "system"
+            if role == "system" {
+                role = "developer".to_string();
+            }
            
+            let mut content = m.get("content").cloned().unwrap_or(serde_json::json!([]));
+            
+            // Map content types based on role for Responses API
+            if let Some(content_array) = content.as_array_mut() {
+                for part in content_array {
+                    if let Some(part_obj) = part.as_object_mut() {
+                        if let Some(t) = part_obj.get("type").and_then(|v| v.as_str()) {
+                            match t {
+                                "text" => {
+                                    let new_type = if role == "assistant" { "output_text" } else { "input_text" };
+                                    part_obj.insert("type".to_string(), serde_json::json!(new_type));
+                                }
+                                "image_url" => {
+                                    // Assistant typically doesn't have image_url in history this way, but for safety:
+                                    let new_type = if role == "assistant" { "output_image" } else { "input_image" };
+                                    part_obj.insert("type".to_string(), serde_json::json!(new_type));
+                                    if let Some(img_url) = part_obj.remove("image_url") {
+                                        part_obj.insert("image".to_string(), img_url);
+                                    }
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            } else if let Some(text) = content.as_str() {
+                let new_type = if role == "assistant" { "output_text" } else { "input_text" };
+                content = serde_json::json!([{ "type": new_type, "text": text }]);
+            }
+
            input_parts.push(serde_json::json!({
                "role": role,
                "content": content
            }));
        }

+        let mut body = serde_json::json!({
+            "model": request.model,
+            "input": input_parts,
+        });
+
+        // Add standard parameters
+        if let Some(temp) = request.temperature {
+            body["temperature"] = serde_json::json!(temp);
+        }
+        
+        // Newer models (gpt-5, o1) in Responses API use max_output_tokens
+        if let Some(max_tokens) = request.max_tokens {
+            if request.model.contains("gpt-5") || request.model.starts_with("o1-") || request.model.starts_with("o3-") {
+                body["max_output_tokens"] = serde_json::json!(max_tokens);
+            } else {
+                body["max_tokens"] = serde_json::json!(max_tokens);
+            }
+        }
+
+        if let Some(tools) = &request.tools {
+            body["tools"] = serde_json::json!(tools);
+        }
+
        let resp = self
            .client
            .post(format!("{}/responses", self.config.base_url))
            .header("Authorization", format!("Bearer {}", self.api_key))
-            .json(&serde_json::json!({ "model": request.model, "input": input_parts }))
+            .json(&body)
            .send()
            .await
            .map_err(|e| AppError::ProviderError(e.to_string()))?;
@@ -226,11 +193,16 @@ impl super::Provider for OpenAIProvider {

        let resp_json: serde_json::Value = resp.json().await.map_err(|e| AppError::ProviderError(e.to_string()))?;

+        // Try to normalize: if it's chat-style, use existing parser
+        if resp_json.get("choices").is_some() {
+            return helpers::parse_openai_response(&resp_json, request.model);
+        }
+
        // Normalize Responses API output into ProviderResponse
        let mut content_text = String::new();
        if let Some(output) = resp_json.get("output").and_then(|o| o.as_array()) {
-            if let Some(first) = output.get(0) {
-                if let Some(contents) = first.get("content").and_then(|c| c.as_array()) {
+            for out in output {
+                if let Some(contents) = out.get("content").and_then(|c| c.as_array()) {
                    for item in contents {
                        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
                            if !content_text.is_empty() { content_text.push_str("\n"); }
@@ -275,6 +247,7 @@ impl super::Provider for OpenAIProvider {
            tool_calls: None,
            prompt_tokens,
            completion_tokens,
+            reasoning_tokens: 0,
            total_tokens,
            cache_read_tokens: 0,
            cache_write_tokens: 0,
@@ -312,6 +285,12 @@ impl super::Provider for OpenAIProvider {
        &self,
        request: UnifiedRequest,
    ) -> Result<BoxStream<'static, Result<ProviderStreamChunk, AppError>>, AppError> {
+        // Allow proactive routing to Responses API based on heuristic
+        let model_lc = request.model.to_lowercase();
+        if model_lc.contains("gpt-5") || model_lc.contains("codex") {
+             return self.chat_responses_stream(request).await;
+        }
+
        let messages_json = helpers::messages_to_openai_json(&request.messages).await?;
        let mut body = helpers::build_openai_body(&request, messages_json, true);

@@ -400,21 +379,68 @@ impl super::Provider for OpenAIProvider {
        let messages_json = helpers::messages_to_openai_json(&request.messages).await?;
        let mut input_parts = Vec::new();
        for m in &messages_json {
-            let role = m["role"].as_str().unwrap_or("user");
-            let content = m.get("content").cloned().unwrap_or(serde_json::json!(""));
+            let mut role = m["role"].as_str().unwrap_or("user").to_string();
+            // Newer models (gpt-5, o1) prefer "developer" over "system"
+            if role == "system" {
+                role = "developer".to_string();
+            }
            
+            let mut content = m.get("content").cloned().unwrap_or(serde_json::json!([]));
+            
+            // Map content types based on role for Responses API
+            if let Some(content_array) = content.as_array_mut() {
+                for part in content_array {
+                    if let Some(part_obj) = part.as_object_mut() {
+                        if let Some(t) = part_obj.get("type").and_then(|v| v.as_str()) {
+                            match t {
+                                "text" => {
+                                    let new_type = if role == "assistant" { "output_text" } else { "input_text" };
+                                    part_obj.insert("type".to_string(), serde_json::json!(new_type));
+                                }
+                                "image_url" => {
+                                    // Assistant typically doesn't have image_url in history this way, but for safety:
+                                    let new_type = if role == "assistant" { "output_image" } else { "input_image" };
+                                    part_obj.insert("type".to_string(), serde_json::json!(new_type));
+                                    if let Some(img_url) = part_obj.remove("image_url") {
+                                        part_obj.insert("image".to_string(), img_url);
+                                    }
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            } else if let Some(text) = content.as_str() {
+                let new_type = if role == "assistant" { "output_text" } else { "input_text" };
+                content = serde_json::json!([{ "type": new_type, "text": text }]);
+            }
+
            input_parts.push(serde_json::json!({
                "role": role,
                "content": content
            }));
        }

-        let body = serde_json::json!({
+        let mut body = serde_json::json!({
            "model": request.model,
            "input": input_parts,
-            "stream": true
+            "stream": true,
        });

+        // Add standard parameters
+        if let Some(temp) = request.temperature {
+            body["temperature"] = serde_json::json!(temp);
+        }
+        
+        // Newer models (gpt-5, o1) in Responses API use max_output_tokens
+        if let Some(max_tokens) = request.max_tokens {
+            if request.model.contains("gpt-5") || request.model.starts_with("o1-") || request.model.starts_with("o3-") {
+                body["max_output_tokens"] = serde_json::json!(max_tokens);
+            } else {
+                body["max_tokens"] = serde_json::json!(max_tokens);
+            }
+        }
+
        let url = format!("{}/responses", self.config.base_url);
        let api_key = self.api_key.clone();
        let model = request.model.clone();
@@ -425,6 +451,7 @@ impl super::Provider for OpenAIProvider {
            self.client
                .post(&url)
                .header("Authorization", format!("Bearer {}", api_key))
+                .header("Accept", "text/event-stream")
                .json(&body),
        )
        .map_err(|e| AppError::ProviderError(format!("Failed to create EventSource for Responses API: {}", e)))?;
@@ -441,36 +468,44 @@ impl super::Provider for OpenAIProvider {
                        let chunk: serde_json::Value = serde_json::from_str(&msg.data)
                            .map_err(|e| AppError::ProviderError(format!("Failed to parse Responses stream chunk: {}", e)))?;

-                        // Try standard OpenAI parsing first
+                        // Try standard OpenAI parsing first (choices/usage)
                        if let Some(p_chunk) = helpers::parse_openai_stream_chunk(&chunk, &model, None) {
                            yield p_chunk?;
                        } else {
                            // Responses API specific parsing for streaming
-                            // Often it follows a similar structure to the non-streaming response but in chunks
                            let mut content = String::new();
+                            let mut finish_reason = None;
                            
-                            // Check for output[0].content[0].text (similar to non-stream)
-                            if let Some(output) = chunk.get("output").and_then(|o| o.as_array()) {
-                                if let Some(first) = output.get(0) {
-                                    if let Some(contents) = first.get("content").and_then(|c| c.as_array()) {
-                                        for item in contents {
-                                            if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
-                                                content.push_str(text);
-                                            }
-                                        }
+                            let event_type = chunk.get("type").and_then(|v| v.as_str()).unwrap_or("");
+                            
+                            match event_type {
+                                "response.output_text.delta" => {
+                                    if let Some(delta) = chunk.get("delta").and_then(|v| v.as_str()) {
+                                        content.push_str(delta);
                                    }
                                }
-                            }
-                            
-                            // Check for candidates[0].content.parts[0].text (Gemini-like, which OpenAI sometimes uses for v1/responses)
-                            if content.is_empty() {
-                                if let Some(cands) = chunk.get("candidates").and_then(|c| c.as_array()) {
-                                    if let Some(c0) = cands.get(0) {
-                                        if let Some(content_obj) = c0.get("content") {
-                                            if let Some(parts) = content_obj.get("parts").and_then(|p| p.as_array()) {
-                                                for p in parts {
-                                                    if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
-                                                        content.push_str(t);
+                                "response.output_text.done" => {
+                                    if let Some(text) = chunk.get("text").and_then(|v| v.as_str()) {
+                                        // Some implementations send the full text at the end
+                                        // We usually prefer deltas, but if we haven't seen them, this is the fallback.
+                                        // However, if we're already yielding deltas, we might not want this.
+                                        // For now, let's just use it as a signal that we're done.
+                                        finish_reason = Some("stop".to_string());
+                                    }
+                                }
+                                "response.done" => {
+                                    finish_reason = Some("stop".to_string());
+                                }
+                                _ => {
+                                    // Fallback to older nested structure if present
+                                    if let Some(output) = chunk.get("output").and_then(|o| o.as_array()) {
+                                        for out in output {
+                                            if let Some(contents) = out.get("content").and_then(|c| c.as_array()) {
+                                                for item in contents {
+                                                    if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
+                                                        content.push_str(text);
+                                                    } else if let Some(delta) = item.get("delta").and_then(|d| d.get("text")).and_then(|t| t.as_str()) {
+                                                        content.push_str(delta);
                                                    }
                                                }
                                            }
@@ -479,11 +514,11 @@ impl super::Provider for OpenAIProvider {
                                }
                            }

-                            if !content.is_empty() {
+                            if !content.is_empty() || finish_reason.is_some() {
                                yield ProviderStreamChunk {
                                    content,
                                    reasoning_content: None,
-                                    finish_reason: None,
+                                    finish_reason,
                                    tool_calls: None,
                                    model: model.clone(),
                                    usage: None,
@@ -497,19 +532,22 @@ impl super::Provider for OpenAIProvider {
                        let probe_resp = probe_client
                            .post(&url)
                            .header("Authorization", format!("Bearer {}", api_key))
+                            .header("Accept", "application/json") // Ask for JSON during probe
                            .json(&probe_body)
                            .send()
                            .await;
                        
                        match probe_resp {
-                            Ok(r) if !r.status().is_success() => {
+                            Ok(r) => {
                                let status = r.status();
-                                let error_body = r.text().await.unwrap_or_default();
-                                tracing::error!("OpenAI Responses Stream Error Probe ({}): {}", status, error_body);
-                                Err(AppError::ProviderError(format!("OpenAI Responses API error ({}): {}", status, error_body)))?;
-                            }
-                            Ok(_) => {
-                                Err(AppError::ProviderError(format!("Responses stream error (probe returned 200): {}", e)))?;
+                                let body = r.text().await.unwrap_or_default();
+                                if status.is_success() {
+                                    tracing::warn!("Responses stream ended prematurely but probe returned 200 OK. Body: {}", body);
+                                    Err(AppError::ProviderError(format!("Responses stream ended (server sent 200 OK with body: {})", body)))?;
+                                } else {
+                                    tracing::error!("OpenAI Responses Stream Error Probe ({}): {}", status, body);
+                                    Err(AppError::ProviderError(format!("OpenAI Responses API error ({}): {}", status, body)))?;
+                                }
                            }
                            Err(probe_err) => {
                                tracing::error!("OpenAI Responses Stream Error Probe failed: {}", probe_err);
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -312,6 +312,14 @@ async fn chat_completions(
                                        },
                                        finish_reason: chunk.finish_reason,
                                    }],
+                                    usage: chunk.usage.as_ref().map(|u| crate::models::Usage {
+                                        prompt_tokens: u.prompt_tokens,
+                                        completion_tokens: u.completion_tokens,
+                                        total_tokens: u.total_tokens,
+                                        reasoning_tokens: if u.reasoning_tokens > 0 { Some(u.reasoning_tokens) } else { None },
+                                        cache_read_tokens: if u.cache_read_tokens > 0 { Some(u.cache_read_tokens) } else { None },
+                                        cache_write_tokens: if u.cache_write_tokens > 0 { Some(u.cache_write_tokens) } else { None },
+                                    }),
                                };
                                
                                // Use axum's Event directly, wrap in Ok
@@ -383,6 +391,7 @@ async fn chat_completions(
                    model: response.model.clone(),
                    prompt_tokens: response.prompt_tokens,
                    completion_tokens: response.completion_tokens,
+                    reasoning_tokens: response.reasoning_tokens,
                    total_tokens: response.total_tokens,
                    cache_read_tokens: response.cache_read_tokens,
                    cache_write_tokens: response.cache_write_tokens,
@@ -423,6 +432,7 @@ async fn chat_completions(
                        prompt_tokens: response.prompt_tokens,
                        completion_tokens: response.completion_tokens,
                        total_tokens: response.total_tokens,
+                        reasoning_tokens: if response.reasoning_tokens > 0 { Some(response.reasoning_tokens) } else { None },
                        cache_read_tokens: if response.cache_read_tokens > 0 { Some(response.cache_read_tokens) } else { None },
                        cache_write_tokens: if response.cache_write_tokens > 0 { Some(response.cache_write_tokens) } else { None },
                    }),
@@ -452,6 +462,7 @@ async fn chat_completions(
                    model: model.clone(),
                    prompt_tokens: 0,
                    completion_tokens: 0,
+                    reasoning_tokens: 0,
                    total_tokens: 0,
                    cache_read_tokens: 0,
                    cache_write_tokens: 0,
--- a/src/utils/streaming.rs
+++ b/src/utils/streaming.rs
@@ -96,11 +96,12 @@ where
        // Spawn a background task to log the completion
        tokio::spawn(async move {
            // Use real usage from the provider when available, otherwise fall back to estimates
-            let (prompt_tokens, completion_tokens, total_tokens, cache_read_tokens, cache_write_tokens) =
+            let (prompt_tokens, completion_tokens, reasoning_tokens, total_tokens, cache_read_tokens, cache_write_tokens) =
                if let Some(usage) = &real_usage {
                    (
                        usage.prompt_tokens,
                        usage.completion_tokens,
+                        usage.reasoning_tokens,
                        usage.total_tokens,
                        usage.cache_read_tokens,
                        usage.cache_write_tokens,
@@ -109,6 +110,7 @@ where
                    (
                        estimated_prompt_tokens,
                        estimated_completion,
+                        estimated_reasoning_tokens,
                        estimated_prompt_tokens + estimated_completion,
                        0u32,
                        0u32,
@@ -163,6 +165,7 @@ where
                model,
                prompt_tokens,
                completion_tokens,
+                reasoning_tokens,
                total_tokens,
                cache_read_tokens,
                cache_write_tokens,
--- a/static/js/pages/logs.js
+++ b/static/js/pages/logs.js
@@ -38,6 +38,24 @@ class LogsPage {
            const statusClass = log.status === 'success' ? 'success' : 'danger';
            const timestamp = luxon.DateTime.fromISO(log.timestamp).toFormat('yyyy-MM-dd HH:mm:ss');
            
+            let tokenDetails = `${log.tokens} total tokens`;
+            if (log.status === 'success') {
+                const parts = [];
+                parts.push(`${log.prompt_tokens} in`);
+                
+                let completionStr = `${log.completion_tokens} out`;
+                if (log.reasoning_tokens > 0) {
+                    completionStr += ` (${log.reasoning_tokens} reasoning)`;
+                }
+                parts.push(completionStr);
+
+                if (log.cache_read_tokens > 0) {
+                    parts.push(`${log.cache_read_tokens} cache-hit`);
+                }
+                
+                tokenDetails = parts.join(', ');
+            }
+
            return `
                <tr class="log-row">
                    <td class="whitespace-nowrap">${timestamp}</td>
@@ -55,7 +73,7 @@ class LogsPage {
                    <td>
                        <div class="log-message-container">
                            <code class="log-model">${log.model}</code>
-                            <span class="log-tokens">${log.tokens} tokens</span>
+                            <span class="log-tokens" title="${log.tokens} total tokens">${tokenDetails}</span>
                            <span class="log-duration">${log.duration}ms</span>
                            ${log.error ? `<div class="log-error-msg">${log.error}</div>` : ''}
                        </div>
Author	SHA1	Message	Date
hobokenchicken	649371154f	fix(openai): implement parsing for real-time Responses API streaming format Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 19:00:40 +00:00
hobokenchicken	78fff61660	fix(openai): map system to developer role and enhance stream diagnostics for Responses API Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:50:55 +00:00
hobokenchicken	b131094dfd	fix(openai): improve Responses API streaming reliability and diagnostics Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:45:09 +00:00
hobokenchicken	c3d81c1733	fix(openai): remove unsupported stream_options from Responses API Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:40:06 +00:00
hobokenchicken	e123f542f1	fix(openai): use max_output_tokens for Responses API Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:36:05 +00:00
hobokenchicken	0d28241e39	fix(openai): enhance Responses API integration with full parameters and improved parsing Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:31:23 +00:00
hobokenchicken	754ee9cb84	fix(openai): implement role-based content mapping for Responses API Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:23:50 +00:00
hobokenchicken	5a9086b883	fix(openai): map content types for Responses API (v1/responses) Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-17 18:18:23 +00:00
hobokenchicken	cc5eba1957	feat: implement reasoning_tokens tracking and enhanced usage logging Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-11 17:14:49 +00:00
hobokenchicken	3ab00fb188	fixed tracking Some checks failed CI / Check (push) Has been cancelled Details CI / Clippy (push) Has been cancelled Details CI / Formatting (push) Has been cancelled Details CI / Test (push) Has been cancelled Details CI / Release Build (push) Has been cancelled Details	2026-03-11 16:21:32 +00:00