From e8955fd36cbb23d821523ca6038f9ab1ac5fd34e Mon Sep 17 00:00:00 2001 From: hobokenchicken Date: Fri, 6 Mar 2026 15:22:06 -0500 Subject: [PATCH] merge --- .env | 10 ++-- .env.backup | 22 +++++++++ PLAN.md | 45 ++++++++++++++++-- ...0526ae213e830b3f1f33e785ef2b68cf48c7853fcd | Bin 0 -> 106496 bytes server.log | 11 +++++ server.pid | 1 + src/database/mod.rs | 10 ++++ src/providers/deepseek.rs | 10 ++-- src/providers/openai.rs | 3 +- src/server/mod.rs | 19 +++++++- src/state/mod.rs | 10 ++-- src/utils/streaming.rs | 18 ++++++- static/js/pages/models.js | 16 ++++++- 13 files changed, 150 insertions(+), 25 deletions(-) create mode 100644 .env.backup create mode 100644 data/llm_proxy.dbLLM_PROXY__ENCRYPTION_KEY=69879f5b7913ba169982190526ae213e830b3f1f33e785ef2b68cf48c7853fcd create mode 100644 server.log create mode 100644 server.pid diff --git a/.env b/.env index 35887aca..3bed8907 100644 --- a/.env +++ b/.env @@ -15,12 +15,14 @@ GROK_API_KEY=gk-demo-grok-key # Authentication tokens (comma-separated list) LLM_PROXY__SERVER__AUTH_TOKENS=demo-token-123456,another-token -# Server port (optional) -LLM_PROXY__SERVER__PORT=8080 - # Database path (optional) LLM_PROXY__DATABASE__PATH=./data/llm_proxy.db +# Session Secret (for signed tokens) SESSION_SECRET=ki9khXAk9usDkasMrD2UbK4LOgrDRJz0 -LLM_PROXY__ENCRYPTION_KEY=eac0239bfc402c7eb888366dd76c314288a8693efd5b7457819aeaf1fe429ac2 +# Encryption key (required) +LLM_PROXY__ENCRYPTION_KEY=69879f5b7913ba169982190526ae213e830b3f1f33e785ef2b68cf48c7853fcd + +# Server port (optional) +LLM_PROXY__SERVER__PORT=8081 diff --git a/.env.backup b/.env.backup new file mode 100644 index 00000000..796ebbe0 --- /dev/null +++ b/.env.backup @@ -0,0 +1,22 @@ +# LLM Proxy Gateway Environment Variables + +# OpenAI +OPENAI_API_KEY=sk-demo-openai-key + +# Google Gemini +GEMINI_API_KEY=AIza-demo-gemini-key + +# DeepSeek +DEEPSEEK_API_KEY=sk-demo-deepseek-key + +# xAI Grok (not yet available) +GROK_API_KEY=gk-demo-grok-key + +# Authentication tokens (comma-separated list) +LLM_PROXY__SERVER__AUTH_TOKENS=demo-token-123456,another-token + +# Server port (optional) +LLM_PROXY__SERVER__PORT=8080 + +# Database path (optional) +LLM_PROXY__DATABASE__PATH=./data/llm_proxy.db \ No newline at end of file diff --git a/PLAN.md b/PLAN.md index 10e31f6a..6220d5cf 100644 --- a/PLAN.md +++ b/PLAN.md @@ -56,7 +56,44 @@ This document outlines the roadmap for standardizing frontend security, cleaning --- -## Technical Standards -- **Rust:** No `unwrap()` in production code; use proper error handling (`Result`). -- **Frontend:** Always use `window.api` wrappers for sensitive operations. -- **Security:** Secrets must never be logged or hardcoded. +# Phase 6: Cache Cost & Provider Audit (ACTIVE) +**Primary Agents:** `frontend-developer`, `backend-developer`, `database-optimizer`, `lab-assistant` + +## 6.1 Dashboard UI Updates (@frontend-developer) +- [ ] **Update Models Page Modal:** Add input fields for `Cache Read Cost` and `Cache Write Cost` in `static/js/pages/models.js`. +- [ ] **API Integration:** Ensure `window.api.put` includes these new cost fields in the request body. +- [ ] **Verify Costs Page:** Confirm `static/js/pages/costs.js` displays these rates correctly in the pricing table. + +## 6.2 Provider Audit & Stream Fixes (@backend-developer) +- [ ] **Standard DeepSeek Fix:** Modify `src/providers/deepseek.rs` to stop stripping `stream_options` for `deepseek-chat`. +- [ ] **Grok Audit:** Verify if Grok correctly returns usage in streaming; it uses `build_openai_body` and doesn't seem to strip it. +- [ ] **Gemini Audit:** Confirm Gemini returns `usage_metadata` reliably in the final chunk. +- [ ] **Anthropic Audit:** Check if Anthropic streaming requires `include_usage` or similar flags. + +## 6.3 Database & Migration Validation (@database-optimizer) +- [ ] **Test Migrations:** Run the server to ensure `ALTER TABLE` logic in `src/database/mod.rs` applies the new columns correctly. +- [ ] **Schema Verification:** Verify `model_configs` has `cache_read_cost_per_m` and `cache_write_cost_per_m` columns. + +## 6.4 Token Estimation Refinement (@lab-assistant) +- [ ] **Analyze Heuristic:** Review `chars / 4` in `src/utils/tokens.rs`. +- [ ] **Background Precise Recount:** Propose a mechanism for a precise token count (using Tiktoken) after the response is finalized. + +## Critical Path +Migration Validation → UI Fields → Provider Stream Usage Reporting. + +```mermaid +gantt + title Phase 6 Timeline + dateFormat YYYY-MM-DD + section Frontend + Models Page UI :2026-03-06, 1d + Costs Table Update:after Models Page UI, 1d + section Backend + DeepSeek Fix :2026-03-06, 1d + Provider Audit (Grok/Gemini):after DeepSeek Fix, 2d + section Database + Migration Test :2026-03-06, 1d + section Optimization + Token Heuristic Review :2026-03-06, 1d +``` + diff --git a/data/llm_proxy.dbLLM_PROXY__ENCRYPTION_KEY=69879f5b7913ba169982190526ae213e830b3f1f33e785ef2b68cf48c7853fcd b/data/llm_proxy.dbLLM_PROXY__ENCRYPTION_KEY=69879f5b7913ba169982190526ae213e830b3f1f33e785ef2b68cf48c7853fcd new file mode 100644 index 0000000000000000000000000000000000000000..f9b058c51054ad3178af5b566ce7af932c507434 GIT binary patch literal 106496 zcmeI5%WoUU9mlznMN%)*j^k__A;2DNkc2Ill4~1w8YHx|QdComkPiJ@ypz;@;Du$fdJS?(Bm$Z6JY^FCk0ZdC%wf zo!`u8hF0r?nrac@j&Ag1OSqUhk>R+^dxDV3WKPq6Z_lA%+%zvOixzPKxx2H1~ zem|J86BoX>)4X2zekwnIX7N=1>$zvSxAI@iEM&jszU1D^e3Mqh3x*F1Ilg+9J2ytGpl^w7q28(0fSCa@SJtlbGJx&1j(`zclek)nwC>hNj~cgUoE#Ue&B?)z(^dT^w2U z>Oz*^E^-;icbfM!TVe9h(w%3CF`l%-JT87>&gOWY=RS3vB5k-~I^X6P0XRXgYZEKEf;2b#Pu1?95QhOR~BAuBz#9}!1yNUhf0ll!|w z@~e*NB*8CDB$ScC%5z!%9X`!q4s2<~`7!^vlygSx(}FW%9gi+`_5K}o*JK{@jKP?w zxZ&(sBs^=e(EJ(2pKnrZ^c?i0Z%?yI3}bw_dDD?SQ%D0sop-`~(H+f4%b#GN2TtfY^Eu-JLbw^<9)x@5E~!d6i+DwjLUgkcVcT^UwIr^Msv=)t>RBkaHrReR&1+^p7`FUt0Eq&yHD%~YX6FVKt zifTzavL^St#F1cN&dP-fJ47`7fxY~-Ggy0cNwBAD3Y+CpG|P^vX;es(O+GS;xXdP8j(anqx2X137b)6HnO+oP zb`5o4uO|~sI??RF>T4pyvRNlt=+LDwku#4(8=+KSzYq>@Goh$q~L%;9>0T2KI z5C8!X009sH0T2KI5C8!X_@M~&xP{D8`L`D@&T{#D-aD;XcMolZHRmA9e{whT_SwB4 z_x1jMPp7AV{x2R@E3226D;Jk5mxa~JwaTSym6e2`U(L~eafR%hyL*b0T2KI5C8!X009sH0T2KI5C8!XIKl+5 z|3AXUMb|(81V8`;KmY_l00ck)1V8`;K)@qlfBqlm|BwR+fB*=900@8p2!H?xfB*=9 z00hfB*=900@8p2!H?xfB*=900@9U5&?Yv zKZy#$KmY_l00ck)1V8`;KmY_l00cnbXc2Jl|9_V$e0Q|^h2DSw2!H?xfB*=900@8p z2!H?xfWS|lz})oGtU~U{L(Ovc|IadoXFqu=0006Y00JNY0w4eaAOHd&00JNY0*9Z# z8`BqOQ~v+I%!$L_J@fzsKmY_l00ck)1V8`;KmY_l00clFod7=npUws`AOHd&00JNY z0w4eaAOHd&00JOzR0u52EM#(1`OM;%g}*Phr}Fb>7Ek5Bo_m&iEC0pJLiStkOYXhQ zH<{*(_jh?g)o8{~;q8^n~A<2nB75_iA_y>**JJ8BTw zB8ntiUPvKUTRf>rPsvo_2bQ03$8!AHv)t#R zCGTh?Oey?+#w)*Fy?h&8_Wf^R{q9ye(8a?Z#G}7P2kY z+vk}a0aIa}N};{AEeh-6MzvFG3u~QblctyKz*f7ueUr`U_MJ*fXp6TZr49`JzN!!- zA*!b6Gji$J@S$AX= z@&lCShDg=2CbmW+DUi7>{&le?QeV>&+d{ols|AIY=blY}#TSd*6XJ{kPgbrDn1=;s z6fjY*JPcSd3-yNEO4f`{n_PqY34~64>w`{28GX6uS~);n^gj0$(S>SCa@SJtlbGJx z&1j(`zclek)nwC>hNj~cgUoE#Ue&B?)z(^dT?|{5`{#C%%Q(K%yrJp0p; zYh-DKvE||?=4_7VdG1r!Dbj`;rt@u%5r7l)%7(W~8QIIVW{UX-vS~ih4Mn1UVvm&p z3sX_efhO-uLAh+Sp=(ijIDZr%b#GN z2TtfY^Eu-JLb%Ie-yVc}<}Rs9Ipp1AZh$Ue-%fV}j3T5J=5h9sHk0F*m$}d89hF6E zjy`7@twm!em0L_lDSCTxLG8$NejXZ0OCNclN_R=p#7@VuqFU09tjYZ@aU>X+vvQ%r z4iQa%U@w2|4Avf9671=k!e+S?&9b9v8WoaclTT2S*D#6nyT<;2E~{e$C&B+rB*BsY zc{iKoTgz$wCu&b=B^r{uIi2Oto#SphYVl?q)87A3a*@Ylj8-C%CEIPEr#Z`yOI(g$ zTH>A-Ln{;0ovh3$rVKAIF0%=j_0-`nbj)d-|G)TeHjokoKmY_l00ck)1V8`;KmY_l00a&zfkoQz6WM=d3V%D^ zIDTfKzHn;(uk$zN{xSEv*=MtVn4QW0Dfhn|p$YH;0T2KI5CDNgM&OTpj^8YD2X6(} zr<$`YE)2g2Pz!> z^H?*&u?d;&#)OjuZ8T2F(X6rd1ZPX(2JOF| z Result<()> { enabled BOOLEAN DEFAULT TRUE, prompt_cost_per_m REAL, completion_cost_per_m REAL, + cache_read_cost_per_m REAL, + cache_write_cost_per_m REAL, mapping TEXT, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (provider_id) REFERENCES provider_configs(id) ON DELETE CASCADE @@ -180,6 +182,14 @@ pub async fn run_migrations(pool: &DbPool) -> Result<()> { .execute(pool) .await; + // Add manual cache cost columns to model_configs if they don't exist + let _ = sqlx::query("ALTER TABLE model_configs ADD COLUMN cache_read_cost_per_m REAL") + .execute(pool) + .await; + let _ = sqlx::query("ALTER TABLE model_configs ADD COLUMN cache_write_cost_per_m REAL") + .execute(pool) + .await; + // Insert default admin user if none exists (default password: admin) let user_count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM users").fetch_one(pool).await?; diff --git a/src/providers/deepseek.rs b/src/providers/deepseek.rs index 315dcc84..022e616f 100644 --- a/src/providers/deepseek.rs +++ b/src/providers/deepseek.rs @@ -152,8 +152,11 @@ impl super::Provider for DeepSeekProvider { // Sanitize and fix for deepseek-reasoner (R1) if request.model == "deepseek-reasoner" { if let Some(obj) = body.as_object_mut() { - obj.remove("stream_options"); + // Keep stream_options if present (DeepSeek supports include_usage) + + // Remove unsupported parameters obj.remove("temperature"); + obj.remove("top_p"); obj.remove("presence_penalty"); obj.remove("frequency_penalty"); @@ -177,11 +180,6 @@ impl super::Provider for DeepSeekProvider { } } } - } else { - // For standard deepseek-chat, keep it clean - if let Some(obj) = body.as_object_mut() { - obj.remove("stream_options"); - } } let url = format!("{}/chat/completions", self.config.base_url); diff --git a/src/providers/openai.rs b/src/providers/openai.rs index 913ca59b..7fbbcb89 100644 --- a/src/providers/openai.rs +++ b/src/providers/openai.rs @@ -317,8 +317,7 @@ impl super::Provider for OpenAIProvider { // Standard OpenAI cleanup if let Some(obj) = body.as_object_mut() { - obj.remove("stream_options"); - + // stream_options.include_usage is supported by OpenAI for token usage in streaming // Transition: Newer OpenAI models (o1, o3, gpt-5) require max_completion_tokens if request.model.starts_with("o1-") || request.model.starts_with("o3-") || request.model.contains("gpt-5") { if let Some(max_tokens) = obj.remove("max_tokens") { diff --git a/src/server/mod.rs b/src/server/mod.rs index 1726e890..9fe39b34 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -137,8 +137,23 @@ async fn get_model_cost( // Check in-memory cache for cost overrides (no SQLite hit) if let Some(cached) = state.model_config_cache.get(model).await { if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) { - // Manual overrides don't have cache-specific rates, so use simple formula - return (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0); + // Manual overrides logic: if cache rates are provided, use cache-aware formula. + // Formula: (non_cached_prompt * input_rate) + (cache_read * read_rate) + (cache_write * write_rate) + (completion * output_rate) + let non_cached_prompt = prompt_tokens.saturating_sub(cache_read_tokens); + let mut total = (non_cached_prompt as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0); + + if let Some(cr) = cached.cache_read_cost_per_m { + total += cache_read_tokens as f64 * cr / 1_000_000.0; + } else { + // No manual cache_read rate — charge cached tokens at full input rate (backwards compatibility) + total += cache_read_tokens as f64 * p / 1_000_000.0; + } + + if let Some(cw) = cached.cache_write_cost_per_m { + total += cache_write_tokens as f64 * cw / 1_000_000.0; + } + + return total; } } diff --git a/src/state/mod.rs b/src/state/mod.rs index 71dc867a..690b528b 100644 --- a/src/state/mod.rs +++ b/src/state/mod.rs @@ -15,6 +15,8 @@ pub struct CachedModelConfig { pub mapping: Option, pub prompt_cost_per_m: Option, pub completion_cost_per_m: Option, + pub cache_read_cost_per_m: Option, + pub cache_write_cost_per_m: Option, } /// In-memory cache for model_configs table. @@ -35,15 +37,15 @@ impl ModelConfigCache { /// Load all model configs from the database into cache pub async fn refresh(&self) { - match sqlx::query_as::<_, (String, bool, Option, Option, Option)>( - "SELECT id, enabled, mapping, prompt_cost_per_m, completion_cost_per_m FROM model_configs", + match sqlx::query_as::<_, (String, bool, Option, Option, Option, Option, Option)>( + "SELECT id, enabled, mapping, prompt_cost_per_m, completion_cost_per_m, cache_read_cost_per_m, cache_write_cost_per_m FROM model_configs", ) .fetch_all(&self.db_pool) .await { Ok(rows) => { let mut map = HashMap::with_capacity(rows.len()); - for (id, enabled, mapping, prompt_cost, completion_cost) in rows { + for (id, enabled, mapping, prompt_cost, completion_cost, cache_read_cost, cache_write_cost) in rows { map.insert( id, CachedModelConfig { @@ -51,6 +53,8 @@ impl ModelConfigCache { mapping, prompt_cost_per_m: prompt_cost, completion_cost_per_m: completion_cost, + cache_read_cost_per_m: cache_read_cost, + cache_write_cost_per_m: cache_write_cost, }, ); } diff --git a/src/utils/streaming.rs b/src/utils/streaming.rs index 03e2b3fc..50a8075b 100644 --- a/src/utils/streaming.rs +++ b/src/utils/streaming.rs @@ -118,8 +118,22 @@ where // Check in-memory cache for cost overrides (no SQLite hit) let cost = if let Some(cached) = config_cache.get(&model).await { if let (Some(p), Some(c)) = (cached.prompt_cost_per_m, cached.completion_cost_per_m) { - // Cost override doesn't have cache-aware pricing, use simple formula - (prompt_tokens as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0) + // Manual overrides logic: if cache rates are provided, use cache-aware formula. + let non_cached_prompt = prompt_tokens.saturating_sub(cache_read_tokens); + let mut total = (non_cached_prompt as f64 * p / 1_000_000.0) + (completion_tokens as f64 * c / 1_000_000.0); + + if let Some(cr) = cached.cache_read_cost_per_m { + total += cache_read_tokens as f64 * cr / 1_000_000.0; + } else { + // Charge cached tokens at full input rate if no specific rate provided + total += cache_read_tokens as f64 * p / 1_000_000.0; + } + + if let Some(cw) = cached.cache_write_cost_per_m { + total += cache_write_tokens as f64 * cw / 1_000_000.0; + } + + total } else { provider.calculate_cost( &model, diff --git a/static/js/pages/models.js b/static/js/pages/models.js index 9b6449d5..726d963f 100644 --- a/static/js/pages/models.js +++ b/static/js/pages/models.js @@ -30,13 +30,13 @@ class ModelsPage { tableBody.innerHTML = 'No models found in registry'; return; } - + // Sort by provider then name this.models.sort((a, b) => { if (a.provider !== b.provider) return a.provider.localeCompare(b.provider); return a.name.localeCompare(b.name); }); - + tableBody.innerHTML = this.models.map(model => { const statusClass = model.enabled ? 'success' : 'secondary'; const statusIcon = model.enabled ? 'check-circle' : 'ban'; @@ -99,6 +99,14 @@ class ModelsPage { +
+ + +
+
+ + +
@@ -118,6 +126,8 @@ class ModelsPage { const enabled = modal.querySelector('#model-enabled').checked; const promptCost = parseFloat(modal.querySelector('#model-prompt-cost').value); const completionCost = parseFloat(modal.querySelector('#model-completion-cost').value); + const cacheReadCost = parseFloat(modal.querySelector('#model-cache-read-cost').value); + const cacheWriteCost = parseFloat(modal.querySelector('#model-cache-write-cost').value); const mapping = modal.querySelector('#model-mapping').value; try { @@ -125,6 +135,8 @@ class ModelsPage { enabled, prompt_cost: promptCost, completion_cost: completionCost, + cache_read_cost: isNaN(cacheReadCost) ? null : cacheReadCost, + cache_write_cost: isNaN(cacheWriteCost) ? null : cacheWriteCost, mapping: mapping || null });