Files
GopherGate/src/utils/tokens.rs
hobokenchicken 8d50ce7c22
Some checks failed
CI / Check (push) Has been cancelled
CI / Clippy (push) Has been cancelled
CI / Formatting (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Release Build (push) Has been cancelled
perf: eliminate per-request SQLite queries and optimize proxy latency
- Add in-memory ModelConfigCache (30s refresh, explicit invalidation)
  replacing 2 SQLite queries per request (model lookup + cost override)
- Configure all 5 provider HTTP clients with proper timeouts (300s),
  connection pooling (4 idle/host, 90s idle timeout), and TCP keepalive
- Move client_usage update to tokio::spawn in non-streaming path
- Use fast chars/4 heuristic for token estimation on large inputs (>1KB)
- Generate single UUID/timestamp per SSE stream instead of per chunk
- Add shared LazyLock<Client> for image fetching in multimodal module
- Add proxy overhead timing instrumentation for both request paths
- Fix test helper to include new model_config_cache field
2026-03-02 12:53:22 -05:00

70 lines
2.6 KiB
Rust

use crate::models::UnifiedRequest;
use tiktoken_rs::get_bpe_from_model;
/// Count tokens for a given model and text
pub fn count_tokens(model: &str, text: &str) -> u32 {
// If we can't get the bpe for the model, fallback to a safe default (cl100k_base for GPT-4/o1)
let bpe = get_bpe_from_model(model)
.unwrap_or_else(|_| tiktoken_rs::cl100k_base().expect("Failed to get cl100k_base encoding"));
bpe.encode_with_special_tokens(text).len() as u32
}
/// Estimate tokens for a unified request.
/// Uses spawn_blocking to avoid blocking the async runtime on large prompts.
pub fn estimate_request_tokens(model: &str, request: &UnifiedRequest) -> u32 {
let mut total_text = String::new();
let msg_count = request.messages.len();
// Base tokens per message for OpenAI (approximate)
let tokens_per_message: u32 = 3;
for msg in &request.messages {
for part in &msg.content {
match part {
crate::models::ContentPart::Text { text } => {
total_text.push_str(text);
total_text.push('\n');
}
crate::models::ContentPart::Image { .. } => {
// Vision models usually have a fixed cost or calculation based on size
}
}
}
}
// Quick heuristic for small inputs (< 1KB) — avoid spawn_blocking overhead
if total_text.len() < 1024 {
let mut total_tokens: u32 = msg_count as u32 * tokens_per_message;
total_tokens += count_tokens(model, &total_text);
// Add image estimates
let image_count: u32 = request
.messages
.iter()
.flat_map(|m| m.content.iter())
.filter(|p| matches!(p, crate::models::ContentPart::Image { .. }))
.count() as u32;
total_tokens += image_count * 1000;
total_tokens += 3; // assistant reply header
return total_tokens;
}
// For large inputs, use the fast heuristic (chars / 4) to avoid blocking
// the async runtime. The tiktoken encoding is only needed for precise billing,
// which happens in the background finalize step anyway.
let estimated_text_tokens = (total_text.len() as u32) / 4;
let image_count: u32 = request
.messages
.iter()
.flat_map(|m| m.content.iter())
.filter(|p| matches!(p, crate::models::ContentPart::Image { .. }))
.count() as u32;
(msg_count as u32 * tokens_per_message) + estimated_text_tokens + (image_count * 1000) + 3
}
/// Estimate tokens for completion text
pub fn estimate_completion_tokens(text: &str, model: &str) -> u32 {
count_tokens(model, text)
}