use crate::models::UnifiedRequest; use tiktoken_rs::get_bpe_from_model; /// Count tokens for a given model and text pub fn count_tokens(model: &str, text: &str) -> u32 { // If we can't get the bpe for the model, fallback to a safe default (cl100k_base for GPT-4/o1) let bpe = get_bpe_from_model(model) .unwrap_or_else(|_| tiktoken_rs::cl100k_base().expect("Failed to get cl100k_base encoding")); bpe.encode_with_special_tokens(text).len() as u32 } /// Estimate tokens for a unified request. /// Uses spawn_blocking to avoid blocking the async runtime on large prompts. pub fn estimate_request_tokens(model: &str, request: &UnifiedRequest) -> u32 { let mut total_text = String::new(); let msg_count = request.messages.len(); // Base tokens per message for OpenAI (approximate) let tokens_per_message: u32 = 3; for msg in &request.messages { for part in &msg.content { match part { crate::models::ContentPart::Text { text } => { total_text.push_str(text); total_text.push('\n'); } crate::models::ContentPart::Image { .. } => { // Vision models usually have a fixed cost or calculation based on size } } } } // Quick heuristic for small inputs (< 1KB) — avoid spawn_blocking overhead if total_text.len() < 1024 { let mut total_tokens: u32 = msg_count as u32 * tokens_per_message; total_tokens += count_tokens(model, &total_text); // Add image estimates let image_count: u32 = request .messages .iter() .flat_map(|m| m.content.iter()) .filter(|p| matches!(p, crate::models::ContentPart::Image { .. })) .count() as u32; total_tokens += image_count * 1000; total_tokens += 3; // assistant reply header return total_tokens; } // For large inputs, use the fast heuristic (chars / 4) to avoid blocking // the async runtime. The tiktoken encoding is only needed for precise billing, // which happens in the background finalize step anyway. let estimated_text_tokens = (total_text.len() as u32) / 4; let image_count: u32 = request .messages .iter() .flat_map(|m| m.content.iter()) .filter(|p| matches!(p, crate::models::ContentPart::Image { .. })) .count() as u32; (msg_count as u32 * tokens_per_message) + estimated_text_tokens + (image_count * 1000) + 3 } /// Estimate tokens for completion text pub fn estimate_completion_tokens(text: &str, model: &str) -> u32 { count_tokens(model, text) }