fix(ollama): improve model detection and ensure robust token/context limits
- Use case-insensitive matching for model names and routing - Default max_tokens/num_predict to 8192 for all Ollama models to prevent truncation - Increase default context window and add more large-context model families - Ensure DeepSeek routing handles Ollama-hosted variants correctly
This commit is contained in:
@@ -117,11 +117,20 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
|
||||
}
|
||||
|
||||
options := make(map[string]interface{})
|
||||
modelLower := strings.ToLower(request.Model)
|
||||
|
||||
// Context window size (32k for modern models to avoid truncation)
|
||||
if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
||||
options["num_ctx"] = 32768
|
||||
// Context window size (default 8k for all, 32k+ for modern large-context models)
|
||||
ctxSize := 8192
|
||||
if strings.Contains(modelLower, "llama3") ||
|
||||
strings.Contains(modelLower, "mistral") ||
|
||||
strings.Contains(modelLower, "mixtral") ||
|
||||
strings.Contains(modelLower, "qwen") ||
|
||||
strings.Contains(modelLower, "deepseek") ||
|
||||
strings.Contains(modelLower, "command-r") ||
|
||||
strings.Contains(modelLower, "phi3") {
|
||||
ctxSize = 32768
|
||||
}
|
||||
options["num_ctx"] = ctxSize
|
||||
|
||||
if request.Temperature != nil {
|
||||
body["temperature"] = *request.Temperature
|
||||
@@ -131,9 +140,10 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
|
||||
if request.MaxTokens != nil {
|
||||
body["max_tokens"] = *request.MaxTokens
|
||||
options["num_predict"] = *request.MaxTokens
|
||||
} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
||||
// Default to 8192 for common Ollama models if not specified,
|
||||
// as Ollama's compatibility layer sometimes defaults to 128
|
||||
} else {
|
||||
// Default to 8192 for all Ollama models if not specified,
|
||||
// as Ollama's compatibility layer defaults to 128 if neither
|
||||
// max_tokens nor num_predict are provided.
|
||||
body["max_tokens"] = 8192
|
||||
options["num_predict"] = 8192
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user