fix(ollama): significantly increase context and prediction limits
CI / Lint (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (push) Has been cancelled

- Increase timeout to 15m
- Set num_ctx to 32k for common models
- Set default num_predict to 8192 for common models
This commit is contained in:
2026-04-07 13:48:02 +00:00
parent fdbb068a6c
commit ef37dc5af0
+7 -6
View File
@@ -22,7 +22,8 @@ type OllamaProvider struct {
func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider { func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider {
client := resty.New() client := resty.New()
// Set reasonable timeouts for local Ollama server (longer for larger models) // Set reasonable timeouts for local Ollama server (longer for larger models)
client.SetTimeout(5 * time.Minute) // For streaming, we want a very long timeout or none at all to handle generation time
client.SetTimeout(15 * time.Minute)
client.SetRetryCount(2) client.SetRetryCount(2)
client.SetRetryWaitTime(1 * time.Second) client.SetRetryWaitTime(1 * time.Second)
@@ -117,9 +118,9 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
options := make(map[string]interface{}) options := make(map[string]interface{})
// Context window size (8192 for common models) // Context window size (32k for modern models to avoid truncation)
if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
options["num_ctx"] = 8192 options["num_ctx"] = 32768
} }
if request.Temperature != nil { if request.Temperature != nil {
@@ -131,10 +132,10 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
body["max_tokens"] = *request.MaxTokens body["max_tokens"] = *request.MaxTokens
options["num_predict"] = *request.MaxTokens options["num_predict"] = *request.MaxTokens
} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { } else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
// Default to 4096 for common Ollama models if not specified, // Default to 8192 for common Ollama models if not specified,
// as Ollama's compatibility layer sometimes defaults to 128 // as Ollama's compatibility layer sometimes defaults to 128
body["max_tokens"] = 4096 body["max_tokens"] = 8192
options["num_predict"] = 4096 options["num_predict"] = 8192
} }
if request.TopP != nil { if request.TopP != nil {