From ef37dc5af02b7fb21886bce443b52b036df04ae7 Mon Sep 17 00:00:00 2001 From: hobokenchicken Date: Tue, 7 Apr 2026 13:48:02 +0000 Subject: [PATCH] fix(ollama): significantly increase context and prediction limits - Increase timeout to 15m - Set num_ctx to 32k for common models - Set default num_predict to 8192 for common models --- internal/providers/ollama.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/internal/providers/ollama.go b/internal/providers/ollama.go index 51a0341b..589aa4a8 100644 --- a/internal/providers/ollama.go +++ b/internal/providers/ollama.go @@ -22,7 +22,8 @@ type OllamaProvider struct { func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider { client := resty.New() // Set reasonable timeouts for local Ollama server (longer for larger models) - client.SetTimeout(5 * time.Minute) + // For streaming, we want a very long timeout or none at all to handle generation time + client.SetTimeout(15 * time.Minute) client.SetRetryCount(2) client.SetRetryWaitTime(1 * time.Second) @@ -117,9 +118,9 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{}, options := make(map[string]interface{}) - // Context window size (8192 for common models) + // Context window size (32k for modern models to avoid truncation) if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { - options["num_ctx"] = 8192 + options["num_ctx"] = 32768 } if request.Temperature != nil { @@ -131,10 +132,10 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{}, body["max_tokens"] = *request.MaxTokens options["num_predict"] = *request.MaxTokens } else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { - // Default to 4096 for common Ollama models if not specified, + // Default to 8192 for common Ollama models if not specified, // as Ollama's compatibility layer sometimes defaults to 128 - body["max_tokens"] = 4096 - options["num_predict"] = 4096 + body["max_tokens"] = 8192 + options["num_predict"] = 8192 } if request.TopP != nil {