From ef37dc5af02b7fb21886bce443b52b036df04ae7 Mon Sep 17 00:00:00 2001
From: hobokenchicken <dustin@dustin.coffee>
Date: Tue, 7 Apr 2026 13:48:02 +0000
Subject: [PATCH] fix(ollama): significantly increase context and prediction
 limits

- Increase timeout to 15m
- Set num_ctx to 32k for common models
- Set default num_predict to 8192 for common models
---
 internal/providers/ollama.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/internal/providers/ollama.go b/internal/providers/ollama.go
index 51a0341b..589aa4a8 100644
--- a/internal/providers/ollama.go
+++ b/internal/providers/ollama.go
@@ -22,7 +22,8 @@ type OllamaProvider struct {
 func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider {
 	client := resty.New()
 	// Set reasonable timeouts for local Ollama server (longer for larger models)
-	client.SetTimeout(5 * time.Minute)
+	// For streaming, we want a very long timeout or none at all to handle generation time
+	client.SetTimeout(15 * time.Minute)
 	client.SetRetryCount(2)
 	client.SetRetryWaitTime(1 * time.Second)
 	
@@ -117,9 +118,9 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
 
 	options := make(map[string]interface{})
 
-	// Context window size (8192 for common models)
+	// Context window size (32k for modern models to avoid truncation)
 	if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
-		options["num_ctx"] = 8192
+		options["num_ctx"] = 32768
 	}
 
 	if request.Temperature != nil {
@@ -131,10 +132,10 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
 		body["max_tokens"] = *request.MaxTokens
 		options["num_predict"] = *request.MaxTokens
 	} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
-		// Default to 4096 for common Ollama models if not specified, 
+		// Default to 8192 for common Ollama models if not specified, 
 		// as Ollama's compatibility layer sometimes defaults to 128
-		body["max_tokens"] = 4096
-		options["num_predict"] = 4096
+		body["max_tokens"] = 8192
+		options["num_predict"] = 8192
 	}
 
 	if request.TopP != nil {