From fdbb068a6c62c82b6162b29e66936769c0bb3ddc Mon Sep 17 00:00:00 2001 From: hobokenchicken Date: Tue, 7 Apr 2026 13:44:17 +0000 Subject: [PATCH] fix(ollama): map max_tokens to num_predict and increase context window - Map MaxTokens to num_predict in options map - Set default num_ctx to 8192 for common models (gemma, llama, etc.) - This ensures Ollama doesn't cut off responses early due to default limits --- internal/providers/ollama.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/internal/providers/ollama.go b/internal/providers/ollama.go index 182c4ae2..51a0341b 100644 --- a/internal/providers/ollama.go +++ b/internal/providers/ollama.go @@ -115,24 +115,41 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{}, "stream": stream, } + options := make(map[string]interface{}) + + // Context window size (8192 for common models) + if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { + options["num_ctx"] = 8192 + } + if request.Temperature != nil { body["temperature"] = *request.Temperature + options["temperature"] = *request.Temperature } if request.MaxTokens != nil { body["max_tokens"] = *request.MaxTokens + options["num_predict"] = *request.MaxTokens } else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") { // Default to 4096 for common Ollama models if not specified, // as Ollama's compatibility layer sometimes defaults to 128 body["max_tokens"] = 4096 + options["num_predict"] = 4096 } if request.TopP != nil { body["top_p"] = *request.TopP + options["top_p"] = *request.TopP } if request.TopK != nil { body["top_k"] = *request.TopK + options["top_k"] = *request.TopK } + + if len(options) > 0 { + body["options"] = options + } + if len(request.Stop) > 0 { body["stop"] = request.Stop }