fix(ollama): significantly increase context and prediction limits
- Increase timeout to 15m - Set num_ctx to 32k for common models - Set default num_predict to 8192 for common models
This commit is contained in:
@@ -22,7 +22,8 @@ type OllamaProvider struct {
|
|||||||
func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider {
|
func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider {
|
||||||
client := resty.New()
|
client := resty.New()
|
||||||
// Set reasonable timeouts for local Ollama server (longer for larger models)
|
// Set reasonable timeouts for local Ollama server (longer for larger models)
|
||||||
client.SetTimeout(5 * time.Minute)
|
// For streaming, we want a very long timeout or none at all to handle generation time
|
||||||
|
client.SetTimeout(15 * time.Minute)
|
||||||
client.SetRetryCount(2)
|
client.SetRetryCount(2)
|
||||||
client.SetRetryWaitTime(1 * time.Second)
|
client.SetRetryWaitTime(1 * time.Second)
|
||||||
|
|
||||||
@@ -117,9 +118,9 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
|
|||||||
|
|
||||||
options := make(map[string]interface{})
|
options := make(map[string]interface{})
|
||||||
|
|
||||||
// Context window size (8192 for common models)
|
// Context window size (32k for modern models to avoid truncation)
|
||||||
if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
||||||
options["num_ctx"] = 8192
|
options["num_ctx"] = 32768
|
||||||
}
|
}
|
||||||
|
|
||||||
if request.Temperature != nil {
|
if request.Temperature != nil {
|
||||||
@@ -131,10 +132,10 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
|
|||||||
body["max_tokens"] = *request.MaxTokens
|
body["max_tokens"] = *request.MaxTokens
|
||||||
options["num_predict"] = *request.MaxTokens
|
options["num_predict"] = *request.MaxTokens
|
||||||
} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
||||||
// Default to 4096 for common Ollama models if not specified,
|
// Default to 8192 for common Ollama models if not specified,
|
||||||
// as Ollama's compatibility layer sometimes defaults to 128
|
// as Ollama's compatibility layer sometimes defaults to 128
|
||||||
body["max_tokens"] = 4096
|
body["max_tokens"] = 8192
|
||||||
options["num_predict"] = 4096
|
options["num_predict"] = 8192
|
||||||
}
|
}
|
||||||
|
|
||||||
if request.TopP != nil {
|
if request.TopP != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user