fix(ollama): increase timeout and add default max_tokens for large models
- Increase Ollama timeout to 5m for larger models (e.g. gemma4) - Set default max_tokens to 4096 for common Ollama models - Expand stream scanner buffer to 10MB to prevent truncation - Improve model routing and prefix stripping in server
This commit is contained in:
@@ -21,8 +21,8 @@ type OllamaProvider struct {
|
||||
|
||||
func NewOllamaProvider(cfg config.OllamaConfig) *OllamaProvider {
|
||||
client := resty.New()
|
||||
// Set reasonable timeouts for local Ollama server
|
||||
client.SetTimeout(30 * time.Second)
|
||||
// Set reasonable timeouts for local Ollama server (longer for larger models)
|
||||
client.SetTimeout(5 * time.Minute)
|
||||
client.SetRetryCount(2)
|
||||
client.SetRetryWaitTime(1 * time.Second)
|
||||
|
||||
@@ -118,9 +118,15 @@ func BuildOllamaBody(request *models.UnifiedRequest, messagesJSON []interface{},
|
||||
if request.Temperature != nil {
|
||||
body["temperature"] = *request.Temperature
|
||||
}
|
||||
|
||||
if request.MaxTokens != nil {
|
||||
body["max_tokens"] = *request.MaxTokens
|
||||
} else if strings.Contains(request.Model, "gemma") || strings.Contains(request.Model, "llama") || strings.Contains(request.Model, "mistral") || strings.Contains(request.Model, "qwen") {
|
||||
// Default to 4096 for common Ollama models if not specified,
|
||||
// as Ollama's compatibility layer sometimes defaults to 128
|
||||
body["max_tokens"] = 4096
|
||||
}
|
||||
|
||||
if request.TopP != nil {
|
||||
body["top_p"] = *request.TopP
|
||||
}
|
||||
@@ -196,6 +202,11 @@ func ParseOllamaStreamChunk(line string) (*models.ChatCompletionStreamResponse,
|
||||
func StreamOllama(ctx io.ReadCloser, ch chan<- *models.ChatCompletionStreamResponse, model string) error {
|
||||
defer ctx.Close()
|
||||
scanner := bufio.NewScanner(ctx)
|
||||
// Set a larger buffer for scanning to handle large chunks if they occur
|
||||
const maxCapacity = 10 * 1024 * 1024 // 10MB
|
||||
buf := make([]byte, 64*1024)
|
||||
scanner.Buffer(buf, maxCapacity)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
chunk, done, err := ParseOllamaStreamChunk(line)
|
||||
|
||||
Reference in New Issue
Block a user