From 07d6689d87545f34666f1ba491a2ed9d968cd7ba Mon Sep 17 00:00:00 2001 From: Blue-B Date: Sat, 7 Mar 2026 21:31:10 +0900 Subject: [PATCH 1/2] fix(claude): add interleaved-thinking beta header, AMP gzip error decoding, normalizeClaudeBudget max_tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Always include interleaved-thinking-2025-05-14 beta header so that thinking blocks are returned correctly for all Claude models. 2. Remove status-code guard in AMP reverse proxy ModifyResponse so that error responses (4xx/5xx) with hidden gzip encoding are decoded properly — prevents garbled error messages reaching the client. 3. In normalizeClaudeBudget, when the adjusted budget falls below the model minimum, set max_tokens = budgetTokens+1 instead of leaving the request unchanged (which causes a 400 from the API). --- internal/api/modules/amp/proxy.go | 5 ----- internal/runtime/executor/claude_executor.go | 3 +++ internal/thinking/provider/claude/apply.go | 4 +++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/api/modules/amp/proxy.go b/internal/api/modules/amp/proxy.go index ecc9da77..c8010854 100644 --- a/internal/api/modules/amp/proxy.go +++ b/internal/api/modules/amp/proxy.go @@ -108,11 +108,6 @@ func createReverseProxy(upstreamURL string, secretSource SecretSource) (*httputi // Modify incoming responses to handle gzip without Content-Encoding // This addresses the same issue as inline handler gzip handling, but at the proxy level proxy.ModifyResponse = func(resp *http.Response) error { - // Only process successful responses - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return nil - } - // Skip if already marked as gzip (Content-Encoding set) if resp.Header.Get("Content-Encoding") != "" { return nil diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go index 7d0ddcf2..8cdbbf4f 100644 --- a/internal/runtime/executor/claude_executor.go +++ b/internal/runtime/executor/claude_executor.go @@ -832,6 +832,9 @@ func applyClaudeHeaders(r *http.Request, auth *cliproxyauth.Auth, apiKey string, baseBetas += ",oauth-2025-04-20" } } + if !strings.Contains(baseBetas, "interleaved-thinking") { + baseBetas += ",interleaved-thinking-2025-05-14" + } hasClaude1MHeader := false if ginHeaders != nil { diff --git a/internal/thinking/provider/claude/apply.go b/internal/thinking/provider/claude/apply.go index 275be469..af031907 100644 --- a/internal/thinking/provider/claude/apply.go +++ b/internal/thinking/provider/claude/apply.go @@ -194,7 +194,9 @@ func (a *Applier) normalizeClaudeBudget(body []byte, budgetTokens int, modelInfo } if minBudget > 0 && adjustedBudget > 0 && adjustedBudget < minBudget { // If enforcing the max_tokens constraint would push the budget below the model minimum, - // leave the request unchanged. + // increase max_tokens to accommodate the original budget instead of leaving the + // request unchanged (which would cause a 400 error from the API). + body, _ = sjson.SetBytes(body, "max_tokens", budgetTokens+1) return body } From 5f58248016c33c7a3cc01691abf2452e4810f7e7 Mon Sep 17 00:00:00 2001 From: Blue-B Date: Mon, 9 Mar 2026 22:10:30 +0900 Subject: [PATCH 2/2] fix(claude): clamp max_tokens to model limit in normalizeClaudeBudget When adjustedBudget < minBudget, the previous fix blindly set max_tokens = budgetTokens+1 which could exceed MaxCompletionTokens. Now: cap max_tokens at MaxCompletionTokens, recalculate budget, and disable thinking entirely if constraints are unsatisfiable. Add unit tests covering raise, clamp, disable, and no-op scenarios. --- internal/thinking/provider/claude/apply.go | 29 +++++- .../thinking/provider/claude/apply_test.go | 99 +++++++++++++++++++ 2 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 internal/thinking/provider/claude/apply_test.go diff --git a/internal/thinking/provider/claude/apply.go b/internal/thinking/provider/claude/apply.go index af031907..c92f539e 100644 --- a/internal/thinking/provider/claude/apply.go +++ b/internal/thinking/provider/claude/apply.go @@ -174,7 +174,8 @@ func (a *Applier) normalizeClaudeBudget(body []byte, budgetTokens int, modelInfo // Ensure the request satisfies Claude constraints: // 1) Determine effective max_tokens (request overrides model default) // 2) If budget_tokens >= max_tokens, reduce budget_tokens to max_tokens-1 - // 3) If the adjusted budget falls below the model minimum, leave the request unchanged + // 3) If the adjusted budget falls below the model minimum, try raising max_tokens + // (clamped to MaxCompletionTokens); disable thinking if constraints are unsatisfiable // 4) If max_tokens came from model default, write it back into the request effectiveMax, setDefaultMax := a.effectiveMaxTokens(body, modelInfo) @@ -193,10 +194,28 @@ func (a *Applier) normalizeClaudeBudget(body []byte, budgetTokens int, modelInfo minBudget = modelInfo.Thinking.Min } if minBudget > 0 && adjustedBudget > 0 && adjustedBudget < minBudget { - // If enforcing the max_tokens constraint would push the budget below the model minimum, - // increase max_tokens to accommodate the original budget instead of leaving the - // request unchanged (which would cause a 400 error from the API). - body, _ = sjson.SetBytes(body, "max_tokens", budgetTokens+1) + // Enforcing budget_tokens < max_tokens pushed the budget below the model minimum. + // Try raising max_tokens to fit the original budget. + needed := budgetTokens + 1 + maxAllowed := 0 + if modelInfo != nil { + maxAllowed = modelInfo.MaxCompletionTokens + } + if maxAllowed > 0 && needed > maxAllowed { + // Cannot use original budget; cap max_tokens at model limit. + needed = maxAllowed + } + cappedBudget := needed - 1 + if cappedBudget < minBudget { + // Impossible to satisfy both budget >= minBudget and budget < max_tokens + // within the model's completion limit. Disable thinking entirely. + body, _ = sjson.DeleteBytes(body, "thinking") + return body + } + body, _ = sjson.SetBytes(body, "max_tokens", needed) + if cappedBudget != budgetTokens { + body, _ = sjson.SetBytes(body, "thinking.budget_tokens", cappedBudget) + } return body } diff --git a/internal/thinking/provider/claude/apply_test.go b/internal/thinking/provider/claude/apply_test.go new file mode 100644 index 00000000..46b3f3b7 --- /dev/null +++ b/internal/thinking/provider/claude/apply_test.go @@ -0,0 +1,99 @@ +package claude + +import ( + "testing" + + "github.com/router-for-me/CLIProxyAPI/v6/internal/registry" + "github.com/tidwall/gjson" +) + +func TestNormalizeClaudeBudget_RaisesMaxTokens(t *testing.T) { + a := &Applier{} + modelInfo := ®istry.ModelInfo{ + MaxCompletionTokens: 64000, + Thinking: ®istry.ThinkingSupport{Min: 1024, Max: 128000}, + } + body := []byte(`{"max_tokens":1000,"thinking":{"type":"enabled","budget_tokens":5000}}`) + + out := a.normalizeClaudeBudget(body, 5000, modelInfo) + + maxTok := gjson.GetBytes(out, "max_tokens").Int() + if maxTok != 5001 { + t.Fatalf("max_tokens = %d, want 5001, body=%s", maxTok, string(out)) + } +} + +func TestNormalizeClaudeBudget_ClampsToModelMax(t *testing.T) { + a := &Applier{} + modelInfo := ®istry.ModelInfo{ + MaxCompletionTokens: 64000, + Thinking: ®istry.ThinkingSupport{Min: 1024, Max: 128000}, + } + body := []byte(`{"max_tokens":500,"thinking":{"type":"enabled","budget_tokens":200000}}`) + + out := a.normalizeClaudeBudget(body, 200000, modelInfo) + + maxTok := gjson.GetBytes(out, "max_tokens").Int() + if maxTok != 64000 { + t.Fatalf("max_tokens = %d, want 64000 (capped to model limit), body=%s", maxTok, string(out)) + } + budget := gjson.GetBytes(out, "thinking.budget_tokens").Int() + if budget != 63999 { + t.Fatalf("budget_tokens = %d, want 63999 (max_tokens-1), body=%s", budget, string(out)) + } +} + +func TestNormalizeClaudeBudget_DisablesThinkingWhenUnsatisfiable(t *testing.T) { + a := &Applier{} + modelInfo := ®istry.ModelInfo{ + MaxCompletionTokens: 1000, + Thinking: ®istry.ThinkingSupport{Min: 1024, Max: 128000}, + } + body := []byte(`{"max_tokens":500,"thinking":{"type":"enabled","budget_tokens":2000}}`) + + out := a.normalizeClaudeBudget(body, 2000, modelInfo) + + if gjson.GetBytes(out, "thinking").Exists() { + t.Fatalf("thinking should be removed when constraints are unsatisfiable, body=%s", string(out)) + } +} + +func TestNormalizeClaudeBudget_NoClamping(t *testing.T) { + a := &Applier{} + modelInfo := ®istry.ModelInfo{ + MaxCompletionTokens: 64000, + Thinking: ®istry.ThinkingSupport{Min: 1024, Max: 128000}, + } + body := []byte(`{"max_tokens":32000,"thinking":{"type":"enabled","budget_tokens":16000}}`) + + out := a.normalizeClaudeBudget(body, 16000, modelInfo) + + maxTok := gjson.GetBytes(out, "max_tokens").Int() + if maxTok != 32000 { + t.Fatalf("max_tokens should remain 32000, got %d, body=%s", maxTok, string(out)) + } + budget := gjson.GetBytes(out, "thinking.budget_tokens").Int() + if budget != 16000 { + t.Fatalf("budget_tokens should remain 16000, got %d, body=%s", budget, string(out)) + } +} + +func TestNormalizeClaudeBudget_AdjustsBudgetToMaxMinus1(t *testing.T) { + a := &Applier{} + modelInfo := ®istry.ModelInfo{ + MaxCompletionTokens: 8192, + Thinking: ®istry.ThinkingSupport{Min: 1024, Max: 128000}, + } + body := []byte(`{"max_tokens":8192,"thinking":{"type":"enabled","budget_tokens":10000}}`) + + out := a.normalizeClaudeBudget(body, 10000, modelInfo) + + maxTok := gjson.GetBytes(out, "max_tokens").Int() + if maxTok != 8192 { + t.Fatalf("max_tokens = %d, want 8192 (unchanged), body=%s", maxTok, string(out)) + } + budget := gjson.GetBytes(out, "thinking.budget_tokens").Int() + if budget != 8191 { + t.Fatalf("budget_tokens = %d, want 8191 (max_tokens-1), body=%s", budget, string(out)) + } +}