fix(copilot): use dynamic API limits to prevent prompt token overflow

The Copilot API enforces per-account prompt token limits (128K individual, 168K business) that differ from the static 200K context length advertised by the proxy. This mismatch caused Claude Code to accumulate context beyond the actual limit, triggering "prompt token count exceeds the limit of 128000" errors. Changes: - Extract max_prompt_tokens and max_output_tokens from the Copilot /models API response (capabilities.limits) and use them as the authoritative ContextLength and MaxCompletionTokens values - Add CopilotModelLimits struct and Limits() helper to parse limits from the existing Capabilities map - Fix GitLab Duo context-1m beta header not being set when routing through the Anthropic gateway (gitlab_duo_force_context_1m attr was set but only gin headers were checked) - Fix flaky parallel tests that shared global model registry state
2026-04-03 19:21:17 +00:00 · 2026-04-03 23:54:17 +08:00
parent b849bf79d6
commit 87bf0b73d5
4 changed files with 202 additions and 2 deletions
--- a/internal/auth/copilot/copilot_auth.go
+++ b/internal/auth/copilot/copilot_auth.go
@@ -235,6 +235,74 @@ type CopilotModelEntry struct {
 	Capabilities map[string]any `json:"capabilities,omitempty"`
 }

+// CopilotModelLimits holds the token limits returned by the Copilot /models API
+// under capabilities.limits. These limits vary by account type (individual vs
+// business) and are the authoritative source for enforcing prompt size.
+type CopilotModelLimits struct {
+	// MaxContextWindowTokens is the total context window (prompt + output).
+	MaxContextWindowTokens int
+	// MaxPromptTokens is the hard limit on input/prompt tokens.
+	// Exceeding this triggers a 400 error from the Copilot API.
+	MaxPromptTokens int
+	// MaxOutputTokens is the maximum number of output/completion tokens.
+	MaxOutputTokens int
+}
+
+// Limits extracts the token limits from the model's capabilities map.
+// Returns nil if no limits are available or the structure is unexpected.
+//
+// Expected Copilot API shape:
+//
+//	"capabilities": {
+//	    "limits": {
+//	        "max_context_window_tokens": 200000,
+//	        "max_prompt_tokens": 168000,
+//	        "max_output_tokens": 32000
+//	    }
+//	}
+func (e *CopilotModelEntry) Limits() *CopilotModelLimits {
+	if e.Capabilities == nil {
+		return nil
+	}
+	limitsRaw, ok := e.Capabilities["limits"]
+	if !ok {
+		return nil
+	}
+	limitsMap, ok := limitsRaw.(map[string]any)
+	if !ok {
+		return nil
+	}
+
+	result := &CopilotModelLimits{
+		MaxContextWindowTokens: anyToInt(limitsMap["max_context_window_tokens"]),
+		MaxPromptTokens:        anyToInt(limitsMap["max_prompt_tokens"]),
+		MaxOutputTokens:        anyToInt(limitsMap["max_output_tokens"]),
+	}
+
+	// Only return if at least one field is populated.
+	if result.MaxContextWindowTokens == 0 && result.MaxPromptTokens == 0 && result.MaxOutputTokens == 0 {
+		return nil
+	}
+	return result
+}
+
+// anyToInt converts a JSON-decoded numeric value to int.
+// Go's encoding/json decodes numbers into float64 when the target is any/interface{}.
+func anyToInt(v any) int {
+	switch n := v.(type) {
+	case float64:
+		return int(n)
+	case float32:
+		return int(n)
+	case int:
+		return n
+	case int64:
+		return int(n)
+	default:
+		return 0
+	}
+}
+
 // CopilotModelsResponse represents the response from the Copilot /models endpoint.
 type CopilotModelsResponse struct {
 	Data   []CopilotModelEntry `json:"data"`
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -827,6 +827,14 @@ func applyClaudeHeaders(r *http.Request, auth *cliproxyauth.Auth, apiKey string,
 			hasClaude1MHeader = true
 		}
 	}
+	// Also check auth attributes — GitLab Duo sets gitlab_duo_force_context_1m
+	// when routing through the Anthropic gateway, but the gin headers won't have
+	// X-CPA-CLAUDE-1M because the request is internally constructed.
+	if !hasClaude1MHeader && auth != nil && auth.Attributes != nil {
+		if auth.Attributes["gitlab_duo_force_context_1m"] == "true" {
+			hasClaude1MHeader = true
+		}
+	}

 	// Merge extra betas from request body and request flags.
 	if len(extraBetas) > 0 || hasClaude1MHeader {
--- a/internal/runtime/executor/github_copilot_executor.go
+++ b/internal/runtime/executor/github_copilot_executor.go
@@ -1626,6 +1626,21 @@ func FetchGitHubCopilotModels(ctx context.Context, auth *cliproxyauth.Auth, cfg
 			m.MaxCompletionTokens = defaultCopilotMaxCompletionTokens
 		}

+		// Override with real limits from the Copilot API when available.
+		// The API returns per-account limits (individual vs business) under
+		// capabilities.limits, which are more accurate than our static
+		// fallback values. We use max_prompt_tokens as ContextLength because
+		// that's the hard limit the Copilot API enforces on prompt size —
+		// exceeding it triggers "prompt token count exceeds the limit" errors.
+		if limits := entry.Limits(); limits != nil {
+			if limits.MaxPromptTokens > 0 {
+				m.ContextLength = limits.MaxPromptTokens
+			}
+			if limits.MaxOutputTokens > 0 {
+				m.MaxCompletionTokens = limits.MaxOutputTokens
+			}
+		}
+
 		models = append(models, m)
 	}

--- a/internal/runtime/executor/github_copilot_executor_test.go
+++ b/internal/runtime/executor/github_copilot_executor_test.go
@@ -6,6 +6,7 @@ import (
 	"strings"
 	"testing"

+	copilotauth "github.com/router-for-me/CLIProxyAPI/v6/internal/auth/copilot"
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/registry"
 	cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor"
 	sdktranslator "github.com/router-for-me/CLIProxyAPI/v6/sdk/translator"
@@ -74,7 +75,7 @@ func TestUseGitHubCopilotResponsesEndpoint_CodexModel(t *testing.T) {
 }

 func TestUseGitHubCopilotResponsesEndpoint_RegistryResponsesOnlyModel(t *testing.T) {
-	t.Parallel()
+	// Not parallel: shares global model registry with DynamicRegistryWinsOverStatic.
 	if !useGitHubCopilotResponsesEndpoint(sdktranslator.FromString("openai"), "gpt-5.4") {
 		t.Fatal("expected responses-only registry model to use /responses")
 	}
@@ -84,7 +85,7 @@ func TestUseGitHubCopilotResponsesEndpoint_RegistryResponsesOnlyModel(t *testing
 }

 func TestUseGitHubCopilotResponsesEndpoint_DynamicRegistryWinsOverStatic(t *testing.T) {
-	t.Parallel()
+	// Not parallel: mutates global model registry, conflicts with RegistryResponsesOnlyModel.

 	reg := registry.GetGlobalRegistry()
 	clientID := "github-copilot-test-client"
@@ -706,3 +707,111 @@ func TestStripUnsupportedBetas_AllBetasStripped(t *testing.T) {
 		t.Fatal("betas field should be deleted when all betas are stripped")
 	}
 }
+
+func TestCopilotModelEntry_Limits(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name         string
+		capabilities map[string]any
+		wantNil      bool
+		wantPrompt   int
+		wantOutput   int
+		wantContext  int
+	}{
+		{
+			name:         "nil capabilities",
+			capabilities: nil,
+			wantNil:      true,
+		},
+		{
+			name:         "no limits key",
+			capabilities: map[string]any{"family": "claude-opus-4.6"},
+			wantNil:      true,
+		},
+		{
+			name:         "limits is not a map",
+			capabilities: map[string]any{"limits": "invalid"},
+			wantNil:      true,
+		},
+		{
+			name: "all zero values",
+			capabilities: map[string]any{
+				"limits": map[string]any{
+					"max_context_window_tokens": float64(0),
+					"max_prompt_tokens":         float64(0),
+					"max_output_tokens":         float64(0),
+				},
+			},
+			wantNil: true,
+		},
+		{
+			name: "individual account limits (128K prompt)",
+			capabilities: map[string]any{
+				"limits": map[string]any{
+					"max_context_window_tokens": float64(144000),
+					"max_prompt_tokens":         float64(128000),
+					"max_output_tokens":         float64(64000),
+				},
+			},
+			wantNil:     false,
+			wantPrompt:  128000,
+			wantOutput:  64000,
+			wantContext: 144000,
+		},
+		{
+			name: "business account limits (168K prompt)",
+			capabilities: map[string]any{
+				"limits": map[string]any{
+					"max_context_window_tokens": float64(200000),
+					"max_prompt_tokens":         float64(168000),
+					"max_output_tokens":         float64(32000),
+				},
+			},
+			wantNil:     false,
+			wantPrompt:  168000,
+			wantOutput:  32000,
+			wantContext: 200000,
+		},
+		{
+			name: "partial limits (only prompt)",
+			capabilities: map[string]any{
+				"limits": map[string]any{
+					"max_prompt_tokens": float64(128000),
+				},
+			},
+			wantNil:    false,
+			wantPrompt: 128000,
+			wantOutput: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			entry := copilotauth.CopilotModelEntry{
+				ID:           "claude-opus-4.6",
+				Capabilities: tt.capabilities,
+			}
+			limits := entry.Limits()
+			if tt.wantNil {
+				if limits != nil {
+					t.Fatalf("expected nil limits, got %+v", limits)
+				}
+				return
+			}
+			if limits == nil {
+				t.Fatal("expected non-nil limits, got nil")
+			}
+			if limits.MaxPromptTokens != tt.wantPrompt {
+				t.Errorf("MaxPromptTokens = %d, want %d", limits.MaxPromptTokens, tt.wantPrompt)
+			}
+			if limits.MaxOutputTokens != tt.wantOutput {
+				t.Errorf("MaxOutputTokens = %d, want %d", limits.MaxOutputTokens, tt.wantOutput)
+			}
+			if tt.wantContext > 0 && limits.MaxContextWindowTokens != tt.wantContext {
+				t.Errorf("MaxContextWindowTokens = %d, want %d", limits.MaxContextWindowTokens, tt.wantContext)
+			}
+		})
+	}
+}