From 0ea768011b93de3d45fbc05442e451df55069280 Mon Sep 17 00:00:00 2001 From: zilianpn Date: Tue, 7 Apr 2026 00:24:08 +0800 Subject: [PATCH] fix(auth): honor disable-cooling and enrich no-auth errors --- config.example.yaml | 3 + sdk/api/handlers/handlers.go | 54 ++++- .../handlers/handlers_error_response_test.go | 45 ++++ .../handlers_stream_bootstrap_test.go | 73 +++++++ sdk/cliproxy/auth/conductor.go | 100 ++++++--- sdk/cliproxy/auth/conductor_overrides_test.go | 196 ++++++++++++++++++ 6 files changed, 436 insertions(+), 35 deletions(-) diff --git a/config.example.yaml b/config.example.yaml index 5dd872ea..ce2d0a5a 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -87,6 +87,9 @@ max-retry-credentials: 0 # Maximum wait time in seconds for a cooled-down credential before triggering a retry. max-retry-interval: 30 +# When true, disable auth/model cooldown scheduling globally (prevents blackout windows after failure states). +disable-cooling: false + # Quota exceeded behavior quota-exceeded: switch-project: true # Whether to automatically switch to another project when a quota is exceeded diff --git a/sdk/api/handlers/handlers.go b/sdk/api/handlers/handlers.go index 28ab970d..1f7996c0 100644 --- a/sdk/api/handlers/handlers.go +++ b/sdk/api/handlers/handlers.go @@ -6,6 +6,7 @@ package handlers import ( "bytes" "encoding/json" + "errors" "fmt" "net/http" "strings" @@ -492,6 +493,7 @@ func (h *BaseAPIHandler) ExecuteWithAuthManager(ctx context.Context, handlerType opts.Metadata = reqMeta resp, err := h.AuthManager.Execute(ctx, providers, req, opts) if err != nil { + err = enrichAuthSelectionError(err, providers, normalizedModel) status := http.StatusInternalServerError if se, ok := err.(interface{ StatusCode() int }); ok && se != nil { if code := se.StatusCode(); code > 0 { @@ -538,6 +540,7 @@ func (h *BaseAPIHandler) ExecuteCountWithAuthManager(ctx context.Context, handle opts.Metadata = reqMeta resp, err := h.AuthManager.ExecuteCount(ctx, providers, req, opts) if err != nil { + err = enrichAuthSelectionError(err, providers, normalizedModel) status := http.StatusInternalServerError if se, ok := err.(interface{ StatusCode() int }); ok && se != nil { if code := se.StatusCode(); code > 0 { @@ -588,6 +591,7 @@ func (h *BaseAPIHandler) ExecuteStreamWithAuthManager(ctx context.Context, handl opts.Metadata = reqMeta streamResult, err := h.AuthManager.ExecuteStream(ctx, providers, req, opts) if err != nil { + err = enrichAuthSelectionError(err, providers, normalizedModel) errChan := make(chan *interfaces.ErrorMessage, 1) status := http.StatusInternalServerError if se, ok := err.(interface{ StatusCode() int }); ok && se != nil { @@ -697,7 +701,7 @@ func (h *BaseAPIHandler) ExecuteStreamWithAuthManager(ctx context.Context, handl chunks = retryResult.Chunks continue outer } - streamErr = retryErr + streamErr = enrichAuthSelectionError(retryErr, providers, normalizedModel) } } @@ -840,6 +844,54 @@ func replaceHeader(dst http.Header, src http.Header) { } } +func enrichAuthSelectionError(err error, providers []string, model string) error { + if err == nil { + return nil + } + + var authErr *coreauth.Error + if !errors.As(err, &authErr) || authErr == nil { + return err + } + + code := strings.TrimSpace(authErr.Code) + if code != "auth_not_found" && code != "auth_unavailable" { + return err + } + + providerText := strings.Join(providers, ",") + if providerText == "" { + providerText = "unknown" + } + modelText := strings.TrimSpace(model) + if modelText == "" { + modelText = "unknown" + } + + baseMessage := strings.TrimSpace(authErr.Message) + if baseMessage == "" { + baseMessage = "no auth available" + } + detail := fmt.Sprintf("%s (providers=%s, model=%s)", baseMessage, providerText, modelText) + + // Clarify the most common alias confusion between Anthropic route names and internal provider keys. + if strings.Contains(","+providerText+",", ",claude,") { + detail += "; check Claude auth/key session and cooldown state via /v0/management/auth-files" + } + + status := authErr.HTTPStatus + if status <= 0 { + status = http.StatusServiceUnavailable + } + + return &coreauth.Error{ + Code: authErr.Code, + Message: detail, + Retryable: authErr.Retryable, + HTTPStatus: status, + } +} + // WriteErrorResponse writes an error message to the response writer using the HTTP status embedded in the message. func (h *BaseAPIHandler) WriteErrorResponse(c *gin.Context, msg *interfaces.ErrorMessage) { status := http.StatusInternalServerError diff --git a/sdk/api/handlers/handlers_error_response_test.go b/sdk/api/handlers/handlers_error_response_test.go index cde4547f..917971c2 100644 --- a/sdk/api/handlers/handlers_error_response_test.go +++ b/sdk/api/handlers/handlers_error_response_test.go @@ -5,10 +5,12 @@ import ( "net/http" "net/http/httptest" "reflect" + "strings" "testing" "github.com/gin-gonic/gin" "github.com/router-for-me/CLIProxyAPI/v6/internal/interfaces" + coreauth "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/auth" sdkconfig "github.com/router-for-me/CLIProxyAPI/v6/sdk/config" ) @@ -66,3 +68,46 @@ func TestWriteErrorResponse_AddonHeadersEnabled(t *testing.T) { t.Fatalf("X-Request-Id = %#v, want %#v", got, []string{"new-1", "new-2"}) } } + +func TestEnrichAuthSelectionError_DefaultsTo503WithContext(t *testing.T) { + in := &coreauth.Error{Code: "auth_not_found", Message: "no auth available"} + out := enrichAuthSelectionError(in, []string{"claude"}, "claude-sonnet-4-6") + + var got *coreauth.Error + if !errors.As(out, &got) || got == nil { + t.Fatalf("expected coreauth.Error, got %T", out) + } + if got.StatusCode() != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want %d", got.StatusCode(), http.StatusServiceUnavailable) + } + if !strings.Contains(got.Message, "providers=claude") { + t.Fatalf("message missing provider context: %q", got.Message) + } + if !strings.Contains(got.Message, "model=claude-sonnet-4-6") { + t.Fatalf("message missing model context: %q", got.Message) + } + if !strings.Contains(got.Message, "/v0/management/auth-files") { + t.Fatalf("message missing management hint: %q", got.Message) + } +} + +func TestEnrichAuthSelectionError_PreservesExplicitStatus(t *testing.T) { + in := &coreauth.Error{Code: "auth_unavailable", Message: "no auth available", HTTPStatus: http.StatusTooManyRequests} + out := enrichAuthSelectionError(in, []string{"gemini"}, "gemini-2.5-pro") + + var got *coreauth.Error + if !errors.As(out, &got) || got == nil { + t.Fatalf("expected coreauth.Error, got %T", out) + } + if got.StatusCode() != http.StatusTooManyRequests { + t.Fatalf("status = %d, want %d", got.StatusCode(), http.StatusTooManyRequests) + } +} + +func TestEnrichAuthSelectionError_IgnoresOtherErrors(t *testing.T) { + in := errors.New("boom") + out := enrichAuthSelectionError(in, []string{"claude"}, "claude-sonnet-4-6") + if out != in { + t.Fatalf("expected original error to be returned unchanged") + } +} diff --git a/sdk/api/handlers/handlers_stream_bootstrap_test.go b/sdk/api/handlers/handlers_stream_bootstrap_test.go index 61c03332..f357962f 100644 --- a/sdk/api/handlers/handlers_stream_bootstrap_test.go +++ b/sdk/api/handlers/handlers_stream_bootstrap_test.go @@ -2,10 +2,13 @@ package handlers import ( "context" + "errors" "net/http" + "strings" "sync" "testing" + "github.com/router-for-me/CLIProxyAPI/v6/internal/interfaces" "github.com/router-for-me/CLIProxyAPI/v6/internal/registry" coreauth "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/auth" coreexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor" @@ -463,6 +466,76 @@ func TestExecuteStreamWithAuthManager_DoesNotRetryAfterFirstByte(t *testing.T) { } } +func TestExecuteStreamWithAuthManager_EnrichesBootstrapRetryAuthUnavailableError(t *testing.T) { + executor := &failOnceStreamExecutor{} + manager := coreauth.NewManager(nil, nil, nil) + manager.RegisterExecutor(executor) + + auth1 := &coreauth.Auth{ + ID: "auth1", + Provider: "codex", + Status: coreauth.StatusActive, + Metadata: map[string]any{"email": "test1@example.com"}, + } + if _, err := manager.Register(context.Background(), auth1); err != nil { + t.Fatalf("manager.Register(auth1): %v", err) + } + + registry.GetGlobalRegistry().RegisterClient(auth1.ID, auth1.Provider, []*registry.ModelInfo{{ID: "test-model"}}) + t.Cleanup(func() { + registry.GetGlobalRegistry().UnregisterClient(auth1.ID) + }) + + handler := NewBaseAPIHandlers(&sdkconfig.SDKConfig{ + Streaming: sdkconfig.StreamingConfig{ + BootstrapRetries: 1, + }, + }, manager) + dataChan, _, errChan := handler.ExecuteStreamWithAuthManager(context.Background(), "openai", "test-model", []byte(`{"model":"test-model"}`), "") + if dataChan == nil || errChan == nil { + t.Fatalf("expected non-nil channels") + } + + var got []byte + for chunk := range dataChan { + got = append(got, chunk...) + } + if len(got) != 0 { + t.Fatalf("expected empty payload, got %q", string(got)) + } + + var gotErr *interfaces.ErrorMessage + for msg := range errChan { + if msg != nil { + gotErr = msg + } + } + if gotErr == nil { + t.Fatalf("expected terminal error") + } + if gotErr.StatusCode != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want %d", gotErr.StatusCode, http.StatusServiceUnavailable) + } + + var authErr *coreauth.Error + if !errors.As(gotErr.Error, &authErr) || authErr == nil { + t.Fatalf("expected coreauth.Error, got %T", gotErr.Error) + } + if authErr.Code != "auth_unavailable" { + t.Fatalf("code = %q, want %q", authErr.Code, "auth_unavailable") + } + if !strings.Contains(authErr.Message, "providers=codex") { + t.Fatalf("message missing provider context: %q", authErr.Message) + } + if !strings.Contains(authErr.Message, "model=test-model") { + t.Fatalf("message missing model context: %q", authErr.Message) + } + + if executor.Calls() != 1 { + t.Fatalf("expected exactly one upstream call before retry path selection failure, got %d", executor.Calls()) + } +} + func TestExecuteStreamWithAuthManager_PinnedAuthKeepsSameUpstream(t *testing.T) { executor := &authAwareStreamExecutor{} manager := coreauth.NewManager(nil, nil, nil) diff --git a/sdk/cliproxy/auth/conductor.go b/sdk/cliproxy/auth/conductor.go index 478c7921..f5f7a60a 100644 --- a/sdk/cliproxy/auth/conductor.go +++ b/sdk/cliproxy/auth/conductor.go @@ -1838,6 +1838,7 @@ func (m *Manager) MarkResult(ctx context.Context, result Result) { } else { if result.Model != "" { if !isRequestScopedNotFoundResultError(result.Error) { + disableCooling := quotaCooldownDisabledForAuth(auth) state := ensureModelState(auth, result.Model) state.Unavailable = true state.Status = StatusError @@ -1858,31 +1859,45 @@ func (m *Manager) MarkResult(ctx context.Context, result Result) { } else { switch statusCode { case 401: - next := now.Add(30 * time.Minute) - state.NextRetryAfter = next - suspendReason = "unauthorized" - shouldSuspendModel = true + if disableCooling { + state.NextRetryAfter = time.Time{} + } else { + next := now.Add(30 * time.Minute) + state.NextRetryAfter = next + suspendReason = "unauthorized" + shouldSuspendModel = true + } case 402, 403: - next := now.Add(30 * time.Minute) - state.NextRetryAfter = next - suspendReason = "payment_required" - shouldSuspendModel = true + if disableCooling { + state.NextRetryAfter = time.Time{} + } else { + next := now.Add(30 * time.Minute) + state.NextRetryAfter = next + suspendReason = "payment_required" + shouldSuspendModel = true + } case 404: - next := now.Add(12 * time.Hour) - state.NextRetryAfter = next - suspendReason = "not_found" - shouldSuspendModel = true + if disableCooling { + state.NextRetryAfter = time.Time{} + } else { + next := now.Add(12 * time.Hour) + state.NextRetryAfter = next + suspendReason = "not_found" + shouldSuspendModel = true + } case 429: var next time.Time backoffLevel := state.Quota.BackoffLevel - if result.RetryAfter != nil { - next = now.Add(*result.RetryAfter) - } else { - cooldown, nextLevel := nextQuotaCooldown(backoffLevel, quotaCooldownDisabledForAuth(auth)) - if cooldown > 0 { - next = now.Add(cooldown) + if !disableCooling { + if result.RetryAfter != nil { + next = now.Add(*result.RetryAfter) + } else { + cooldown, nextLevel := nextQuotaCooldown(backoffLevel, disableCooling) + if cooldown > 0 { + next = now.Add(cooldown) + } + backoffLevel = nextLevel } - backoffLevel = nextLevel } state.NextRetryAfter = next state.Quota = QuotaState{ @@ -1891,11 +1906,13 @@ func (m *Manager) MarkResult(ctx context.Context, result Result) { NextRecoverAt: next, BackoffLevel: backoffLevel, } - suspendReason = "quota" - shouldSuspendModel = true - setModelQuota = true + if !disableCooling { + suspendReason = "quota" + shouldSuspendModel = true + setModelQuota = true + } case 408, 500, 502, 503, 504: - if quotaCooldownDisabledForAuth(auth) { + if disableCooling { state.NextRetryAfter = time.Time{} } else { next := now.Add(1 * time.Minute) @@ -2211,6 +2228,7 @@ func applyAuthFailureState(auth *Auth, resultErr *Error, retryAfter *time.Durati if isRequestScopedNotFoundResultError(resultErr) { return } + disableCooling := quotaCooldownDisabledForAuth(auth) auth.Unavailable = true auth.Status = StatusError auth.UpdatedAt = now @@ -2224,32 +2242,46 @@ func applyAuthFailureState(auth *Auth, resultErr *Error, retryAfter *time.Durati switch statusCode { case 401: auth.StatusMessage = "unauthorized" - auth.NextRetryAfter = now.Add(30 * time.Minute) + if disableCooling { + auth.NextRetryAfter = time.Time{} + } else { + auth.NextRetryAfter = now.Add(30 * time.Minute) + } case 402, 403: auth.StatusMessage = "payment_required" - auth.NextRetryAfter = now.Add(30 * time.Minute) + if disableCooling { + auth.NextRetryAfter = time.Time{} + } else { + auth.NextRetryAfter = now.Add(30 * time.Minute) + } case 404: auth.StatusMessage = "not_found" - auth.NextRetryAfter = now.Add(12 * time.Hour) + if disableCooling { + auth.NextRetryAfter = time.Time{} + } else { + auth.NextRetryAfter = now.Add(12 * time.Hour) + } case 429: auth.StatusMessage = "quota exhausted" auth.Quota.Exceeded = true auth.Quota.Reason = "quota" var next time.Time - if retryAfter != nil { - next = now.Add(*retryAfter) - } else { - cooldown, nextLevel := nextQuotaCooldown(auth.Quota.BackoffLevel, quotaCooldownDisabledForAuth(auth)) - if cooldown > 0 { - next = now.Add(cooldown) + if !disableCooling { + if retryAfter != nil { + next = now.Add(*retryAfter) + } else { + cooldown, nextLevel := nextQuotaCooldown(auth.Quota.BackoffLevel, disableCooling) + if cooldown > 0 { + next = now.Add(cooldown) + } + auth.Quota.BackoffLevel = nextLevel } - auth.Quota.BackoffLevel = nextLevel } auth.Quota.NextRecoverAt = next auth.NextRetryAfter = next case 408, 500, 502, 503, 504: auth.StatusMessage = "transient upstream error" - if quotaCooldownDisabledForAuth(auth) { + if disableCooling { auth.NextRetryAfter = time.Time{} } else { auth.NextRetryAfter = now.Add(1 * time.Minute) diff --git a/sdk/cliproxy/auth/conductor_overrides_test.go b/sdk/cliproxy/auth/conductor_overrides_test.go index 50915ce0..0c72c833 100644 --- a/sdk/cliproxy/auth/conductor_overrides_test.go +++ b/sdk/cliproxy/auth/conductor_overrides_test.go @@ -180,6 +180,34 @@ func (e *authFallbackExecutor) StreamCalls() []string { return out } +type retryAfterStatusError struct { + status int + message string + retryAfter time.Duration +} + +func (e *retryAfterStatusError) Error() string { + if e == nil { + return "" + } + return e.message +} + +func (e *retryAfterStatusError) StatusCode() int { + if e == nil { + return 0 + } + return e.status +} + +func (e *retryAfterStatusError) RetryAfter() *time.Duration { + if e == nil { + return nil + } + d := e.retryAfter + return &d +} + func newCredentialRetryLimitTestManager(t *testing.T, maxRetryCredentials int) (*Manager, *credentialRetryLimitExecutor) { t.Helper() @@ -450,6 +478,174 @@ func TestManager_MarkResult_RespectsAuthDisableCoolingOverride(t *testing.T) { } } +func TestManager_MarkResult_RespectsAuthDisableCoolingOverride_On403(t *testing.T) { + prev := quotaCooldownDisabled.Load() + quotaCooldownDisabled.Store(false) + t.Cleanup(func() { quotaCooldownDisabled.Store(prev) }) + + m := NewManager(nil, nil, nil) + + auth := &Auth{ + ID: "auth-403", + Provider: "claude", + Metadata: map[string]any{ + "disable_cooling": true, + }, + } + if _, errRegister := m.Register(context.Background(), auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + model := "test-model-403" + reg := registry.GetGlobalRegistry() + reg.RegisterClient(auth.ID, "claude", []*registry.ModelInfo{{ID: model}}) + t.Cleanup(func() { reg.UnregisterClient(auth.ID) }) + + m.MarkResult(context.Background(), Result{ + AuthID: auth.ID, + Provider: "claude", + Model: model, + Success: false, + Error: &Error{HTTPStatus: http.StatusForbidden, Message: "forbidden"}, + }) + + updated, ok := m.GetByID(auth.ID) + if !ok || updated == nil { + t.Fatalf("expected auth to be present") + } + state := updated.ModelStates[model] + if state == nil { + t.Fatalf("expected model state to be present") + } + if !state.NextRetryAfter.IsZero() { + t.Fatalf("expected NextRetryAfter to be zero when disable_cooling=true, got %v", state.NextRetryAfter) + } + + if count := reg.GetModelCount(model); count <= 0 { + t.Fatalf("expected model count > 0 when disable_cooling=true, got %d", count) + } +} + +func TestManager_Execute_DisableCooling_DoesNotBlackoutAfter403(t *testing.T) { + prev := quotaCooldownDisabled.Load() + quotaCooldownDisabled.Store(false) + t.Cleanup(func() { quotaCooldownDisabled.Store(prev) }) + + m := NewManager(nil, nil, nil) + executor := &authFallbackExecutor{ + id: "claude", + executeErrors: map[string]error{ + "auth-403-exec": &Error{ + HTTPStatus: http.StatusForbidden, + Message: "forbidden", + }, + }, + } + m.RegisterExecutor(executor) + + auth := &Auth{ + ID: "auth-403-exec", + Provider: "claude", + Metadata: map[string]any{ + "disable_cooling": true, + }, + } + if _, errRegister := m.Register(context.Background(), auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + model := "test-model-403-exec" + reg := registry.GetGlobalRegistry() + reg.RegisterClient(auth.ID, "claude", []*registry.ModelInfo{{ID: model}}) + t.Cleanup(func() { reg.UnregisterClient(auth.ID) }) + + req := cliproxyexecutor.Request{Model: model} + _, errExecute1 := m.Execute(context.Background(), []string{"claude"}, req, cliproxyexecutor.Options{}) + if errExecute1 == nil { + t.Fatal("expected first execute error") + } + if statusCodeFromError(errExecute1) != http.StatusForbidden { + t.Fatalf("first execute status = %d, want %d", statusCodeFromError(errExecute1), http.StatusForbidden) + } + + _, errExecute2 := m.Execute(context.Background(), []string{"claude"}, req, cliproxyexecutor.Options{}) + if errExecute2 == nil { + t.Fatal("expected second execute error") + } + if statusCodeFromError(errExecute2) != http.StatusForbidden { + t.Fatalf("second execute status = %d, want %d", statusCodeFromError(errExecute2), http.StatusForbidden) + } +} + +func TestManager_Execute_DisableCooling_DoesNotBlackoutAfter429RetryAfter(t *testing.T) { + prev := quotaCooldownDisabled.Load() + quotaCooldownDisabled.Store(false) + t.Cleanup(func() { quotaCooldownDisabled.Store(prev) }) + + m := NewManager(nil, nil, nil) + executor := &authFallbackExecutor{ + id: "claude", + executeErrors: map[string]error{ + "auth-429-exec": &retryAfterStatusError{ + status: http.StatusTooManyRequests, + message: "quota exhausted", + retryAfter: 2 * time.Minute, + }, + }, + } + m.RegisterExecutor(executor) + + auth := &Auth{ + ID: "auth-429-exec", + Provider: "claude", + Metadata: map[string]any{ + "disable_cooling": true, + }, + } + if _, errRegister := m.Register(context.Background(), auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + model := "test-model-429-exec" + reg := registry.GetGlobalRegistry() + reg.RegisterClient(auth.ID, "claude", []*registry.ModelInfo{{ID: model}}) + t.Cleanup(func() { reg.UnregisterClient(auth.ID) }) + + req := cliproxyexecutor.Request{Model: model} + _, errExecute1 := m.Execute(context.Background(), []string{"claude"}, req, cliproxyexecutor.Options{}) + if errExecute1 == nil { + t.Fatal("expected first execute error") + } + if statusCodeFromError(errExecute1) != http.StatusTooManyRequests { + t.Fatalf("first execute status = %d, want %d", statusCodeFromError(errExecute1), http.StatusTooManyRequests) + } + + _, errExecute2 := m.Execute(context.Background(), []string{"claude"}, req, cliproxyexecutor.Options{}) + if errExecute2 == nil { + t.Fatal("expected second execute error") + } + if statusCodeFromError(errExecute2) != http.StatusTooManyRequests { + t.Fatalf("second execute status = %d, want %d", statusCodeFromError(errExecute2), http.StatusTooManyRequests) + } + + calls := executor.ExecuteCalls() + if len(calls) != 2 { + t.Fatalf("execute calls = %d, want 2", len(calls)) + } + + updated, ok := m.GetByID(auth.ID) + if !ok || updated == nil { + t.Fatalf("expected auth to be present") + } + state := updated.ModelStates[model] + if state == nil { + t.Fatalf("expected model state to be present") + } + if !state.NextRetryAfter.IsZero() { + t.Fatalf("expected NextRetryAfter to be zero when disable_cooling=true, got %v", state.NextRetryAfter) + } +} + func TestManager_MarkResult_RequestScopedNotFoundDoesNotCooldownAuth(t *testing.T) { m := NewManager(nil, nil, nil)