diff --git a/sdk/cliproxy/auth/conductor.go b/sdk/cliproxy/auth/conductor.go index 9fc65274..1152bca0 100644 --- a/sdk/cliproxy/auth/conductor.go +++ b/sdk/cliproxy/auth/conductor.go @@ -233,23 +233,19 @@ func (m *Manager) RefreshSchedulerEntry(authID string) { m.scheduler.upsertAuth(snapshot) } -// ReconcileRegistryModelStates clears stale per-model runtime failures for -// models that are currently registered for the auth in the global model registry. +// ReconcileRegistryModelStates aligns per-model runtime state with the current +// registry snapshot for one auth. // -// This keeps the scheduler and the global registry aligned after model -// re-registration. Without this reconciliation, a model can reappear in -// /v1/models after registry refresh while the scheduler still blocks it because -// auth.ModelStates retained an older failure such as not_found or quota. +// Supported models are reset to a clean state because re-registration already +// cleared the registry-side cooldown/suspension snapshot. ModelStates for +// models that are no longer present in the registry are pruned entirely so +// renamed/removed models cannot keep auth-level status stale. func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID string) { if m == nil || authID == "" { return } supportedModels := registry.GetGlobalRegistry().GetModelsForClient(authID) - if len(supportedModels) == 0 { - return - } - supported := make(map[string]struct{}, len(supportedModels)) for _, model := range supportedModels { if model == nil { @@ -261,9 +257,6 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin } supported[modelKey] = struct{}{} } - if len(supported) == 0 { - return - } var snapshot *Auth now := time.Now() @@ -273,14 +266,19 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin if ok && auth != nil && len(auth.ModelStates) > 0 { changed := false for modelKey, state := range auth.ModelStates { - if state == nil { - continue - } baseModel := canonicalModelKey(modelKey) if baseModel == "" { baseModel = strings.TrimSpace(modelKey) } if _, supportedModel := supported[baseModel]; !supportedModel { + // Drop state for models that disappeared from the current registry + // snapshot. Keeping them around leaks stale errors into auth-level + // status, management output, and websocket fallback checks. + delete(auth.ModelStates, modelKey) + changed = true + continue + } + if state == nil { continue } if modelStateIsClean(state) { @@ -289,6 +287,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin resetModelState(state, now) changed = true } + if len(auth.ModelStates) == 0 { + auth.ModelStates = nil + } if changed { updateAggregatedAvailability(auth, now) if !hasModelError(auth, now) { @@ -297,7 +298,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin auth.Status = StatusActive } auth.UpdatedAt = now - _ = m.persist(ctx, auth) + if errPersist := m.persist(ctx, auth); errPersist != nil { + logEntryWithRequestID(ctx).WithField("auth_id", auth.ID).Warnf("failed to persist auth changes during model state reconciliation: %v", errPersist) + } snapshot = auth.Clone() } } @@ -1827,7 +1830,11 @@ func modelStateIsClean(state *ModelState) bool { } func updateAggregatedAvailability(auth *Auth, now time.Time) { - if auth == nil || len(auth.ModelStates) == 0 { + if auth == nil { + return + } + if len(auth.ModelStates) == 0 { + clearAggregatedAvailability(auth) return } allUnavailable := true @@ -1835,10 +1842,12 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) { quotaExceeded := false quotaRecover := time.Time{} maxBackoffLevel := 0 + hasState := false for _, state := range auth.ModelStates { if state == nil { continue } + hasState = true stateUnavailable := false if state.Status == StatusDisabled { stateUnavailable = true @@ -1868,6 +1877,10 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) { } } } + if !hasState { + clearAggregatedAvailability(auth) + return + } auth.Unavailable = allUnavailable if allUnavailable { auth.NextRetryAfter = earliestRetry @@ -1887,6 +1900,15 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) { } } +func clearAggregatedAvailability(auth *Auth) { + if auth == nil { + return + } + auth.Unavailable = false + auth.NextRetryAfter = time.Time{} + auth.Quota = QuotaState{} +} + func hasModelError(auth *Auth, now time.Time) bool { if auth == nil || len(auth.ModelStates) == 0 { return false diff --git a/sdk/cliproxy/auth/conductor_registry_reconcile_test.go b/sdk/cliproxy/auth/conductor_registry_reconcile_test.go new file mode 100644 index 00000000..dc4b95a9 --- /dev/null +++ b/sdk/cliproxy/auth/conductor_registry_reconcile_test.go @@ -0,0 +1,182 @@ +package auth + +import ( + "context" + "errors" + "net/http" + "testing" + "time" + + cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor" +) + +func TestManager_ReconcileRegistryModelStates_ClearsStaleSupportedModelErrors(t *testing.T) { + ctx := context.Background() + manager := NewManager(nil, &RoundRobinSelector{}, nil) + + auth := &Auth{ + ID: "reconcile-auth", + Provider: "codex", + ModelStates: map[string]*ModelState{ + "gpt-5.4": { + Status: StatusError, + StatusMessage: "not_found", + Unavailable: true, + NextRetryAfter: time.Now().Add(12 * time.Hour), + LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"}, + }, + }, + } + if _, errRegister := manager.Register(ctx, auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + registerSchedulerModels(t, "codex", "gpt-5.4", auth.ID) + manager.RefreshSchedulerEntry(auth.ID) + + got, errPick := manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil) + var authErr *Error + if !errors.As(errPick, &authErr) || authErr == nil { + t.Fatalf("pickSingle() before reconcile error = %v, want auth_unavailable", errPick) + } + if authErr.Code != "auth_unavailable" { + t.Fatalf("pickSingle() before reconcile code = %q, want %q", authErr.Code, "auth_unavailable") + } + if got != nil { + t.Fatalf("pickSingle() before reconcile auth = %v, want nil", got) + } + + manager.ReconcileRegistryModelStates(ctx, auth.ID) + + got, errPick = manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil) + if errPick != nil { + t.Fatalf("pickSingle() after reconcile error = %v", errPick) + } + if got == nil || got.ID != auth.ID { + t.Fatalf("pickSingle() after reconcile auth = %v, want %q", got, auth.ID) + } + + reconciled, ok := manager.GetByID(auth.ID) + if !ok || reconciled == nil { + t.Fatalf("expected auth to still exist") + } + state := reconciled.ModelStates["gpt-5.4"] + if state == nil { + t.Fatalf("expected reconciled model state to exist") + } + if state.Unavailable { + t.Fatalf("state.Unavailable = true, want false") + } + if state.Status != StatusActive { + t.Fatalf("state.Status = %q, want %q", state.Status, StatusActive) + } + if !state.NextRetryAfter.IsZero() { + t.Fatalf("state.NextRetryAfter = %v, want zero", state.NextRetryAfter) + } + if state.LastError != nil { + t.Fatalf("state.LastError = %v, want nil", state.LastError) + } +} + +func TestManager_ReconcileRegistryModelStates_PrunesUnsupportedModelStates(t *testing.T) { + ctx := context.Background() + manager := NewManager(nil, &RoundRobinSelector{}, nil) + + nextRetry := time.Now().Add(30 * time.Minute) + auth := &Auth{ + ID: "reconcile-unsupported-auth", + Provider: "codex", + Status: StatusError, + Unavailable: true, + StatusMessage: "payment_required", + LastError: &Error{HTTPStatus: http.StatusPaymentRequired, Message: "payment_required"}, + ModelStates: map[string]*ModelState{ + "gpt-5.4": { + Status: StatusError, + StatusMessage: "payment_required", + Unavailable: true, + NextRetryAfter: nextRetry, + }, + }, + } + if _, errRegister := manager.Register(ctx, auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + registerSchedulerModels(t, "codex", "gpt-5.5", auth.ID) + manager.ReconcileRegistryModelStates(ctx, auth.ID) + + reconciled, ok := manager.GetByID(auth.ID) + if !ok || reconciled == nil { + t.Fatalf("expected auth to still exist") + } + if len(reconciled.ModelStates) != 0 { + t.Fatalf("expected stale unsupported model state to be pruned, got %+v", reconciled.ModelStates) + } + if reconciled.Unavailable { + t.Fatalf("auth.Unavailable = true, want false") + } + if reconciled.Status != StatusActive { + t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive) + } + if reconciled.StatusMessage != "" { + t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage) + } + if reconciled.LastError != nil { + t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError) + } + if !reconciled.NextRetryAfter.IsZero() { + t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter) + } +} + +func TestManager_ReconcileRegistryModelStates_ClearsRemovedModelStateWhenRegistryIsEmpty(t *testing.T) { + ctx := context.Background() + manager := NewManager(nil, &RoundRobinSelector{}, nil) + + auth := &Auth{ + ID: "reconcile-empty-registry-auth", + Provider: "codex", + Status: StatusError, + Unavailable: true, + StatusMessage: "not_found", + LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"}, + ModelStates: map[string]*ModelState{ + "gpt-5.4": { + Status: StatusError, + StatusMessage: "not_found", + Unavailable: true, + NextRetryAfter: time.Now().Add(12 * time.Hour), + LastError: &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"}, + }, + }, + } + if _, errRegister := manager.Register(ctx, auth); errRegister != nil { + t.Fatalf("register auth: %v", errRegister) + } + + manager.ReconcileRegistryModelStates(ctx, auth.ID) + + reconciled, ok := manager.GetByID(auth.ID) + if !ok || reconciled == nil { + t.Fatalf("expected auth to still exist") + } + if len(reconciled.ModelStates) != 0 { + t.Fatalf("expected stale model state to be pruned when registry is empty, got %+v", reconciled.ModelStates) + } + if reconciled.Unavailable { + t.Fatalf("auth.Unavailable = true, want false") + } + if reconciled.Status != StatusActive { + t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive) + } + if reconciled.StatusMessage != "" { + t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage) + } + if reconciled.LastError != nil { + t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError) + } + if !reconciled.NextRetryAfter.IsZero() { + t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter) + } +}