fix(auth): tighten registry model reconciliation

2026-04-10 00:08:44 +00:00 · 2026-03-14 14:40:06 +00:00
parent e166e56249
commit f09ed25fd3
2 changed files with 222 additions and 18 deletions
--- a/sdk/cliproxy/auth/conductor.go
+++ b/sdk/cliproxy/auth/conductor.go
@@ -233,23 +233,19 @@ func (m *Manager) RefreshSchedulerEntry(authID string) {
 	m.scheduler.upsertAuth(snapshot)
 }

-// ReconcileRegistryModelStates clears stale per-model runtime failures for
-// models that are currently registered for the auth in the global model registry.
+// ReconcileRegistryModelStates aligns per-model runtime state with the current
+// registry snapshot for one auth.
 //
-// This keeps the scheduler and the global registry aligned after model
-// re-registration. Without this reconciliation, a model can reappear in
-// /v1/models after registry refresh while the scheduler still blocks it because
-// auth.ModelStates retained an older failure such as not_found or quota.
+// Supported models are reset to a clean state because re-registration already
+// cleared the registry-side cooldown/suspension snapshot. ModelStates for
+// models that are no longer present in the registry are pruned entirely so
+// renamed/removed models cannot keep auth-level status stale.
 func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID string) {
 	if m == nil || authID == "" {
 		return
 	}

 	supportedModels := registry.GetGlobalRegistry().GetModelsForClient(authID)
-	if len(supportedModels) == 0 {
-		return
-	}
-
 	supported := make(map[string]struct{}, len(supportedModels))
 	for _, model := range supportedModels {
 		if model == nil {
@@ -261,9 +257,6 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
 		}
 		supported[modelKey] = struct{}{}
 	}
-	if len(supported) == 0 {
-		return
-	}

 	var snapshot *Auth
 	now := time.Now()
@@ -273,14 +266,19 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
 	if ok && auth != nil && len(auth.ModelStates) > 0 {
 		changed := false
 		for modelKey, state := range auth.ModelStates {
-			if state == nil {
-				continue
-			}
 			baseModel := canonicalModelKey(modelKey)
 			if baseModel == "" {
 				baseModel = strings.TrimSpace(modelKey)
 			}
 			if _, supportedModel := supported[baseModel]; !supportedModel {
+				// Drop state for models that disappeared from the current registry
+				// snapshot. Keeping them around leaks stale errors into auth-level
+				// status, management output, and websocket fallback checks.
+				delete(auth.ModelStates, modelKey)
+				changed = true
+				continue
+			}
+			if state == nil {
 				continue
 			}
 			if modelStateIsClean(state) {
@@ -289,6 +287,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
 			resetModelState(state, now)
 			changed = true
 		}
+		if len(auth.ModelStates) == 0 {
+			auth.ModelStates = nil
+		}
 		if changed {
 			updateAggregatedAvailability(auth, now)
 			if !hasModelError(auth, now) {
@@ -297,7 +298,9 @@ func (m *Manager) ReconcileRegistryModelStates(ctx context.Context, authID strin
 				auth.Status = StatusActive
 			}
 			auth.UpdatedAt = now
-			_ = m.persist(ctx, auth)
+			if errPersist := m.persist(ctx, auth); errPersist != nil {
+				logEntryWithRequestID(ctx).WithField("auth_id", auth.ID).Warnf("failed to persist auth changes during model state reconciliation: %v", errPersist)
+			}
 			snapshot = auth.Clone()
 		}
 	}
@@ -1827,7 +1830,11 @@ func modelStateIsClean(state *ModelState) bool {
 }

 func updateAggregatedAvailability(auth *Auth, now time.Time) {
-	if auth == nil || len(auth.ModelStates) == 0 {
+	if auth == nil {
+		return
+	}
+	if len(auth.ModelStates) == 0 {
+		clearAggregatedAvailability(auth)
 		return
 	}
 	allUnavailable := true
@@ -1835,10 +1842,12 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
 	quotaExceeded := false
 	quotaRecover := time.Time{}
 	maxBackoffLevel := 0
+	hasState := false
 	for _, state := range auth.ModelStates {
 		if state == nil {
 			continue
 		}
+		hasState = true
 		stateUnavailable := false
 		if state.Status == StatusDisabled {
 			stateUnavailable = true
@@ -1868,6 +1877,10 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
 			}
 		}
 	}
+	if !hasState {
+		clearAggregatedAvailability(auth)
+		return
+	}
 	auth.Unavailable = allUnavailable
 	if allUnavailable {
 		auth.NextRetryAfter = earliestRetry
@@ -1887,6 +1900,15 @@ func updateAggregatedAvailability(auth *Auth, now time.Time) {
 	}
 }

+func clearAggregatedAvailability(auth *Auth) {
+	if auth == nil {
+		return
+	}
+	auth.Unavailable = false
+	auth.NextRetryAfter = time.Time{}
+	auth.Quota = QuotaState{}
+}
+
 func hasModelError(auth *Auth, now time.Time) bool {
 	if auth == nil || len(auth.ModelStates) == 0 {
 		return false
--- a/sdk/cliproxy/auth/conductor_registry_reconcile_test.go
+++ b/sdk/cliproxy/auth/conductor_registry_reconcile_test.go
@@ -0,0 +1,182 @@
+package auth
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"testing"
+	"time"
+
+	cliproxyexecutor "github.com/router-for-me/CLIProxyAPI/v6/sdk/cliproxy/executor"
+)
+
+func TestManager_ReconcileRegistryModelStates_ClearsStaleSupportedModelErrors(t *testing.T) {
+	ctx := context.Background()
+	manager := NewManager(nil, &RoundRobinSelector{}, nil)
+
+	auth := &Auth{
+		ID:       "reconcile-auth",
+		Provider: "codex",
+		ModelStates: map[string]*ModelState{
+			"gpt-5.4": {
+				Status:         StatusError,
+				StatusMessage:  "not_found",
+				Unavailable:    true,
+				NextRetryAfter: time.Now().Add(12 * time.Hour),
+				LastError:      &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
+			},
+		},
+	}
+	if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
+		t.Fatalf("register auth: %v", errRegister)
+	}
+
+	registerSchedulerModels(t, "codex", "gpt-5.4", auth.ID)
+	manager.RefreshSchedulerEntry(auth.ID)
+
+	got, errPick := manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil)
+	var authErr *Error
+	if !errors.As(errPick, &authErr) || authErr == nil {
+		t.Fatalf("pickSingle() before reconcile error = %v, want auth_unavailable", errPick)
+	}
+	if authErr.Code != "auth_unavailable" {
+		t.Fatalf("pickSingle() before reconcile code = %q, want %q", authErr.Code, "auth_unavailable")
+	}
+	if got != nil {
+		t.Fatalf("pickSingle() before reconcile auth = %v, want nil", got)
+	}
+
+	manager.ReconcileRegistryModelStates(ctx, auth.ID)
+
+	got, errPick = manager.scheduler.pickSingle(ctx, "codex", "gpt-5.4", cliproxyexecutor.Options{}, nil)
+	if errPick != nil {
+		t.Fatalf("pickSingle() after reconcile error = %v", errPick)
+	}
+	if got == nil || got.ID != auth.ID {
+		t.Fatalf("pickSingle() after reconcile auth = %v, want %q", got, auth.ID)
+	}
+
+	reconciled, ok := manager.GetByID(auth.ID)
+	if !ok || reconciled == nil {
+		t.Fatalf("expected auth to still exist")
+	}
+	state := reconciled.ModelStates["gpt-5.4"]
+	if state == nil {
+		t.Fatalf("expected reconciled model state to exist")
+	}
+	if state.Unavailable {
+		t.Fatalf("state.Unavailable = true, want false")
+	}
+	if state.Status != StatusActive {
+		t.Fatalf("state.Status = %q, want %q", state.Status, StatusActive)
+	}
+	if !state.NextRetryAfter.IsZero() {
+		t.Fatalf("state.NextRetryAfter = %v, want zero", state.NextRetryAfter)
+	}
+	if state.LastError != nil {
+		t.Fatalf("state.LastError = %v, want nil", state.LastError)
+	}
+}
+
+func TestManager_ReconcileRegistryModelStates_PrunesUnsupportedModelStates(t *testing.T) {
+	ctx := context.Background()
+	manager := NewManager(nil, &RoundRobinSelector{}, nil)
+
+	nextRetry := time.Now().Add(30 * time.Minute)
+	auth := &Auth{
+		ID:            "reconcile-unsupported-auth",
+		Provider:      "codex",
+		Status:        StatusError,
+		Unavailable:   true,
+		StatusMessage: "payment_required",
+		LastError:     &Error{HTTPStatus: http.StatusPaymentRequired, Message: "payment_required"},
+		ModelStates: map[string]*ModelState{
+			"gpt-5.4": {
+				Status:         StatusError,
+				StatusMessage:  "payment_required",
+				Unavailable:    true,
+				NextRetryAfter: nextRetry,
+			},
+		},
+	}
+	if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
+		t.Fatalf("register auth: %v", errRegister)
+	}
+
+	registerSchedulerModels(t, "codex", "gpt-5.5", auth.ID)
+	manager.ReconcileRegistryModelStates(ctx, auth.ID)
+
+	reconciled, ok := manager.GetByID(auth.ID)
+	if !ok || reconciled == nil {
+		t.Fatalf("expected auth to still exist")
+	}
+	if len(reconciled.ModelStates) != 0 {
+		t.Fatalf("expected stale unsupported model state to be pruned, got %+v", reconciled.ModelStates)
+	}
+	if reconciled.Unavailable {
+		t.Fatalf("auth.Unavailable = true, want false")
+	}
+	if reconciled.Status != StatusActive {
+		t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive)
+	}
+	if reconciled.StatusMessage != "" {
+		t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage)
+	}
+	if reconciled.LastError != nil {
+		t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError)
+	}
+	if !reconciled.NextRetryAfter.IsZero() {
+		t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter)
+	}
+}
+
+func TestManager_ReconcileRegistryModelStates_ClearsRemovedModelStateWhenRegistryIsEmpty(t *testing.T) {
+	ctx := context.Background()
+	manager := NewManager(nil, &RoundRobinSelector{}, nil)
+
+	auth := &Auth{
+		ID:            "reconcile-empty-registry-auth",
+		Provider:      "codex",
+		Status:        StatusError,
+		Unavailable:   true,
+		StatusMessage: "not_found",
+		LastError:     &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
+		ModelStates: map[string]*ModelState{
+			"gpt-5.4": {
+				Status:         StatusError,
+				StatusMessage:  "not_found",
+				Unavailable:    true,
+				NextRetryAfter: time.Now().Add(12 * time.Hour),
+				LastError:      &Error{HTTPStatus: http.StatusNotFound, Message: "not_found"},
+			},
+		},
+	}
+	if _, errRegister := manager.Register(ctx, auth); errRegister != nil {
+		t.Fatalf("register auth: %v", errRegister)
+	}
+
+	manager.ReconcileRegistryModelStates(ctx, auth.ID)
+
+	reconciled, ok := manager.GetByID(auth.ID)
+	if !ok || reconciled == nil {
+		t.Fatalf("expected auth to still exist")
+	}
+	if len(reconciled.ModelStates) != 0 {
+		t.Fatalf("expected stale model state to be pruned when registry is empty, got %+v", reconciled.ModelStates)
+	}
+	if reconciled.Unavailable {
+		t.Fatalf("auth.Unavailable = true, want false")
+	}
+	if reconciled.Status != StatusActive {
+		t.Fatalf("auth.Status = %q, want %q", reconciled.Status, StatusActive)
+	}
+	if reconciled.StatusMessage != "" {
+		t.Fatalf("auth.StatusMessage = %q, want empty", reconciled.StatusMessage)
+	}
+	if reconciled.LastError != nil {
+		t.Fatalf("auth.LastError = %v, want nil", reconciled.LastError)
+	}
+	if !reconciled.NextRetryAfter.IsZero() {
+		t.Fatalf("auth.NextRetryAfter = %v, want zero", reconciled.NextRetryAfter)
+	}
+}