feat: add Kiro OAuth web, rate limiter, metrics, fingerprint, background refresh and model converter

2026-04-20 22:51:45 +00:00 · 2026-01-18 15:04:29 +08:00
parent 93d7883513
commit 0e77e93e5d
37 changed files with 10396 additions and 282 deletions
--- a/internal/runtime/executor/kiro_executor.go
+++ b/internal/runtime/executor/kiro_executor.go
@@ -7,13 +7,16 @@ import (
 	"encoding/base64"
 	"encoding/binary"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
+	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
+	"syscall"
 	"time"

 	"github.com/google/uuid"
@@ -53,9 +56,28 @@ const (
 	kiroIDEUserAgent     = "aws-sdk-js/1.0.18 ua/2.1 os/darwin#25.0.0 lang/js md/nodejs#20.16.0 api/codewhispererstreaming#1.0.18 m/E KiroIDE-0.2.13-66c23a8c5d15afabec89ef9954ef52a119f10d369df04d548fc6c1eac694b0d1"
 	kiroIDEAmzUserAgent  = "aws-sdk-js/1.0.18 KiroIDE-0.2.13-66c23a8c5d15afabec89ef9954ef52a119f10d369df04d548fc6c1eac694b0d1"
 	kiroIDEAgentModeSpec = "spec"
-	kiroAgentModeVibe    = "vibe"
+
+	// Socket retry configuration constants (based on kiro2Api reference implementation)
+	// Maximum number of retry attempts for socket/network errors
+	kiroSocketMaxRetries = 3
+	// Base delay between retry attempts (uses exponential backoff: delay * 2^attempt)
+	kiroSocketBaseRetryDelay = 1 * time.Second
+	// Maximum delay between retry attempts (cap for exponential backoff)
+	kiroSocketMaxRetryDelay = 30 * time.Second
+	// First token timeout for streaming responses (how long to wait for first response)
+	kiroFirstTokenTimeout = 15 * time.Second
+	// Streaming read timeout (how long to wait between chunks)
+	kiroStreamingReadTimeout = 300 * time.Second
 )

+// retryableHTTPStatusCodes defines HTTP status codes that are considered retryable.
+// Based on kiro2Api reference: 502 (Bad Gateway), 503 (Service Unavailable), 504 (Gateway Timeout)
+var retryableHTTPStatusCodes = map[int]bool{
+	502: true, // Bad Gateway - upstream server error
+	503: true, // Service Unavailable - server temporarily overloaded
+	504: true, // Gateway Timeout - upstream server timeout
+}
+
 // Real-time usage estimation configuration
 // These control how often usage updates are sent during streaming
 var (
@@ -63,6 +85,241 @@ var (
 	usageUpdateTimeInterval  = 15 * time.Second // Or every 15 seconds, whichever comes first
 )

+// Global FingerprintManager for dynamic User-Agent generation per token
+// Each token gets a unique fingerprint on first use, which is cached for subsequent requests
+var (
+	globalFingerprintManager     *kiroauth.FingerprintManager
+	globalFingerprintManagerOnce sync.Once
+)
+
+// getGlobalFingerprintManager returns the global FingerprintManager instance
+func getGlobalFingerprintManager() *kiroauth.FingerprintManager {
+	globalFingerprintManagerOnce.Do(func() {
+		globalFingerprintManager = kiroauth.NewFingerprintManager()
+		log.Infof("kiro: initialized global FingerprintManager for dynamic UA generation")
+	})
+	return globalFingerprintManager
+}
+
+// retryConfig holds configuration for socket retry logic.
+// Based on kiro2Api Python implementation patterns.
+type retryConfig struct {
+	MaxRetries       int           // Maximum number of retry attempts
+	BaseDelay        time.Duration // Base delay between retries (exponential backoff)
+	MaxDelay         time.Duration // Maximum delay cap
+	RetryableErrors  []string      // List of retryable error patterns
+	RetryableStatus  map[int]bool  // HTTP status codes to retry
+	FirstTokenTmout  time.Duration // Timeout for first token in streaming
+	StreamReadTmout  time.Duration // Timeout between stream chunks
+}
+
+// defaultRetryConfig returns the default retry configuration for Kiro socket operations.
+func defaultRetryConfig() retryConfig {
+	return retryConfig{
+		MaxRetries:      kiroSocketMaxRetries,
+		BaseDelay:       kiroSocketBaseRetryDelay,
+		MaxDelay:        kiroSocketMaxRetryDelay,
+		RetryableStatus: retryableHTTPStatusCodes,
+		RetryableErrors: []string{
+			"connection reset",
+			"connection refused",
+			"broken pipe",
+			"EOF",
+			"timeout",
+			"temporary failure",
+			"no such host",
+			"network is unreachable",
+			"i/o timeout",
+		},
+		FirstTokenTmout: kiroFirstTokenTimeout,
+		StreamReadTmout: kiroStreamingReadTimeout,
+	}
+}
+
+// isRetryableError checks if an error is retryable based on error type and message.
+// Returns true for network timeouts, connection resets, and temporary failures.
+// Based on kiro2Api's retry logic patterns.
+func isRetryableError(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	// Check for context cancellation - not retryable
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return false
+	}
+
+	// Check for net.Error (timeout, temporary)
+	var netErr net.Error
+	if errors.As(err, &netErr) {
+		if netErr.Timeout() {
+			log.Debugf("kiro: isRetryableError: network timeout detected")
+			return true
+		}
+		// Note: Temporary() is deprecated but still useful for some error types
+	}
+
+	// Check for specific syscall errors (connection reset, broken pipe, etc.)
+	var syscallErr syscall.Errno
+	if errors.As(err, &syscallErr) {
+		switch syscallErr {
+		case syscall.ECONNRESET: // Connection reset by peer
+			log.Debugf("kiro: isRetryableError: ECONNRESET detected")
+			return true
+		case syscall.ECONNREFUSED: // Connection refused
+			log.Debugf("kiro: isRetryableError: ECONNREFUSED detected")
+			return true
+		case syscall.EPIPE: // Broken pipe
+			log.Debugf("kiro: isRetryableError: EPIPE (broken pipe) detected")
+			return true
+		case syscall.ETIMEDOUT: // Connection timed out
+			log.Debugf("kiro: isRetryableError: ETIMEDOUT detected")
+			return true
+		case syscall.ENETUNREACH: // Network is unreachable
+			log.Debugf("kiro: isRetryableError: ENETUNREACH detected")
+			return true
+		case syscall.EHOSTUNREACH: // No route to host
+			log.Debugf("kiro: isRetryableError: EHOSTUNREACH detected")
+			return true
+		}
+	}
+
+	// Check for net.OpError wrapping other errors
+	var opErr *net.OpError
+	if errors.As(err, &opErr) {
+		log.Debugf("kiro: isRetryableError: net.OpError detected, op=%s", opErr.Op)
+		// Recursively check the wrapped error
+		if opErr.Err != nil {
+			return isRetryableError(opErr.Err)
+		}
+		return true
+	}
+
+	// Check error message for retryable patterns
+	errMsg := strings.ToLower(err.Error())
+	cfg := defaultRetryConfig()
+	for _, pattern := range cfg.RetryableErrors {
+		if strings.Contains(errMsg, pattern) {
+			log.Debugf("kiro: isRetryableError: pattern '%s' matched in error: %s", pattern, errMsg)
+			return true
+		}
+	}
+
+	// Check for EOF which may indicate connection was closed
+	if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+		log.Debugf("kiro: isRetryableError: EOF/UnexpectedEOF detected")
+		return true
+	}
+
+	return false
+}
+
+// isRetryableHTTPStatus checks if an HTTP status code is retryable.
+// Based on kiro2Api: 502, 503, 504 are retryable server errors.
+func isRetryableHTTPStatus(statusCode int) bool {
+	return retryableHTTPStatusCodes[statusCode]
+}
+
+// calculateRetryDelay calculates the delay for the next retry attempt using exponential backoff.
+// delay = min(baseDelay * 2^attempt, maxDelay)
+// Adds ±30% jitter to prevent thundering herd.
+func calculateRetryDelay(attempt int, cfg retryConfig) time.Duration {
+	return kiroauth.ExponentialBackoffWithJitter(attempt, cfg.BaseDelay, cfg.MaxDelay)
+}
+
+// logRetryAttempt logs a retry attempt with relevant context.
+func logRetryAttempt(attempt, maxRetries int, reason string, delay time.Duration, endpoint string) {
+	log.Warnf("kiro: retry attempt %d/%d for %s, waiting %v before next attempt (endpoint: %s)",
+		attempt+1, maxRetries, reason, delay, endpoint)
+}
+
+// kiroHTTPClientPool provides a shared HTTP client with connection pooling for Kiro API.
+// This reduces connection overhead and improves performance for concurrent requests.
+// Based on kiro2Api's connection pooling pattern.
+var (
+	kiroHTTPClientPool     *http.Client
+	kiroHTTPClientPoolOnce sync.Once
+)
+
+// getKiroPooledHTTPClient returns a shared HTTP client with optimized connection pooling.
+// The client is lazily initialized on first use and reused across requests.
+// This is especially beneficial for:
+// - Reducing TCP handshake overhead
+// - Enabling HTTP/2 multiplexing
+// - Better handling of keep-alive connections
+func getKiroPooledHTTPClient() *http.Client {
+	kiroHTTPClientPoolOnce.Do(func() {
+		transport := &http.Transport{
+			// Connection pool settings
+			MaxIdleConns:        100,              // Max idle connections across all hosts
+			MaxIdleConnsPerHost: 20,               // Max idle connections per host
+			MaxConnsPerHost:     50,               // Max total connections per host
+			IdleConnTimeout:     90 * time.Second, // How long idle connections stay in pool
+
+			// Timeouts for connection establishment
+			DialContext: (&net.Dialer{
+				Timeout:   30 * time.Second, // TCP connection timeout
+				KeepAlive: 30 * time.Second, // TCP keep-alive interval
+			}).DialContext,
+
+			// TLS handshake timeout
+			TLSHandshakeTimeout: 10 * time.Second,
+
+			// Response header timeout
+			ResponseHeaderTimeout: 30 * time.Second,
+
+			// Expect 100-continue timeout
+			ExpectContinueTimeout: 1 * time.Second,
+
+			// Enable HTTP/2 when available
+			ForceAttemptHTTP2: true,
+		}
+
+		kiroHTTPClientPool = &http.Client{
+			Transport: transport,
+			// No global timeout - let individual requests set their own timeouts via context
+		}
+
+		log.Debugf("kiro: initialized pooled HTTP client (MaxIdleConns=%d, MaxIdleConnsPerHost=%d, MaxConnsPerHost=%d)",
+			transport.MaxIdleConns, transport.MaxIdleConnsPerHost, transport.MaxConnsPerHost)
+	})
+
+	return kiroHTTPClientPool
+}
+
+// newKiroHTTPClientWithPooling creates an HTTP client that uses connection pooling when appropriate.
+// It respects proxy configuration from auth or config, falling back to the pooled client.
+// This provides the best of both worlds: custom proxy support + connection reuse.
+func newKiroHTTPClientWithPooling(ctx context.Context, cfg *config.Config, auth *cliproxyauth.Auth, timeout time.Duration) *http.Client {
+	// Check if a proxy is configured - if so, we need a custom client
+	var proxyURL string
+	if auth != nil {
+		proxyURL = strings.TrimSpace(auth.ProxyURL)
+	}
+	if proxyURL == "" && cfg != nil {
+		proxyURL = strings.TrimSpace(cfg.ProxyURL)
+	}
+
+	// If proxy is configured, use the existing proxy-aware client (doesn't pool)
+	if proxyURL != "" {
+		log.Debugf("kiro: using proxy-aware HTTP client (proxy=%s)", proxyURL)
+		return newProxyAwareHTTPClient(ctx, cfg, auth, timeout)
+	}
+
+	// No proxy - use pooled client for better performance
+	pooledClient := getKiroPooledHTTPClient()
+
+	// If timeout is specified, we need to wrap the pooled transport with timeout
+	if timeout > 0 {
+		return &http.Client{
+			Transport: pooledClient.Transport,
+			Timeout:   timeout,
+		}
+	}
+
+	return pooledClient
+}
+
 // kiroEndpointConfig bundles endpoint URL with its compatible Origin and AmzTarget values.
 // This solves the "triple mismatch" problem where different endpoints require matching
 // Origin and X-Amz-Target header values.
@@ -99,7 +356,7 @@ var kiroEndpointConfigs = []kiroEndpointConfig{
 		Name:      "CodeWhisperer",
 	},
 	{
-		URL:       "https://q.us-east-1.amazonaws.com/generateAssistantResponse",
+		URL:       "https://q.us-east-1.amazonaws.com/",
 		Origin:    "CLI",
 		AmzTarget: "AmazonQDeveloperStreamingService.SendMessage",
 		Name:      "AmazonQ",
@@ -217,6 +474,29 @@ func NewKiroExecutor(cfg *config.Config) *KiroExecutor {
 // Identifier returns the unique identifier for this executor.
 func (e *KiroExecutor) Identifier() string { return "kiro" }

+// applyDynamicFingerprint applies token-specific fingerprint headers to the request
+// For IDC auth, uses dynamic fingerprint-based User-Agent
+// For other auth types, uses static Amazon Q CLI style headers
+func applyDynamicFingerprint(req *http.Request, auth *cliproxyauth.Auth) {
+	if isIDCAuth(auth) {
+		// Get token-specific fingerprint for dynamic UA generation
+		tokenKey := getTokenKey(auth)
+		fp := getGlobalFingerprintManager().GetFingerprint(tokenKey)
+		
+		// Use fingerprint-generated dynamic User-Agent
+		req.Header.Set("User-Agent", fp.BuildUserAgent())
+		req.Header.Set("X-Amz-User-Agent", fp.BuildAmzUserAgent())
+		req.Header.Set("x-amzn-kiro-agent-mode", kiroIDEAgentModeSpec)
+		
+		log.Debugf("kiro: using dynamic fingerprint for token %s (SDK:%s, OS:%s/%s, Kiro:%s)",
+			tokenKey[:8]+"...", fp.SDKVersion, fp.OSType, fp.OSVersion, fp.KiroVersion)
+	} else {
+		// Use static Amazon Q CLI style headers for non-IDC auth
+		req.Header.Set("User-Agent", kiroUserAgent)
+		req.Header.Set("X-Amz-User-Agent", kiroFullUserAgent)
+	}
+}
+
 // PrepareRequest prepares the HTTP request before execution.
 func (e *KiroExecutor) PrepareRequest(req *http.Request, auth *cliproxyauth.Auth) error {
 	if req == nil {
@@ -226,16 +506,10 @@ func (e *KiroExecutor) PrepareRequest(req *http.Request, auth *cliproxyauth.Auth
 	if strings.TrimSpace(accessToken) == "" {
 		return statusErr{code: http.StatusUnauthorized, msg: "missing access token"}
 	}
-	if isIDCAuth(auth) {
-		req.Header.Set("User-Agent", kiroIDEUserAgent)
-		req.Header.Set("X-Amz-User-Agent", kiroIDEAmzUserAgent)
-		req.Header.Set("x-amzn-kiro-agent-mode", kiroIDEAgentModeSpec)
-	} else {
-		req.Header.Set("User-Agent", kiroUserAgent)
-		req.Header.Set("X-Amz-User-Agent", kiroFullUserAgent)
-		req.Header.Set("x-amzn-kiro-agent-mode", kiroAgentModeVibe)
-	}
-	req.Header.Set("x-amzn-codewhisperer-optout", "true")
+	
+	// Apply dynamic fingerprint-based headers
+	applyDynamicFingerprint(req, auth)
+	
 	req.Header.Set("Amz-Sdk-Request", "attempt=1; max=3")
 	req.Header.Set("Amz-Sdk-Invocation-Id", uuid.New().String())
 	req.Header.Set("Authorization", "Bearer "+accessToken)
@@ -259,10 +533,23 @@ func (e *KiroExecutor) HttpRequest(ctx context.Context, auth *cliproxyauth.Auth,
 	if errPrepare := e.PrepareRequest(httpReq, auth); errPrepare != nil {
 		return nil, errPrepare
 	}
-	httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
+	httpClient := newKiroHTTPClientWithPooling(ctx, e.cfg, auth, 0)
 	return httpClient.Do(httpReq)
 }

+// getTokenKey returns a unique key for rate limiting based on auth credentials.
+// Uses auth ID if available, otherwise falls back to a hash of the access token.
+func getTokenKey(auth *cliproxyauth.Auth) string {
+	if auth != nil && auth.ID != "" {
+		return auth.ID
+	}
+	accessToken, _ := kiroCredentials(auth)
+	if len(accessToken) > 16 {
+		return accessToken[:16]
+	}
+	return accessToken
+}
+
 // Execute sends the request to Kiro API and returns the response.
 // Supports automatic token refresh on 401/403 errors.
 func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (resp cliproxyexecutor.Response, err error) {
@@ -271,6 +558,24 @@ func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req
 		return resp, fmt.Errorf("kiro: access token not found in auth")
 	}

+	// Rate limiting: get token key for tracking
+	tokenKey := getTokenKey(auth)
+	rateLimiter := kiroauth.GetGlobalRateLimiter()
+	cooldownMgr := kiroauth.GetGlobalCooldownManager()
+
+	// Check if token is in cooldown period
+	if cooldownMgr.IsInCooldown(tokenKey) {
+		remaining := cooldownMgr.GetRemainingCooldown(tokenKey)
+		reason := cooldownMgr.GetCooldownReason(tokenKey)
+		log.Warnf("kiro: token %s is in cooldown (reason: %s), remaining: %v", tokenKey, reason, remaining)
+		return resp, fmt.Errorf("kiro: token is in cooldown for %v (reason: %s)", remaining, reason)
+	}
+
+	// Wait for rate limiter before proceeding
+	log.Debugf("kiro: waiting for rate limiter for token %s", tokenKey)
+	rateLimiter.WaitForToken(tokenKey)
+	log.Debugf("kiro: rate limiter cleared for token %s", tokenKey)
+
 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
 	defer reporter.trackFailure(ctx, &err)

@@ -303,7 +608,7 @@ func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req

 	// Execute with retry on 401/403 and 429 (quota exhausted)
 	// Note: currentOrigin and kiroPayload are built inside executeWithRetry for each endpoint
-	resp, err = e.executeWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, nil, body, from, to, reporter, "", kiroModelID, isAgentic, isChatOnly)
+	resp, err = e.executeWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, nil, body, from, to, reporter, "", kiroModelID, isAgentic, isChatOnly, tokenKey)
 	return resp, err
 }

@@ -312,9 +617,12 @@ func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req
 // - Amazon Q endpoint (CLI origin) uses Amazon Q Developer quota
 // - CodeWhisperer endpoint (AI_EDITOR origin) uses Kiro IDE quota
 // Also supports multi-endpoint fallback similar to Antigravity implementation.
-func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, accessToken, profileArn string, kiroPayload, body []byte, from, to sdktranslator.Format, reporter *usageReporter, currentOrigin, kiroModelID string, isAgentic, isChatOnly bool) (cliproxyexecutor.Response, error) {
+// tokenKey is used for rate limiting and cooldown tracking.
+func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, accessToken, profileArn string, kiroPayload, body []byte, from, to sdktranslator.Format, reporter *usageReporter, currentOrigin, kiroModelID string, isAgentic, isChatOnly bool, tokenKey string) (cliproxyexecutor.Response, error) {
 	var resp cliproxyexecutor.Response
 	maxRetries := 2 // Allow retries for token refresh + endpoint fallback
+	rateLimiter := kiroauth.GetGlobalRateLimiter()
+	cooldownMgr := kiroauth.GetGlobalCooldownManager()
 	endpointConfigs := getKiroEndpointConfigs(auth)
 	var last429Err error

@@ -332,6 +640,12 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)

 		for attempt := 0; attempt <= maxRetries; attempt++ {
+			// Apply human-like delay before first request (not on retries)
+			// This mimics natural user behavior patterns
+			if attempt == 0 && endpointIdx == 0 {
+				kiroauth.ApplyHumanLikeDelay()
+			}
+
 			httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(kiroPayload))
 			if err != nil {
 				return resp, err
@@ -342,20 +656,9 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 			// Use endpoint-specific X-Amz-Target (critical for avoiding 403 errors)
 			httpReq.Header.Set("X-Amz-Target", endpointConfig.AmzTarget)

-			// Use different headers based on auth type
-			// IDC auth uses Kiro IDE style headers (from kiro2api)
-			// Other auth types use Amazon Q CLI style headers
-			if isIDCAuth(auth) {
-				httpReq.Header.Set("User-Agent", kiroIDEUserAgent)
-				httpReq.Header.Set("X-Amz-User-Agent", kiroIDEAmzUserAgent)
-				httpReq.Header.Set("x-amzn-kiro-agent-mode", kiroIDEAgentModeSpec)
-				log.Debugf("kiro: using Kiro IDE headers for IDC auth")
-			} else {
-				httpReq.Header.Set("User-Agent", kiroUserAgent)
-				httpReq.Header.Set("X-Amz-User-Agent", kiroFullUserAgent)
-				httpReq.Header.Set("x-amzn-kiro-agent-mode", kiroAgentModeVibe)
-			}
-			httpReq.Header.Set("x-amzn-codewhisperer-optout", "true")
+			// Apply dynamic fingerprint-based headers
+			applyDynamicFingerprint(httpReq, auth)
+			
 			httpReq.Header.Set("Amz-Sdk-Request", "attempt=1; max=3")
 			httpReq.Header.Set("Amz-Sdk-Invocation-Id", uuid.New().String())

@@ -386,10 +689,34 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 				AuthValue: authValue,
 			})

-			httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 120*time.Second)
+			httpClient := newKiroHTTPClientWithPooling(ctx, e.cfg, auth, 120*time.Second)
 			httpResp, err := httpClient.Do(httpReq)
 			if err != nil {
+				// Check for context cancellation first - client disconnected, not a server error
+				// Use 499 (Client Closed Request - nginx convention) instead of 500
+				if errors.Is(err, context.Canceled) {
+					log.Debugf("kiro: request canceled by client (context.Canceled)")
+					return resp, statusErr{code: 499, msg: "client canceled request"}
+				}
+
+				// Check for context deadline exceeded - request timed out
+				// Return 504 Gateway Timeout instead of 500
+				if errors.Is(err, context.DeadlineExceeded) {
+					log.Debugf("kiro: request timed out (context.DeadlineExceeded)")
+					return resp, statusErr{code: http.StatusGatewayTimeout, msg: "upstream request timed out"}
+				}
+
 				recordAPIResponseError(ctx, e.cfg, err)
+
+				// Enhanced socket retry: Check if error is retryable (network timeout, connection reset, etc.)
+				retryCfg := defaultRetryConfig()
+				if isRetryableError(err) && attempt < retryCfg.MaxRetries {
+					delay := calculateRetryDelay(attempt, retryCfg)
+					logRetryAttempt(attempt, retryCfg.MaxRetries, fmt.Sprintf("socket error: %v", err), delay, endpointConfig.Name)
+					time.Sleep(delay)
+					continue
+				}
+
 				return resp, err
 			}
 			recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
@@ -401,6 +728,12 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 				_ = httpResp.Body.Close()
 				appendAPIResponseChunk(ctx, e.cfg, respBody)

+				// Record failure and set cooldown for 429
+				rateLimiter.MarkTokenFailed(tokenKey)
+				cooldownDuration := kiroauth.CalculateCooldownFor429(attempt)
+				cooldownMgr.SetCooldown(tokenKey, cooldownDuration, kiroauth.CooldownReason429)
+				log.Warnf("kiro: rate limit hit (429), token %s set to cooldown for %v", tokenKey, cooldownDuration)
+
 				// Preserve last 429 so callers can correctly backoff when all endpoints are exhausted
 				last429Err = statusErr{code: httpResp.StatusCode, msg: string(respBody)}

@@ -412,13 +745,21 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 			}

 			// Handle 5xx server errors with exponential backoff retry
+			// Enhanced: Use retryConfig for consistent retry behavior
 			if httpResp.StatusCode >= 500 && httpResp.StatusCode < 600 {
 				respBody, _ := io.ReadAll(httpResp.Body)
 				_ = httpResp.Body.Close()
 				appendAPIResponseChunk(ctx, e.cfg, respBody)

-				if attempt < maxRetries {
-					// Exponential backoff: 1s, 2s, 4s... (max 30s)
+				retryCfg := defaultRetryConfig()
+				// Check if this specific 5xx code is retryable (502, 503, 504)
+				if isRetryableHTTPStatus(httpResp.StatusCode) && attempt < retryCfg.MaxRetries {
+					delay := calculateRetryDelay(attempt, retryCfg)
+					logRetryAttempt(attempt, retryCfg.MaxRetries, fmt.Sprintf("HTTP %d", httpResp.StatusCode), delay, endpointConfig.Name)
+					time.Sleep(delay)
+					continue
+				} else if attempt < maxRetries {
+					// Fallback for other 5xx errors (500, 501, etc.)
 					backoff := time.Duration(1<<attempt) * time.Second
 					if backoff > 30*time.Second {
 						backoff = 30 * time.Second
@@ -492,7 +833,10 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.

 				// Check for SUSPENDED status - return immediately without retry
 				if strings.Contains(respBodyStr, "SUSPENDED") || strings.Contains(respBodyStr, "TEMPORARILY_SUSPENDED") {
-					log.Errorf("kiro: account is suspended, cannot proceed")
+					// Set long cooldown for suspended accounts
+					rateLimiter.CheckAndMarkSuspended(tokenKey, respBodyStr)
+					cooldownMgr.SetCooldown(tokenKey, kiroauth.LongCooldown, kiroauth.CooldownReasonSuspended)
+					log.Errorf("kiro: account is suspended, token %s set to cooldown for %v", tokenKey, kiroauth.LongCooldown)
 					return resp, statusErr{code: httpResp.StatusCode, msg: "account suspended: " + string(respBody)}
 				}

@@ -581,6 +925,10 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 			appendAPIResponseChunk(ctx, e.cfg, []byte(content))
 			reporter.publish(ctx, usageInfo)

+			// Record success for rate limiting
+			rateLimiter.MarkTokenSuccess(tokenKey)
+			log.Debugf("kiro: request successful, token %s marked as success", tokenKey)
+
 			// Build response in Claude format for Kiro translator
 			// stopReason is extracted from upstream response by parseEventStream
 			kiroResponse := kiroclaude.BuildClaudeResponse(content, toolUses, req.Model, usageInfo, stopReason)
@@ -608,6 +956,24 @@ func (e *KiroExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 		return nil, fmt.Errorf("kiro: access token not found in auth")
 	}

+	// Rate limiting: get token key for tracking
+	tokenKey := getTokenKey(auth)
+	rateLimiter := kiroauth.GetGlobalRateLimiter()
+	cooldownMgr := kiroauth.GetGlobalCooldownManager()
+
+	// Check if token is in cooldown period
+	if cooldownMgr.IsInCooldown(tokenKey) {
+		remaining := cooldownMgr.GetRemainingCooldown(tokenKey)
+		reason := cooldownMgr.GetCooldownReason(tokenKey)
+		log.Warnf("kiro: token %s is in cooldown (reason: %s), remaining: %v", tokenKey, reason, remaining)
+		return nil, fmt.Errorf("kiro: token is in cooldown for %v (reason: %s)", remaining, reason)
+	}
+
+	// Wait for rate limiter before proceeding
+	log.Debugf("kiro: stream waiting for rate limiter for token %s", tokenKey)
+	rateLimiter.WaitForToken(tokenKey)
+	log.Debugf("kiro: stream rate limiter cleared for token %s", tokenKey)
+
 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
 	defer reporter.trackFailure(ctx, &err)

@@ -640,7 +1006,7 @@ func (e *KiroExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut

 	// Execute stream with retry on 401/403 and 429 (quota exhausted)
 	// Note: currentOrigin and kiroPayload are built inside executeStreamWithRetry for each endpoint
-	return e.executeStreamWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, nil, body, from, reporter, "", kiroModelID, isAgentic, isChatOnly)
+	return e.executeStreamWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, nil, body, from, reporter, "", kiroModelID, isAgentic, isChatOnly, tokenKey)
 }

 // executeStreamWithRetry performs the streaming HTTP request with automatic retry on auth errors.
@@ -648,8 +1014,11 @@ func (e *KiroExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 // - Amazon Q endpoint (CLI origin) uses Amazon Q Developer quota
 // - CodeWhisperer endpoint (AI_EDITOR origin) uses Kiro IDE quota
 // Also supports multi-endpoint fallback similar to Antigravity implementation.
-func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, accessToken, profileArn string, kiroPayload, body []byte, from sdktranslator.Format, reporter *usageReporter, currentOrigin, kiroModelID string, isAgentic, isChatOnly bool) (<-chan cliproxyexecutor.StreamChunk, error) {
+// tokenKey is used for rate limiting and cooldown tracking.
+func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, accessToken, profileArn string, kiroPayload, body []byte, from sdktranslator.Format, reporter *usageReporter, currentOrigin, kiroModelID string, isAgentic, isChatOnly bool, tokenKey string) (<-chan cliproxyexecutor.StreamChunk, error) {
 	maxRetries := 2 // Allow retries for token refresh + endpoint fallback
+	rateLimiter := kiroauth.GetGlobalRateLimiter()
+	cooldownMgr := kiroauth.GetGlobalCooldownManager()
 	endpointConfigs := getKiroEndpointConfigs(auth)
 	var last429Err error

@@ -667,6 +1036,13 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)

 		for attempt := 0; attempt <= maxRetries; attempt++ {
+			// Apply human-like delay before first streaming request (not on retries)
+			// This mimics natural user behavior patterns
+			// Note: Delay is NOT applied during streaming response - only before initial request
+			if attempt == 0 && endpointIdx == 0 {
+				kiroauth.ApplyHumanLikeDelay()
+			}
+
 			httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(kiroPayload))
 			if err != nil {
 				return nil, err
@@ -677,20 +1053,9 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 			// Use endpoint-specific X-Amz-Target (critical for avoiding 403 errors)
 			httpReq.Header.Set("X-Amz-Target", endpointConfig.AmzTarget)

-			// Use different headers based on auth type
-			// IDC auth uses Kiro IDE style headers (from kiro2api)
-			// Other auth types use Amazon Q CLI style headers
-			if isIDCAuth(auth) {
-				httpReq.Header.Set("User-Agent", kiroIDEUserAgent)
-				httpReq.Header.Set("X-Amz-User-Agent", kiroIDEAmzUserAgent)
-				httpReq.Header.Set("x-amzn-kiro-agent-mode", kiroIDEAgentModeSpec)
-				log.Debugf("kiro: using Kiro IDE headers for IDC auth")
-			} else {
-				httpReq.Header.Set("User-Agent", kiroUserAgent)
-				httpReq.Header.Set("X-Amz-User-Agent", kiroFullUserAgent)
-				httpReq.Header.Set("x-amzn-kiro-agent-mode", kiroAgentModeVibe)
-			}
-			httpReq.Header.Set("x-amzn-codewhisperer-optout", "true")
+			// Apply dynamic fingerprint-based headers
+			applyDynamicFingerprint(httpReq, auth)
+			
 			httpReq.Header.Set("Amz-Sdk-Request", "attempt=1; max=3")
 			httpReq.Header.Set("Amz-Sdk-Invocation-Id", uuid.New().String())

@@ -721,10 +1086,20 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 				AuthValue: authValue,
 			})

-			httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
+			httpClient := newKiroHTTPClientWithPooling(ctx, e.cfg, auth, 0)
 			httpResp, err := httpClient.Do(httpReq)
 			if err != nil {
 				recordAPIResponseError(ctx, e.cfg, err)
+
+				// Enhanced socket retry for streaming: Check if error is retryable (network timeout, connection reset, etc.)
+				retryCfg := defaultRetryConfig()
+				if isRetryableError(err) && attempt < retryCfg.MaxRetries {
+					delay := calculateRetryDelay(attempt, retryCfg)
+					logRetryAttempt(attempt, retryCfg.MaxRetries, fmt.Sprintf("stream socket error: %v", err), delay, endpointConfig.Name)
+					time.Sleep(delay)
+					continue
+				}
+
 				return nil, err
 			}
 			recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
@@ -736,6 +1111,12 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 				_ = httpResp.Body.Close()
 				appendAPIResponseChunk(ctx, e.cfg, respBody)

+				// Record failure and set cooldown for 429
+				rateLimiter.MarkTokenFailed(tokenKey)
+				cooldownDuration := kiroauth.CalculateCooldownFor429(attempt)
+				cooldownMgr.SetCooldown(tokenKey, cooldownDuration, kiroauth.CooldownReason429)
+				log.Warnf("kiro: stream rate limit hit (429), token %s set to cooldown for %v", tokenKey, cooldownDuration)
+
 				// Preserve last 429 so callers can correctly backoff when all endpoints are exhausted
 				last429Err = statusErr{code: httpResp.StatusCode, msg: string(respBody)}

@@ -747,13 +1128,21 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 			}

 			// Handle 5xx server errors with exponential backoff retry
+			// Enhanced: Use retryConfig for consistent retry behavior
 			if httpResp.StatusCode >= 500 && httpResp.StatusCode < 600 {
 				respBody, _ := io.ReadAll(httpResp.Body)
 				_ = httpResp.Body.Close()
 				appendAPIResponseChunk(ctx, e.cfg, respBody)

-				if attempt < maxRetries {
-					// Exponential backoff: 1s, 2s, 4s... (max 30s)
+				retryCfg := defaultRetryConfig()
+				// Check if this specific 5xx code is retryable (502, 503, 504)
+				if isRetryableHTTPStatus(httpResp.StatusCode) && attempt < retryCfg.MaxRetries {
+					delay := calculateRetryDelay(attempt, retryCfg)
+					logRetryAttempt(attempt, retryCfg.MaxRetries, fmt.Sprintf("stream HTTP %d", httpResp.StatusCode), delay, endpointConfig.Name)
+					time.Sleep(delay)
+					continue
+				} else if attempt < maxRetries {
+					// Fallback for other 5xx errors (500, 501, etc.)
 					backoff := time.Duration(1<<attempt) * time.Second
 					if backoff > 30*time.Second {
 						backoff = 30 * time.Second
@@ -840,7 +1229,10 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox

 				// Check for SUSPENDED status - return immediately without retry
 				if strings.Contains(respBodyStr, "SUSPENDED") || strings.Contains(respBodyStr, "TEMPORARILY_SUSPENDED") {
-					log.Errorf("kiro: account is suspended, cannot proceed")
+					// Set long cooldown for suspended accounts
+					rateLimiter.CheckAndMarkSuspended(tokenKey, respBodyStr)
+					cooldownMgr.SetCooldown(tokenKey, kiroauth.LongCooldown, kiroauth.CooldownReasonSuspended)
+					log.Errorf("kiro: stream account is suspended, token %s set to cooldown for %v", tokenKey, kiroauth.LongCooldown)
 					return nil, statusErr{code: httpResp.StatusCode, msg: "account suspended: " + string(respBody)}
 				}

@@ -890,6 +1282,11 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox

 			out := make(chan cliproxyexecutor.StreamChunk)

+			// Record success immediately since connection was established successfully
+			// Streaming errors will be handled separately
+			rateLimiter.MarkTokenSuccess(tokenKey)
+			log.Debugf("kiro: stream request successful, token %s marked as success", tokenKey)
+
 			go func(resp *http.Response, thinkingEnabled bool) {
 				defer close(out)
 				defer func() {