From 134f41496dd3d3bcbd1601b223856830c8f3a88e Mon Sep 17 00:00:00 2001
From: hkfires <10558748+hkfires@users.noreply.github.com>
Date: Sun, 1 Mar 2026 10:05:29 +0800
Subject: [PATCH 1/7] fix(antigravity): update model configurations and add new
 models for Antigravity

---
 internal/registry/model_definitions_static_data.go | 9 ++++-----
 internal/runtime/executor/antigravity_executor.go  | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/internal/registry/model_definitions_static_data.go b/internal/registry/model_definitions_static_data.go
index 7cfe15db..f70d3984 100644
--- a/internal/registry/model_definitions_static_data.go
+++ b/internal/registry/model_definitions_static_data.go
@@ -947,18 +947,17 @@ type AntigravityModelConfig struct {
 // Keys use upstream model names returned by the Antigravity models endpoint.
 func GetAntigravityModelConfig() map[string]*AntigravityModelConfig {
 	return map[string]*AntigravityModelConfig{
-		// "rev19-uic3-1p":              {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true}},
 		"gemini-2.5-flash":         {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}},
 		"gemini-2.5-flash-lite":    {Thinking: &ThinkingSupport{Min: 0, Max: 24576, ZeroAllowed: true, DynamicAllowed: true}},
 		"gemini-3-pro-high":        {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}},
-		"gemini-3-pro-image":       {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}},
+		"gemini-3-pro-low":         {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}},
 		"gemini-3.1-pro-high":      {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}},
+		"gemini-3.1-pro-low":       {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"low", "high"}}},
 		"gemini-3.1-flash-image":   {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"minimal", "high"}}},
 		"gemini-3-flash":           {Thinking: &ThinkingSupport{Min: 128, Max: 32768, ZeroAllowed: false, DynamicAllowed: true, Levels: []string{"minimal", "low", "medium", "high"}}},
-		"claude-opus-4-6-thinking": {Thinking: &ThinkingSupport{Min: 1024, Max: 64000, ZeroAllowed: true, DynamicAllowed: true}},
-		"claude-sonnet-4-6":        {Thinking: &ThinkingSupport{Min: 1024, Max: 64000, ZeroAllowed: true, DynamicAllowed: true}},
+		"claude-opus-4-6-thinking": {Thinking: &ThinkingSupport{Min: 1024, Max: 64000, ZeroAllowed: true, DynamicAllowed: true}, MaxCompletionTokens: 64000},
+		"claude-sonnet-4-6":        {Thinking: &ThinkingSupport{Min: 1024, Max: 64000, ZeroAllowed: true, DynamicAllowed: true}, MaxCompletionTokens: 64000},
 		"gpt-oss-120b-medium":      {},
-		"tab_flash_lite_preview":   {},
 	}
 }
 
diff --git a/internal/runtime/executor/antigravity_executor.go b/internal/runtime/executor/antigravity_executor.go
index 00959a22..919d96fa 100644
--- a/internal/runtime/executor/antigravity_executor.go
+++ b/internal/runtime/executor/antigravity_executor.go
@@ -1152,7 +1152,7 @@ func FetchAntigravityModels(ctx context.Context, auth *cliproxyauth.Auth, cfg *c
 				continue
 			}
 			switch modelID {
-			case "chat_20706", "chat_23310", "gemini-2.5-flash-thinking", "gemini-3-pro-low", "gemini-2.5-pro":
+			case "chat_20706", "chat_23310", "tab_flash_lite_preview", "tab_jump_flash_lite_preview", "gemini-2.5-flash-thinking", "gemini-2.5-pro":
 				continue
 			}
 			modelCfg := modelConfig[modelID]

From b148820c358480220e2a5ca8958accec8599071d Mon Sep 17 00:00:00 2001
From: hkfires <10558748+hkfires@users.noreply.github.com>
Date: Sun, 1 Mar 2026 10:30:19 +0800
Subject: [PATCH 2/7] fix(translator): handle Claude thinking type "auto" like
 adaptive

---
 .../antigravity/claude/antigravity_claude_request.go   | 10 ++--------
 .../translator/codex/claude/codex_claude_request.go    |  4 ++--
 .../gemini-cli/claude/gemini-cli_claude_request.go     |  4 ++--
 .../translator/gemini/claude/gemini_claude_request.go  |  4 ++--
 .../translator/openai/claude/openai_claude_request.go  |  4 ++--
 5 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/internal/translator/antigravity/claude/antigravity_claude_request.go b/internal/translator/antigravity/claude/antigravity_claude_request.go
index a3f9fa48..c4e07b6a 100644
--- a/internal/translator/antigravity/claude/antigravity_claude_request.go
+++ b/internal/translator/antigravity/claude/antigravity_claude_request.go
@@ -440,14 +440,8 @@ func ConvertClaudeRequestToAntigravity(modelName string, inputRawJSON []byte, _
 				out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.thinkingBudget", budget)
 				out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.includeThoughts", true)
 			}
-		case "auto":
-			// Amp sends thinking.type="auto" — use max budget from model config
-			// Antigravity API for Claude models requires a concrete positive budget,
-			// not -1. Use a high default that ApplyThinking will cap to model max.
-			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.thinkingBudget", 64000)
-			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.includeThoughts", true)
-		case "adaptive":
-			// Keep adaptive as a high level sentinel; ApplyThinking resolves it
+		case "adaptive", "auto":
+			// Keep adaptive/auto as a high level sentinel; ApplyThinking resolves it
 			// to model-specific max capability.
 			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.thinkingLevel", "high")
 			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.includeThoughts", true)
diff --git a/internal/translator/codex/claude/codex_claude_request.go b/internal/translator/codex/claude/codex_claude_request.go
index 64e41fb5..739b39e9 100644
--- a/internal/translator/codex/claude/codex_claude_request.go
+++ b/internal/translator/codex/claude/codex_claude_request.go
@@ -230,8 +230,8 @@ func ConvertClaudeRequestToCodex(modelName string, inputRawJSON []byte, _ bool)
 					reasoningEffort = effort
 				}
 			}
-		case "adaptive":
-			// Claude adaptive means "enable with max capacity"; keep it as highest level
+		case "adaptive", "auto":
+			// Claude adaptive/auto means "enable with max capacity"; keep it as highest level
 			// and let ApplyThinking normalize per target model capability.
 			reasoningEffort = string(thinking.LevelXHigh)
 		case "disabled":
diff --git a/internal/translator/gemini-cli/claude/gemini-cli_claude_request.go b/internal/translator/gemini-cli/claude/gemini-cli_claude_request.go
index ee661381..653bbeb2 100644
--- a/internal/translator/gemini-cli/claude/gemini-cli_claude_request.go
+++ b/internal/translator/gemini-cli/claude/gemini-cli_claude_request.go
@@ -180,8 +180,8 @@ func ConvertClaudeRequestToCLI(modelName string, inputRawJSON []byte, _ bool) []
 				out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.thinkingBudget", budget)
 				out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.includeThoughts", true)
 			}
-		case "adaptive":
-			// Keep adaptive as a high level sentinel; ApplyThinking resolves it
+		case "adaptive", "auto":
+			// Keep adaptive/auto as a high level sentinel; ApplyThinking resolves it
 			// to model-specific max capability.
 			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.thinkingLevel", "high")
 			out, _ = sjson.Set(out, "request.generationConfig.thinkingConfig.includeThoughts", true)
diff --git a/internal/translator/gemini/claude/gemini_claude_request.go b/internal/translator/gemini/claude/gemini_claude_request.go
index e882f769..b5756d20 100644
--- a/internal/translator/gemini/claude/gemini_claude_request.go
+++ b/internal/translator/gemini/claude/gemini_claude_request.go
@@ -161,8 +161,8 @@ func ConvertClaudeRequestToGemini(modelName string, inputRawJSON []byte, _ bool)
 				out, _ = sjson.Set(out, "generationConfig.thinkingConfig.thinkingBudget", budget)
 				out, _ = sjson.Set(out, "generationConfig.thinkingConfig.includeThoughts", true)
 			}
-		case "adaptive":
-			// Keep adaptive as a high level sentinel; ApplyThinking resolves it
+		case "adaptive", "auto":
+			// Keep adaptive/auto as a high level sentinel; ApplyThinking resolves it
 			// to model-specific max capability.
 			out, _ = sjson.Set(out, "generationConfig.thinkingConfig.thinkingLevel", "high")
 			out, _ = sjson.Set(out, "generationConfig.thinkingConfig.includeThoughts", true)
diff --git a/internal/translator/openai/claude/openai_claude_request.go b/internal/translator/openai/claude/openai_claude_request.go
index acb79a13..e3efb83c 100644
--- a/internal/translator/openai/claude/openai_claude_request.go
+++ b/internal/translator/openai/claude/openai_claude_request.go
@@ -75,8 +75,8 @@ func ConvertClaudeRequestToOpenAI(modelName string, inputRawJSON []byte, stream
 						out, _ = sjson.Set(out, "reasoning_effort", effort)
 					}
 				}
-			case "adaptive":
-				// Claude adaptive means "enable with max capacity"; keep it as highest level
+			case "adaptive", "auto":
+				// Claude adaptive/auto means "enable with max capacity"; keep it as highest level
 				// and let ApplyThinking normalize per target model capability.
 				out, _ = sjson.Set(out, "reasoning_effort", string(thinking.LevelXHigh))
 			case "disabled":

From 444a47ae63375aaf5b29a322e13f2d4f21623c8e Mon Sep 17 00:00:00 2001
From: edlsh <enzo@edl.sh>
Date: Sat, 28 Feb 2026 22:32:33 -0500
Subject: [PATCH 3/7] Fix Claude cache-control guardrails and gzip error
 decoding

---
 internal/runtime/executor/claude_executor.go  | 303 +++++++++++++++++-
 .../runtime/executor/claude_executor_test.go  | 171 ++++++++++
 2 files changed, 465 insertions(+), 9 deletions(-)

diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go
index fcb3a9c9..8826b061 100644
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -135,6 +135,15 @@ func (e *ClaudeExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r
 		body = ensureCacheControl(body)
 	}
 
+	// Enforce Anthropic's cache_control block limit (max 4 breakpoints per request).
+	// Cloaking and ensureCacheControl may push the total over 4 when the client
+	// (e.g. Amp CLI) already sends multiple cache_control blocks.
+	body = enforceCacheControlLimit(body, 4)
+
+	// Normalize TTL values to prevent ordering violations under prompt-caching-scope-2026-01-05.
+	// A 1h-TTL block must not appear after a 5m-TTL block in evaluation order (tools→system→messages).
+	body = normalizeCacheControlTTL(body)
+
 	// Extract betas from body and convert to header
 	var extraBetas []string
 	extraBetas, body = extractAndRemoveBetas(body)
@@ -176,11 +185,18 @@ func (e *ClaudeExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
 	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		b, _ := io.ReadAll(httpResp.Body)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
+		errBody := httpResp.Body
+		if ce := httpResp.Header.Get("Content-Encoding"); ce != "" {
+			if decoded, decErr := decodeResponseBody(httpResp.Body, ce); decErr == nil {
+				errBody = decoded
+			}
+		}
+		b, _ := io.ReadAll(errBody)
 		appendAPIResponseChunk(ctx, e.cfg, b)
 		logWithRequestID(ctx).Debugf("request error, error status: %d, error message: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
 		err = statusErr{code: httpResp.StatusCode, msg: string(b)}
-		if errClose := httpResp.Body.Close(); errClose != nil {
+		if errClose := errBody.Close(); errClose != nil {
 			log.Errorf("response body close error: %v", errClose)
 		}
 		return resp, err
@@ -276,6 +292,12 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 		body = ensureCacheControl(body)
 	}
 
+	// Enforce Anthropic's cache_control block limit (max 4 breakpoints per request).
+	body = enforceCacheControlLimit(body, 4)
+
+	// Normalize TTL values to prevent ordering violations under prompt-caching-scope-2026-01-05.
+	body = normalizeCacheControlTTL(body)
+
 	// Extract betas from body and convert to header
 	var extraBetas []string
 	extraBetas, body = extractAndRemoveBetas(body)
@@ -317,10 +339,17 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
 	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		b, _ := io.ReadAll(httpResp.Body)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
+		errBody := httpResp.Body
+		if ce := httpResp.Header.Get("Content-Encoding"); ce != "" {
+			if decoded, decErr := decodeResponseBody(httpResp.Body, ce); decErr == nil {
+				errBody = decoded
+			}
+		}
+		b, _ := io.ReadAll(errBody)
 		appendAPIResponseChunk(ctx, e.cfg, b)
 		logWithRequestID(ctx).Debugf("request error, error status: %d, error message: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
-		if errClose := httpResp.Body.Close(); errClose != nil {
+		if errClose := errBody.Close(); errClose != nil {
 			log.Errorf("response body close error: %v", errClose)
 		}
 		err = statusErr{code: httpResp.StatusCode, msg: string(b)}
@@ -425,6 +454,10 @@ func (e *ClaudeExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Aut
 		body = checkSystemInstructions(body)
 	}
 
+	// Keep count_tokens requests compatible with Anthropic cache-control constraints too.
+	body = enforceCacheControlLimit(body, 4)
+	body = normalizeCacheControlTTL(body)
+
 	// Extract betas from body and convert to header (for count_tokens too)
 	var extraBetas []string
 	extraBetas, body = extractAndRemoveBetas(body)
@@ -464,9 +497,16 @@ func (e *ClaudeExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Aut
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, resp.StatusCode, resp.Header.Clone())
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		b, _ := io.ReadAll(resp.Body)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
+		errBody := io.ReadCloser(resp.Body)
+		if ce := resp.Header.Get("Content-Encoding"); ce != "" {
+			if decoded, decErr := decodeResponseBody(resp.Body, ce); decErr == nil {
+				errBody = decoded
+			}
+		}
+		b, _ := io.ReadAll(errBody)
 		appendAPIResponseChunk(ctx, e.cfg, b)
-		if errClose := resp.Body.Close(); errClose != nil {
+		if errClose := errBody.Close(); errClose != nil {
 			log.Errorf("response body close error: %v", errClose)
 		}
 		return cliproxyexecutor.Response{}, statusErr{code: resp.StatusCode, msg: string(b)}
@@ -1083,7 +1123,12 @@ func checkSystemInstructionsWithMode(payload []byte, strictMode bool) []byte {
 
 	billingText := generateBillingHeader(payload)
 	billingBlock := fmt.Sprintf(`{"type":"text","text":"%s"}`, billingText)
-	agentBlock := `{"type":"text","text":"You are a Claude agent, built on Anthropic's Claude Agent SDK.","cache_control":{"type":"ephemeral","ttl":"1h"}}`
+	// No cache_control on the agent block. It is a cloaking artifact with zero cache
+	// value (the last system block is what actually triggers caching of all system content).
+	// Including any cache_control here creates an intra-system TTL ordering violation
+	// when the client's system blocks use ttl='1h' (prompt-caching-scope-2026-01-05 beta
+	// forbids 1h blocks after 5m blocks, and a no-TTL block defaults to 5m).
+	agentBlock := `{"type":"text","text":"You are a Claude agent, built on Anthropic's Claude Agent SDK."}`
 
 	if strictMode {
 		// Strict mode: billing header + agent identifier only
@@ -1103,11 +1148,12 @@ func checkSystemInstructionsWithMode(payload []byte, strictMode bool) []byte {
 	if system.IsArray() {
 		system.ForEach(func(_, part gjson.Result) bool {
 			if part.Get("type").String() == "text" {
-				// Add cache_control with ttl to user system messages if not present
+				// Add cache_control to user system messages if not present.
+				// Do NOT add ttl — let it inherit the default (5m) to avoid
+				// TTL ordering violations with the prompt-caching-scope-2026-01-05 beta.
 				partJSON := part.Raw
 				if !part.Get("cache_control").Exists() {
 					partJSON, _ = sjson.Set(partJSON, "cache_control.type", "ephemeral")
-					partJSON, _ = sjson.Set(partJSON, "cache_control.ttl", "1h")
 				}
 				result += "," + partJSON
 			}
@@ -1254,6 +1300,245 @@ func countCacheControls(payload []byte) int {
 	return count
 }
 
+// normalizeCacheControlTTL ensures cache_control TTL values don't violate the
+// prompt-caching-scope-2026-01-05 ordering constraint: a 1h-TTL block must not
+// appear after a 5m-TTL block anywhere in the evaluation order.
+//
+// Anthropic evaluates blocks in order: tools → system (index 0..N) → messages.
+// Within each section, blocks are evaluated in array order. A 5m (default) block
+// followed by a 1h block at ANY later position is an error — including within
+// the same section (e.g. system[1]=5m then system[3]=1h).
+//
+// Strategy: walk all cache_control blocks in evaluation order. Once a 5m block
+// is seen, strip ttl from ALL subsequent 1h blocks (downgrading them to 5m).
+func normalizeCacheControlTTL(payload []byte) []byte {
+	seen5m := false // once true, all subsequent 1h blocks must be downgraded
+
+	// Phase 1: tools (evaluated first)
+	tools := gjson.GetBytes(payload, "tools")
+	if tools.IsArray() {
+		idx := 0
+		tools.ForEach(func(_, tool gjson.Result) bool {
+			cc := tool.Get("cache_control")
+			if cc.Exists() {
+				ttl := cc.Get("ttl").String()
+				if ttl != "1h" {
+					seen5m = true
+				} else if seen5m {
+					payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control.ttl", idx))
+				}
+			}
+			idx++
+			return true
+		})
+	}
+
+	// Phase 2: system blocks (evaluated second, in array order)
+	system := gjson.GetBytes(payload, "system")
+	if system.IsArray() {
+		idx := 0
+		system.ForEach(func(_, item gjson.Result) bool {
+			cc := item.Get("cache_control")
+			if cc.Exists() {
+				ttl := cc.Get("ttl").String()
+				if ttl != "1h" {
+					seen5m = true
+				} else if seen5m {
+					payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control.ttl", idx))
+				}
+			}
+			idx++
+			return true
+		})
+	}
+
+	// Phase 3: message content blocks (evaluated last, in array order)
+	messages := gjson.GetBytes(payload, "messages")
+	if messages.IsArray() {
+		msgIdx := 0
+		messages.ForEach(func(_, msg gjson.Result) bool {
+			content := msg.Get("content")
+			if content.IsArray() {
+				contentIdx := 0
+				content.ForEach(func(_, item gjson.Result) bool {
+					cc := item.Get("cache_control")
+					if cc.Exists() {
+						ttl := cc.Get("ttl").String()
+						if ttl != "1h" {
+							seen5m = true
+						} else if seen5m {
+							payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("messages.%d.content.%d.cache_control.ttl", msgIdx, contentIdx))
+						}
+					}
+					contentIdx++
+					return true
+				})
+			}
+			msgIdx++
+			return true
+		})
+	}
+
+	return payload
+}
+
+// enforceCacheControlLimit removes excess cache_control blocks from a payload
+// so the total does not exceed the Anthropic API limit (currently 4).
+//
+// Anthropic evaluates cache breakpoints in order: tools → system → messages.
+// The most valuable breakpoints are:
+//   1. Last tool         — caches ALL tool definitions
+//   2. Last system block — caches ALL system content
+//   3. Recent messages   — cache conversation context
+//
+// Removal priority (strip lowest-value first):
+//   Phase 1: system blocks earliest-first, preserving the last one.
+//   Phase 2: tool blocks earliest-first, preserving the last one.
+//   Phase 3: message content blocks earliest-first.
+//   Phase 4: remaining system blocks (last system).
+//   Phase 5: remaining tool blocks (last tool).
+func enforceCacheControlLimit(payload []byte, maxBlocks int) []byte {
+	total := countCacheControls(payload)
+	if total <= maxBlocks {
+		return payload
+	}
+
+	excess := total - maxBlocks
+
+	// Phase 1: strip cache_control from system blocks earliest-first, but SKIP the last one.
+	// The last system cache_control is high-value because it caches all system content.
+	system := gjson.GetBytes(payload, "system")
+	if system.IsArray() {
+		lastSysCCIdx := -1
+		sysIdx := 0
+		system.ForEach(func(_, item gjson.Result) bool {
+			if item.Get("cache_control").Exists() {
+				lastSysCCIdx = sysIdx
+			}
+			sysIdx++
+			return true
+		})
+
+		idx := 0
+		system.ForEach(func(_, item gjson.Result) bool {
+			if excess <= 0 {
+				return false
+			}
+			if item.Get("cache_control").Exists() && idx != lastSysCCIdx {
+				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control", idx))
+				excess--
+			}
+			idx++
+			return true
+		})
+	}
+	if excess <= 0 {
+		return payload
+	}
+
+	// Phase 2: strip cache_control from tools earliest-first, but SKIP the last one.
+	// Only the last tool cache_control is needed to cache all tool definitions.
+	tools := gjson.GetBytes(payload, "tools")
+	if tools.IsArray() {
+		lastToolCCIdx := -1
+		toolIdx := 0
+		tools.ForEach(func(_, tool gjson.Result) bool {
+			if tool.Get("cache_control").Exists() {
+				lastToolCCIdx = toolIdx
+			}
+			toolIdx++
+			return true
+		})
+
+		idx := 0
+		tools.ForEach(func(_, tool gjson.Result) bool {
+			if excess <= 0 {
+				return false
+			}
+			if tool.Get("cache_control").Exists() && idx != lastToolCCIdx {
+				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control", idx))
+				excess--
+			}
+			idx++
+			return true
+		})
+	}
+	if excess <= 0 {
+		return payload
+	}
+
+	// Phase 3: strip cache_control from message content blocks, earliest first.
+	// Older conversation turns are least likely to help immediate reuse.
+	messages := gjson.GetBytes(payload, "messages")
+	if messages.IsArray() {
+		msgIdx := 0
+		messages.ForEach(func(_, msg gjson.Result) bool {
+			if excess <= 0 {
+				return false
+			}
+			content := msg.Get("content")
+			if content.IsArray() {
+				contentIdx := 0
+				content.ForEach(func(_, item gjson.Result) bool {
+					if excess <= 0 {
+						return false
+					}
+					if item.Get("cache_control").Exists() {
+						payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("messages.%d.content.%d.cache_control", msgIdx, contentIdx))
+						excess--
+					}
+					contentIdx++
+					return true
+				})
+			}
+			msgIdx++
+			return true
+		})
+	}
+	if excess <= 0 {
+		return payload
+	}
+
+	// Phase 4: strip any remaining system cache_control blocks.
+	system = gjson.GetBytes(payload, "system")
+	if system.IsArray() {
+		idx := 0
+		system.ForEach(func(_, item gjson.Result) bool {
+			if excess <= 0 {
+				return false
+			}
+			if item.Get("cache_control").Exists() {
+				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control", idx))
+				excess--
+			}
+			idx++
+			return true
+		})
+	}
+	if excess <= 0 {
+		return payload
+	}
+
+	// Phase 5: strip any remaining tool cache_control blocks (including the last tool).
+	tools = gjson.GetBytes(payload, "tools")
+	if tools.IsArray() {
+		idx := 0
+		tools.ForEach(func(_, tool gjson.Result) bool {
+			if excess <= 0 {
+				return false
+			}
+			if tool.Get("cache_control").Exists() {
+				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control", idx))
+				excess--
+			}
+			idx++
+			return true
+		})
+	}
+
+	return payload
+}
+
 // injectMessagesCacheControl adds cache_control to the second-to-last user turn for multi-turn caching.
 // Per Anthropic docs: "Place cache_control on the second-to-last User message to let the model reuse the earlier cache."
 // This enables caching of conversation history, which is especially beneficial for long multi-turn conversations.
diff --git a/internal/runtime/executor/claude_executor_test.go b/internal/runtime/executor/claude_executor_test.go
index dd29ed8a..d90076b6 100644
--- a/internal/runtime/executor/claude_executor_test.go
+++ b/internal/runtime/executor/claude_executor_test.go
@@ -348,3 +348,174 @@ func TestApplyClaudeToolPrefix_SkipsBuiltinToolReference(t *testing.T) {
 		t.Fatalf("built-in tool_reference should not be prefixed, got %q", got)
 	}
 }
+
+func TestNormalizeCacheControlTTL_DowngradesLaterOneHourBlocks(t *testing.T) {
+	payload := []byte(`{
+		"tools": [{"name":"t1","cache_control":{"type":"ephemeral","ttl":"1h"}}],
+		"system": [{"type":"text","text":"s1","cache_control":{"type":"ephemeral"}}],
+		"messages": [{"role":"user","content":[{"type":"text","text":"u1","cache_control":{"type":"ephemeral","ttl":"1h"}}]}]
+	}`)
+
+	out := normalizeCacheControlTTL(payload)
+
+	if got := gjson.GetBytes(out, "tools.0.cache_control.ttl").String(); got != "1h" {
+		t.Fatalf("tools.0.cache_control.ttl = %q, want %q", got, "1h")
+	}
+	if gjson.GetBytes(out, "messages.0.content.0.cache_control.ttl").Exists() {
+		t.Fatalf("messages.0.content.0.cache_control.ttl should be removed after a default-5m block")
+	}
+}
+
+func TestEnforceCacheControlLimit_StripsNonLastToolBeforeMessages(t *testing.T) {
+	payload := []byte(`{
+		"tools": [
+			{"name":"t1","cache_control":{"type":"ephemeral"}},
+			{"name":"t2","cache_control":{"type":"ephemeral"}}
+		],
+		"system": [{"type":"text","text":"s1","cache_control":{"type":"ephemeral"}}],
+		"messages": [
+			{"role":"user","content":[{"type":"text","text":"u1","cache_control":{"type":"ephemeral"}}]},
+			{"role":"user","content":[{"type":"text","text":"u2","cache_control":{"type":"ephemeral"}}]}
+		]
+	}`)
+
+	out := enforceCacheControlLimit(payload, 4)
+
+	if got := countCacheControls(out); got != 4 {
+		t.Fatalf("cache_control count = %d, want 4", got)
+	}
+	if gjson.GetBytes(out, "tools.0.cache_control").Exists() {
+		t.Fatalf("tools.0.cache_control should be removed first (non-last tool)")
+	}
+	if !gjson.GetBytes(out, "tools.1.cache_control").Exists() {
+		t.Fatalf("tools.1.cache_control (last tool) should be preserved")
+	}
+	if !gjson.GetBytes(out, "messages.0.content.0.cache_control").Exists() || !gjson.GetBytes(out, "messages.1.content.0.cache_control").Exists() {
+		t.Fatalf("message cache_control blocks should be preserved when non-last tool removal is enough")
+	}
+}
+
+func TestEnforceCacheControlLimit_ToolOnlyPayloadStillRespectsLimit(t *testing.T) {
+	payload := []byte(`{
+		"tools": [
+			{"name":"t1","cache_control":{"type":"ephemeral"}},
+			{"name":"t2","cache_control":{"type":"ephemeral"}},
+			{"name":"t3","cache_control":{"type":"ephemeral"}},
+			{"name":"t4","cache_control":{"type":"ephemeral"}},
+			{"name":"t5","cache_control":{"type":"ephemeral"}}
+		]
+	}`)
+
+	out := enforceCacheControlLimit(payload, 4)
+
+	if got := countCacheControls(out); got != 4 {
+		t.Fatalf("cache_control count = %d, want 4", got)
+	}
+	if gjson.GetBytes(out, "tools.0.cache_control").Exists() {
+		t.Fatalf("tools.0.cache_control should be removed to satisfy max=4")
+	}
+	if !gjson.GetBytes(out, "tools.4.cache_control").Exists() {
+		t.Fatalf("last tool cache_control should be preserved when possible")
+	}
+}
+
+func TestClaudeExecutor_CountTokens_AppliesCacheControlGuards(t *testing.T) {
+	var seenBody []byte
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		body, _ := io.ReadAll(r.Body)
+		seenBody = bytes.Clone(body)
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{"input_tokens":42}`))
+	}))
+	defer server.Close()
+
+	executor := NewClaudeExecutor(&config.Config{})
+	auth := &cliproxyauth.Auth{Attributes: map[string]string{
+		"api_key":  "key-123",
+		"base_url": server.URL,
+	}}
+
+	payload := []byte(`{
+		"tools": [
+			{"name":"t1","cache_control":{"type":"ephemeral","ttl":"1h"}},
+			{"name":"t2","cache_control":{"type":"ephemeral"}}
+		],
+		"system": [
+			{"type":"text","text":"s1","cache_control":{"type":"ephemeral","ttl":"1h"}},
+			{"type":"text","text":"s2","cache_control":{"type":"ephemeral","ttl":"1h"}}
+		],
+		"messages": [
+			{"role":"user","content":[{"type":"text","text":"u1","cache_control":{"type":"ephemeral","ttl":"1h"}}]},
+			{"role":"user","content":[{"type":"text","text":"u2","cache_control":{"type":"ephemeral","ttl":"1h"}}]}
+		]
+	}`)
+
+	_, err := executor.CountTokens(context.Background(), auth, cliproxyexecutor.Request{
+		Model:   "claude-3-5-haiku-20241022",
+		Payload: payload,
+	}, cliproxyexecutor.Options{SourceFormat: sdktranslator.FromString("claude")})
+	if err != nil {
+		t.Fatalf("CountTokens error: %v", err)
+	}
+
+	if len(seenBody) == 0 {
+		t.Fatal("expected count_tokens request body to be captured")
+	}
+	if got := countCacheControls(seenBody); got > 4 {
+		t.Fatalf("count_tokens body has %d cache_control blocks, want <= 4", got)
+	}
+	if hasTTLOrderingViolation(seenBody) {
+		t.Fatalf("count_tokens body still has ttl ordering violations: %s", string(seenBody))
+	}
+}
+
+func hasTTLOrderingViolation(payload []byte) bool {
+	seen5m := false
+	violates := false
+
+	checkCC := func(cc gjson.Result) {
+		if !cc.Exists() || violates {
+			return
+		}
+		ttl := cc.Get("ttl").String()
+		if ttl != "1h" {
+			seen5m = true
+			return
+		}
+		if seen5m {
+			violates = true
+		}
+	}
+
+	tools := gjson.GetBytes(payload, "tools")
+	if tools.IsArray() {
+		tools.ForEach(func(_, tool gjson.Result) bool {
+			checkCC(tool.Get("cache_control"))
+			return !violates
+		})
+	}
+
+	system := gjson.GetBytes(payload, "system")
+	if system.IsArray() {
+		system.ForEach(func(_, item gjson.Result) bool {
+			checkCC(item.Get("cache_control"))
+			return !violates
+		})
+	}
+
+	messages := gjson.GetBytes(payload, "messages")
+	if messages.IsArray() {
+		messages.ForEach(func(_, msg gjson.Result) bool {
+			content := msg.Get("content")
+			if content.IsArray() {
+				content.ForEach(func(_, item gjson.Result) bool {
+					checkCC(item.Get("cache_control"))
+					return !violates
+				})
+			}
+			return !violates
+		})
+	}
+
+	return violates
+}

From 0ad3e8457f9d3121b0fa24b95c96b4d6d3030ca3 Mon Sep 17 00:00:00 2001
From: edlsh <enzo@edl.sh>
Date: Sat, 28 Feb 2026 22:34:14 -0500
Subject: [PATCH 4/7] Clarify cloaking system block cache-control comments

---
 internal/runtime/executor/claude_executor.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go
index 8826b061..ddbe9297 100644
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -1113,11 +1113,10 @@ func generateBillingHeader(payload []byte) string {
 	return fmt.Sprintf("x-anthropic-billing-header: cc_version=2.1.63.%s; cc_entrypoint=cli; cch=%s;", buildHash, cch)
 }
 
-// checkSystemInstructionsWithMode injects Claude Code system prompt to match
-// the real Claude Code request format:
+// checkSystemInstructionsWithMode injects Claude Code-style system blocks:
 //   system[0]: billing header (no cache_control)
-//   system[1]: "You are a Claude agent, built on Anthropic's Claude Agent SDK." (with cache_control)
-//   system[2..]: user's system messages (with cache_control on last)
+//   system[1]: agent identifier (no cache_control)
+//   system[2..]: user system messages (cache_control added when missing)
 func checkSystemInstructionsWithMode(payload []byte, strictMode bool) []byte {
 	system := gjson.GetBytes(payload, "system")
 

From 6ac9b31e4eeb743b89b9fbccee1c4fe2e2c5b43a Mon Sep 17 00:00:00 2001
From: edlsh <enzo@edl.sh>
Date: Sat, 28 Feb 2026 22:43:46 -0500
Subject: [PATCH 5/7] Handle compressed error decode failures safely

---
 internal/runtime/executor/claude_executor.go  | 59 +++++++++++++----
 .../runtime/executor/claude_executor_test.go  | 64 +++++++++++++++++++
 2 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go
index ddbe9297..483a4830 100644
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -185,14 +185,25 @@ func (e *ClaudeExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
 	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API).
 		errBody := httpResp.Body
 		if ce := httpResp.Header.Get("Content-Encoding"); ce != "" {
-			if decoded, decErr := decodeResponseBody(httpResp.Body, ce); decErr == nil {
-				errBody = decoded
+			var decErr error
+			errBody, decErr = decodeResponseBody(httpResp.Body, ce)
+			if decErr != nil {
+				recordAPIResponseError(ctx, e.cfg, decErr)
+				msg := fmt.Sprintf("failed to decode error response body (encoding=%s): %v", ce, decErr)
+				logWithRequestID(ctx).Warn(msg)
+				return resp, statusErr{code: httpResp.StatusCode, msg: msg}
 			}
 		}
-		b, _ := io.ReadAll(errBody)
+		b, readErr := io.ReadAll(errBody)
+		if readErr != nil {
+			recordAPIResponseError(ctx, e.cfg, readErr)
+			msg := fmt.Sprintf("failed to read error response body: %v", readErr)
+			logWithRequestID(ctx).Warn(msg)
+			b = []byte(msg)
+		}
 		appendAPIResponseChunk(ctx, e.cfg, b)
 		logWithRequestID(ctx).Debugf("request error, error status: %d, error message: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
 		err = statusErr{code: httpResp.StatusCode, msg: string(b)}
@@ -339,14 +350,25 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
 	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API).
 		errBody := httpResp.Body
 		if ce := httpResp.Header.Get("Content-Encoding"); ce != "" {
-			if decoded, decErr := decodeResponseBody(httpResp.Body, ce); decErr == nil {
-				errBody = decoded
+			var decErr error
+			errBody, decErr = decodeResponseBody(httpResp.Body, ce)
+			if decErr != nil {
+				recordAPIResponseError(ctx, e.cfg, decErr)
+				msg := fmt.Sprintf("failed to decode error response body (encoding=%s): %v", ce, decErr)
+				logWithRequestID(ctx).Warn(msg)
+				return nil, statusErr{code: httpResp.StatusCode, msg: msg}
 			}
 		}
-		b, _ := io.ReadAll(errBody)
+		b, readErr := io.ReadAll(errBody)
+		if readErr != nil {
+			recordAPIResponseError(ctx, e.cfg, readErr)
+			msg := fmt.Sprintf("failed to read error response body: %v", readErr)
+			logWithRequestID(ctx).Warn(msg)
+			b = []byte(msg)
+		}
 		appendAPIResponseChunk(ctx, e.cfg, b)
 		logWithRequestID(ctx).Debugf("request error, error status: %d, error message: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
 		if errClose := errBody.Close(); errClose != nil {
@@ -497,14 +519,25 @@ func (e *ClaudeExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Aut
 	}
 	recordAPIResponseMetadata(ctx, e.cfg, resp.StatusCode, resp.Header.Clone())
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API)
-		errBody := io.ReadCloser(resp.Body)
+		// Decompress error responses (e.g. gzip-compressed 400 errors from Anthropic API).
+		errBody := resp.Body
 		if ce := resp.Header.Get("Content-Encoding"); ce != "" {
-			if decoded, decErr := decodeResponseBody(resp.Body, ce); decErr == nil {
-				errBody = decoded
+			var decErr error
+			errBody, decErr = decodeResponseBody(resp.Body, ce)
+			if decErr != nil {
+				recordAPIResponseError(ctx, e.cfg, decErr)
+				msg := fmt.Sprintf("failed to decode error response body (encoding=%s): %v", ce, decErr)
+				logWithRequestID(ctx).Warn(msg)
+				return cliproxyexecutor.Response{}, statusErr{code: resp.StatusCode, msg: msg}
 			}
 		}
-		b, _ := io.ReadAll(errBody)
+		b, readErr := io.ReadAll(errBody)
+		if readErr != nil {
+			recordAPIResponseError(ctx, e.cfg, readErr)
+			msg := fmt.Sprintf("failed to read error response body: %v", readErr)
+			logWithRequestID(ctx).Warn(msg)
+			b = []byte(msg)
+		}
 		appendAPIResponseChunk(ctx, e.cfg, b)
 		if errClose := errBody.Close(); errClose != nil {
 			log.Errorf("response body close error: %v", errClose)
diff --git a/internal/runtime/executor/claude_executor_test.go b/internal/runtime/executor/claude_executor_test.go
index d90076b6..f9553f9a 100644
--- a/internal/runtime/executor/claude_executor_test.go
+++ b/internal/runtime/executor/claude_executor_test.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/config"
@@ -519,3 +520,66 @@ func hasTTLOrderingViolation(payload []byte) bool {
 
 	return violates
 }
+
+func TestClaudeExecutor_Execute_InvalidGzipErrorBodyReturnsDecodeMessage(t *testing.T) {
+	testClaudeExecutorInvalidCompressedErrorBody(t, func(executor *ClaudeExecutor, auth *cliproxyauth.Auth, payload []byte) error {
+		_, err := executor.Execute(context.Background(), auth, cliproxyexecutor.Request{
+			Model:   "claude-3-5-sonnet-20241022",
+			Payload: payload,
+		}, cliproxyexecutor.Options{SourceFormat: sdktranslator.FromString("claude")})
+		return err
+	})
+}
+
+func TestClaudeExecutor_ExecuteStream_InvalidGzipErrorBodyReturnsDecodeMessage(t *testing.T) {
+	testClaudeExecutorInvalidCompressedErrorBody(t, func(executor *ClaudeExecutor, auth *cliproxyauth.Auth, payload []byte) error {
+		_, err := executor.ExecuteStream(context.Background(), auth, cliproxyexecutor.Request{
+			Model:   "claude-3-5-sonnet-20241022",
+			Payload: payload,
+		}, cliproxyexecutor.Options{SourceFormat: sdktranslator.FromString("claude")})
+		return err
+	})
+}
+
+func TestClaudeExecutor_CountTokens_InvalidGzipErrorBodyReturnsDecodeMessage(t *testing.T) {
+	testClaudeExecutorInvalidCompressedErrorBody(t, func(executor *ClaudeExecutor, auth *cliproxyauth.Auth, payload []byte) error {
+		_, err := executor.CountTokens(context.Background(), auth, cliproxyexecutor.Request{
+			Model:   "claude-3-5-sonnet-20241022",
+			Payload: payload,
+		}, cliproxyexecutor.Options{SourceFormat: sdktranslator.FromString("claude")})
+		return err
+	})
+}
+
+func testClaudeExecutorInvalidCompressedErrorBody(
+	t *testing.T,
+	invoke func(executor *ClaudeExecutor, auth *cliproxyauth.Auth, payload []byte) error,
+) {
+	t.Helper()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.Header().Set("Content-Encoding", "gzip")
+		w.WriteHeader(http.StatusBadRequest)
+		_, _ = w.Write([]byte("not-a-valid-gzip-stream"))
+	}))
+	defer server.Close()
+
+	executor := NewClaudeExecutor(&config.Config{})
+	auth := &cliproxyauth.Auth{Attributes: map[string]string{
+		"api_key":  "key-123",
+		"base_url": server.URL,
+	}}
+	payload := []byte(`{"messages":[{"role":"user","content":[{"type":"text","text":"hi"}]}]}`)
+
+	err := invoke(executor, auth, payload)
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+	if !strings.Contains(err.Error(), "failed to decode error response body") {
+		t.Fatalf("expected decode failure message, got: %v", err)
+	}
+	if statusProvider, ok := err.(interface{ StatusCode() int }); !ok || statusProvider.StatusCode() != http.StatusBadRequest {
+		t.Fatalf("expected status code 400, got: %v", err)
+	}
+}

From 76aa917882acb78eb98d08b32ce35354ba2f162d Mon Sep 17 00:00:00 2001
From: edlsh <enzo@edl.sh>
Date: Sat, 28 Feb 2026 22:47:04 -0500
Subject: [PATCH 6/7] Optimize cache-control JSON mutations in Claude executor

---
 internal/runtime/executor/claude_executor.go | 446 +++++++++++--------
 1 file changed, 258 insertions(+), 188 deletions(-)

diff --git a/internal/runtime/executor/claude_executor.go b/internal/runtime/executor/claude_executor.go
index 483a4830..0845d168 100644
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -9,6 +9,7 @@ import (
 	"crypto/rand"
 	"crypto/sha256"
 	"encoding/hex"
+	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
@@ -1147,9 +1148,10 @@ func generateBillingHeader(payload []byte) string {
 }
 
 // checkSystemInstructionsWithMode injects Claude Code-style system blocks:
-//   system[0]: billing header (no cache_control)
-//   system[1]: agent identifier (no cache_control)
-//   system[2..]: user system messages (cache_control added when missing)
+//
+//	system[0]: billing header (no cache_control)
+//	system[1]: agent identifier (no cache_control)
+//	system[2..]: user system messages (cache_control added when missing)
 func checkSystemInstructionsWithMode(payload []byte, strictMode bool) []byte {
 	system := gjson.GetBytes(payload, "system")
 
@@ -1332,6 +1334,180 @@ func countCacheControls(payload []byte) int {
 	return count
 }
 
+func parsePayloadObject(payload []byte) (map[string]any, bool) {
+	if len(payload) == 0 {
+		return nil, false
+	}
+	var root map[string]any
+	if err := json.Unmarshal(payload, &root); err != nil {
+		return nil, false
+	}
+	return root, true
+}
+
+func marshalPayloadObject(original []byte, root map[string]any) []byte {
+	if root == nil {
+		return original
+	}
+	out, err := json.Marshal(root)
+	if err != nil {
+		return original
+	}
+	return out
+}
+
+func asObject(v any) (map[string]any, bool) {
+	obj, ok := v.(map[string]any)
+	return obj, ok
+}
+
+func asArray(v any) ([]any, bool) {
+	arr, ok := v.([]any)
+	return arr, ok
+}
+
+func countCacheControlsMap(root map[string]any) int {
+	count := 0
+
+	if system, ok := asArray(root["system"]); ok {
+		for _, item := range system {
+			if obj, ok := asObject(item); ok {
+				if _, exists := obj["cache_control"]; exists {
+					count++
+				}
+			}
+		}
+	}
+
+	if tools, ok := asArray(root["tools"]); ok {
+		for _, item := range tools {
+			if obj, ok := asObject(item); ok {
+				if _, exists := obj["cache_control"]; exists {
+					count++
+				}
+			}
+		}
+	}
+
+	if messages, ok := asArray(root["messages"]); ok {
+		for _, msg := range messages {
+			msgObj, ok := asObject(msg)
+			if !ok {
+				continue
+			}
+			content, ok := asArray(msgObj["content"])
+			if !ok {
+				continue
+			}
+			for _, item := range content {
+				if obj, ok := asObject(item); ok {
+					if _, exists := obj["cache_control"]; exists {
+						count++
+					}
+				}
+			}
+		}
+	}
+
+	return count
+}
+
+func normalizeTTLForBlock(obj map[string]any, seen5m *bool) {
+	ccRaw, exists := obj["cache_control"]
+	if !exists {
+		return
+	}
+	cc, ok := asObject(ccRaw)
+	if !ok {
+		*seen5m = true
+		return
+	}
+	ttlRaw, ttlExists := cc["ttl"]
+	ttl, ttlIsString := ttlRaw.(string)
+	if !ttlExists || !ttlIsString || ttl != "1h" {
+		*seen5m = true
+		return
+	}
+	if *seen5m {
+		delete(cc, "ttl")
+	}
+}
+
+func findLastCacheControlIndex(arr []any) int {
+	last := -1
+	for idx, item := range arr {
+		obj, ok := asObject(item)
+		if !ok {
+			continue
+		}
+		if _, exists := obj["cache_control"]; exists {
+			last = idx
+		}
+	}
+	return last
+}
+
+func stripCacheControlExceptIndex(arr []any, preserveIdx int, excess *int) {
+	for idx, item := range arr {
+		if *excess <= 0 {
+			return
+		}
+		obj, ok := asObject(item)
+		if !ok {
+			continue
+		}
+		if _, exists := obj["cache_control"]; exists && idx != preserveIdx {
+			delete(obj, "cache_control")
+			*excess--
+		}
+	}
+}
+
+func stripAllCacheControl(arr []any, excess *int) {
+	for _, item := range arr {
+		if *excess <= 0 {
+			return
+		}
+		obj, ok := asObject(item)
+		if !ok {
+			continue
+		}
+		if _, exists := obj["cache_control"]; exists {
+			delete(obj, "cache_control")
+			*excess--
+		}
+	}
+}
+
+func stripMessageCacheControl(messages []any, excess *int) {
+	for _, msg := range messages {
+		if *excess <= 0 {
+			return
+		}
+		msgObj, ok := asObject(msg)
+		if !ok {
+			continue
+		}
+		content, ok := asArray(msgObj["content"])
+		if !ok {
+			continue
+		}
+		for _, item := range content {
+			if *excess <= 0 {
+				return
+			}
+			obj, ok := asObject(item)
+			if !ok {
+				continue
+			}
+			if _, exists := obj["cache_control"]; exists {
+				delete(obj, "cache_control")
+				*excess--
+			}
+		}
+	}
+}
+
 // normalizeCacheControlTTL ensures cache_control TTL values don't violate the
 // prompt-caching-scope-2026-01-05 ordering constraint: a 1h-TTL block must not
 // appear after a 5m-TTL block anywhere in the evaluation order.
@@ -1344,74 +1520,48 @@ func countCacheControls(payload []byte) int {
 // Strategy: walk all cache_control blocks in evaluation order. Once a 5m block
 // is seen, strip ttl from ALL subsequent 1h blocks (downgrading them to 5m).
 func normalizeCacheControlTTL(payload []byte) []byte {
-	seen5m := false // once true, all subsequent 1h blocks must be downgraded
+	root, ok := parsePayloadObject(payload)
+	if !ok {
+		return payload
+	}
 
-	// Phase 1: tools (evaluated first)
-	tools := gjson.GetBytes(payload, "tools")
-	if tools.IsArray() {
-		idx := 0
-		tools.ForEach(func(_, tool gjson.Result) bool {
-			cc := tool.Get("cache_control")
-			if cc.Exists() {
-				ttl := cc.Get("ttl").String()
-				if ttl != "1h" {
-					seen5m = true
-				} else if seen5m {
-					payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control.ttl", idx))
+	seen5m := false
+
+	if tools, ok := asArray(root["tools"]); ok {
+		for _, tool := range tools {
+			if obj, ok := asObject(tool); ok {
+				normalizeTTLForBlock(obj, &seen5m)
+			}
+		}
+	}
+
+	if system, ok := asArray(root["system"]); ok {
+		for _, item := range system {
+			if obj, ok := asObject(item); ok {
+				normalizeTTLForBlock(obj, &seen5m)
+			}
+		}
+	}
+
+	if messages, ok := asArray(root["messages"]); ok {
+		for _, msg := range messages {
+			msgObj, ok := asObject(msg)
+			if !ok {
+				continue
+			}
+			content, ok := asArray(msgObj["content"])
+			if !ok {
+				continue
+			}
+			for _, item := range content {
+				if obj, ok := asObject(item); ok {
+					normalizeTTLForBlock(obj, &seen5m)
 				}
 			}
-			idx++
-			return true
-		})
+		}
 	}
 
-	// Phase 2: system blocks (evaluated second, in array order)
-	system := gjson.GetBytes(payload, "system")
-	if system.IsArray() {
-		idx := 0
-		system.ForEach(func(_, item gjson.Result) bool {
-			cc := item.Get("cache_control")
-			if cc.Exists() {
-				ttl := cc.Get("ttl").String()
-				if ttl != "1h" {
-					seen5m = true
-				} else if seen5m {
-					payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control.ttl", idx))
-				}
-			}
-			idx++
-			return true
-		})
-	}
-
-	// Phase 3: message content blocks (evaluated last, in array order)
-	messages := gjson.GetBytes(payload, "messages")
-	if messages.IsArray() {
-		msgIdx := 0
-		messages.ForEach(func(_, msg gjson.Result) bool {
-			content := msg.Get("content")
-			if content.IsArray() {
-				contentIdx := 0
-				content.ForEach(func(_, item gjson.Result) bool {
-					cc := item.Get("cache_control")
-					if cc.Exists() {
-						ttl := cc.Get("ttl").String()
-						if ttl != "1h" {
-							seen5m = true
-						} else if seen5m {
-							payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("messages.%d.content.%d.cache_control.ttl", msgIdx, contentIdx))
-						}
-					}
-					contentIdx++
-					return true
-				})
-			}
-			msgIdx++
-			return true
-		})
-	}
-
-	return payload
+	return marshalPayloadObject(payload, root)
 }
 
 // enforceCacheControlLimit removes excess cache_control blocks from a payload
@@ -1419,156 +1569,76 @@ func normalizeCacheControlTTL(payload []byte) []byte {
 //
 // Anthropic evaluates cache breakpoints in order: tools → system → messages.
 // The most valuable breakpoints are:
-//   1. Last tool         — caches ALL tool definitions
-//   2. Last system block — caches ALL system content
-//   3. Recent messages   — cache conversation context
+//  1. Last tool         — caches ALL tool definitions
+//  2. Last system block — caches ALL system content
+//  3. Recent messages   — cache conversation context
 //
 // Removal priority (strip lowest-value first):
-//   Phase 1: system blocks earliest-first, preserving the last one.
-//   Phase 2: tool blocks earliest-first, preserving the last one.
-//   Phase 3: message content blocks earliest-first.
-//   Phase 4: remaining system blocks (last system).
-//   Phase 5: remaining tool blocks (last tool).
+//
+//	Phase 1: system blocks earliest-first, preserving the last one.
+//	Phase 2: tool blocks earliest-first, preserving the last one.
+//	Phase 3: message content blocks earliest-first.
+//	Phase 4: remaining system blocks (last system).
+//	Phase 5: remaining tool blocks (last tool).
 func enforceCacheControlLimit(payload []byte, maxBlocks int) []byte {
-	total := countCacheControls(payload)
+	root, ok := parsePayloadObject(payload)
+	if !ok {
+		return payload
+	}
+
+	total := countCacheControlsMap(root)
 	if total <= maxBlocks {
 		return payload
 	}
 
 	excess := total - maxBlocks
 
-	// Phase 1: strip cache_control from system blocks earliest-first, but SKIP the last one.
-	// The last system cache_control is high-value because it caches all system content.
-	system := gjson.GetBytes(payload, "system")
-	if system.IsArray() {
-		lastSysCCIdx := -1
-		sysIdx := 0
-		system.ForEach(func(_, item gjson.Result) bool {
-			if item.Get("cache_control").Exists() {
-				lastSysCCIdx = sysIdx
-			}
-			sysIdx++
-			return true
-		})
+	var system []any
+	if arr, ok := asArray(root["system"]); ok {
+		system = arr
+	}
+	var tools []any
+	if arr, ok := asArray(root["tools"]); ok {
+		tools = arr
+	}
+	var messages []any
+	if arr, ok := asArray(root["messages"]); ok {
+		messages = arr
+	}
 
-		idx := 0
-		system.ForEach(func(_, item gjson.Result) bool {
-			if excess <= 0 {
-				return false
-			}
-			if item.Get("cache_control").Exists() && idx != lastSysCCIdx {
-				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control", idx))
-				excess--
-			}
-			idx++
-			return true
-		})
+	if len(system) > 0 {
+		stripCacheControlExceptIndex(system, findLastCacheControlIndex(system), &excess)
 	}
 	if excess <= 0 {
-		return payload
+		return marshalPayloadObject(payload, root)
 	}
 
-	// Phase 2: strip cache_control from tools earliest-first, but SKIP the last one.
-	// Only the last tool cache_control is needed to cache all tool definitions.
-	tools := gjson.GetBytes(payload, "tools")
-	if tools.IsArray() {
-		lastToolCCIdx := -1
-		toolIdx := 0
-		tools.ForEach(func(_, tool gjson.Result) bool {
-			if tool.Get("cache_control").Exists() {
-				lastToolCCIdx = toolIdx
-			}
-			toolIdx++
-			return true
-		})
-
-		idx := 0
-		tools.ForEach(func(_, tool gjson.Result) bool {
-			if excess <= 0 {
-				return false
-			}
-			if tool.Get("cache_control").Exists() && idx != lastToolCCIdx {
-				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control", idx))
-				excess--
-			}
-			idx++
-			return true
-		})
+	if len(tools) > 0 {
+		stripCacheControlExceptIndex(tools, findLastCacheControlIndex(tools), &excess)
 	}
 	if excess <= 0 {
-		return payload
+		return marshalPayloadObject(payload, root)
 	}
 
-	// Phase 3: strip cache_control from message content blocks, earliest first.
-	// Older conversation turns are least likely to help immediate reuse.
-	messages := gjson.GetBytes(payload, "messages")
-	if messages.IsArray() {
-		msgIdx := 0
-		messages.ForEach(func(_, msg gjson.Result) bool {
-			if excess <= 0 {
-				return false
-			}
-			content := msg.Get("content")
-			if content.IsArray() {
-				contentIdx := 0
-				content.ForEach(func(_, item gjson.Result) bool {
-					if excess <= 0 {
-						return false
-					}
-					if item.Get("cache_control").Exists() {
-						payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("messages.%d.content.%d.cache_control", msgIdx, contentIdx))
-						excess--
-					}
-					contentIdx++
-					return true
-				})
-			}
-			msgIdx++
-			return true
-		})
+	if len(messages) > 0 {
+		stripMessageCacheControl(messages, &excess)
 	}
 	if excess <= 0 {
-		return payload
+		return marshalPayloadObject(payload, root)
 	}
 
-	// Phase 4: strip any remaining system cache_control blocks.
-	system = gjson.GetBytes(payload, "system")
-	if system.IsArray() {
-		idx := 0
-		system.ForEach(func(_, item gjson.Result) bool {
-			if excess <= 0 {
-				return false
-			}
-			if item.Get("cache_control").Exists() {
-				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("system.%d.cache_control", idx))
-				excess--
-			}
-			idx++
-			return true
-		})
+	if len(system) > 0 {
+		stripAllCacheControl(system, &excess)
 	}
 	if excess <= 0 {
-		return payload
+		return marshalPayloadObject(payload, root)
 	}
 
-	// Phase 5: strip any remaining tool cache_control blocks (including the last tool).
-	tools = gjson.GetBytes(payload, "tools")
-	if tools.IsArray() {
-		idx := 0
-		tools.ForEach(func(_, tool gjson.Result) bool {
-			if excess <= 0 {
-				return false
-			}
-			if tool.Get("cache_control").Exists() {
-				payload, _ = sjson.DeleteBytes(payload, fmt.Sprintf("tools.%d.cache_control", idx))
-				excess--
-			}
-			idx++
-			return true
-		})
+	if len(tools) > 0 {
+		stripAllCacheControl(tools, &excess)
 	}
 
-	return payload
+	return marshalPayloadObject(payload, root)
 }
 
 // injectMessagesCacheControl adds cache_control to the second-to-last user turn for multi-turn caching.

From a8a5d03c33609f05703114ec7a27e8a455761de2 Mon Sep 17 00:00:00 2001
From: hkfires <10558748+hkfires@users.noreply.github.com>
Date: Sun, 1 Mar 2026 12:42:59 +0800
Subject: [PATCH 7/7] chore: ignore .idea directory in git and docker builds

---
 .dockerignore | 1 +
 .gitignore    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index ef021aea..843c7e04 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -31,6 +31,7 @@ bin/*
 .agent/*
 .agents/*
 .opencode/*
+.idea/*
 .bmad/*
 _bmad/*
 _bmad-output/*
diff --git a/.gitignore b/.gitignore
index 183138f9..90ff3a94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ GEMINI.md
 .agents/*
 .agents/*
 .opencode/*
+.idea/*
 .bmad/*
 _bmad/*
 _bmad-output/*