Merge branch 'router-for-me:main' into main

refactor(claude): replace strings.Builder with simpler output string concatenation
Merge pull request #488 from router-for-me/gemini
2026-03-11 00:03:36 +00:00 · 2025-12-12 02:41:05 +08:00 · 2025-12-11 22:34:06 +08:00 · 2025-12-11 22:14:17 +08:00 · 2025-12-11 21:56:44 +08:00 · 2025-12-11 21:56:44 +08:00
25 changed files with 1411 additions and 617 deletions
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -105,7 +105,7 @@ ws-auth: false
 #     excluded-models:
 #       - "claude-opus-4-5-20251101" # exclude specific models (exact match)
 #       - "claude-3-*"               # wildcard matching prefix (e.g. claude-3-7-sonnet-20250219)
-#       - "*-think"                  # wildcard matching suffix (e.g. claude-opus-4-5-thinking)
+#       - "*-thinking"               # wildcard matching suffix (e.g. claude-opus-4-5-thinking)
 #       - "*haiku*"                  # wildcard matching substring (e.g. claude-3-5-haiku-20241022)

 # Kiro (AWS CodeWhisperer) configuration
--- a/internal/registry/model_definitions.go
+++ b/internal/registry/model_definitions.go
@@ -16,6 +16,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 4.5 Haiku",
 			ContextLength:       200000,
 			MaxCompletionTokens: 64000,
+			// Thinking: not supported for Haiku models
 		},
 		{
 			ID:                  "claude-sonnet-4-5-20250929",
@@ -49,6 +50,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 4.1 Opus",
 			ContextLength:       200000,
 			MaxCompletionTokens: 32000,
+			Thinking:            &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true},
 		},
 		{
 			ID:                  "claude-opus-4-20250514",
@@ -59,6 +61,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 4 Opus",
 			ContextLength:       200000,
 			MaxCompletionTokens: 32000,
+			Thinking:            &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true},
 		},
 		{
 			ID:                  "claude-sonnet-4-20250514",
@@ -69,6 +72,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 4 Sonnet",
 			ContextLength:       200000,
 			MaxCompletionTokens: 64000,
+			Thinking:            &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true},
 		},
 		{
 			ID:                  "claude-3-7-sonnet-20250219",
@@ -79,6 +83,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 3.7 Sonnet",
 			ContextLength:       128000,
 			MaxCompletionTokens: 8192,
+			Thinking:            &ThinkingSupport{Min: 1024, Max: 100000, ZeroAllowed: false, DynamicAllowed: true},
 		},
 		{
 			ID:                  "claude-3-5-haiku-20241022",
@@ -89,6 +94,7 @@ func GetClaudeModels() []*ModelInfo {
 			DisplayName:         "Claude 3.5 Haiku",
 			ContextLength:       128000,
 			MaxCompletionTokens: 8192,
+			// Thinking: not supported for Haiku models
 		},
 	}
 }
@@ -476,6 +482,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"minimal", "low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5-codex",
@@ -489,6 +496,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5-codex-mini",
@@ -502,6 +510,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5.1",
@@ -515,6 +524,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"none", "low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5.1-codex",
@@ -528,6 +538,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5.1-codex-mini",
@@ -541,6 +552,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"low", "medium", "high"}},
 		},
 		{
 			ID:                  "gpt-5.1-codex-max",
@@ -554,6 +566,7 @@ func GetOpenAIModels() []*ModelInfo {
 			ContextLength:       400000,
 			MaxCompletionTokens: 128000,
 			SupportedParameters: []string{"tools"},
+			Thinking:            &ThinkingSupport{Levels: []string{"low", "medium", "high", "xhigh"}},
 		},
 	}
 }
@@ -610,6 +623,7 @@ func GetIFlowModels() []*ModelInfo {
 		DisplayName string
 		Description string
 		Created     int64
+		Thinking    *ThinkingSupport
 	}{
 		{ID: "tstars2.0", DisplayName: "TStars-2.0", Description: "iFlow TStars-2.0 multimodal assistant", Created: 1746489600},
 		{ID: "qwen3-coder-plus", DisplayName: "Qwen3-Coder-Plus", Description: "Qwen3 Coder Plus code generation", Created: 1753228800},
@@ -619,17 +633,17 @@ func GetIFlowModels() []*ModelInfo {
 		{ID: "kimi-k2-0905", DisplayName: "Kimi-K2-Instruct-0905", Description: "Moonshot Kimi K2 instruct 0905", Created: 1757030400},
 		{ID: "glm-4.6", DisplayName: "GLM-4.6", Description: "Zhipu GLM 4.6 general model", Created: 1759190400},
 		{ID: "kimi-k2", DisplayName: "Kimi-K2", Description: "Moonshot Kimi K2 general model", Created: 1752192000},
-		{ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 general model", Created: 1762387200},
+		{ID: "kimi-k2-thinking", DisplayName: "Kimi-K2-Thinking", Description: "Moonshot Kimi K2 thinking model", Created: 1762387200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}},
 		{ID: "deepseek-v3.2-chat", DisplayName: "DeepSeek-V3.2", Description: "DeepSeek V3.2", Created: 1764576000},
 		{ID: "deepseek-v3.2", DisplayName: "DeepSeek-V3.2-Exp", Description: "DeepSeek V3.2 experimental", Created: 1759104000},
 		{ID: "deepseek-v3.1", DisplayName: "DeepSeek-V3.1-Terminus", Description: "DeepSeek V3.1 Terminus", Created: 1756339200},
-		{ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200},
+		{ID: "deepseek-r1", DisplayName: "DeepSeek-R1", Description: "DeepSeek reasoning model R1", Created: 1737331200, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}},
 		{ID: "deepseek-v3", DisplayName: "DeepSeek-V3-671B", Description: "DeepSeek V3 671B", Created: 1734307200},
 		{ID: "qwen3-32b", DisplayName: "Qwen3-32B", Description: "Qwen3 32B", Created: 1747094400},
-		{ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600},
+		{ID: "qwen3-235b-a22b-thinking-2507", DisplayName: "Qwen3-235B-A22B-Thinking", Description: "Qwen3 235B A22B Thinking (2507)", Created: 1753401600, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}},
 		{ID: "qwen3-235b-a22b-instruct", DisplayName: "Qwen3-235B-A22B-Instruct", Description: "Qwen3 235B A22B Instruct", Created: 1753401600},
 		{ID: "qwen3-235b", DisplayName: "Qwen3-235B-A22B", Description: "Qwen3 235B A22B", Created: 1753401600},
-		{ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000},
+		{ID: "minimax-m2", DisplayName: "MiniMax-M2", Description: "MiniMax M2", Created: 1758672000, Thinking: &ThinkingSupport{Levels: []string{"low", "medium", "high"}}},
 	}
 	models := make([]*ModelInfo, 0, len(entries))
 	for _, entry := range entries {
@@ -641,6 +655,7 @@ func GetIFlowModels() []*ModelInfo {
 			Type:        "iflow",
 			DisplayName: entry.DisplayName,
 			Description: entry.Description,
+			Thinking:    entry.Thinking,
 		})
 	}
 	return models
--- a/internal/registry/model_registry.go
+++ b/internal/registry/model_registry.go
@@ -63,6 +63,9 @@ type ThinkingSupport struct {
 	ZeroAllowed bool `json:"zero_allowed,omitempty"`
 	// DynamicAllowed indicates whether -1 is a valid value (dynamic thinking budget).
 	DynamicAllowed bool `json:"dynamic_allowed,omitempty"`
+	// Levels defines discrete reasoning effort levels (e.g., "low", "medium", "high").
+	// When set, the model uses level-based reasoning instead of token budgets.
+	Levels []string `json:"levels,omitempty"`
 }

 // ModelRegistration tracks a model's availability
--- a/internal/runtime/executor/aistudio_executor.go
+++ b/internal/runtime/executor/aistudio_executor.go
@@ -1,3 +1,6 @@
+// Package executor provides runtime execution capabilities for various AI service providers.
+// This file implements the AI Studio executor that routes requests through a websocket-backed
+// transport for the AI Studio provider.
 package executor

 import (
@@ -26,19 +29,28 @@ type AIStudioExecutor struct {
 	cfg      *config.Config
 }

-// NewAIStudioExecutor constructs a websocket executor for the provider name.
+// NewAIStudioExecutor creates a new AI Studio executor instance.
+//
+// Parameters:
+//   - cfg: The application configuration
+//   - provider: The provider name
+//   - relay: The websocket relay manager
+//
+// Returns:
+//   - *AIStudioExecutor: A new AI Studio executor instance
 func NewAIStudioExecutor(cfg *config.Config, provider string, relay *wsrelay.Manager) *AIStudioExecutor {
 	return &AIStudioExecutor{provider: strings.ToLower(provider), relay: relay, cfg: cfg}
 }

-// Identifier returns the logical provider key for routing.
+// Identifier returns the executor identifier.
 func (e *AIStudioExecutor) Identifier() string { return "aistudio" }

-// PrepareRequest is a no-op because websocket transport already injects headers.
+// PrepareRequest prepares the HTTP request for execution (no-op for AI Studio).
 func (e *AIStudioExecutor) PrepareRequest(_ *http.Request, _ *cliproxyauth.Auth) error {
 	return nil
 }

+// Execute performs a non-streaming request to the AI Studio API.
 func (e *AIStudioExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (resp cliproxyexecutor.Response, err error) {
 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
 	defer reporter.trackFailure(ctx, &err)
@@ -92,6 +104,7 @@ func (e *AIStudioExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth,
 	return resp, nil
 }

+// ExecuteStream performs a streaming request to the AI Studio API.
 func (e *AIStudioExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (stream <-chan cliproxyexecutor.StreamChunk, err error) {
 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
 	defer reporter.trackFailure(ctx, &err)
@@ -239,6 +252,7 @@ func (e *AIStudioExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth
 	return stream, nil
 }

+// CountTokens counts tokens for the given request using the AI Studio API.
 func (e *AIStudioExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (cliproxyexecutor.Response, error) {
 	_, body, err := e.translateRequest(req, opts, false)
 	if err != nil {
@@ -293,8 +307,8 @@ func (e *AIStudioExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.A
 	return cliproxyexecutor.Response{Payload: []byte(translated)}, nil
 }

-func (e *AIStudioExecutor) Refresh(ctx context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
-	_ = ctx
+// Refresh refreshes the authentication credentials (no-op for AI Studio).
+func (e *AIStudioExecutor) Refresh(_ context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
 	return auth, nil
 }

--- a/internal/runtime/executor/antigravity_executor.go
+++ b/internal/runtime/executor/antigravity_executor.go
@@ -1,3 +1,6 @@
+// Package executor provides runtime execution capabilities for various AI service providers.
+// This file implements the Antigravity executor that proxies requests to the antigravity
+// upstream using OAuth credentials.
 package executor

 import (
@@ -30,16 +33,15 @@ import (
 const (
 	antigravityBaseURLDaily = "https://daily-cloudcode-pa.sandbox.googleapis.com"
 	// antigravityBaseURLAutopush     = "https://autopush-cloudcode-pa.sandbox.googleapis.com"
-	antigravityBaseURLProd      = "https://cloudcode-pa.googleapis.com"
-	antigravityStreamPath       = "/v1internal:streamGenerateContent"
-	antigravityGeneratePath     = "/v1internal:generateContent"
-	antigravityModelsPath       = "/v1internal:fetchAvailableModels"
-	antigravityClientID         = "1071006060591-tmhssin2h21lcre235vtolojh4g403ep.apps.googleusercontent.com"
-	antigravityClientSecret     = "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"
-	defaultAntigravityAgent     = "antigravity/1.11.5 windows/amd64"
-	antigravityAuthType         = "antigravity"
-	refreshSkew                 = 3000 * time.Second
-	streamScannerBuffer     int = 20_971_520
+	antigravityBaseURLProd  = "https://cloudcode-pa.googleapis.com"
+	antigravityStreamPath   = "/v1internal:streamGenerateContent"
+	antigravityGeneratePath = "/v1internal:generateContent"
+	antigravityModelsPath   = "/v1internal:fetchAvailableModels"
+	antigravityClientID     = "1071006060591-tmhssin2h21lcre235vtolojh4g403ep.apps.googleusercontent.com"
+	antigravityClientSecret = "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"
+	defaultAntigravityAgent = "antigravity/1.11.5 windows/amd64"
+	antigravityAuthType     = "antigravity"
+	refreshSkew             = 3000 * time.Second
 )

 var (
@@ -52,18 +54,24 @@ type AntigravityExecutor struct {
 	cfg *config.Config
 }

-// NewAntigravityExecutor constructs a new executor instance.
+// NewAntigravityExecutor creates a new Antigravity executor instance.
+//
+// Parameters:
+//   - cfg: The application configuration
+//
+// Returns:
+//   - *AntigravityExecutor: A new Antigravity executor instance
 func NewAntigravityExecutor(cfg *config.Config) *AntigravityExecutor {
 	return &AntigravityExecutor{cfg: cfg}
 }

-// Identifier implements ProviderExecutor.
+// Identifier returns the executor identifier.
 func (e *AntigravityExecutor) Identifier() string { return antigravityAuthType }

-// PrepareRequest implements ProviderExecutor.
+// PrepareRequest prepares the HTTP request for execution (no-op for Antigravity).
 func (e *AntigravityExecutor) PrepareRequest(_ *http.Request, _ *cliproxyauth.Auth) error { return nil }

-// Execute handles non-streaming requests via the antigravity generate endpoint.
+// Execute performs a non-streaming request to the Antigravity API.
 func (e *AntigravityExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (resp cliproxyexecutor.Response, err error) {
 	token, updatedAuth, errToken := e.ensureAccessToken(ctx, auth)
 	if errToken != nil {
@@ -156,7 +164,7 @@ func (e *AntigravityExecutor) Execute(ctx context.Context, auth *cliproxyauth.Au
 	return resp, err
 }

-// ExecuteStream handles streaming requests via the antigravity upstream.
+// ExecuteStream performs a streaming request to the Antigravity API.
 func (e *AntigravityExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (stream <-chan cliproxyexecutor.StreamChunk, err error) {
 	ctx = context.WithValue(ctx, "alt", "")

@@ -296,7 +304,7 @@ func (e *AntigravityExecutor) ExecuteStream(ctx context.Context, auth *cliproxya
 	return nil, err
 }

-// Refresh refreshes the OAuth token using the refresh token.
+// Refresh refreshes the authentication credentials using the refresh token.
 func (e *AntigravityExecutor) Refresh(ctx context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
 	if auth == nil {
 		return auth, nil
@@ -308,7 +316,7 @@ func (e *AntigravityExecutor) Refresh(ctx context.Context, auth *cliproxyauth.Au
 	return updated, nil
 }

-// CountTokens is not supported for the antigravity provider.
+// CountTokens counts tokens for the given request (not supported for Antigravity).
 func (e *AntigravityExecutor) CountTokens(context.Context, *cliproxyauth.Auth, cliproxyexecutor.Request, cliproxyexecutor.Options) (cliproxyexecutor.Response, error) {
 	return cliproxyexecutor.Response{}, statusErr{code: http.StatusNotImplemented, msg: "count tokens not supported"}
 }
--- a/internal/runtime/executor/claude_executor.go
+++ b/internal/runtime/executor/claude_executor.go
@@ -254,7 +254,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 		// If from == to (Claude → Claude), directly forward the SSE stream without translation
 		if from == to {
 			scanner := bufio.NewScanner(decodedBody)
-			scanner.Buffer(nil, 20_971_520)
+			scanner.Buffer(nil, 52_428_800) // 50MB
 			for scanner.Scan() {
 				line := scanner.Bytes()
 				appendAPIResponseChunk(ctx, e.cfg, line)
@@ -277,7 +277,7 @@ func (e *ClaudeExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A

 		// For other formats, use translation
 		scanner := bufio.NewScanner(decodedBody)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, 52_428_800) // 50MB
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -450,59 +450,15 @@ func extractAndRemoveBetas(body []byte) ([]string, []byte) {
 	return betas, body
 }

-// injectThinkingConfig adds thinking configuration based on metadata or legacy suffixes.
+// injectThinkingConfig adds thinking configuration based on metadata using the unified flow.
+// It uses util.ResolveClaudeThinkingConfig which internally calls ResolveThinkingConfigFromMetadata
+// and NormalizeThinkingBudget, ensuring consistency with other executors like Gemini.
 func (e *ClaudeExecutor) injectThinkingConfig(modelName string, metadata map[string]any, body []byte) []byte {
-	// Only inject if thinking config is not already present
-	if gjson.GetBytes(body, "thinking").Exists() {
+	budget, ok := util.ResolveClaudeThinkingConfig(modelName, metadata)
+	if !ok {
 		return body
 	}
-
-	budgetTokens, ok := resolveClaudeThinkingBudget(modelName, metadata)
-	if !ok || budgetTokens <= 0 {
-		return body
-	}
-
-	body, _ = sjson.SetBytes(body, "thinking.type", "enabled")
-	body, _ = sjson.SetBytes(body, "thinking.budget_tokens", budgetTokens)
-	return body
-}
-
-func resolveClaudeThinkingBudget(modelName string, metadata map[string]any) (int, bool) {
-	budget, include, effort, matched := util.ThinkingFromMetadata(metadata)
-	if matched {
-		if include != nil && !*include {
-			return 0, false
-		}
-		if budget != nil {
-			normalized := util.NormalizeThinkingBudget(modelName, *budget)
-			if normalized > 0 {
-				return normalized, true
-			}
-			return 0, false
-		}
-		if effort != nil {
-			if derived, ok := util.ThinkingEffortToBudget(modelName, *effort); ok && derived > 0 {
-				return derived, true
-			}
-		}
-	}
-	return claudeBudgetFromSuffix(modelName)
-}
-
-func claudeBudgetFromSuffix(modelName string) (int, bool) {
-	lower := strings.ToLower(strings.TrimSpace(modelName))
-	switch {
-	case strings.HasSuffix(lower, "-thinking-low"):
-		return 1024, true
-	case strings.HasSuffix(lower, "-thinking-medium"):
-		return 8192, true
-	case strings.HasSuffix(lower, "-thinking-high"):
-		return 24576, true
-	case strings.HasSuffix(lower, "-thinking"):
-		return 8192, true
-	default:
-		return 0, false
-	}
+	return util.ApplyClaudeThinkingConfig(body, budget)
 }

 // ensureMaxTokensForThinking ensures max_tokens > thinking.budget_tokens when thinking is enabled.
--- a/internal/runtime/executor/codex_executor.go
+++ b/internal/runtime/executor/codex_executor.go
@@ -54,7 +54,11 @@ func (e *CodexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re
 	from := opts.SourceFormat
 	to := sdktranslator.FromString("codex")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
-	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model)
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort")
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return resp, errValidate
+	}
 	body = applyPayloadConfig(e.cfg, req.Model, body)
 	body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	body, _ = sjson.SetBytes(body, "stream", true)
@@ -148,7 +152,11 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au
 	to := sdktranslator.FromString("codex")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true)

-	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model)
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort")
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return nil, errValidate
+	}
 	body = applyPayloadConfig(e.cfg, req.Model, body)
 	body, _ = sjson.DeleteBytes(body, "previous_response_id")
 	body, _ = sjson.SetBytes(body, "model", upstreamModel)
@@ -208,7 +216,7 @@ func (e *CodexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, 52_428_800) // 50MB
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -246,7 +254,7 @@ func (e *CodexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth

 	modelForCounting := req.Model

-	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model)
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning.effort")
 	body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	body, _ = sjson.DeleteBytes(body, "previous_response_id")
 	body, _ = sjson.SetBytes(body, "stream", false)
--- a/internal/runtime/executor/gemini_cli_executor.go
+++ b/internal/runtime/executor/gemini_cli_executor.go
@@ -1,3 +1,6 @@
+// Package executor provides runtime execution capabilities for various AI service providers.
+// This file implements the Gemini CLI executor that talks to Cloud Code Assist endpoints
+// using OAuth credentials from auth metadata.
 package executor

 import (
@@ -29,11 +32,11 @@ import (
 const (
 	codeAssistEndpoint      = "https://cloudcode-pa.googleapis.com"
 	codeAssistVersion       = "v1internal"
-	geminiOauthClientID     = "681255809395-oo8ft2oprdrnp9e3aqf6av3hmdib135j.apps.googleusercontent.com"
-	geminiOauthClientSecret = "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"
+	geminiOAuthClientID     = "681255809395-oo8ft2oprdrnp9e3aqf6av3hmdib135j.apps.googleusercontent.com"
+	geminiOAuthClientSecret = "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"
 )

-var geminiOauthScopes = []string{
+var geminiOAuthScopes = []string{
 	"https://www.googleapis.com/auth/cloud-platform",
 	"https://www.googleapis.com/auth/userinfo.email",
 	"https://www.googleapis.com/auth/userinfo.profile",
@@ -44,14 +47,24 @@ type GeminiCLIExecutor struct {
 	cfg *config.Config
 }

+// NewGeminiCLIExecutor creates a new Gemini CLI executor instance.
+//
+// Parameters:
+//   - cfg: The application configuration
+//
+// Returns:
+//   - *GeminiCLIExecutor: A new Gemini CLI executor instance
 func NewGeminiCLIExecutor(cfg *config.Config) *GeminiCLIExecutor {
 	return &GeminiCLIExecutor{cfg: cfg}
 }

+// Identifier returns the executor identifier.
 func (e *GeminiCLIExecutor) Identifier() string { return "gemini-cli" }

+// PrepareRequest prepares the HTTP request for execution (no-op for Gemini CLI).
 func (e *GeminiCLIExecutor) PrepareRequest(_ *http.Request, _ *cliproxyauth.Auth) error { return nil }

+// Execute performs a non-streaming request to the Gemini CLI API.
 func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (resp cliproxyexecutor.Response, err error) {
 	tokenSource, baseTokenData, err := prepareGeminiCLITokenSource(ctx, e.cfg, auth)
 	if err != nil {
@@ -189,6 +202,7 @@ func (e *GeminiCLIExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth
 	return resp, err
 }

+// ExecuteStream performs a streaming request to the Gemini CLI API.
 func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (stream <-chan cliproxyexecutor.StreamChunk, err error) {
 	tokenSource, baseTokenData, err := prepareGeminiCLITokenSource(ctx, e.cfg, auth)
 	if err != nil {
@@ -309,7 +323,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 			}()
 			if opts.Alt == "" {
 				scanner := bufio.NewScanner(resp.Body)
-				scanner.Buffer(nil, 20_971_520)
+				scanner.Buffer(nil, streamScannerBuffer)
 				var param any
 				for scanner.Scan() {
 					line := scanner.Bytes()
@@ -371,6 +385,7 @@ func (e *GeminiCLIExecutor) ExecuteStream(ctx context.Context, auth *cliproxyaut
 	return nil, err
 }

+// CountTokens counts tokens for the given request using the Gemini CLI API.
 func (e *GeminiCLIExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (cliproxyexecutor.Response, error) {
 	tokenSource, baseTokenData, err := prepareGeminiCLITokenSource(ctx, e.cfg, auth)
 	if err != nil {
@@ -471,9 +486,8 @@ func (e *GeminiCLIExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.
 	return cliproxyexecutor.Response{}, newGeminiStatusErr(lastStatus, lastBody)
 }

-func (e *GeminiCLIExecutor) Refresh(ctx context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
-	log.Debugf("gemini cli executor: refresh called")
-	_ = ctx
+// Refresh refreshes the authentication credentials (no-op for Gemini CLI).
+func (e *GeminiCLIExecutor) Refresh(_ context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
 	return auth, nil
 }

@@ -515,9 +529,9 @@ func prepareGeminiCLITokenSource(ctx context.Context, cfg *config.Config, auth *
 	}

 	conf := &oauth2.Config{
-		ClientID:     geminiOauthClientID,
-		ClientSecret: geminiOauthClientSecret,
-		Scopes:       geminiOauthScopes,
+		ClientID:     geminiOAuthClientID,
+		ClientSecret: geminiOAuthClientSecret,
+		Scopes:       geminiOAuthScopes,
 		Endpoint:     google.Endpoint,
 	}

--- a/internal/runtime/executor/gemini_executor.go
+++ b/internal/runtime/executor/gemini_executor.go
@@ -11,7 +11,6 @@ import (
 	"io"
 	"net/http"
 	"strings"
-	"time"

 	"github.com/router-for-me/CLIProxyAPI/v6/internal/config"
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/util"
@@ -21,8 +20,6 @@ import (
 	log "github.com/sirupsen/logrus"
 	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
-	"golang.org/x/oauth2"
-	"golang.org/x/oauth2/google"
 )

 const (
@@ -31,6 +28,9 @@ const (

 	// glAPIVersion is the API version used for Gemini requests.
 	glAPIVersion = "v1beta"
+
+	// streamScannerBuffer is the buffer size for SSE stream scanning.
+	streamScannerBuffer = 52_428_800
 )

 // GeminiExecutor is a stateless executor for the official Gemini API using API keys.
@@ -48,9 +48,11 @@ type GeminiExecutor struct {
 //
 // Returns:
 //   - *GeminiExecutor: A new Gemini executor instance
-func NewGeminiExecutor(cfg *config.Config) *GeminiExecutor { return &GeminiExecutor{cfg: cfg} }
+func NewGeminiExecutor(cfg *config.Config) *GeminiExecutor {
+	return &GeminiExecutor{cfg: cfg}
+}

-// Identifier returns the executor identifier for Gemini.
+// Identifier returns the executor identifier.
 func (e *GeminiExecutor) Identifier() string { return "gemini" }

 // PrepareRequest prepares the HTTP request for execution (no-op for Gemini).
@@ -164,6 +166,7 @@ func (e *GeminiExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, r
 	return resp, nil
 }

+// ExecuteStream performs a streaming request to the Gemini API.
 func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (stream <-chan cliproxyexecutor.StreamChunk, err error) {
 	apiKey, bearer := geminiCreds(auth)

@@ -249,7 +252,7 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, streamScannerBuffer)
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -280,6 +283,7 @@ func (e *GeminiExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.A
 	return stream, nil
 }

+// CountTokens counts tokens for the given request using the Gemini API.
 func (e *GeminiExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (cliproxyexecutor.Response, error) {
 	apiKey, bearer := geminiCreds(auth)

@@ -353,106 +357,8 @@ func (e *GeminiExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Aut
 	return cliproxyexecutor.Response{Payload: []byte(translated)}, nil
 }

-func (e *GeminiExecutor) Refresh(ctx context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
-	log.Debugf("gemini executor: refresh called")
-	// OAuth bearer token refresh for official Gemini API.
-	if auth == nil {
-		return nil, fmt.Errorf("gemini executor: auth is nil")
-	}
-	if auth.Metadata == nil {
-		return auth, nil
-	}
-	// Token data is typically nested under "token" map in Gemini files.
-	tokenMap, _ := auth.Metadata["token"].(map[string]any)
-	var refreshToken, accessToken, clientID, clientSecret, tokenURI, expiryStr string
-	if tokenMap != nil {
-		if v, ok := tokenMap["refresh_token"].(string); ok {
-			refreshToken = v
-		}
-		if v, ok := tokenMap["access_token"].(string); ok {
-			accessToken = v
-		}
-		if v, ok := tokenMap["client_id"].(string); ok {
-			clientID = v
-		}
-		if v, ok := tokenMap["client_secret"].(string); ok {
-			clientSecret = v
-		}
-		if v, ok := tokenMap["token_uri"].(string); ok {
-			tokenURI = v
-		}
-		if v, ok := tokenMap["expiry"].(string); ok {
-			expiryStr = v
-		}
-	} else {
-		// Fallback to top-level keys if present
-		if v, ok := auth.Metadata["refresh_token"].(string); ok {
-			refreshToken = v
-		}
-		if v, ok := auth.Metadata["access_token"].(string); ok {
-			accessToken = v
-		}
-		if v, ok := auth.Metadata["client_id"].(string); ok {
-			clientID = v
-		}
-		if v, ok := auth.Metadata["client_secret"].(string); ok {
-			clientSecret = v
-		}
-		if v, ok := auth.Metadata["token_uri"].(string); ok {
-			tokenURI = v
-		}
-		if v, ok := auth.Metadata["expiry"].(string); ok {
-			expiryStr = v
-		}
-	}
-	if refreshToken == "" {
-		// Nothing to do for API key or cookie based entries
-		return auth, nil
-	}
-
-	// Prepare oauth2 config; default to Google endpoints
-	endpoint := google.Endpoint
-	if tokenURI != "" {
-		endpoint.TokenURL = tokenURI
-	}
-	conf := &oauth2.Config{ClientID: clientID, ClientSecret: clientSecret, Endpoint: endpoint}
-
-	// Ensure proxy-aware HTTP client for token refresh
-	httpClient := util.SetProxy(&e.cfg.SDKConfig, &http.Client{})
-	ctx = context.WithValue(ctx, oauth2.HTTPClient, httpClient)
-
-	// Build base token
-	tok := &oauth2.Token{AccessToken: accessToken, RefreshToken: refreshToken}
-	if t, err := time.Parse(time.RFC3339, expiryStr); err == nil {
-		tok.Expiry = t
-	}
-	newTok, err := conf.TokenSource(ctx, tok).Token()
-	if err != nil {
-		return nil, err
-	}
-
-	// Persist back to metadata; prefer nested token map if present
-	if tokenMap == nil {
-		tokenMap = make(map[string]any)
-	}
-	tokenMap["access_token"] = newTok.AccessToken
-	tokenMap["refresh_token"] = newTok.RefreshToken
-	tokenMap["expiry"] = newTok.Expiry.Format(time.RFC3339)
-	if clientID != "" {
-		tokenMap["client_id"] = clientID
-	}
-	if clientSecret != "" {
-		tokenMap["client_secret"] = clientSecret
-	}
-	if tokenURI != "" {
-		tokenMap["token_uri"] = tokenURI
-	}
-	auth.Metadata["token"] = tokenMap
-
-	// Also mirror top-level access_token for compatibility if previously present
-	if _, ok := auth.Metadata["access_token"]; ok {
-		auth.Metadata["access_token"] = newTok.AccessToken
-	}
+// Refresh refreshes the authentication credentials (no-op for Gemini API key).
+func (e *GeminiExecutor) Refresh(_ context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
 	return auth, nil
 }

--- a/internal/runtime/executor/gemini_vertex_executor.go
+++ b/internal/runtime/executor/gemini_vertex_executor.go
@@ -1,6 +1,6 @@
-// Package executor contains provider executors. This file implements the Vertex AI
-// Gemini executor that talks to Google Vertex AI endpoints using service account
-// credentials imported by the CLI.
+// Package executor provides runtime execution capabilities for various AI service providers.
+// This file implements the Vertex AI Gemini executor that talks to Google Vertex AI
+// endpoints using service account credentials or API keys.
 package executor

 import (
@@ -36,20 +36,26 @@ type GeminiVertexExecutor struct {
 	cfg *config.Config
 }

-// NewGeminiVertexExecutor constructs the Vertex executor.
+// NewGeminiVertexExecutor creates a new Vertex AI Gemini executor instance.
+//
+// Parameters:
+//   - cfg: The application configuration
+//
+// Returns:
+//   - *GeminiVertexExecutor: A new Vertex AI Gemini executor instance
 func NewGeminiVertexExecutor(cfg *config.Config) *GeminiVertexExecutor {
 	return &GeminiVertexExecutor{cfg: cfg}
 }

-// Identifier returns provider key for manager routing.
+// Identifier returns the executor identifier.
 func (e *GeminiVertexExecutor) Identifier() string { return "vertex" }

-// PrepareRequest is a no-op for Vertex.
+// PrepareRequest prepares the HTTP request for execution (no-op for Vertex).
 func (e *GeminiVertexExecutor) PrepareRequest(_ *http.Request, _ *cliproxyauth.Auth) error {
 	return nil
 }

-// Execute handles non-streaming requests.
+// Execute performs a non-streaming request to the Vertex AI API.
 func (e *GeminiVertexExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (resp cliproxyexecutor.Response, err error) {
 	// Try API key authentication first
 	apiKey, baseURL := vertexAPICreds(auth)
@@ -67,7 +73,7 @@ func (e *GeminiVertexExecutor) Execute(ctx context.Context, auth *cliproxyauth.A
 	return e.executeWithAPIKey(ctx, auth, req, opts, apiKey, baseURL)
 }

-// ExecuteStream handles SSE streaming for Vertex.
+// ExecuteStream performs a streaming request to the Vertex AI API.
 func (e *GeminiVertexExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (stream <-chan cliproxyexecutor.StreamChunk, err error) {
 	// Try API key authentication first
 	apiKey, baseURL := vertexAPICreds(auth)
@@ -85,7 +91,7 @@ func (e *GeminiVertexExecutor) ExecuteStream(ctx context.Context, auth *cliproxy
 	return e.executeStreamWithAPIKey(ctx, auth, req, opts, apiKey, baseURL)
 }

-// CountTokens calls Vertex countTokens endpoint.
+// CountTokens counts tokens for the given request using the Vertex AI API.
 func (e *GeminiVertexExecutor) CountTokens(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options) (cliproxyexecutor.Response, error) {
 	// Try API key authentication first
 	apiKey, baseURL := vertexAPICreds(auth)
@@ -103,185 +109,7 @@ func (e *GeminiVertexExecutor) CountTokens(ctx context.Context, auth *cliproxyau
 	return e.countTokensWithAPIKey(ctx, auth, req, opts, apiKey, baseURL)
 }

-// countTokensWithServiceAccount handles token counting using service account credentials.
-func (e *GeminiVertexExecutor) countTokensWithServiceAccount(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, projectID, location string, saJSON []byte) (cliproxyexecutor.Response, error) {
-	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
-
-	from := opts.SourceFormat
-	to := sdktranslator.FromString("gemini")
-	translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
-	if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) {
-		if budgetOverride != nil {
-			norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride)
-			budgetOverride = &norm
-		}
-		translatedReq = util.ApplyGeminiThinkingConfig(translatedReq, budgetOverride, includeOverride)
-	}
-	translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq)
-	translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq)
-	translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel)
-	respCtx := context.WithValue(ctx, "alt", opts.Alt)
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools")
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig")
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "safetySettings")
-
-	baseURL := vertexBaseURL(location)
-	url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, upstreamModel, "countTokens")
-
-	httpReq, errNewReq := http.NewRequestWithContext(respCtx, http.MethodPost, url, bytes.NewReader(translatedReq))
-	if errNewReq != nil {
-		return cliproxyexecutor.Response{}, errNewReq
-	}
-	httpReq.Header.Set("Content-Type", "application/json")
-	if token, errTok := vertexAccessToken(ctx, e.cfg, auth, saJSON); errTok == nil && token != "" {
-		httpReq.Header.Set("Authorization", "Bearer "+token)
-	} else if errTok != nil {
-		log.Errorf("vertex executor: access token error: %v", errTok)
-		return cliproxyexecutor.Response{}, statusErr{code: 500, msg: "internal server error"}
-	}
-	applyGeminiHeaders(httpReq, auth)
-
-	var authID, authLabel, authType, authValue string
-	if auth != nil {
-		authID = auth.ID
-		authLabel = auth.Label
-		authType, authValue = auth.AccountInfo()
-	}
-	recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
-		URL:       url,
-		Method:    http.MethodPost,
-		Headers:   httpReq.Header.Clone(),
-		Body:      translatedReq,
-		Provider:  e.Identifier(),
-		AuthID:    authID,
-		AuthLabel: authLabel,
-		AuthType:  authType,
-		AuthValue: authValue,
-	})
-
-	httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
-	httpResp, errDo := httpClient.Do(httpReq)
-	if errDo != nil {
-		recordAPIResponseError(ctx, e.cfg, errDo)
-		return cliproxyexecutor.Response{}, errDo
-	}
-	defer func() {
-		if errClose := httpResp.Body.Close(); errClose != nil {
-			log.Errorf("vertex executor: close response body error: %v", errClose)
-		}
-	}()
-	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
-	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		b, _ := io.ReadAll(httpResp.Body)
-		appendAPIResponseChunk(ctx, e.cfg, b)
-		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
-		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(b)}
-	}
-	data, errRead := io.ReadAll(httpResp.Body)
-	if errRead != nil {
-		recordAPIResponseError(ctx, e.cfg, errRead)
-		return cliproxyexecutor.Response{}, errRead
-	}
-	appendAPIResponseChunk(ctx, e.cfg, data)
-	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
-		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(data)}
-	}
-	count := gjson.GetBytes(data, "totalTokens").Int()
-	out := sdktranslator.TranslateTokenCount(ctx, to, from, count, data)
-	return cliproxyexecutor.Response{Payload: []byte(out)}, nil
-}
-
-// countTokensWithAPIKey handles token counting using API key credentials.
-func (e *GeminiVertexExecutor) countTokensWithAPIKey(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, apiKey, baseURL string) (cliproxyexecutor.Response, error) {
-	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
-
-	from := opts.SourceFormat
-	to := sdktranslator.FromString("gemini")
-	translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
-	if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) {
-		if budgetOverride != nil {
-			norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride)
-			budgetOverride = &norm
-		}
-		translatedReq = util.ApplyGeminiThinkingConfig(translatedReq, budgetOverride, includeOverride)
-	}
-	translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq)
-	translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq)
-	translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel)
-	respCtx := context.WithValue(ctx, "alt", opts.Alt)
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools")
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig")
-	translatedReq, _ = sjson.DeleteBytes(translatedReq, "safetySettings")
-
-	// For API key auth, use simpler URL format without project/location
-	if baseURL == "" {
-		baseURL = "https://generativelanguage.googleapis.com"
-	}
-	url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, req.Model, "countTokens")
-
-	httpReq, errNewReq := http.NewRequestWithContext(respCtx, http.MethodPost, url, bytes.NewReader(translatedReq))
-	if errNewReq != nil {
-		return cliproxyexecutor.Response{}, errNewReq
-	}
-	httpReq.Header.Set("Content-Type", "application/json")
-	if apiKey != "" {
-		httpReq.Header.Set("x-goog-api-key", apiKey)
-	}
-	applyGeminiHeaders(httpReq, auth)
-
-	var authID, authLabel, authType, authValue string
-	if auth != nil {
-		authID = auth.ID
-		authLabel = auth.Label
-		authType, authValue = auth.AccountInfo()
-	}
-	recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
-		URL:       url,
-		Method:    http.MethodPost,
-		Headers:   httpReq.Header.Clone(),
-		Body:      translatedReq,
-		Provider:  e.Identifier(),
-		AuthID:    authID,
-		AuthLabel: authLabel,
-		AuthType:  authType,
-		AuthValue: authValue,
-	})
-
-	httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
-	httpResp, errDo := httpClient.Do(httpReq)
-	if errDo != nil {
-		recordAPIResponseError(ctx, e.cfg, errDo)
-		return cliproxyexecutor.Response{}, errDo
-	}
-	defer func() {
-		if errClose := httpResp.Body.Close(); errClose != nil {
-			log.Errorf("vertex executor: close response body error: %v", errClose)
-		}
-	}()
-	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
-	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		b, _ := io.ReadAll(httpResp.Body)
-		appendAPIResponseChunk(ctx, e.cfg, b)
-		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
-		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(b)}
-	}
-	data, errRead := io.ReadAll(httpResp.Body)
-	if errRead != nil {
-		recordAPIResponseError(ctx, e.cfg, errRead)
-		return cliproxyexecutor.Response{}, errRead
-	}
-	appendAPIResponseChunk(ctx, e.cfg, data)
-	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
-		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
-		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(data)}
-	}
-	count := gjson.GetBytes(data, "totalTokens").Int()
-	out := sdktranslator.TranslateTokenCount(ctx, to, from, count, data)
-	return cliproxyexecutor.Response{Payload: []byte(out)}, nil
-}
-
-// Refresh is a no-op for service account based credentials.
+// Refresh refreshes the authentication credentials (no-op for Vertex).
 func (e *GeminiVertexExecutor) Refresh(_ context.Context, auth *cliproxyauth.Auth) (*cliproxyauth.Auth, error) {
 	return auth, nil
 }
@@ -579,7 +407,7 @@ func (e *GeminiVertexExecutor) executeStreamWithServiceAccount(ctx context.Conte
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, streamScannerBuffer)
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -696,7 +524,7 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, streamScannerBuffer)
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -722,6 +550,184 @@ func (e *GeminiVertexExecutor) executeStreamWithAPIKey(ctx context.Context, auth
 	return stream, nil
 }

+// countTokensWithServiceAccount counts tokens using service account credentials.
+func (e *GeminiVertexExecutor) countTokensWithServiceAccount(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, projectID, location string, saJSON []byte) (cliproxyexecutor.Response, error) {
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+
+	from := opts.SourceFormat
+	to := sdktranslator.FromString("gemini")
+	translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
+	if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) {
+		if budgetOverride != nil {
+			norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride)
+			budgetOverride = &norm
+		}
+		translatedReq = util.ApplyGeminiThinkingConfig(translatedReq, budgetOverride, includeOverride)
+	}
+	translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq)
+	translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq)
+	translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel)
+	respCtx := context.WithValue(ctx, "alt", opts.Alt)
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools")
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig")
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "safetySettings")
+
+	baseURL := vertexBaseURL(location)
+	url := fmt.Sprintf("%s/%s/projects/%s/locations/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, projectID, location, upstreamModel, "countTokens")
+
+	httpReq, errNewReq := http.NewRequestWithContext(respCtx, http.MethodPost, url, bytes.NewReader(translatedReq))
+	if errNewReq != nil {
+		return cliproxyexecutor.Response{}, errNewReq
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	if token, errTok := vertexAccessToken(ctx, e.cfg, auth, saJSON); errTok == nil && token != "" {
+		httpReq.Header.Set("Authorization", "Bearer "+token)
+	} else if errTok != nil {
+		log.Errorf("vertex executor: access token error: %v", errTok)
+		return cliproxyexecutor.Response{}, statusErr{code: 500, msg: "internal server error"}
+	}
+	applyGeminiHeaders(httpReq, auth)
+
+	var authID, authLabel, authType, authValue string
+	if auth != nil {
+		authID = auth.ID
+		authLabel = auth.Label
+		authType, authValue = auth.AccountInfo()
+	}
+	recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
+		URL:       url,
+		Method:    http.MethodPost,
+		Headers:   httpReq.Header.Clone(),
+		Body:      translatedReq,
+		Provider:  e.Identifier(),
+		AuthID:    authID,
+		AuthLabel: authLabel,
+		AuthType:  authType,
+		AuthValue: authValue,
+	})
+
+	httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
+	httpResp, errDo := httpClient.Do(httpReq)
+	if errDo != nil {
+		recordAPIResponseError(ctx, e.cfg, errDo)
+		return cliproxyexecutor.Response{}, errDo
+	}
+	defer func() {
+		if errClose := httpResp.Body.Close(); errClose != nil {
+			log.Errorf("vertex executor: close response body error: %v", errClose)
+		}
+	}()
+	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
+	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+		b, _ := io.ReadAll(httpResp.Body)
+		appendAPIResponseChunk(ctx, e.cfg, b)
+		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
+		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(b)}
+	}
+	data, errRead := io.ReadAll(httpResp.Body)
+	if errRead != nil {
+		recordAPIResponseError(ctx, e.cfg, errRead)
+		return cliproxyexecutor.Response{}, errRead
+	}
+	appendAPIResponseChunk(ctx, e.cfg, data)
+	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
+		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(data)}
+	}
+	count := gjson.GetBytes(data, "totalTokens").Int()
+	out := sdktranslator.TranslateTokenCount(ctx, to, from, count, data)
+	return cliproxyexecutor.Response{Payload: []byte(out)}, nil
+}
+
+// countTokensWithAPIKey handles token counting using API key credentials.
+func (e *GeminiVertexExecutor) countTokensWithAPIKey(ctx context.Context, auth *cliproxyauth.Auth, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, apiKey, baseURL string) (cliproxyexecutor.Response, error) {
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+
+	from := opts.SourceFormat
+	to := sdktranslator.FromString("gemini")
+	translatedReq := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
+	if budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(req.Model, req.Metadata); ok && util.ModelSupportsThinking(req.Model) {
+		if budgetOverride != nil {
+			norm := util.NormalizeThinkingBudget(req.Model, *budgetOverride)
+			budgetOverride = &norm
+		}
+		translatedReq = util.ApplyGeminiThinkingConfig(translatedReq, budgetOverride, includeOverride)
+	}
+	translatedReq = util.StripThinkingConfigIfUnsupported(req.Model, translatedReq)
+	translatedReq = fixGeminiImageAspectRatio(req.Model, translatedReq)
+	translatedReq, _ = sjson.SetBytes(translatedReq, "model", upstreamModel)
+	respCtx := context.WithValue(ctx, "alt", opts.Alt)
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "tools")
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "generationConfig")
+	translatedReq, _ = sjson.DeleteBytes(translatedReq, "safetySettings")
+
+	// For API key auth, use simpler URL format without project/location
+	if baseURL == "" {
+		baseURL = "https://generativelanguage.googleapis.com"
+	}
+	url := fmt.Sprintf("%s/%s/publishers/google/models/%s:%s", baseURL, vertexAPIVersion, req.Model, "countTokens")
+
+	httpReq, errNewReq := http.NewRequestWithContext(respCtx, http.MethodPost, url, bytes.NewReader(translatedReq))
+	if errNewReq != nil {
+		return cliproxyexecutor.Response{}, errNewReq
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+	if apiKey != "" {
+		httpReq.Header.Set("x-goog-api-key", apiKey)
+	}
+	applyGeminiHeaders(httpReq, auth)
+
+	var authID, authLabel, authType, authValue string
+	if auth != nil {
+		authID = auth.ID
+		authLabel = auth.Label
+		authType, authValue = auth.AccountInfo()
+	}
+	recordAPIRequest(ctx, e.cfg, upstreamRequestLog{
+		URL:       url,
+		Method:    http.MethodPost,
+		Headers:   httpReq.Header.Clone(),
+		Body:      translatedReq,
+		Provider:  e.Identifier(),
+		AuthID:    authID,
+		AuthLabel: authLabel,
+		AuthType:  authType,
+		AuthValue: authValue,
+	})
+
+	httpClient := newProxyAwareHTTPClient(ctx, e.cfg, auth, 0)
+	httpResp, errDo := httpClient.Do(httpReq)
+	if errDo != nil {
+		recordAPIResponseError(ctx, e.cfg, errDo)
+		return cliproxyexecutor.Response{}, errDo
+	}
+	defer func() {
+		if errClose := httpResp.Body.Close(); errClose != nil {
+			log.Errorf("vertex executor: close response body error: %v", errClose)
+		}
+	}()
+	recordAPIResponseMetadata(ctx, e.cfg, httpResp.StatusCode, httpResp.Header.Clone())
+	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+		b, _ := io.ReadAll(httpResp.Body)
+		appendAPIResponseChunk(ctx, e.cfg, b)
+		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), b))
+		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(b)}
+	}
+	data, errRead := io.ReadAll(httpResp.Body)
+	if errRead != nil {
+		recordAPIResponseError(ctx, e.cfg, errRead)
+		return cliproxyexecutor.Response{}, errRead
+	}
+	appendAPIResponseChunk(ctx, e.cfg, data)
+	if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
+		log.Debugf("request error, error status: %d, error body: %s", httpResp.StatusCode, summarizeErrorBody(httpResp.Header.Get("Content-Type"), data))
+		return cliproxyexecutor.Response{}, statusErr{code: httpResp.StatusCode, msg: string(data)}
+	}
+	count := gjson.GetBytes(data, "totalTokens").Int()
+	out := sdktranslator.TranslateTokenCount(ctx, to, from, count, data)
+	return cliproxyexecutor.Response{Payload: []byte(out)}, nil
+}
+
 // vertexCreds extracts project, location and raw service account JSON from auth metadata.
 func vertexCreds(a *cliproxyauth.Auth) (projectID, location string, serviceAccountJSON []byte, err error) {
 	if a == nil || a.Metadata == nil {
--- a/internal/runtime/executor/iflow_executor.go
+++ b/internal/runtime/executor/iflow_executor.go
@@ -57,10 +57,15 @@ func (e *IFlowExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, re
 	from := opts.SourceFormat
 	to := sdktranslator.FromString("openai")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
-	body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	}
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return resp, errValidate
+	}
 	body = applyPayloadConfig(e.cfg, req.Model, body)

 	endpoint := strings.TrimSuffix(baseURL, "/") + iflowDefaultEndpoint
@@ -143,10 +148,15 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au
 	to := sdktranslator.FromString("openai")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true)

-	body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	}
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return nil, errValidate
+	}
 	// Ensure tools array exists to avoid provider quirks similar to Qwen's behaviour.
 	toolsResult := gjson.GetBytes(body, "tools")
 	if toolsResult.Exists() && toolsResult.IsArray() && len(toolsResult.Array()) == 0 {
@@ -209,7 +219,7 @@ func (e *IFlowExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Au
 		}()

 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, 52_428_800) // 50MB 
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
--- a/internal/runtime/executor/kiro_executor.go
+++ b/internal/runtime/executor/kiro_executor.go
@@ -121,7 +121,12 @@ func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req
 		return resp, fmt.Errorf("kiro: access token not found in auth")
 	}
 	if profileArn == "" {
-		log.Warnf("kiro: profile ARN not found in auth, API calls may fail")
+		// Only warn if not using builder-id auth (which doesn't need profileArn)
+		if auth == nil || auth.Metadata == nil {
+			log.Debugf("kiro: profile ARN not found in auth (may be normal for builder-id)")
+		} else if authMethod, ok := auth.Metadata["auth_method"].(string); !ok || authMethod != "builder-id" {
+			log.Warnf("kiro: profile ARN not found in auth, API calls may fail")
+		}
 	}

 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
@@ -161,10 +166,19 @@ func (e *KiroExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req
 		currentOrigin = "CLI"
 	}
 	
-	kiroPayload := e.buildKiroPayload(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly)
+	// Determine if profileArn should be included based on auth method
+	// profileArn is only needed for social auth (Google OAuth), not for builder-id (AWS SSO)
+	effectiveProfileArn := profileArn
+	if auth != nil && auth.Metadata != nil {
+		if authMethod, ok := auth.Metadata["auth_method"].(string); ok && authMethod == "builder-id" {
+			effectiveProfileArn = "" // Don't include profileArn for builder-id auth
+		}
+	}
+	
+	kiroPayload := e.buildKiroPayload(body, kiroModelID, effectiveProfileArn, currentOrigin, isAgentic, isChatOnly)

 	// Execute with retry on 401/403 and 429 (quota exhausted)
-	resp, err = e.executeWithRetry(ctx, auth, req, opts, accessToken, profileArn, kiroPayload, body, from, to, reporter, currentOrigin, kiroModelID, isAgentic, isChatOnly)
+	resp, err = e.executeWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, kiroPayload, body, from, to, reporter, currentOrigin, kiroModelID, isAgentic, isChatOnly)
 	return resp, err
 }

@@ -301,9 +315,18 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 				}
 			}
 			if len(content) > 0 {
-				usageInfo.OutputTokens = int64(len(content) / 4)
+				// Use tiktoken for more accurate output token calculation
+				if enc, encErr := tokenizerForModel(req.Model); encErr == nil {
+					if tokenCount, countErr := enc.Count(content); countErr == nil {
+						usageInfo.OutputTokens = int64(tokenCount)
+					}
+				}
+				// Fallback to character count estimation if tiktoken fails
 				if usageInfo.OutputTokens == 0 {
-					usageInfo.OutputTokens = 1
+					usageInfo.OutputTokens = int64(len(content) / 4)
+					if usageInfo.OutputTokens == 0 {
+						usageInfo.OutputTokens = 1
+					}
 				}
 			}
 			usageInfo.TotalTokens = usageInfo.InputTokens + usageInfo.OutputTokens
@@ -330,7 +353,12 @@ func (e *KiroExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 		return nil, fmt.Errorf("kiro: access token not found in auth")
 	}
 	if profileArn == "" {
-		log.Warnf("kiro: profile ARN not found in auth, API calls may fail")
+		// Only warn if not using builder-id auth (which doesn't need profileArn)
+		if auth == nil || auth.Metadata == nil {
+			log.Debugf("kiro: profile ARN not found in auth (may be normal for builder-id)")
+		} else if authMethod, ok := auth.Metadata["auth_method"].(string); !ok || authMethod != "builder-id" {
+			log.Warnf("kiro: profile ARN not found in auth, API calls may fail")
+		}
 	}

 	reporter := newUsageReporter(ctx, e.Identifier(), req.Model, auth)
@@ -370,10 +398,19 @@ func (e *KiroExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 		currentOrigin = "CLI"
 	}
 	
-	kiroPayload := e.buildKiroPayload(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly)
+	// Determine if profileArn should be included based on auth method
+	// profileArn is only needed for social auth (Google OAuth), not for builder-id (AWS SSO)
+	effectiveProfileArn := profileArn
+	if auth != nil && auth.Metadata != nil {
+		if authMethod, ok := auth.Metadata["auth_method"].(string); ok && authMethod == "builder-id" {
+			effectiveProfileArn = "" // Don't include profileArn for builder-id auth
+		}
+	}
+	
+	kiroPayload := e.buildKiroPayload(body, kiroModelID, effectiveProfileArn, currentOrigin, isAgentic, isChatOnly)

 	// Execute stream with retry on 401/403 and 429 (quota exhausted)
-	return e.executeStreamWithRetry(ctx, auth, req, opts, accessToken, profileArn, kiroPayload, body, from, reporter, currentOrigin, kiroModelID, isAgentic, isChatOnly)
+	return e.executeStreamWithRetry(ctx, auth, req, opts, accessToken, effectiveProfileArn, kiroPayload, body, from, reporter, currentOrigin, kiroModelID, isAgentic, isChatOnly)
 }

 // executeStreamWithRetry performs the streaming HTTP request with automatic retry on auth errors.
@@ -491,6 +528,12 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox

 		go func(resp *http.Response) {
 			defer close(out)
+			defer func() {
+				if r := recover(); r != nil {
+					log.Errorf("kiro: panic in stream handler: %v", r)
+					out <- cliproxyexecutor.StreamChunk{Err: fmt.Errorf("internal error: %v", r)}
+				}
+			}()
 			defer func() {
 				if errClose := resp.Body.Close(); errClose != nil {
 					log.Errorf("response body close error: %v", errClose)
@@ -587,10 +630,10 @@ type kiroPayload struct {
 }

 type kiroConversationState struct {
+	ChatTriggerType string               `json:"chatTriggerType"` // Required: "MANUAL" - must be first field
 	ConversationID  string               `json:"conversationId"`
-	History         []kiroHistoryMessage `json:"history"`
 	CurrentMessage  kiroCurrentMessage   `json:"currentMessage"`
-	ChatTriggerType string               `json:"chatTriggerType"` // Required: "MANUAL"
+	History         []kiroHistoryMessage `json:"history,omitempty"` // Only include when non-empty
 }

 type kiroCurrentMessage struct {
@@ -627,9 +670,9 @@ type kiroUserInputMessageContext struct {
 }

 type kiroToolResult struct {
-	ToolUseID string              `json:"toolUseId"`
 	Content   []kiroTextContent   `json:"content"`
 	Status    string              `json:"status"`
+	ToolUseID string              `json:"toolUseId"`
 }

 type kiroTextContent struct {
@@ -735,7 +778,9 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,
 	var currentUserMsg *kiroUserInputMessage
 	var currentToolResults []kiroToolResult

-	messagesArray := messages.Array()
+	// Merge adjacent messages with the same role before processing
+	// This reduces API call complexity and improves compatibility
+	messagesArray := mergeAdjacentMessages(messages.Array())
 	for i, msg := range messagesArray {
 		role := msg.Get("role").String()
 		isLastMessage := i == len(messagesArray)-1
@@ -746,6 +791,14 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,
 				currentUserMsg = &userMsg
 				currentToolResults = toolResults
 			} else {
+				// CRITICAL: Kiro API requires content to be non-empty for history messages too
+				if strings.TrimSpace(userMsg.Content) == "" {
+					if len(toolResults) > 0 {
+						userMsg.Content = "Tool results provided."
+					} else {
+						userMsg.Content = "Continue"
+					}
+				}
 				// For history messages, embed tool results in context
 				if len(toolResults) > 0 {
 					userMsg.UserInputMessageContext = &kiroUserInputMessageContext{
@@ -758,9 +811,24 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,
 			}
 		} else if role == "assistant" {
 			assistantMsg := e.buildAssistantMessageStruct(msg)
-			history = append(history, kiroHistoryMessage{
-				AssistantResponseMessage: &assistantMsg,
-			})
+			// If this is the last message and it's an assistant message,
+			// we need to add it to history and create a "Continue" user message
+			// because Kiro API requires currentMessage to be userInputMessage type
+			if isLastMessage {
+				history = append(history, kiroHistoryMessage{
+					AssistantResponseMessage: &assistantMsg,
+				})
+				// Create a "Continue" user message as currentMessage
+				currentUserMsg = &kiroUserInputMessage{
+					Content: "Continue",
+					ModelID: modelID,
+					Origin:  origin,
+				}
+			} else {
+				history = append(history, kiroHistoryMessage{
+					AssistantResponseMessage: &assistantMsg,
+				})
+			}
 		}
 	}

@@ -777,7 +845,35 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,
 		
 		// Add the actual user message
 		contentBuilder.WriteString(currentUserMsg.Content)
-		currentUserMsg.Content = contentBuilder.String()
+		finalContent := contentBuilder.String()
+		
+		// CRITICAL: Kiro API requires content to be non-empty, even when toolResults are present
+		// If content is empty or only whitespace, provide a default message
+		if strings.TrimSpace(finalContent) == "" {
+			if len(currentToolResults) > 0 {
+				finalContent = "Tool results provided."
+			} else {
+				finalContent = "Continue"
+			}
+			log.Debugf("kiro: content was empty, using default: %s", finalContent)
+		}
+		currentUserMsg.Content = finalContent
+		
+		// Deduplicate currentToolResults before adding to context
+		// Kiro API does not accept duplicate toolUseIds
+		if len(currentToolResults) > 0 {
+			seenIDs := make(map[string]bool)
+			uniqueToolResults := make([]kiroToolResult, 0, len(currentToolResults))
+			for _, tr := range currentToolResults {
+				if !seenIDs[tr.ToolUseID] {
+					seenIDs[tr.ToolUseID] = true
+					uniqueToolResults = append(uniqueToolResults, tr)
+				} else {
+					log.Debugf("kiro: skipping duplicate toolResult in currentMessage: %s", tr.ToolUseID)
+				}
+			}
+			currentToolResults = uniqueToolResults
+		}
 		
 		// Build userInputMessageContext with tools and tool results
 		if len(kiroTools) > 0 || len(currentToolResults) > 0 {
@@ -805,21 +901,18 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,
 		}}
 	}
 	
+	// Build payload with correct field order (matches struct definition)
+	// Note: history is omitempty, so nil/empty slice won't be serialized
 	payload := kiroPayload{
 		ConversationState: kiroConversationState{
+			ChatTriggerType: "MANUAL", // Required by Kiro API - must be first
 			ConversationID:  uuid.New().String(),
-			History:         history,
 			CurrentMessage:  currentMessage,
-			ChatTriggerType: "MANUAL", // Required by Kiro API
+			History:         history, // Will be omitted if empty due to omitempty tag
 		},
 		ProfileArn: profileArn,
 	}

-	// Ensure history is not nil (empty array)
-	if payload.ConversationState.History == nil {
-		payload.ConversationState.History = []kiroHistoryMessage{}
-	}
-
 	result, err := json.Marshal(payload)
 	if err != nil {
 		log.Debugf("kiro: failed to marshal payload: %v", err)
@@ -830,11 +923,15 @@ func (e *KiroExecutor) buildKiroPayload(claudeBody []byte, modelID, profileArn,

 // buildUserMessageStruct builds a user message and extracts tool results
 // origin parameter determines which quota to use: "CLI" for Amazon Q, "AI_EDITOR" for Kiro IDE.
+// IMPORTANT: Kiro API does not accept duplicate toolUseIds, so we deduplicate here.
 func (e *KiroExecutor) buildUserMessageStruct(msg gjson.Result, modelID, origin string) (kiroUserInputMessage, []kiroToolResult) {
 	content := msg.Get("content")
 	var contentBuilder strings.Builder
 	var toolResults []kiroToolResult
 	var images []kiroImage
+	
+	// Track seen toolUseIds to deduplicate - Kiro API rejects duplicate toolUseIds
+	seenToolUseIDs := make(map[string]bool)

 	if content.IsArray() {
 		for _, part := range content.Array() {
@@ -864,6 +961,14 @@ func (e *KiroExecutor) buildUserMessageStruct(msg gjson.Result, modelID, origin
 			case "tool_result":
 				// Extract tool result for API
 				toolUseID := part.Get("tool_use_id").String()
+				
+				// Skip duplicate toolUseIds - Kiro API does not accept duplicates
+				if seenToolUseIDs[toolUseID] {
+					log.Debugf("kiro: skipping duplicate tool_result with toolUseId: %s", toolUseID)
+					continue
+				}
+				seenToolUseIDs[toolUseID] = true
+				
 				isError := part.Get("is_error").Bool()
 				resultContent := part.Get("content")
 				
@@ -1001,6 +1106,12 @@ func (e *KiroExecutor) parseEventStream(body io.Reader) (string, []kiroToolUse,
 			return content.String(), toolUses, usageInfo, fmt.Errorf("failed to read message: %w", err)
 		}

+		// Validate headersLen to prevent slice out of bounds
+		if headersLen+4 > uint32(len(remaining)) {
+			log.Warnf("kiro: invalid headersLen %d exceeds remaining buffer %d", headersLen, len(remaining))
+			continue
+		}
+
 		// Extract event type from headers
 		eventType := e.extractEventType(remaining[:headersLen+4])

@@ -1018,6 +1129,37 @@ func (e *KiroExecutor) parseEventStream(body io.Reader) (string, []kiroToolUse,
 			continue
 		}

+		// DIAGNOSTIC: Log all received event types for debugging
+		log.Debugf("kiro: parseEventStream received event type: %s", eventType)
+		if log.IsLevelEnabled(log.TraceLevel) {
+			log.Tracef("kiro: parseEventStream event payload: %s", string(payload))
+		}
+
+		// Check for error/exception events in the payload (Kiro API may return errors with HTTP 200)
+		// These can appear as top-level fields or nested within the event
+		if errType, hasErrType := event["_type"].(string); hasErrType {
+			// AWS-style error: {"_type": "com.amazon.aws.codewhisperer#ValidationException", "message": "..."}
+			errMsg := ""
+			if msg, ok := event["message"].(string); ok {
+				errMsg = msg
+			}
+			log.Errorf("kiro: received AWS error in event stream: type=%s, message=%s", errType, errMsg)
+			return "", nil, usageInfo, fmt.Errorf("kiro API error: %s - %s", errType, errMsg)
+		}
+		if errType, hasErrType := event["type"].(string); hasErrType && (errType == "error" || errType == "exception") {
+			// Generic error event
+			errMsg := ""
+			if msg, ok := event["message"].(string); ok {
+				errMsg = msg
+			} else if errObj, ok := event["error"].(map[string]interface{}); ok {
+				if msg, ok := errObj["message"].(string); ok {
+					errMsg = msg
+				}
+			}
+			log.Errorf("kiro: received error event in stream: type=%s, message=%s", errType, errMsg)
+			return "", nil, usageInfo, fmt.Errorf("kiro API error: %s", errMsg)
+		}
+
 		// Handle different event types
 		switch eventType {
 		case "assistantResponseEvent":
@@ -1231,8 +1373,9 @@ func (e *KiroExecutor) buildClaudeResponse(content string, toolUses []kiroToolUs
 // streamToChannel converts AWS Event Stream to channel-based streaming.
 // Supports tool calling - emits tool_use content blocks when tools are used.
 // Includes embedded [Called ...] tool call parsing and input buffering for toolUseEvent.
+// Implements duplicate content filtering using lastContentEvent detection (based on AIClient-2-API).
 func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out chan<- cliproxyexecutor.StreamChunk, targetFormat sdktranslator.Format, model string, originalReq, claudeBody []byte, reporter *usageReporter) {
-	reader := bufio.NewReader(body)
+	reader := bufio.NewReaderSize(body, 20*1024*1024) // 20MB buffer to match other providers
 	var totalUsage usage.Detail
 	var hasToolUses bool // Track if any tool uses were emitted

@@ -1240,6 +1383,15 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 	processedIDs := make(map[string]bool)
 	var currentToolUse *toolUseState

+	// Duplicate content detection - tracks last content event to filter duplicates
+	// Based on AIClient-2-API implementation for Kiro
+	var lastContentEvent string
+
+	// Streaming token calculation - accumulate content for real-time token counting
+	// Based on AIClient-2-API implementation
+	var accumulatedContent strings.Builder
+	accumulatedContent.Grow(4096) // Pre-allocate 4KB capacity to reduce reallocations
+
 	// Translator param for maintaining tool call state across streaming events
 	// IMPORTANT: This must persist across all TranslateStream calls
 	var translatorParam any
@@ -1279,6 +1431,51 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 		prelude := make([]byte, 8)
 		_, err := io.ReadFull(reader, prelude)
 		if err == io.EOF {
+			// Flush any incomplete tool use before ending stream
+			if currentToolUse != nil && !processedIDs[currentToolUse.toolUseID] {
+				log.Warnf("kiro: flushing incomplete tool use at EOF: %s (ID: %s)", currentToolUse.name, currentToolUse.toolUseID)
+				fullInput := currentToolUse.inputBuffer.String()
+				repairedJSON := repairJSON(fullInput)
+				var finalInput map[string]interface{}
+				if err := json.Unmarshal([]byte(repairedJSON), &finalInput); err != nil {
+					log.Warnf("kiro: failed to parse incomplete tool input at EOF: %v", err)
+					finalInput = make(map[string]interface{})
+				}
+				
+				processedIDs[currentToolUse.toolUseID] = true
+				contentBlockIndex++
+				
+				// Send tool_use content block
+				blockStart := e.buildClaudeContentBlockStartEvent(contentBlockIndex, "tool_use", currentToolUse.toolUseID, currentToolUse.name)
+				sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, blockStart, &translatorParam)
+				for _, chunk := range sseData {
+					if chunk != "" {
+						out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
+					}
+				}
+				
+				// Send tool input as delta
+				inputBytes, _ := json.Marshal(finalInput)
+				inputDelta := e.buildClaudeInputJsonDeltaEvent(string(inputBytes), contentBlockIndex)
+				sseData = sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, inputDelta, &translatorParam)
+				for _, chunk := range sseData {
+					if chunk != "" {
+						out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
+					}
+				}
+				
+				// Close block
+				blockStop := e.buildClaudeContentBlockStopEvent(contentBlockIndex)
+				sseData = sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, blockStop, &translatorParam)
+				for _, chunk := range sseData {
+					if chunk != "" {
+						out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
+					}
+				}
+				
+				hasToolUses = true
+				currentToolUse = nil
+			}
 			break
 		}
 		if err != nil {
@@ -1304,6 +1501,12 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 			return
 		}

+		// Validate headersLen to prevent slice out of bounds
+		if headersLen+4 > uint32(len(remaining)) {
+			log.Warnf("kiro: invalid headersLen %d exceeds remaining buffer %d", headersLen, len(remaining))
+			continue
+		}
+
 		eventType := e.extractEventType(remaining[:headersLen+4])

 		payloadStart := 4 + headersLen
@@ -1317,9 +1520,43 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out

 		var event map[string]interface{}
 		if err := json.Unmarshal(payload, &event); err != nil {
+			log.Warnf("kiro: failed to unmarshal event payload: %v, raw: %s", err, string(payload))
 			continue
 		}

+		// DIAGNOSTIC: Log all received event types for debugging
+		log.Debugf("kiro: streamToChannel received event type: %s", eventType)
+		if log.IsLevelEnabled(log.TraceLevel) {
+			log.Tracef("kiro: streamToChannel event payload: %s", string(payload))
+		}
+
+		// Check for error/exception events in the payload (Kiro API may return errors with HTTP 200)
+		// These can appear as top-level fields or nested within the event
+		if errType, hasErrType := event["_type"].(string); hasErrType {
+			// AWS-style error: {"_type": "com.amazon.aws.codewhisperer#ValidationException", "message": "..."}
+			errMsg := ""
+			if msg, ok := event["message"].(string); ok {
+				errMsg = msg
+			}
+			log.Errorf("kiro: received AWS error in stream: type=%s, message=%s", errType, errMsg)
+			out <- cliproxyexecutor.StreamChunk{Err: fmt.Errorf("kiro API error: %s - %s", errType, errMsg)}
+			return
+		}
+		if errType, hasErrType := event["type"].(string); hasErrType && (errType == "error" || errType == "exception") {
+			// Generic error event
+			errMsg := ""
+			if msg, ok := event["message"].(string); ok {
+				errMsg = msg
+			} else if errObj, ok := event["error"].(map[string]interface{}); ok {
+				if msg, ok := errObj["message"].(string); ok {
+					errMsg = msg
+				}
+			}
+			log.Errorf("kiro: received error event in stream: type=%s, message=%s", errType, errMsg)
+			out <- cliproxyexecutor.StreamChunk{Err: fmt.Errorf("kiro API error: %s", errMsg)}
+			return
+		}
+
 		// Send message_start on first event
 		if !messageStartSent {
 			msgStart := e.buildClaudeMessageStartEvent(model, totalUsage.InputTokens)
@@ -1364,9 +1601,19 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 				}
 			}

-			// Handle text content
+			// Handle text content with duplicate detection
 			if contentDelta != "" {
+				// Check for duplicate content - skip if identical to last content event
+				// Based on AIClient-2-API implementation for Kiro
+				if contentDelta == lastContentEvent {
+					log.Debugf("kiro: skipping duplicate content event (len: %d)", len(contentDelta))
+					continue
+				}
+				lastContentEvent = contentDelta
+
 				outputLen += len(contentDelta)
+				// Accumulate content for streaming token calculation
+				accumulatedContent.WriteString(contentDelta)
 				// Start text content block if needed
 				if !isTextBlockOpen {
 					contentBlockIndex++
@@ -1538,8 +1785,32 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 		}
 	}

-	// Fallback for output tokens if not received from upstream
-	if totalUsage.OutputTokens == 0 && outputLen > 0 {
+	// Streaming token calculation - calculate output tokens from accumulated content
+	// This provides more accurate token counting than simple character division
+	if totalUsage.OutputTokens == 0 && accumulatedContent.Len() > 0 {
+		// Try to use tiktoken for accurate counting
+		if enc, err := tokenizerForModel(model); err == nil {
+			if tokenCount, countErr := enc.Count(accumulatedContent.String()); countErr == nil {
+				totalUsage.OutputTokens = int64(tokenCount)
+				log.Debugf("kiro: streamToChannel calculated output tokens using tiktoken: %d", totalUsage.OutputTokens)
+			} else {
+				// Fallback on count error: estimate from character count
+				totalUsage.OutputTokens = int64(accumulatedContent.Len() / 4)
+				if totalUsage.OutputTokens == 0 {
+					totalUsage.OutputTokens = 1
+				}
+				log.Debugf("kiro: streamToChannel tiktoken count failed, estimated from chars: %d", totalUsage.OutputTokens)
+			}
+		} else {
+			// Fallback: estimate from character count (roughly 4 chars per token)
+			totalUsage.OutputTokens = int64(accumulatedContent.Len() / 4)
+			if totalUsage.OutputTokens == 0 {
+				totalUsage.OutputTokens = 1
+			}
+			log.Debugf("kiro: streamToChannel estimated output tokens from chars: %d (content len: %d)", totalUsage.OutputTokens, accumulatedContent.Len())
+		}
+	} else if totalUsage.OutputTokens == 0 && outputLen > 0 {
+		// Legacy fallback using outputLen
 		totalUsage.OutputTokens = int64(outputLen / 4)
 		if totalUsage.OutputTokens == 0 {
 			totalUsage.OutputTokens = 1
@@ -1553,9 +1824,18 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 		stopReason = "tool_use"
 	}

-	// Send message_delta and message_stop
-	msgStop := e.buildClaudeMessageStopEvent(stopReason, totalUsage)
-	sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgStop, &translatorParam)
+	// Send message_delta event
+	msgDelta := e.buildClaudeMessageDeltaEvent(stopReason, totalUsage)
+	sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgDelta, &translatorParam)
+	for _, chunk := range sseData {
+		if chunk != "" {
+			out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
+		}
+	}
+
+	// Send message_stop event separately
+	msgStop := e.buildClaudeMessageStopOnlyEvent()
+	sseData = sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgStop, &translatorParam)
 	for _, chunk := range sseData {
 		if chunk != "" {
 			out <- cliproxyexecutor.StreamChunk{Payload: []byte(chunk + "\n\n")}
@@ -1646,8 +1926,8 @@ func (e *KiroExecutor) buildClaudeContentBlockStopEvent(index int) []byte {
 	return []byte("event: content_block_stop\ndata: " + string(result))
 }

-func (e *KiroExecutor) buildClaudeMessageStopEvent(stopReason string, usageInfo usage.Detail) []byte {
-	// First message_delta
+// buildClaudeMessageDeltaEvent creates the message_delta event with stop_reason and usage.
+func (e *KiroExecutor) buildClaudeMessageDeltaEvent(stopReason string, usageInfo usage.Detail) []byte {
 	deltaEvent := map[string]interface{}{
 		"type": "message_delta",
 		"delta": map[string]interface{}{
@@ -1660,14 +1940,16 @@ func (e *KiroExecutor) buildClaudeMessageStopEvent(stopReason string, usageInfo
 		},
 	}
 	deltaResult, _ := json.Marshal(deltaEvent)
+	return []byte("event: message_delta\ndata: " + string(deltaResult))
+}

-	// Then message_stop
+// buildClaudeMessageStopOnlyEvent creates only the message_stop event.
+func (e *KiroExecutor) buildClaudeMessageStopOnlyEvent() []byte {
 	stopEvent := map[string]interface{}{
 		"type": "message_stop",
 	}
 	stopResult, _ := json.Marshal(stopEvent)
-
-	return []byte("event: message_delta\ndata: " + string(deltaResult) + "\n\nevent: message_stop\ndata: " + string(stopResult))
+	return []byte("event: message_stop\ndata: " + string(stopResult))
 }

 // buildClaudeFinalEvent constructs the final Claude-style event.
@@ -1873,6 +2155,12 @@ func (e *KiroExecutor) streamEventStream(ctx context.Context, body io.Reader, c
 			return fmt.Errorf("failed to read message: %w", err)
 		}

+		// Validate headersLen to prevent slice out of bounds
+		if headersLen+4 > uint32(len(remaining)) {
+			log.Warnf("kiro: invalid headersLen %d exceeds remaining buffer %d", headersLen, len(remaining))
+			continue
+		}
+
 		eventType := e.extractEventType(remaining[:headersLen+4])

 		payloadStart := 4 + headersLen
@@ -1886,6 +2174,7 @@ func (e *KiroExecutor) streamEventStream(ctx context.Context, body io.Reader, c

 		var event map[string]interface{}
 		if err := json.Unmarshal(payload, &event); err != nil {
+			log.Warnf("kiro: failed to unmarshal event payload: %v, raw: %s", err, string(payload))
 			continue
 		}

@@ -1983,9 +2272,19 @@ func (e *KiroExecutor) streamEventStream(ctx context.Context, body io.Reader, c
 	}
 	totalUsage.TotalTokens = totalUsage.InputTokens + totalUsage.OutputTokens

-	// Always use end_turn (no tool_use support)
-	msgStop := e.buildClaudeMessageStopEvent("end_turn", totalUsage)
-	sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgStop, &translatorParam)
+	// Send message_delta event
+	msgDelta := e.buildClaudeMessageDeltaEvent("end_turn", totalUsage)
+	sseData := sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgDelta, &translatorParam)
+	for _, chunk := range sseData {
+		if chunk != "" {
+			c.Writer.Write([]byte(chunk + "\n\n"))
+		}
+	}
+	c.Writer.Flush()
+
+	// Send message_stop event separately
+	msgStop := e.buildClaudeMessageStopOnlyEvent()
+	sseData = sdktranslator.TranslateStream(ctx, sdktranslator.FromString("kiro"), targetFormat, model, originalReq, claudeBody, msgStop, &translatorParam)
 	for _, chunk := range sseData {
 		if chunk != "" {
 			c.Writer.Write([]byte(chunk + "\n\n"))
@@ -2057,6 +2356,128 @@ func (e *KiroExecutor) isTokenExpired(accessToken string) bool {
 	return isExpired
 }

+// ============================================================================
+// Message Merging Support - Merge adjacent messages with the same role
+// Based on AIClient-2-API implementation
+// ============================================================================
+
+// mergeAdjacentMessages merges adjacent messages with the same role.
+// This reduces API call complexity and improves compatibility.
+// Based on AIClient-2-API implementation.
+func mergeAdjacentMessages(messages []gjson.Result) []gjson.Result {
+	if len(messages) <= 1 {
+		return messages
+	}
+
+	var merged []gjson.Result
+	for _, msg := range messages {
+		if len(merged) == 0 {
+			merged = append(merged, msg)
+			continue
+		}
+
+		lastMsg := merged[len(merged)-1]
+		currentRole := msg.Get("role").String()
+		lastRole := lastMsg.Get("role").String()
+
+		if currentRole == lastRole {
+			// Merge content from current message into last message
+			mergedContent := mergeMessageContent(lastMsg, msg)
+			// Create a new merged message JSON
+			mergedMsg := createMergedMessage(lastRole, mergedContent)
+			merged[len(merged)-1] = gjson.Parse(mergedMsg)
+		} else {
+			merged = append(merged, msg)
+		}
+	}
+
+	return merged
+}
+
+// mergeMessageContent merges the content of two messages with the same role.
+// Handles both string content and array content (with text, tool_use, tool_result blocks).
+func mergeMessageContent(msg1, msg2 gjson.Result) string {
+	content1 := msg1.Get("content")
+	content2 := msg2.Get("content")
+
+	// Extract content blocks from both messages
+	var blocks1, blocks2 []map[string]interface{}
+
+	if content1.IsArray() {
+		for _, block := range content1.Array() {
+			blocks1 = append(blocks1, blockToMap(block))
+		}
+	} else if content1.Type == gjson.String {
+		blocks1 = append(blocks1, map[string]interface{}{
+			"type": "text",
+			"text": content1.String(),
+		})
+	}
+
+	if content2.IsArray() {
+		for _, block := range content2.Array() {
+			blocks2 = append(blocks2, blockToMap(block))
+		}
+	} else if content2.Type == gjson.String {
+		blocks2 = append(blocks2, map[string]interface{}{
+			"type": "text",
+			"text": content2.String(),
+		})
+	}
+
+	// Merge text blocks if both end/start with text
+	if len(blocks1) > 0 && len(blocks2) > 0 {
+		if blocks1[len(blocks1)-1]["type"] == "text" && blocks2[0]["type"] == "text" {
+			// Merge the last text block of msg1 with the first text block of msg2
+			text1 := blocks1[len(blocks1)-1]["text"].(string)
+			text2 := blocks2[0]["text"].(string)
+			blocks1[len(blocks1)-1]["text"] = text1 + "\n" + text2
+			blocks2 = blocks2[1:] // Remove the merged block from blocks2
+		}
+	}
+
+	// Combine all blocks
+	allBlocks := append(blocks1, blocks2...)
+
+	// Convert to JSON
+	result, _ := json.Marshal(allBlocks)
+	return string(result)
+}
+
+// blockToMap converts a gjson.Result block to a map[string]interface{}
+func blockToMap(block gjson.Result) map[string]interface{} {
+	result := make(map[string]interface{})
+	block.ForEach(func(key, value gjson.Result) bool {
+		if value.IsObject() {
+			result[key.String()] = blockToMap(value)
+		} else if value.IsArray() {
+			var arr []interface{}
+			for _, item := range value.Array() {
+				if item.IsObject() {
+					arr = append(arr, blockToMap(item))
+				} else {
+					arr = append(arr, item.Value())
+				}
+			}
+			result[key.String()] = arr
+		} else {
+			result[key.String()] = value.Value()
+		}
+		return true
+	})
+	return result
+}
+
+// createMergedMessage creates a JSON string for a merged message
+func createMergedMessage(role string, content string) string {
+	msg := map[string]interface{}{
+		"role":    role,
+		"content": json.RawMessage(content),
+	}
+	result, _ := json.Marshal(msg)
+	return string(result)
+}
+
 // ============================================================================
 // Tool Calling Support - Embedded tool call parsing and input buffering
 // Based on amq2api and AIClient-2-API implementations
@@ -2079,8 +2500,6 @@ var (
 	whitespaceCollapsePattern = regexp.MustCompile(`\s+`)
 	// trailingCommaPattern matches trailing commas before closing braces/brackets
 	trailingCommaPattern = regexp.MustCompile(`,\s*([}\]])`)
-	// unquotedKeyPattern matches unquoted JSON keys that need quoting
-	unquotedKeyPattern = regexp.MustCompile(`([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:`)
 )

 // parseEmbeddedToolCalls extracts [Called tool_name with args: {...}] format from text.
@@ -2246,14 +2665,208 @@ func findMatchingBracket(text string, startPos int) int {
 }

 // repairJSON attempts to fix common JSON issues that may occur in tool call arguments.
-// Based on AIClient-2-API's JSON repair implementation.
+// Based on AIClient-2-API's JSON repair implementation with a more conservative strategy.
+//
+// Conservative repair strategy:
+// 1. First try to parse JSON directly - if valid, return as-is
+// 2. Only attempt repair if parsing fails
+// 3. After repair, validate the result - if still invalid, return original
+//
+// Handles incomplete JSON by balancing brackets and removing trailing incomplete content.
 // Uses pre-compiled regex patterns for performance.
-func repairJSON(raw string) string {
+func repairJSON(jsonString string) string {
+	// Handle empty or invalid input
+	if jsonString == "" {
+		return "{}"
+	}
+	
+	str := strings.TrimSpace(jsonString)
+	if str == "" {
+		return "{}"
+	}
+	
+	// CONSERVATIVE STRATEGY: First try to parse directly
+	// If the JSON is already valid, return it unchanged
+	var testParse interface{}
+	if err := json.Unmarshal([]byte(str), &testParse); err == nil {
+		log.Debugf("kiro: repairJSON - JSON is already valid, returning unchanged")
+		return str
+	}
+	
+	log.Debugf("kiro: repairJSON - JSON parse failed, attempting repair")
+	originalStr := str // Keep original for fallback
+	
+	// First, escape unescaped newlines/tabs within JSON string values
+	str = escapeNewlinesInStrings(str)
 	// Remove trailing commas before closing braces/brackets
-	repaired := trailingCommaPattern.ReplaceAllString(raw, "$1")
-	// Fix unquoted keys (basic attempt - handles simple cases)
-	repaired = unquotedKeyPattern.ReplaceAllString(repaired, `$1"$2":`)
-	return repaired
+	str = trailingCommaPattern.ReplaceAllString(str, "$1")
+	
+	// Calculate bracket balance to detect incomplete JSON
+	braceCount := 0    // {} balance
+	bracketCount := 0  // [] balance
+	inString := false
+	escape := false
+	lastValidIndex := -1
+	
+	for i := 0; i < len(str); i++ {
+		char := str[i]
+		
+		// Handle escape sequences
+		if escape {
+			escape = false
+			continue
+		}
+		
+		if char == '\\' {
+			escape = true
+			continue
+		}
+		
+		// Handle string boundaries
+		if char == '"' {
+			inString = !inString
+			continue
+		}
+		
+		// Skip characters inside strings (they don't affect bracket balance)
+		if inString {
+			continue
+		}
+		
+		// Track bracket balance
+		switch char {
+		case '{':
+			braceCount++
+		case '}':
+			braceCount--
+		case '[':
+			bracketCount++
+		case ']':
+			bracketCount--
+		}
+		
+		// Record last valid position (where brackets are balanced or positive)
+		if braceCount >= 0 && bracketCount >= 0 {
+			lastValidIndex = i
+		}
+	}
+	
+	// If brackets are unbalanced, try to repair
+	if braceCount > 0 || bracketCount > 0 {
+		// Truncate to last valid position if we have incomplete content
+		if lastValidIndex > 0 && lastValidIndex < len(str)-1 {
+			// Check if truncation would help (only truncate if there's trailing garbage)
+			truncated := str[:lastValidIndex+1]
+			// Recount brackets after truncation
+			braceCount = 0
+			bracketCount = 0
+			inString = false
+			escape = false
+			for i := 0; i < len(truncated); i++ {
+				char := truncated[i]
+				if escape {
+					escape = false
+					continue
+				}
+				if char == '\\' {
+					escape = true
+					continue
+				}
+				if char == '"' {
+					inString = !inString
+					continue
+				}
+				if inString {
+					continue
+				}
+				switch char {
+				case '{':
+					braceCount++
+				case '}':
+					braceCount--
+				case '[':
+					bracketCount++
+				case ']':
+					bracketCount--
+				}
+			}
+			str = truncated
+		}
+		
+		// Add missing closing brackets
+		for braceCount > 0 {
+			str += "}"
+			braceCount--
+		}
+		for bracketCount > 0 {
+			str += "]"
+			bracketCount--
+		}
+	}
+	
+	// CONSERVATIVE STRATEGY: Validate repaired JSON
+	// If repair didn't produce valid JSON, return original string
+	if err := json.Unmarshal([]byte(str), &testParse); err != nil {
+		log.Warnf("kiro: repairJSON - repair failed to produce valid JSON, returning original")
+		return originalStr
+	}
+	
+	log.Debugf("kiro: repairJSON - successfully repaired JSON")
+	return str
+}
+
+// escapeNewlinesInStrings escapes literal newlines, tabs, and other control characters
+// that appear inside JSON string values. This handles cases where streaming fragments
+// contain unescaped control characters within string content.
+func escapeNewlinesInStrings(raw string) string {
+	var result strings.Builder
+	result.Grow(len(raw) + 100) // Pre-allocate with some extra space
+
+	inString := false
+	escaped := false
+
+	for i := 0; i < len(raw); i++ {
+		c := raw[i]
+
+		if escaped {
+			// Previous character was backslash, this is an escape sequence
+			result.WriteByte(c)
+			escaped = false
+			continue
+		}
+
+		if c == '\\' && inString {
+			// Start of escape sequence
+			result.WriteByte(c)
+			escaped = true
+			continue
+		}
+
+		if c == '"' {
+			// Toggle string state
+			inString = !inString
+			result.WriteByte(c)
+			continue
+		}
+
+		if inString {
+			// Inside a string, escape control characters
+			switch c {
+			case '\n':
+				result.WriteString("\\n")
+			case '\r':
+				result.WriteString("\\r")
+			case '\t':
+				result.WriteString("\\t")
+			default:
+				result.WriteByte(c)
+			}
+		} else {
+			result.WriteByte(c)
+		}
+	}
+
+	return result.String()
 }

 // processToolUseEvent handles a toolUseEvent from the Kiro stream.
@@ -2330,6 +2943,8 @@ func (e *KiroExecutor) processToolUseEvent(event map[string]interface{}, current

 	// Accumulate input fragments
 	if currentToolUse != nil && inputFragment != "" {
+		// Accumulate fragments directly - they form valid JSON when combined
+		// The fragments are already decoded from JSON, so we just concatenate them
 		currentToolUse.inputBuffer.WriteString(inputFragment)
 		log.Debugf("kiro: accumulated input fragment, total length: %d", currentToolUse.inputBuffer.Len())
 	}
--- a/internal/runtime/executor/openai_compat_executor.go
+++ b/internal/runtime/executor/openai_compat_executor.go
@@ -58,10 +58,15 @@ func (e *OpenAICompatExecutor) Execute(ctx context.Context, auth *cliproxyauth.A
 		translated = e.overrideModel(translated, modelOverride)
 	}
 	translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated)
-	translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		translated, _ = sjson.SetBytes(translated, "model", upstreamModel)
 	}
+	translated = normalizeThinkingConfig(translated, upstreamModel)
+	if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil {
+		return resp, errValidate
+	}

 	url := strings.TrimSuffix(baseURL, "/") + "/chat/completions"
 	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated))
@@ -147,10 +152,15 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy
 		translated = e.overrideModel(translated, modelOverride)
 	}
 	translated = applyPayloadConfigWithRoot(e.cfg, req.Model, to.String(), "", translated)
-	translated = applyReasoningEffortMetadataChatCompletions(translated, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	translated = applyReasoningEffortMetadata(translated, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		translated, _ = sjson.SetBytes(translated, "model", upstreamModel)
 	}
+	translated = normalizeThinkingConfig(translated, upstreamModel)
+	if errValidate := validateThinkingConfig(translated, upstreamModel); errValidate != nil {
+		return nil, errValidate
+	}

 	url := strings.TrimSuffix(baseURL, "/") + "/chat/completions"
 	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(translated))
@@ -214,7 +224,7 @@ func (e *OpenAICompatExecutor) ExecuteStream(ctx context.Context, auth *cliproxy
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, 52_428_800) // 50MB
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
--- a/internal/runtime/executor/payload_helpers.go
+++ b/internal/runtime/executor/payload_helpers.go
@@ -1,6 +1,8 @@
 package executor

 import (
+	"fmt"
+	"net/http"
 	"strings"

 	"github.com/router-for-me/CLIProxyAPI/v6/internal/config"
@@ -9,7 +11,7 @@ import (
 	"github.com/tidwall/sjson"
 )

-// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N)
+// applyThinkingMetadata applies thinking config from model suffix metadata (e.g., (high), (8192))
 // for standard Gemini format payloads. It normalizes the budget when the model supports thinking.
 func applyThinkingMetadata(payload []byte, metadata map[string]any, model string) []byte {
 	budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata)
@@ -26,7 +28,7 @@ func applyThinkingMetadata(payload []byte, metadata map[string]any, model string
 	return util.ApplyGeminiThinkingConfig(payload, budgetOverride, includeOverride)
 }

-// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., -reasoning, -thinking-N)
+// applyThinkingMetadataCLI applies thinking config from model suffix metadata (e.g., (high), (8192))
 // for Gemini CLI format payloads (nested under "request"). It normalizes the budget when the model supports thinking.
 func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model string) []byte {
 	budgetOverride, includeOverride, ok := util.ResolveThinkingConfigFromMetadata(model, metadata)
@@ -43,40 +45,21 @@ func applyThinkingMetadataCLI(payload []byte, metadata map[string]any, model str
 	return util.ApplyGeminiCLIThinkingConfig(payload, budgetOverride, includeOverride)
 }

-// applyReasoningEffortMetadata applies reasoning effort overrides (reasoning.effort) when present in metadata.
-// It avoids overwriting an existing reasoning.effort field and only applies to models that support thinking.
-func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model string) []byte {
+// applyReasoningEffortMetadata applies reasoning effort overrides from metadata to the given JSON path.
+// Metadata values take precedence over any existing field when the model supports thinking, intentionally
+// overwriting caller-provided values to honor suffix/default metadata priority.
+func applyReasoningEffortMetadata(payload []byte, metadata map[string]any, model, field string) []byte {
 	if len(metadata) == 0 {
 		return payload
 	}
 	if !util.ModelSupportsThinking(model) {
 		return payload
 	}
-	if gjson.GetBytes(payload, "reasoning.effort").Exists() {
+	if field == "" {
 		return payload
 	}
 	if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" {
-		if updated, err := sjson.SetBytes(payload, "reasoning.effort", effort); err == nil {
-			return updated
-		}
-	}
-	return payload
-}
-
-// applyReasoningEffortMetadataChatCompletions applies reasoning_effort (OpenAI chat completions field)
-// when present in metadata. It avoids overwriting an existing reasoning_effort field.
-func applyReasoningEffortMetadataChatCompletions(payload []byte, metadata map[string]any, model string) []byte {
-	if len(metadata) == 0 {
-		return payload
-	}
-	if !util.ModelSupportsThinking(model) {
-		return payload
-	}
-	if gjson.GetBytes(payload, "reasoning_effort").Exists() {
-		return payload
-	}
-	if effort, ok := util.ReasoningEffortFromMetadata(metadata); ok && effort != "" {
-		if updated, err := sjson.SetBytes(payload, "reasoning_effort", effort); err == nil {
+		if updated, err := sjson.SetBytes(payload, field, effort); err == nil {
 			return updated
 		}
 	}
@@ -232,3 +215,93 @@ func matchModelPattern(pattern, model string) bool {
 	}
 	return pi == len(pattern)
 }
+
+// normalizeThinkingConfig normalizes thinking-related fields in the payload
+// based on model capabilities. For models without thinking support, it strips
+// reasoning fields. For models with level-based thinking, it validates and
+// normalizes the reasoning effort level.
+func normalizeThinkingConfig(payload []byte, model string) []byte {
+	if len(payload) == 0 || model == "" {
+		return payload
+	}
+
+	if !util.ModelSupportsThinking(model) {
+		return stripThinkingFields(payload)
+	}
+
+	if util.ModelUsesThinkingLevels(model) {
+		return normalizeReasoningEffortLevel(payload, model)
+	}
+
+	return payload
+}
+
+// stripThinkingFields removes thinking-related fields from the payload for
+// models that do not support thinking.
+func stripThinkingFields(payload []byte) []byte {
+	fieldsToRemove := []string{
+		"reasoning",
+		"reasoning_effort",
+		"reasoning.effort",
+	}
+	out := payload
+	for _, field := range fieldsToRemove {
+		if gjson.GetBytes(out, field).Exists() {
+			out, _ = sjson.DeleteBytes(out, field)
+		}
+	}
+	return out
+}
+
+// normalizeReasoningEffortLevel validates and normalizes the reasoning_effort
+// or reasoning.effort field for level-based thinking models.
+func normalizeReasoningEffortLevel(payload []byte, model string) []byte {
+	out := payload
+
+	if effort := gjson.GetBytes(out, "reasoning_effort"); effort.Exists() {
+		if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok {
+			out, _ = sjson.SetBytes(out, "reasoning_effort", normalized)
+		}
+	}
+
+	if effort := gjson.GetBytes(out, "reasoning.effort"); effort.Exists() {
+		if normalized, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); ok {
+			out, _ = sjson.SetBytes(out, "reasoning.effort", normalized)
+		}
+	}
+
+	return out
+}
+
+// validateThinkingConfig checks for unsupported reasoning levels on level-based models.
+// Returns a statusErr with 400 when an unsupported level is supplied to avoid silently
+// downgrading requests.
+func validateThinkingConfig(payload []byte, model string) error {
+	if len(payload) == 0 || model == "" {
+		return nil
+	}
+	if !util.ModelSupportsThinking(model) || !util.ModelUsesThinkingLevels(model) {
+		return nil
+	}
+
+	levels := util.GetModelThinkingLevels(model)
+	checkField := func(path string) error {
+		if effort := gjson.GetBytes(payload, path); effort.Exists() {
+			if _, ok := util.NormalizeReasoningEffortLevel(model, effort.String()); !ok {
+				return statusErr{
+					code: http.StatusBadRequest,
+					msg:  fmt.Sprintf("unsupported reasoning effort level %q for model %s (supported: %s)", effort.String(), model, strings.Join(levels, ", ")),
+				}
+			}
+		}
+		return nil
+	}
+
+	if err := checkField("reasoning_effort"); err != nil {
+		return err
+	}
+	if err := checkField("reasoning.effort"); err != nil {
+		return err
+	}
+	return nil
+}
--- a/internal/runtime/executor/qwen_executor.go
+++ b/internal/runtime/executor/qwen_executor.go
@@ -51,10 +51,15 @@ func (e *QwenExecutor) Execute(ctx context.Context, auth *cliproxyauth.Auth, req
 	from := opts.SourceFormat
 	to := sdktranslator.FromString("openai")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), false)
-	body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	}
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return resp, errValidate
+	}
 	body = applyPayloadConfig(e.cfg, req.Model, body)

 	url := strings.TrimSuffix(baseURL, "/") + "/chat/completions"
@@ -126,10 +131,15 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 	to := sdktranslator.FromString("openai")
 	body := sdktranslator.TranslateRequest(from, to, req.Model, bytes.Clone(req.Payload), true)

-	body = applyReasoningEffortMetadataChatCompletions(body, req.Metadata, req.Model)
-	if upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata); upstreamModel != "" {
+	body = applyReasoningEffortMetadata(body, req.Metadata, req.Model, "reasoning_effort")
+	upstreamModel := util.ResolveOriginalModel(req.Model, req.Metadata)
+	if upstreamModel != "" {
 		body, _ = sjson.SetBytes(body, "model", upstreamModel)
 	}
+	body = normalizeThinkingConfig(body, upstreamModel)
+	if errValidate := validateThinkingConfig(body, upstreamModel); errValidate != nil {
+		return nil, errValidate
+	}
 	toolsResult := gjson.GetBytes(body, "tools")
 	// I'm addressing the Qwen3 "poisoning" issue, which is caused by the model needing a tool to be defined. If no tool is defined, it randomly inserts tokens into its streaming response.
 	// This will have no real consequences. It's just to scare Qwen3.
@@ -190,7 +200,7 @@ func (e *QwenExecutor) ExecuteStream(ctx context.Context, auth *cliproxyauth.Aut
 			}
 		}()
 		scanner := bufio.NewScanner(httpResp.Body)
-		scanner.Buffer(nil, 20_971_520)
+		scanner.Buffer(nil, 52_428_800) // 50MB
 		var param any
 		for scanner.Scan() {
 			line := scanner.Bytes()
--- a/internal/translator/antigravity/claude/antigravity_claude_response.go
+++ b/internal/translator/antigravity/claude/antigravity_claude_response.go
@@ -35,6 +35,7 @@ type Params struct {
 	TotalTokenCount      int64  // Cached total token count from usage metadata
 	HasSentFinalEvents   bool   // Indicates if final content/message events have been sent
 	HasToolUse           bool   // Indicates if tool use was observed in the stream
+	HasContent           bool   // Tracks whether any content (text, thinking, or tool use) has been output
 }

 // toolUseIDCounter provides a process-wide unique counter for tool use identifiers.
@@ -69,11 +70,14 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq

 	if bytes.Equal(rawJSON, []byte("[DONE]")) {
 		output := ""
-		appendFinalEvents(params, &output, true)
-
-		return []string{
-			output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+		// Only send final events if we have actually output content
+		if params.HasContent {
+			appendFinalEvents(params, &output, true)
+			return []string{
+				output + "event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+			}
 		}
+		return []string{}
 	}

 	output := ""
@@ -119,10 +123,12 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":""}}`, params.ResponseIndex), "delta.signature", thoughtSignature.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						params.HasContent = true
 					} else if params.ResponseType == 2 { // Continue existing thinking block if already in thinking state
 						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						params.HasContent = true
 					} else {
 						// Transition from another state to thinking
 						// First, close any existing content block
@@ -146,6 +152,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, params.ResponseIndex), "delta.thinking", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
 						params.ResponseType = 2 // Set state to thinking
+						params.HasContent = true
 					}
 				} else {
 					finishReasonResult := gjson.GetBytes(rawJSON, "response.candidates.0.finishReason")
@@ -156,6 +163,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 							output = output + "event: content_block_delta\n"
 							data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String())
 							output = output + fmt.Sprintf("data: %s\n\n\n", data)
+							params.HasContent = true
 						} else {
 							// Transition from another state to text content
 							// First, close any existing content block
@@ -179,6 +187,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 								data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, params.ResponseIndex), "delta.text", partTextResult.String())
 								output = output + fmt.Sprintf("data: %s\n\n\n", data)
 								params.ResponseType = 1 // Set state to content
+								params.HasContent = true
 							}
 						}
 					}
@@ -230,6 +239,7 @@ func ConvertAntigravityResponseToClaude(_ context.Context, _ string, originalReq
 					output = output + fmt.Sprintf("data: %s\n\n\n", data)
 				}
 				params.ResponseType = 3
+				params.HasContent = true
 			}
 		}
 	}
@@ -269,6 +279,11 @@ func appendFinalEvents(params *Params, output *string, force bool) {
 		return
 	}

+	// Only send final events if we have actually output content
+	if !params.HasContent {
+		return
+	}
+
 	if params.ResponseType != 0 {
 		*output = *output + "event: content_block_stop\n"
 		*output = *output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, params.ResponseIndex)
--- a/internal/translator/claude/gemini/claude_gemini_response.go
+++ b/internal/translator/claude/gemini/claude_gemini_response.go
@@ -331,9 +331,8 @@ func ConvertClaudeResponseToGeminiNonStream(_ context.Context, modelName string,
 	streamingEvents := make([][]byte, 0)

 	scanner := bufio.NewScanner(bytes.NewReader(rawJSON))
-	// Use a smaller initial buffer (64KB) that can grow up to 20MB if needed
-	// This prevents allocating 20MB for every request regardless of size
-	scanner.Buffer(make([]byte, 64*1024), 20_971_520)
+	buffer := make([]byte, 52_428_800) // 50MB
+	scanner.Buffer(buffer, 52_428_800)
 	for scanner.Scan() {
 		line := scanner.Bytes()
 		// log.Debug(string(line))
--- a/internal/translator/claude/openai/responses/claude_openai-responses_response.go
+++ b/internal/translator/claude/openai/responses/claude_openai-responses_response.go
@@ -445,8 +445,8 @@ func ConvertClaudeResponseToOpenAIResponsesNonStream(_ context.Context, _ string
 		// Use a simple scanner to iterate through raw bytes
 		// Note: extremely large responses may require increasing the buffer
 		scanner := bufio.NewScanner(bytes.NewReader(rawJSON))
-		buf := make([]byte, 20_971_520)
-		scanner.Buffer(buf, 20_971_520)
+		buf := make([]byte, 52_428_800) // 50MB
+		scanner.Buffer(buf, 52_428_800)
 		for scanner.Scan() {
 			line := scanner.Bytes()
 			if !bytes.HasPrefix(line, dataTag) {
--- a/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go
+++ b/internal/translator/gemini-cli/claude/gemini-cli_claude_response.go
@@ -26,6 +26,7 @@ type Params struct {
 	HasFirstResponse bool // Indicates if the initial message_start event has been sent
 	ResponseType     int  // Current response type: 0=none, 1=content, 2=thinking, 3=function
 	ResponseIndex    int  // Index counter for content blocks in the streaming response
+	HasContent       bool // Tracks whether any content (text, thinking, or tool use) has been output
 }

 // toolUseIDCounter provides a process-wide unique counter for tool use identifiers.
@@ -57,19 +58,23 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque
 	}

 	if bytes.Equal(rawJSON, []byte("[DONE]")) {
-		return []string{
-			"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+		// Only send message_stop if we have actually output content
+		if (*param).(*Params).HasContent {
+			return []string{
+				"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+			}
 		}
+		return []string{}
 	}

 	// Track whether tools are being used in this response chunk
 	usedTool := false
-	var sb strings.Builder
+	output := ""

 	// Initialize the streaming session with a message_start event
 	// This is only sent for the very first response chunk to establish the streaming session
 	if !(*param).(*Params).HasFirstResponse {
-		sb.WriteString("event: message_start\n")
+		output = "event: message_start\n"

 		// Create the initial message structure with default values according to Claude Code API specification
 		// This follows the Claude Code API specification for streaming message initialization
@@ -82,7 +87,7 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque
 		if responseIDResult := gjson.GetBytes(rawJSON, "response.responseId"); responseIDResult.Exists() {
 			messageStartTemplate, _ = sjson.Set(messageStartTemplate, "message.id", responseIDResult.String())
 		}
-		sb.WriteString(fmt.Sprintf("data: %s\n\n\n", messageStartTemplate))
+		output = output + fmt.Sprintf("data: %s\n\n\n", messageStartTemplate)

 		(*param).(*Params).HasFirstResponse = true
 	}
@@ -105,53 +110,67 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque
 				if partResult.Get("thought").Bool() {
 					// Continue existing thinking block if already in thinking state
 					if (*param).(*Params).ResponseType == 2 {
-						sb.WriteString("event: content_block_delta\n")
+						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String())
-						sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						(*param).(*Params).HasContent = true
 					} else {
 						// Transition from another state to thinking
 						// First, close any existing content block
 						if (*param).(*Params).ResponseType != 0 {
-							sb.WriteString("event: content_block_stop\n")
-							sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex))
-							sb.WriteString("\n\n\n")
+							if (*param).(*Params).ResponseType == 2 {
+								// output = output + "event: content_block_delta\n"
+								// output = output + fmt.Sprintf(`data: {"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":null}}`, (*param).(*Params).ResponseIndex)
+								// output = output + "\n\n\n"
+							}
+							output = output + "event: content_block_stop\n"
+							output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+							output = output + "\n\n\n"
 							(*param).(*Params).ResponseIndex++
 						}

 						// Start a new thinking content block
-						sb.WriteString("event: content_block_start\n")
-						sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_start","index":%d,"content_block":{"type":"thinking","thinking":""}}`, (*param).(*Params).ResponseIndex))
-						sb.WriteString("\n\n\n")
-						sb.WriteString("event: content_block_delta\n")
+						output = output + "event: content_block_start\n"
+						output = output + fmt.Sprintf(`data: {"type":"content_block_start","index":%d,"content_block":{"type":"thinking","thinking":""}}`, (*param).(*Params).ResponseIndex)
+						output = output + "\n\n\n"
+						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String())
-						sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+						output = output + fmt.Sprintf("data: %s\n\n\n", data)
 						(*param).(*Params).ResponseType = 2 // Set state to thinking
+						(*param).(*Params).HasContent = true
 					}
 				} else {
 					// Process regular text content (user-visible output)
 					// Continue existing text block if already in content state
 					if (*param).(*Params).ResponseType == 1 {
-						sb.WriteString("event: content_block_delta\n")
+						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String())
-						sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						(*param).(*Params).HasContent = true
 					} else {
 						// Transition from another state to text content
 						// First, close any existing content block
 						if (*param).(*Params).ResponseType != 0 {
-							sb.WriteString("event: content_block_stop\n")
-							sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex))
-							sb.WriteString("\n\n\n")
+							if (*param).(*Params).ResponseType == 2 {
+								// output = output + "event: content_block_delta\n"
+								// output = output + fmt.Sprintf(`data: {"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":null}}`, (*param).(*Params).ResponseIndex)
+								// output = output + "\n\n\n"
+							}
+							output = output + "event: content_block_stop\n"
+							output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+							output = output + "\n\n\n"
 							(*param).(*Params).ResponseIndex++
 						}

 						// Start a new text content block
-						sb.WriteString("event: content_block_start\n")
-						sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_start","index":%d,"content_block":{"type":"text","text":""}}`, (*param).(*Params).ResponseIndex))
-						sb.WriteString("\n\n\n")
-						sb.WriteString("event: content_block_delta\n")
+						output = output + "event: content_block_start\n"
+						output = output + fmt.Sprintf(`data: {"type":"content_block_start","index":%d,"content_block":{"type":"text","text":""}}`, (*param).(*Params).ResponseIndex)
+						output = output + "\n\n\n"
+						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String())
-						sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+						output = output + fmt.Sprintf("data: %s\n\n\n", data)
 						(*param).(*Params).ResponseType = 1 // Set state to content
+						(*param).(*Params).HasContent = true
 					}
 				}
 			} else if functionCallResult.Exists() {
@@ -163,37 +182,45 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque
 				// Handle state transitions when switching to function calls
 				// Close any existing function call block first
 				if (*param).(*Params).ResponseType == 3 {
-					sb.WriteString("event: content_block_stop\n")
-					sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex))
-					sb.WriteString("\n\n\n")
+					output = output + "event: content_block_stop\n"
+					output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+					output = output + "\n\n\n"
 					(*param).(*Params).ResponseIndex++
 					(*param).(*Params).ResponseType = 0
 				}

+				// Special handling for thinking state transition
+				if (*param).(*Params).ResponseType == 2 {
+					// output = output + "event: content_block_delta\n"
+					// output = output + fmt.Sprintf(`data: {"type":"content_block_delta","index":%d,"delta":{"type":"signature_delta","signature":null}}`, (*param).(*Params).ResponseIndex)
+					// output = output + "\n\n\n"
+				}
+
 				// Close any other existing content block
 				if (*param).(*Params).ResponseType != 0 {
-					sb.WriteString("event: content_block_stop\n")
-					sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex))
-					sb.WriteString("\n\n\n")
+					output = output + "event: content_block_stop\n"
+					output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+					output = output + "\n\n\n"
 					(*param).(*Params).ResponseIndex++
 				}

 				// Start a new tool use content block
 				// This creates the structure for a function call in Claude Code format
-				sb.WriteString("event: content_block_start\n")
+				output = output + "event: content_block_start\n"

 				// Create the tool use block with unique ID and function details
 				data := fmt.Sprintf(`{"type":"content_block_start","index":%d,"content_block":{"type":"tool_use","id":"","name":"","input":{}}}`, (*param).(*Params).ResponseIndex)
 				data, _ = sjson.Set(data, "content_block.id", fmt.Sprintf("%s-%d-%d", fcName, time.Now().UnixNano(), atomic.AddUint64(&toolUseIDCounter, 1)))
 				data, _ = sjson.Set(data, "content_block.name", fcName)
-				sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+				output = output + fmt.Sprintf("data: %s\n\n\n", data)

 				if fcArgsResult := functionCallResult.Get("args"); fcArgsResult.Exists() {
-					sb.WriteString("event: content_block_delta\n")
+					output = output + "event: content_block_delta\n"
 					data, _ = sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"input_json_delta","partial_json":""}}`, (*param).(*Params).ResponseIndex), "delta.partial_json", fcArgsResult.Raw)
-					sb.WriteString(fmt.Sprintf("data: %s\n\n\n", data))
+					output = output + fmt.Sprintf("data: %s\n\n\n", data)
 				}
 				(*param).(*Params).ResponseType = 3
+				(*param).(*Params).HasContent = true
 			}
 		}
 	}
@@ -202,32 +229,35 @@ func ConvertGeminiCLIResponseToClaude(_ context.Context, _ string, originalReque
 	// Process usage metadata and finish reason when present in the response
 	if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) {
 		if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() {
-			// Close the final content block
-			sb.WriteString("event: content_block_stop\n")
-			sb.WriteString(fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex))
-			sb.WriteString("\n\n\n")
+			// Only send final events if we have actually output content
+			if (*param).(*Params).HasContent {
+				// Close the final content block
+				output = output + "event: content_block_stop\n"
+				output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+				output = output + "\n\n\n"

-			// Send the final message delta with usage information and stop reason
-			sb.WriteString("event: message_delta\n")
-			sb.WriteString(`data: `)
+				// Send the final message delta with usage information and stop reason
+				output = output + "event: message_delta\n"
+				output = output + `data: `

-			// Create the message delta template with appropriate stop reason
-			template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
-			// Set tool_use stop reason if tools were used in this response
-			if usedTool {
-				template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				// Create the message delta template with appropriate stop reason
+				template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				// Set tool_use stop reason if tools were used in this response
+				if usedTool {
+					template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				}
+
+				// Include thinking tokens in output token count if present
+				thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
+				template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount)
+				template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int())
+
+				output = output + template + "\n\n\n"
 			}
-
-			// Include thinking tokens in output token count if present
-			thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
-			template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount)
-			template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int())
-
-			sb.WriteString(template + "\n\n\n")
 		}
 	}

-	return []string{sb.String()}
+	return []string{output}
 }

 // ConvertGeminiCLIResponseToClaudeNonStream converts a non-streaming Gemini CLI response to a non-streaming Claude response.
--- a/internal/translator/gemini/claude/gemini_claude_response.go
+++ b/internal/translator/gemini/claude/gemini_claude_response.go
@@ -25,6 +25,7 @@ type Params struct {
 	HasFirstResponse bool
 	ResponseType     int
 	ResponseIndex    int
+	HasContent       bool // Tracks whether any content (text, thinking, or tool use) has been output
 }

 // toolUseIDCounter provides a process-wide unique counter for tool use identifiers.
@@ -57,9 +58,13 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 	}

 	if bytes.Equal(rawJSON, []byte("[DONE]")) {
-		return []string{
-			"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+		// Only send message_stop if we have actually output content
+		if (*param).(*Params).HasContent {
+			return []string{
+				"event: message_stop\ndata: {\"type\":\"message_stop\"}\n\n\n",
+			}
 		}
+		return []string{}
 	}

 	// Track whether tools are being used in this response chunk
@@ -108,6 +113,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						(*param).(*Params).HasContent = true
 					} else {
 						// Transition from another state to thinking
 						// First, close any existing content block
@@ -131,6 +137,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"thinking_delta","thinking":""}}`, (*param).(*Params).ResponseIndex), "delta.thinking", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
 						(*param).(*Params).ResponseType = 2 // Set state to thinking
+						(*param).(*Params).HasContent = true
 					}
 				} else {
 					// Process regular text content (user-visible output)
@@ -139,6 +146,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 						output = output + "event: content_block_delta\n"
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
+						(*param).(*Params).HasContent = true
 					} else {
 						// Transition from another state to text content
 						// First, close any existing content block
@@ -162,6 +170,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 						data, _ := sjson.Set(fmt.Sprintf(`{"type":"content_block_delta","index":%d,"delta":{"type":"text_delta","text":""}}`, (*param).(*Params).ResponseIndex), "delta.text", partTextResult.String())
 						output = output + fmt.Sprintf("data: %s\n\n\n", data)
 						(*param).(*Params).ResponseType = 1 // Set state to content
+						(*param).(*Params).HasContent = true
 					}
 				}
 			} else if functionCallResult.Exists() {
@@ -211,6 +220,7 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 					output = output + fmt.Sprintf("data: %s\n\n\n", data)
 				}
 				(*param).(*Params).ResponseType = 3
+				(*param).(*Params).HasContent = true
 			}
 		}
 	}
@@ -218,23 +228,26 @@ func ConvertGeminiResponseToClaude(_ context.Context, _ string, originalRequestR
 	usageResult := gjson.GetBytes(rawJSON, "usageMetadata")
 	if usageResult.Exists() && bytes.Contains(rawJSON, []byte(`"finishReason"`)) {
 		if candidatesTokenCountResult := usageResult.Get("candidatesTokenCount"); candidatesTokenCountResult.Exists() {
-			output = output + "event: content_block_stop\n"
-			output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
-			output = output + "\n\n\n"
+			// Only send final events if we have actually output content
+			if (*param).(*Params).HasContent {
+				output = output + "event: content_block_stop\n"
+				output = output + fmt.Sprintf(`data: {"type":"content_block_stop","index":%d}`, (*param).(*Params).ResponseIndex)
+				output = output + "\n\n\n"

-			output = output + "event: message_delta\n"
-			output = output + `data: `
+				output = output + "event: message_delta\n"
+				output = output + `data: `

-			template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
-			if usedTool {
-				template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				template := `{"type":"message_delta","delta":{"stop_reason":"end_turn","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				if usedTool {
+					template = `{"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},"usage":{"input_tokens":0,"output_tokens":0}}`
+				}
+
+				thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
+				template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount)
+				template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int())
+
+				output = output + template + "\n\n\n"
 			}
-
-			thoughtsTokenCount := usageResult.Get("thoughtsTokenCount").Int()
-			template, _ = sjson.Set(template, "usage.output_tokens", candidatesTokenCountResult.Int()+thoughtsTokenCount)
-			template, _ = sjson.Set(template, "usage.input_tokens", usageResult.Get("promptTokenCount").Int())
-
-			output = output + template + "\n\n\n"
 		}
 	}

--- a/internal/translator/kiro/openai/chat-completions/kiro_openai_response.go
+++ b/internal/translator/kiro/openai/chat-completions/kiro_openai_response.go
@@ -171,7 +171,7 @@ func convertClaudeEventToOpenAI(jsonStr string, model string) []string {
 		return results

 	case "message_delta":
-		// Final message delta with stop_reason
+		// Final message delta with stop_reason and usage
 		stopReason := root.Get("delta.stop_reason").String()
 		if stopReason != "" {
 			finishReason := "stop"
@@ -196,6 +196,19 @@ func convertClaudeEventToOpenAI(jsonStr string, model string) []string {
 					},
 				},
 			}
+
+			// Extract and include usage information from message_delta event
+			usage := root.Get("usage")
+			if usage.Exists() {
+				inputTokens := usage.Get("input_tokens").Int()
+				outputTokens := usage.Get("output_tokens").Int()
+				response["usage"] = map[string]interface{}{
+					"prompt_tokens":     inputTokens,
+					"completion_tokens": outputTokens,
+					"total_tokens":      inputTokens + outputTokens,
+				}
+			}
+
 			result, _ := json.Marshal(response)
 			results = append(results, string(result))
 		}
--- a/internal/util/claude_thinking.go
+++ b/internal/util/claude_thinking.go
@@ -0,0 +1,46 @@
+package util
+
+import (
+	"github.com/tidwall/gjson"
+	"github.com/tidwall/sjson"
+)
+
+// ApplyClaudeThinkingConfig applies thinking configuration to a Claude API request payload.
+// It sets the thinking.type to "enabled" and thinking.budget_tokens to the specified budget.
+// If budget is nil or the payload already has thinking config, it returns the payload unchanged.
+func ApplyClaudeThinkingConfig(body []byte, budget *int) []byte {
+	if budget == nil {
+		return body
+	}
+	if gjson.GetBytes(body, "thinking").Exists() {
+		return body
+	}
+	if *budget <= 0 {
+		return body
+	}
+	updated := body
+	updated, _ = sjson.SetBytes(updated, "thinking.type", "enabled")
+	updated, _ = sjson.SetBytes(updated, "thinking.budget_tokens", *budget)
+	return updated
+}
+
+// ResolveClaudeThinkingConfig resolves thinking configuration from metadata for Claude models.
+// It uses the unified ResolveThinkingConfigFromMetadata and normalizes the budget.
+// Returns the normalized budget (nil if thinking should not be enabled) and whether it matched.
+func ResolveClaudeThinkingConfig(modelName string, metadata map[string]any) (*int, bool) {
+	budget, include, matched := ResolveThinkingConfigFromMetadata(modelName, metadata)
+	if !matched {
+		return nil, false
+	}
+	if include != nil && !*include {
+		return nil, true
+	}
+	if budget == nil {
+		return nil, true
+	}
+	normalized := NormalizeThinkingBudget(modelName, *budget)
+	if normalized <= 0 {
+		return nil, true
+	}
+	return &normalized, true
+}
--- a/internal/util/thinking.go
+++ b/internal/util/thinking.go
@@ -1,6 +1,8 @@
 package util

 import (
+	"strings"
+
 	"github.com/router-for-me/CLIProxyAPI/v6/internal/registry"
 )

@@ -67,3 +69,39 @@ func thinkingRangeFromRegistry(model string) (found bool, min int, max int, zero
 	}
 	return true, info.Thinking.Min, info.Thinking.Max, info.Thinking.ZeroAllowed, info.Thinking.DynamicAllowed
 }
+
+// GetModelThinkingLevels returns the discrete reasoning effort levels for the model.
+// Returns nil if the model has no thinking support or no levels defined.
+func GetModelThinkingLevels(model string) []string {
+	if model == "" {
+		return nil
+	}
+	info := registry.GetGlobalRegistry().GetModelInfo(model)
+	if info == nil || info.Thinking == nil {
+		return nil
+	}
+	return info.Thinking.Levels
+}
+
+// ModelUsesThinkingLevels reports whether the model uses discrete reasoning
+// effort levels instead of numeric budgets.
+func ModelUsesThinkingLevels(model string) bool {
+	levels := GetModelThinkingLevels(model)
+	return len(levels) > 0
+}
+
+// NormalizeReasoningEffortLevel validates and normalizes a reasoning effort
+// level for the given model. Returns false when the level is not supported.
+func NormalizeReasoningEffortLevel(model, effort string) (string, bool) {
+	levels := GetModelThinkingLevels(model)
+	if len(levels) == 0 {
+		return "", false
+	}
+	loweredEffort := strings.ToLower(strings.TrimSpace(effort))
+	for _, lvl := range levels {
+		if strings.ToLower(lvl) == loweredEffort {
+			return lvl, true
+		}
+	}
+	return "", false
+}
--- a/internal/util/thinking_suffix.go
+++ b/internal/util/thinking_suffix.go
@@ -14,61 +14,59 @@ const (
 )

 // NormalizeThinkingModel parses dynamic thinking suffixes on model names and returns
-// the normalized base model with extracted metadata. Supported patterns:
-//   - "-thinking-<number>" extracts a numeric budget
-//   - "-thinking-<level>" extracts a reasoning effort level (minimal/low/medium/high/xhigh/auto/none)
-//   - "-thinking" maps to a default reasoning effort of "medium"
-//   - "-reasoning" maps to dynamic budget (-1) and include_thoughts=true
-//   - "-nothinking" maps to budget=0 and include_thoughts=false
+// the normalized base model with extracted metadata. Supported pattern:
+//   - "(<value>)" where value can be:
+//   - A numeric budget (e.g., "(8192)", "(16384)")
+//   - A reasoning effort level (e.g., "(high)", "(medium)", "(low)")
+//
+// Examples:
+//   - "claude-sonnet-4-5-20250929(16384)" → budget=16384
+//   - "gpt-5.1(high)" → reasoning_effort="high"
+//   - "gemini-2.5-pro(32768)" → budget=32768
+//
+// Note: Empty parentheses "()" are not supported and will be ignored.
 func NormalizeThinkingModel(modelName string) (string, map[string]any) {
 	if modelName == "" {
 		return modelName, nil
 	}

-	lower := strings.ToLower(modelName)
 	baseModel := modelName

 	var (
 		budgetOverride  *int
-		includeThoughts *bool
 		reasoningEffort *string
 		matched         bool
 	)

-	switch {
-	case strings.HasSuffix(lower, "-nothinking"):
-		baseModel = modelName[:len(modelName)-len("-nothinking")]
-		budget := 0
-		include := false
-		budgetOverride = &budget
-		includeThoughts = &include
-		matched = true
-	case strings.HasSuffix(lower, "-reasoning"):
-		baseModel = modelName[:len(modelName)-len("-reasoning")]
-		budget := -1
-		include := true
-		budgetOverride = &budget
-		includeThoughts = &include
-		matched = true
-	default:
-		if idx := strings.LastIndex(lower, "-thinking-"); idx != -1 {
-			value := modelName[idx+len("-thinking-"):]
-			if value != "" {
-				if parsed, ok := parseIntPrefix(value); ok {
-					baseModel = modelName[:idx]
-					budgetOverride = &parsed
-					matched = true
-				} else if effort, okEffort := normalizeReasoningEffort(value); okEffort {
-					baseModel = modelName[:idx]
-					reasoningEffort = &effort
-					matched = true
-				}
-			}
-		} else if strings.HasSuffix(lower, "-thinking") {
-			baseModel = modelName[:len(modelName)-len("-thinking")]
-			effort := "medium"
-			reasoningEffort = &effort
+	// Match "(<value>)" pattern at the end of the model name
+	if idx := strings.LastIndex(modelName, "("); idx != -1 {
+		if !strings.HasSuffix(modelName, ")") {
+			// Incomplete parenthesis, ignore
+			return baseModel, nil
+		}
+
+		value := modelName[idx+1 : len(modelName)-1] // Extract content between ( and )
+		if value == "" {
+			// Empty parentheses not supported
+			return baseModel, nil
+		}
+
+		candidateBase := modelName[:idx]
+
+		// Auto-detect: pure numeric → budget, string → reasoning effort level
+		if parsed, ok := parseIntPrefix(value); ok {
+			// Numeric value: treat as thinking budget
+			baseModel = candidateBase
+			budgetOverride = &parsed
 			matched = true
+		} else {
+			// String value: treat as reasoning effort level
+			baseModel = candidateBase
+			raw := strings.ToLower(strings.TrimSpace(value))
+			if raw != "" {
+				reasoningEffort = &raw
+				matched = true
+			}
 		}
 	}

@@ -82,9 +80,6 @@ func NormalizeThinkingModel(modelName string) (string, map[string]any) {
 	if budgetOverride != nil {
 		metadata[ThinkingBudgetMetadataKey] = *budgetOverride
 	}
-	if includeThoughts != nil {
-		metadata[ThinkingIncludeThoughtsMetadataKey] = *includeThoughts
-	}
 	if reasoningEffort != nil {
 		metadata[ReasoningEffortMetadataKey] = *reasoningEffort
 	}
@@ -185,7 +180,7 @@ func ReasoningEffortFromMetadata(metadata map[string]any) (string, bool) {
 		return "", false
 	}
 	if effort != nil && *effort != "" {
-		return *effort, true
+		return strings.ToLower(strings.TrimSpace(*effort)), true
 	}
 	if budget != nil {
 		switch *budget {
@@ -207,7 +202,11 @@ func ThinkingEffortToBudget(model, effort string) (int, bool) {
 	if effort == "" {
 		return 0, false
 	}
-	switch strings.ToLower(effort) {
+	normalized, ok := NormalizeReasoningEffortLevel(model, effort)
+	if !ok {
+		normalized = strings.ToLower(strings.TrimSpace(effort))
+	}
+	switch normalized {
 	case "none":
 		return 0, true
 	case "auto":
@@ -312,16 +311,3 @@ func parseNumberToInt(raw any) (int, bool) {
 	}
 	return 0, false
 }
-
-func normalizeReasoningEffort(value string) (string, bool) {
-	if value == "" {
-		return "", false
-	}
-	effort := strings.ToLower(strings.TrimSpace(value))
-	switch effort {
-	case "minimal", "low", "medium", "high", "xhigh", "auto", "none":
-		return effort, true
-	default:
-		return "", false
-	}
-}
--- a/sdk/api/handlers/claude/code_handlers.go
+++ b/sdk/api/handlers/claude/code_handlers.go
@@ -271,6 +271,11 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http.
 				continue
 			}
 			if errMsg != nil {
+				status := http.StatusInternalServerError
+				if errMsg.StatusCode > 0 {
+					status = errMsg.StatusCode
+				}
+				c.Status(status)
 				// An error occurred: emit as a proper SSE error event
 				errorBytes, _ := json.Marshal(h.toClaudeError(errMsg))
 				_, _ = writer.WriteString("event: error\n")
@@ -278,6 +283,7 @@ func (h *ClaudeCodeAPIHandler) forwardClaudeStream(c *gin.Context, flusher http.
 				_, _ = writer.Write(errorBytes)
 				_, _ = writer.WriteString("\n\n")
 				_ = writer.Flush()
+				flusher.Flush()
 			}
 			var execErr error
 			if errMsg != nil {
Author	SHA1	Message	Date
Luis Pater	28469576bf	Merge branch 'router-for-me:main' into main	2025-12-12 02:41:05 +08:00
Luis Pater	ef0edbfe69	refactor(claude): replace `strings.Builder` with simpler `output` string concatenation	2025-12-11 22:34:06 +08:00
Luis Pater	bb6312b4fc	Merge pull request #488 from router-for-me/gemini Unify the Gemini executor style	2025-12-11 22:14:17 +08:00
hkfires	3c315551b0	refactor(executor): relocate gemini token counters	2025-12-11 21:56:44 +08:00
hkfires	27c9c5c4da	refactor(executor): clarify executor comments and oauth names	2025-12-11 21:56:44 +08:00
hkfires	fc9f6c974a	refactor(executor): clarify providers and streams Add package and constructor documentation for AI Studio, Antigravity, Gemini CLI, Gemini API, and Vertex executors to describe their roles and inputs. Introduce a shared stream scanner buffer constant in the Gemini API executor and reuse it in Gemini CLI and Vertex streaming code so stream handling uses a consistent configuration. Update Refresh implementations for AI Studio, Gemini CLI, Gemini API (API key), and Vertex executors to short‑circuit and simply return the incoming auth object, while keeping Antigravity token renewal as the only executor that performs OAuth refresh. Remove OAuth2-based token refresh logic and related dependencies from the Gemini API executor, since it now operates strictly with API key credentials.	2025-12-11 21:56:43 +08:00
Luis Pater	242b4d5754	Merge pull request #19 from router-for-me/plus v6.6.1	2025-12-11 21:35:16 +08:00
Luis Pater	4ce7c61a17	Merge branch 'main' into plus	2025-12-11 21:33:49 +08:00
Luis Pater	a74ee3f319	Merge pull request #481 from sususu98/fix/increase-buffer-size fix: increase buffer size for stream scanners to 50MB across multiple executors	2025-12-11 21:20:54 +08:00
Luis Pater	564bcbaa54	Merge pull request #487 from router-for-me/amp fix(amp): set status on claude stream errors	2025-12-11 21:18:19 +08:00
hkfires	88bdd25f06	fix(amp): set status on claude stream errors	2025-12-11 20:12:06 +08:00
hkfires	e79f65fd8e	refactor(thinking): use parentheses for metadata suffix	2025-12-11 18:39:07 +08:00
Luis Pater	2760989401	Merge pull request #485 from router-for-me/think Think	2025-12-11 18:27:00 +08:00
hkfires	facfe7c518	refactor(thinking): use bracket tags for thinking meta Align thinking suffix handling on a single bracket-style marker. NormalizeThinkingModel strips a terminal `[value]` segment from model identifiers and turns it into either a thinking budget (for numeric values) or a reasoning effort hint (for strings). Emission of `ThinkingIncludeThoughtsMetadataKey` is removed. Executor helpers and the example config are updated so their comments reference the new `[value]` suffix format instead of the legacy dash variants. BREAKING CHANGE: dash-based thinking suffixes (`-thinking`, `-thinking-N`, `-reasoning`, `-nothinking`) are no longer parsed for thinking metadata; only `[value]` annotations are recognized.	2025-12-11 18:17:28 +08:00
hkfires	6285459c08	fix(runtime): unify claude thinking config resolution	2025-12-11 17:20:44 +08:00
hkfires	21bbceca0c	docs(runtime): document reasoning effort precedence	2025-12-11 16:35:36 +08:00
hkfires	f6300c72b7	fix(runtime): validate thinking config in iflow and qwen	2025-12-11 16:21:50 +08:00
hkfires	007572b58e	fix(util): do not strip thinking suffix on registered models NormalizeThinkingModel now checks ModelSupportsThinking before removing "-thinking" or "-thinking-<ver>", avoiding accidental parsing of model names where the suffix is part of the official id (e.g., kimi-k2-thinking, qwen3-235b-a22b-thinking-2507). The registry adds ThinkingSupport metadata for several models and propagates it via ModelInfo (e.g., kimi-k2-thinking, deepseek-r1, qwen3-235b-a22b-thinking-2507, minimax-m2), enabling accurate detection of thinking-capable models and correcting base model inference.	2025-12-11 15:52:14 +08:00
hkfires	3a81ab22fd	fix(runtime): unify reasoning effort metadata overrides	2025-12-11 14:35:05 +08:00
hkfires	519da2e042	fix(runtime): validate reasoning effort levels	2025-12-11 12:36:54 +08:00
hkfires	169f4295d0	fix(util): align reasoning effort handling with registry	2025-12-11 12:20:12 +08:00
hkfires	d06d0eab2f	fix(util): centralize reasoning effort normalization	2025-12-11 12:14:51 +08:00
hkfires	3ffd120ae9	feat(runtime): add thinking config normalization	2025-12-11 11:51:33 +08:00
hkfires	a03d514095	feat(registry): add thinking metadata for models	2025-12-11 11:28:44 +08:00
Luis Pater	69fccf0015	Merge pull request #18 from Ravens2121/master Kiro Executor Stability and API Compatibility Improvements	2025-12-11 08:35:22 +08:00
Luis Pater	1da03bfe15	Merge pull request #479 from router-for-me/claude fix(claude): prevent final events when no content streamed	2025-12-11 08:18:59 +08:00
Ravens2121	6133bac226	feat(kiro): enhance Kiro executor stability and compatibility ## Changes Overview This commit includes multiple improvements to the Kiro executor for better stability, API compatibility, and code quality. ## Detailed Changes ### 1. Output Token Calculation Improvement (lines 317-330) - Replace simple len(content)/4 estimation with tiktoken-based calculation - Add fallback to character count estimation if tiktoken fails - Improves token counting accuracy for usage tracking ### 2. Stream Handler Panic Recovery (lines 528-533) - Add defer/recover block in streamToChannel goroutine - Prevents single request crashes from affecting the entire service ### 3. Struct Field Reordering (lines 670-673) - Reorder kiroToolResult struct fields: Content, Status, ToolUseID - Ensures consistency with API expectations ### 4. Message Merging Function (lines 778-780, 2356-2483) - Add mergeAdjacentMessages() to combine consecutive messages with same role - Add helper functions: mergeMessageContent(), blockToMap(), createMergedMessage() - Required by Kiro API which doesn't allow adjacent messages from same role ### 5. Empty Content Handling (lines 791-800) - Add default content for empty history messages - User messages with tool results: "Tool results provided." - User messages without tool results: "Continue" ### 6. Assistant Last Message Handling (lines 811-830) - Detect when last message is from assistant - Create synthetic "Continue" user message to satisfy Kiro API requirements - Kiro API requires currentMessage to be userInputMessage type ### 7. Duplicate Content Event Detection (lines 1650-1660) - Track lastContentEvent to detect duplicate streaming events - Skip redundant events to prevent duplicate content in responses - Based on AIClient-2-API implementation for Kiro ### 8. Streaming Token Calculation Enhancement (lines 1785-1817) - Add accumulatedContent buffer for streaming token calculation - Use tiktoken for accurate output token counting during streaming - Add fallback to character count estimation with proper logging ### 9. JSON Repair Enhancement (lines 2665-2818) - Implement conservative JSON repair strategy - First try to parse JSON directly - if valid, return unchanged - Add bracket balancing detection and repair - Only repair when necessary to avoid corrupting valid JSON - Validate repaired JSON before returning ### 10. HELIOS_CHK Filtering Removal (lines 2500-2504, 3004-3039) - Remove filterHeliosDebugInfo function - Remove heliosDebugPattern regex - HELIOS_CHK fields now handled by client-side processing ### 11. Comment Translation - Translate Chinese comments to English for code consistency - Affected areas: token calculation, buffer handling, message processing	2025-12-11 08:13:33 +08:00
Ravens2121	f302be5ce6	Merge branch 'router-for-me:main' into master	2025-12-11 05:40:10 +08:00
Ravens2121	cd4e84a360	feat(kiro): enhance request format, stream handling, and usage tracking ## English Description ### Request Format Fixes - Fix conversationState field order (chatTriggerType must be first) - Add conditional profileArn inclusion based on auth method - builder-id auth (AWS SSO) doesn't require profileArn - social auth (Google OAuth) requires profileArn ### Stream Processing Enhancements - Add headersLen boundary validation to prevent slice out of bounds - Handle incomplete tool use at EOF by flushing pending data - Separate message_delta and message_stop events for proper streaming - Add error logging for JSON unmarshal failures ### JSON Repair Improvements - Add escapeNewlinesInStrings() to handle control characters in JSON strings - Remove incorrect unquotedKeyPattern that broke valid JSON content - Fix handling of streaming fragments with embedded newlines/tabs ### Debug Info Filtering (Optional) - Add filterHeliosDebugInfo() to remove [HELIOS_CHK] blocks - Pattern matches internal state tracking from Kiro/Amazon Q - Currently disabled pending further testing ### Usage Tracking - Add usage information extraction in message_delta response - Include prompt_tokens, completion_tokens, total_tokens in OpenAI format --- ## 中文描述 ### 请求格式修复 - 修复 conversationState 字段顺序（chatTriggerType 必须在第一位） - 根据认证方式条件性包含 profileArn - builder-id 认证（AWS SSO）不需要 profileArn - social 认证（Google OAuth）需要 profileArn ### 流处理增强 - 添加 headersLen 边界验证，防止切片越界 - 在 EOF 时处理未完成的工具调用，刷新待处理数据 - 分离 message_delta 和 message_stop 事件以实现正确的流式传输 - 添加 JSON 反序列化失败的错误日志 ### JSON 修复改进 - 添加 escapeNewlinesInStrings() 处理 JSON 字符串中的控制字符 - 移除错误的 unquotedKeyPattern，该模式会破坏有效的 JSON 内容 - 修复包含嵌入换行符/制表符的流式片段处理 ### 调试信息过滤（可选） - 添加 filterHeliosDebugInfo() 移除 [HELIOS_CHK] 块 - 模式匹配来自 Kiro/Amazon Q 的内部状态跟踪信息 - 目前已禁用，等待进一步测试 ### 使用量跟踪 - 在 message_delta 响应中添加 usage 信息提取 - 以 OpenAI 格式包含 prompt_tokens、completion_tokens、total_tokens	2025-12-11 05:37:22 +08:00
sususu	76c563d161	fix(executor): increase buffer size for stream scanners to 50MB across multiple executors	2025-12-10 23:20:04 +08:00
hkfires	a89514951f	fix(claude): prevent final events when no content streamed	2025-12-10 22:19:55 +08:00