feat(kiro): Add token usage cross-validation and simplify thinking mode handling

2026-03-09 15:25:17 +00:00 · 2025-12-14 16:37:08 +08:00
parent e73b9e10a6
commit c3ed3b40ea
3 changed files with 78 additions and 27 deletions
--- a/internal/runtime/executor/kiro_executor.go
+++ b/internal/runtime/executor/kiro_executor.go
@@ -56,6 +56,34 @@ var (
 	usageUpdateTimeInterval  = 15 * time.Second // Or every 15 seconds, whichever comes first
 )

+// kiroRateMultipliers maps Kiro model IDs to their credit multipliers.
+// Used for cross-validation of token estimates against upstream credit usage.
+// Source: Kiro API model definitions
+var kiroRateMultipliers = map[string]float64{
+	"claude-haiku-4.5":  0.4,
+	"claude-sonnet-4":   1.3,
+	"claude-sonnet-4.5": 1.3,
+	"claude-opus-4.5":   2.2,
+	"auto":              1.0, // Default multiplier for auto model selection
+}
+
+// getKiroRateMultiplier returns the credit multiplier for a given model ID.
+// Returns 1.0 as default if model is not found.
+func getKiroRateMultiplier(modelID string) float64 {
+	if multiplier, ok := kiroRateMultipliers[modelID]; ok {
+		return multiplier
+	}
+	return 1.0
+}
+
+// abs64 returns the absolute value of an int64.
+func abs64(x int64) int64 {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
 // kiroEndpointConfig bundles endpoint URL with its compatible Origin and AmzTarget values.
 // This solves the "triple mismatch" problem where different endpoints require matching
 // Origin and X-Amz-Target header values.
@@ -166,7 +194,8 @@ type KiroExecutor struct {
 // This is critical because OpenAI and Claude formats have different tool structures:
 // - OpenAI: tools[].function.name, tools[].function.description
 // - Claude: tools[].name, tools[].description
-func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format) []byte {
+// Returns the serialized JSON payload and a boolean indicating whether thinking mode was injected.
+func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format) ([]byte, bool) {
 	switch sourceFormat.String() {
 	case "openai":
 		log.Debugf("kiro: using OpenAI payload builder for source format: %s", sourceFormat.String())
@@ -248,7 +277,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.

 		// Rebuild payload with the correct origin for this endpoint
 		// Each endpoint requires its matching Origin value in the request body
-		kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+		kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)

 		log.Debugf("kiro: trying endpoint %d/%d: %s (Name: %s, Origin: %s)",
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)
@@ -358,7 +387,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
 						// Rebuild payload with new profile ARN if changed
-						kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
 						log.Infof("kiro: token refreshed successfully, retrying request")
 						continue
 					}
@@ -415,7 +444,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth.
 					if refreshedAuth != nil {
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
-						kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
 						log.Infof("kiro: token refreshed for 403, retrying request")
 						continue
 					}
@@ -554,7 +583,10 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox

 		// Rebuild payload with the correct origin for this endpoint
 		// Each endpoint requires its matching Origin value in the request body
-		kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+		kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+		// Kiro API always returns <thinking> tags regardless of whether thinking mode was requested
+		// So we always enable thinking parsing for Kiro responses
+		thinkingEnabled := true

 		log.Debugf("kiro: stream trying endpoint %d/%d: %s (Name: %s, Origin: %s)",
 			endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin)
@@ -677,7 +709,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
 						// Rebuild payload with new profile ARN if changed
-						kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
 						log.Infof("kiro: token refreshed successfully, retrying stream request")
 						continue
 					}
@@ -734,7 +766,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 					if refreshedAuth != nil {
 						auth = refreshedAuth
 						accessToken, profileArn = kiroCredentials(auth)
-						kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
+						kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from)
 						log.Infof("kiro: token refreshed for 403, retrying stream request")
 						continue
 					}
@@ -758,7 +790,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox

 			out := make(chan cliproxyexecutor.StreamChunk)

-			go func(resp *http.Response) {
+			go func(resp *http.Response, thinkingEnabled bool) {
 				defer close(out)
 				defer func() {
 					if r := recover(); r != nil {
@@ -772,21 +804,12 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox
 					}
 				}()

-				// Check if thinking mode was enabled in the original request
-				// Only parse <thinking> tags when thinking was explicitly requested
-				// Check multiple sources: original request, pre-translation payload, and translated body
-				// This handles different client formats (Claude API, OpenAI, AMP/Cursor)
-				thinkingEnabled := kiroclaude.IsThinkingEnabled(opts.OriginalRequest)
-				if !thinkingEnabled {
-					thinkingEnabled = kiroclaude.IsThinkingEnabled(req.Payload)
-				}
-				if !thinkingEnabled {
-					thinkingEnabled = kiroclaude.IsThinkingEnabled(body)
-				}
-				log.Debugf("kiro: stream thinkingEnabled = %v", thinkingEnabled)
+				// Kiro API always returns <thinking> tags regardless of request parameters
+				// So we always enable thinking parsing for Kiro responses
+				log.Debugf("kiro: stream thinkingEnabled = %v (always true for Kiro)", thinkingEnabled)

 				e.streamToChannel(ctx, resp.Body, out, from, req.Model, opts.OriginalRequest, body, reporter, thinkingEnabled)
-			}(httpResp)
+			}(httpResp, thinkingEnabled)

 			return out, nil
 		}
@@ -2907,6 +2930,32 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out
 			totalUsage.InputTokens, totalUsage.OutputTokens, totalUsage.TotalTokens)
 	}

+	// Cross-validate token estimates using creditUsage if available
+	// This helps detect estimation errors and provides insight into actual usage
+	if upstreamCreditUsage > 0 {
+		rateMultiplier := getKiroRateMultiplier(model)
+		// Credit usage represents total tokens (input + output) * multiplier / 1000
+		// Formula: total_tokens = creditUsage * 1000 / rateMultiplier
+		creditBasedTotal := int64(upstreamCreditUsage * 1000 / rateMultiplier)
+		localTotal := totalUsage.InputTokens + totalUsage.OutputTokens
+
+		// Calculate difference percentage
+		var diffPercent float64
+		if localTotal > 0 {
+			diffPercent = float64(abs64(creditBasedTotal-localTotal)) / float64(localTotal) * 100
+		}
+
+		// Log cross-validation result
+		if diffPercent > 20 {
+			// Significant mismatch - may indicate thinking tokens or estimation error
+			log.Warnf("kiro: token estimation mismatch > 20%% - local: %d, credit-based: %d (credits: %.4f, multiplier: %.1f, diff: %.1f%%)",
+				localTotal, creditBasedTotal, upstreamCreditUsage, rateMultiplier, diffPercent)
+		} else {
+			log.Debugf("kiro: token cross-validation - local: %d, credit-based: %d (credits: %.4f, multiplier: %.1f, diff: %.1f%%)",
+				localTotal, creditBasedTotal, upstreamCreditUsage, rateMultiplier, diffPercent)
+		}
+	}
+
 	// Determine stop reason: prefer upstream, then detect tool_use, default to end_turn
 	stopReason := upstreamStopReason
 	if stopReason == "" {
--- a/internal/translator/kiro/claude/kiro_claude_request.go
+++ b/internal/translator/kiro/claude/kiro_claude_request.go
@@ -135,7 +135,8 @@ func ConvertClaudeRequestToKiro(modelName string, inputRawJSON []byte, stream bo
 // isAgentic parameter enables chunked write optimization prompt for -agentic model variants.
 // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode).
 // Supports thinking mode - when Claude API thinking parameter is present, injects thinkingHint.
-func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte {
+// Returns the payload and a boolean indicating whether thinking mode was injected.
+func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) ([]byte, bool) {
 	// Extract max_tokens for potential use in inferenceConfig
 	// Handle -1 as "use maximum" (Kiro max output is ~32000 tokens)
 	const kiroMaxOutputTokens = 32000
@@ -307,10 +308,10 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA
 	result, err := json.Marshal(payload)
 	if err != nil {
 		log.Debugf("kiro: failed to marshal payload: %v", err)
-		return nil
+		return nil, false
 	}

-	return result
+	return result, thinkingEnabled
 }

 // normalizeOrigin normalizes origin value for Kiro API compatibility
--- a/internal/translator/kiro/openai/kiro_openai_request.go
+++ b/internal/translator/kiro/openai/kiro_openai_request.go
@@ -133,7 +133,8 @@ func ConvertOpenAIRequestToKiro(modelName string, inputRawJSON []byte, stream bo
 // origin parameter determines which quota to use: "CLI" for Amazon Q, "AI_EDITOR" for Kiro IDE.
 // isAgentic parameter enables chunked write optimization prompt for -agentic model variants.
 // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode).
-func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte {
+// Returns the payload and a boolean indicating whether thinking mode was injected.
+func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) ([]byte, bool) {
 	// Extract max_tokens for potential use in inferenceConfig
 	// Handle -1 as "use maximum" (Kiro max output is ~32000 tokens)
 	const kiroMaxOutputTokens = 32000
@@ -311,10 +312,10 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s
 	result, err := json.Marshal(payload)
 	if err != nil {
 		log.Debugf("kiro-openai: failed to marshal payload: %v", err)
-		return nil
+		return nil, false
 	}

-	return result
+	return result, thinkingEnabled
 }

 // normalizeOrigin normalizes origin value for Kiro API compatibility