diff --git a/internal/runtime/executor/kiro_executor.go b/internal/runtime/executor/kiro_executor.go index b9a38272..e1d752a8 100644 --- a/internal/runtime/executor/kiro_executor.go +++ b/internal/runtime/executor/kiro_executor.go @@ -56,6 +56,34 @@ var ( usageUpdateTimeInterval = 15 * time.Second // Or every 15 seconds, whichever comes first ) +// kiroRateMultipliers maps Kiro model IDs to their credit multipliers. +// Used for cross-validation of token estimates against upstream credit usage. +// Source: Kiro API model definitions +var kiroRateMultipliers = map[string]float64{ + "claude-haiku-4.5": 0.4, + "claude-sonnet-4": 1.3, + "claude-sonnet-4.5": 1.3, + "claude-opus-4.5": 2.2, + "auto": 1.0, // Default multiplier for auto model selection +} + +// getKiroRateMultiplier returns the credit multiplier for a given model ID. +// Returns 1.0 as default if model is not found. +func getKiroRateMultiplier(modelID string) float64 { + if multiplier, ok := kiroRateMultipliers[modelID]; ok { + return multiplier + } + return 1.0 +} + +// abs64 returns the absolute value of an int64. +func abs64(x int64) int64 { + if x < 0 { + return -x + } + return x +} + // kiroEndpointConfig bundles endpoint URL with its compatible Origin and AmzTarget values. // This solves the "triple mismatch" problem where different endpoints require matching // Origin and X-Amz-Target header values. @@ -166,7 +194,8 @@ type KiroExecutor struct { // This is critical because OpenAI and Claude formats have different tool structures: // - OpenAI: tools[].function.name, tools[].function.description // - Claude: tools[].name, tools[].description -func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format) []byte { +// Returns the serialized JSON payload and a boolean indicating whether thinking mode was injected. +func buildKiroPayloadForFormat(body []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool, sourceFormat sdktranslator.Format) ([]byte, bool) { switch sourceFormat.String() { case "openai": log.Debugf("kiro: using OpenAI payload builder for source format: %s", sourceFormat.String()) @@ -248,7 +277,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth. // Rebuild payload with the correct origin for this endpoint // Each endpoint requires its matching Origin value in the request body - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) log.Debugf("kiro: trying endpoint %d/%d: %s (Name: %s, Origin: %s)", endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin) @@ -358,7 +387,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth. auth = refreshedAuth accessToken, profileArn = kiroCredentials(auth) // Rebuild payload with new profile ARN if changed - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) log.Infof("kiro: token refreshed successfully, retrying request") continue } @@ -415,7 +444,7 @@ func (e *KiroExecutor) executeWithRetry(ctx context.Context, auth *cliproxyauth. if refreshedAuth != nil { auth = refreshedAuth accessToken, profileArn = kiroCredentials(auth) - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) log.Infof("kiro: token refreshed for 403, retrying request") continue } @@ -554,7 +583,10 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox // Rebuild payload with the correct origin for this endpoint // Each endpoint requires its matching Origin value in the request body - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + // Kiro API always returns tags regardless of whether thinking mode was requested + // So we always enable thinking parsing for Kiro responses + thinkingEnabled := true log.Debugf("kiro: stream trying endpoint %d/%d: %s (Name: %s, Origin: %s)", endpointIdx+1, len(endpointConfigs), url, endpointConfig.Name, currentOrigin) @@ -677,7 +709,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox auth = refreshedAuth accessToken, profileArn = kiroCredentials(auth) // Rebuild payload with new profile ARN if changed - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) log.Infof("kiro: token refreshed successfully, retrying stream request") continue } @@ -734,7 +766,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox if refreshedAuth != nil { auth = refreshedAuth accessToken, profileArn = kiroCredentials(auth) - kiroPayload = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) + kiroPayload, _ = buildKiroPayloadForFormat(body, kiroModelID, profileArn, currentOrigin, isAgentic, isChatOnly, from) log.Infof("kiro: token refreshed for 403, retrying stream request") continue } @@ -758,7 +790,7 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox out := make(chan cliproxyexecutor.StreamChunk) - go func(resp *http.Response) { + go func(resp *http.Response, thinkingEnabled bool) { defer close(out) defer func() { if r := recover(); r != nil { @@ -772,21 +804,12 @@ func (e *KiroExecutor) executeStreamWithRetry(ctx context.Context, auth *cliprox } }() - // Check if thinking mode was enabled in the original request - // Only parse tags when thinking was explicitly requested - // Check multiple sources: original request, pre-translation payload, and translated body - // This handles different client formats (Claude API, OpenAI, AMP/Cursor) - thinkingEnabled := kiroclaude.IsThinkingEnabled(opts.OriginalRequest) - if !thinkingEnabled { - thinkingEnabled = kiroclaude.IsThinkingEnabled(req.Payload) - } - if !thinkingEnabled { - thinkingEnabled = kiroclaude.IsThinkingEnabled(body) - } - log.Debugf("kiro: stream thinkingEnabled = %v", thinkingEnabled) + // Kiro API always returns tags regardless of request parameters + // So we always enable thinking parsing for Kiro responses + log.Debugf("kiro: stream thinkingEnabled = %v (always true for Kiro)", thinkingEnabled) e.streamToChannel(ctx, resp.Body, out, from, req.Model, opts.OriginalRequest, body, reporter, thinkingEnabled) - }(httpResp) + }(httpResp, thinkingEnabled) return out, nil } @@ -2907,6 +2930,32 @@ func (e *KiroExecutor) streamToChannel(ctx context.Context, body io.Reader, out totalUsage.InputTokens, totalUsage.OutputTokens, totalUsage.TotalTokens) } + // Cross-validate token estimates using creditUsage if available + // This helps detect estimation errors and provides insight into actual usage + if upstreamCreditUsage > 0 { + rateMultiplier := getKiroRateMultiplier(model) + // Credit usage represents total tokens (input + output) * multiplier / 1000 + // Formula: total_tokens = creditUsage * 1000 / rateMultiplier + creditBasedTotal := int64(upstreamCreditUsage * 1000 / rateMultiplier) + localTotal := totalUsage.InputTokens + totalUsage.OutputTokens + + // Calculate difference percentage + var diffPercent float64 + if localTotal > 0 { + diffPercent = float64(abs64(creditBasedTotal-localTotal)) / float64(localTotal) * 100 + } + + // Log cross-validation result + if diffPercent > 20 { + // Significant mismatch - may indicate thinking tokens or estimation error + log.Warnf("kiro: token estimation mismatch > 20%% - local: %d, credit-based: %d (credits: %.4f, multiplier: %.1f, diff: %.1f%%)", + localTotal, creditBasedTotal, upstreamCreditUsage, rateMultiplier, diffPercent) + } else { + log.Debugf("kiro: token cross-validation - local: %d, credit-based: %d (credits: %.4f, multiplier: %.1f, diff: %.1f%%)", + localTotal, creditBasedTotal, upstreamCreditUsage, rateMultiplier, diffPercent) + } + } + // Determine stop reason: prefer upstream, then detect tool_use, default to end_turn stopReason := upstreamStopReason if stopReason == "" { diff --git a/internal/translator/kiro/claude/kiro_claude_request.go b/internal/translator/kiro/claude/kiro_claude_request.go index ae42b186..052d671c 100644 --- a/internal/translator/kiro/claude/kiro_claude_request.go +++ b/internal/translator/kiro/claude/kiro_claude_request.go @@ -135,7 +135,8 @@ func ConvertClaudeRequestToKiro(modelName string, inputRawJSON []byte, stream bo // isAgentic parameter enables chunked write optimization prompt for -agentic model variants. // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode). // Supports thinking mode - when Claude API thinking parameter is present, injects thinkingHint. -func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte { +// Returns the payload and a boolean indicating whether thinking mode was injected. +func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) ([]byte, bool) { // Extract max_tokens for potential use in inferenceConfig // Handle -1 as "use maximum" (Kiro max output is ~32000 tokens) const kiroMaxOutputTokens = 32000 @@ -307,10 +308,10 @@ func BuildKiroPayload(claudeBody []byte, modelID, profileArn, origin string, isA result, err := json.Marshal(payload) if err != nil { log.Debugf("kiro: failed to marshal payload: %v", err) - return nil + return nil, false } - return result + return result, thinkingEnabled } // normalizeOrigin normalizes origin value for Kiro API compatibility diff --git a/internal/translator/kiro/openai/kiro_openai_request.go b/internal/translator/kiro/openai/kiro_openai_request.go index 21b15aa0..cb97a340 100644 --- a/internal/translator/kiro/openai/kiro_openai_request.go +++ b/internal/translator/kiro/openai/kiro_openai_request.go @@ -133,7 +133,8 @@ func ConvertOpenAIRequestToKiro(modelName string, inputRawJSON []byte, stream bo // origin parameter determines which quota to use: "CLI" for Amazon Q, "AI_EDITOR" for Kiro IDE. // isAgentic parameter enables chunked write optimization prompt for -agentic model variants. // isChatOnly parameter disables tool calling for -chat model variants (pure conversation mode). -func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) []byte { +// Returns the payload and a boolean indicating whether thinking mode was injected. +func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin string, isAgentic, isChatOnly bool) ([]byte, bool) { // Extract max_tokens for potential use in inferenceConfig // Handle -1 as "use maximum" (Kiro max output is ~32000 tokens) const kiroMaxOutputTokens = 32000 @@ -311,10 +312,10 @@ func BuildKiroPayloadFromOpenAI(openaiBody []byte, modelID, profileArn, origin s result, err := json.Marshal(payload) if err != nil { log.Debugf("kiro-openai: failed to marshal payload: %v", err) - return nil + return nil, false } - return result + return result, thinkingEnabled } // normalizeOrigin normalizes origin value for Kiro API compatibility