refactor(vllm): own nemotron thinking payloads

This commit is contained in:
Peter Steinberger
2026-04-27 12:13:37 +01:00
parent 22bb53ac9a
commit da822a56d8
11 changed files with 244 additions and 160 deletions

View File

@@ -660,7 +660,7 @@ Example (OpenAIcompatible):
- For `api: "openai-completions"` on non-native endpoints (any non-empty `baseUrl` whose host is not `api.openai.com`), OpenClaw forces `compat.supportsDeveloperRole: false` to avoid provider 400 errors for unsupported `developer` roles.
- Proxy-style OpenAI-compatible routes also skip native OpenAI-only request shaping: no `service_tier`, no Responses `store`, no Completions `store`, no prompt-cache hints, no OpenAI reasoning-compat payload shaping, and no hidden OpenClaw attribution headers.
- For OpenAI-compatible Completions proxies that need vendor-specific fields, set `agents.defaults.models["provider/model"].params.extra_body` (or `extraBody`) to merge extra JSON into the outbound request body.
- For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off.
- For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. The bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off.
- For slow local models or remote LAN/tailnet hosts, set `models.providers.<id>.timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout.
- If `baseUrl` is empty/omitted, OpenClaw keeps the default OpenAI behavior (which resolves to `api.openai.com`).
- For safety, an explicit `compat.supportsDeveloperRole: true` is still overridden on non-native `openai-completions` endpoints.

View File

@@ -371,7 +371,7 @@ Time format in system prompt. Default: `auto` (OS preference).
- `params`: global default provider parameters applied to all models. Set at `agents.defaults.params` (e.g. `{ cacheRetention: "long" }`).
- `params` merge precedence (config): `agents.defaults.params` (global base) is overridden by `agents.defaults.models["provider/model"].params` (per-model), then `agents.list[].params` (matching agent id) overrides by key. See [Prompt Caching](/reference/prompt-caching) for details.
- `params.extra_body`/`params.extraBody`: advanced pass-through JSON merged into `api: "openai-completions"` request bodies for OpenAI-compatible proxies. If it collides with generated request keys, the extra body wins; non-native completions routes still strip OpenAI-only `store` afterward.
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. For vLLM Qwen thinking controls, set `params.qwenThinkingFormat` to `"chat-template"` or `"top-level"` on that model entry.
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, the bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. For vLLM Qwen thinking controls, set `params.qwenThinkingFormat` to `"chat-template"` or `"top-level"` on that model entry.
- `params.preserveThinking`: Z.AI-only opt-in for preserved thinking. When enabled and thinking is on, OpenClaw sends `thinking.clear_thinking: false` and replays prior `reasoning_content`; see [Z.AI thinking and preserved thinking](/providers/zai#thinking-and-preserved-thinking).
- `agentRuntime`: default low-level agent runtime policy. Omitted id defaults to OpenClaw Pi. Use `id: "pi"` to force the built-in PI harness, `id: "auto"` to let registered plugin harnesses claim supported models, a registered harness id such as `id: "codex"`, or a supported CLI backend alias such as `id: "claude-cli"`. Set `fallback: "none"` to disable automatic PI fallback. Explicit plugin runtimes such as `codex` fail closed by default unless you set `fallback: "pi"` in the same override scope. Keep model refs canonical as `provider/model`; select Codex, Claude CLI, Gemini CLI, and other execution backends through runtime config instead of legacy runtime provider prefixes. See [Agent runtimes](/concepts/agent-runtimes) for how this differs from provider/model selection.
- Config writers that mutate these fields (for example `/models set`, `/models set-image`, and fallback add/remove commands) save canonical object form and preserve existing fallback lists when possible.

View File

@@ -153,7 +153,7 @@ Use explicit config when:
<Accordion title="Nemotron 3 thinking controls">
vLLM/Nemotron 3 can use chat-template kwargs to control whether reasoning is
returned as hidden reasoning or visible answer text. When an OpenClaw session
uses `vllm/nemotron-3-*` with thinking off, OpenClaw sends:
uses `vllm/nemotron-3-*` with thinking off, the bundled vLLM plugin sends:
```json
{

View File

@@ -1,7 +1,10 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
import { createPayloadPatchStreamWrapper } from "openclaw/plugin-sdk/provider-stream-shared";
import {
createPayloadPatchStreamWrapper,
isOpenAICompatibleThinkingEnabled,
} from "openclaw/plugin-sdk/provider-stream-shared";
type QwenThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
@@ -15,19 +18,6 @@ function isQwenProviderId(providerId: string): boolean {
);
}
function resolveOpenAICompatibleThinkingEnabled(params: {
thinkingLevel: QwenThinkingLevel;
options: Parameters<StreamFn>[2];
}): boolean {
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
if (typeof raw !== "string") {
return true;
}
const normalized = raw.trim().toLowerCase();
return normalized !== "off" && normalized !== "none";
}
export function createQwenThinkingWrapper(
baseStreamFn: StreamFn | undefined,
thinkingLevel: QwenThinkingLevel,
@@ -35,7 +25,7 @@ export function createQwenThinkingWrapper(
return createPayloadPatchStreamWrapper(
baseStreamFn,
({ payload: payloadObj, options }) => {
const enableThinking = resolveOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
const enableThinking = isOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
payloadObj.enable_thinking = enableThinking;
delete payloadObj.reasoning_effort;
delete payloadObj.reasoningEffort;

View File

@@ -1,7 +1,11 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import type { Context, Model } from "@mariozechner/pi-ai";
import { describe, expect, it } from "vitest";
import { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
import {
createVllmProviderThinkingWrapper,
createVllmQwenThinkingWrapper,
wrapVllmProviderStream,
} from "./stream.js";
function capturePayload(params: {
format: "chat-template" | "top-level";
@@ -105,6 +109,80 @@ describe("createVllmQwenThinkingWrapper", () => {
});
});
describe("createVllmProviderThinkingWrapper", () => {
function captureProviderPayload(params: {
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
initialPayload?: Record<string, unknown>;
model?: Partial<Model<"openai-completions">>;
}): Record<string, unknown> {
let captured: Record<string, unknown> = {};
const baseStreamFn: StreamFn = (_model, _context, options) => {
const payload = { ...params.initialPayload };
options?.onPayload?.(payload, _model);
captured = payload;
return {} as ReturnType<StreamFn>;
};
const wrapped = createVllmProviderThinkingWrapper({
baseStreamFn,
thinkingLevel: params.thinkingLevel ?? "high",
});
void wrapped(
{
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
reasoning: true,
...params.model,
} as Model<"openai-completions">,
{ messages: [] } as Context,
{},
);
return captured;
}
it("injects Nemotron 3 chat-template kwargs when thinking is off", () => {
expect(captureProviderPayload({ thinkingLevel: "off" })).toEqual({
chat_template_kwargs: {
enable_thinking: false,
force_nonempty_content: true,
},
});
});
it("does not inject Nemotron 3 chat-template kwargs when thinking is enabled", () => {
expect(captureProviderPayload({ thinkingLevel: "low" })).toEqual({});
});
it("preserves existing Nemotron 3 chat-template kwargs over defaults", () => {
expect(
captureProviderPayload({
thinkingLevel: "off",
initialPayload: {
chat_template_kwargs: {
enable_thinking: true,
},
},
}),
).toEqual({
chat_template_kwargs: {
enable_thinking: true,
force_nonempty_content: true,
},
});
});
it("skips non-Nemotron vLLM models", () => {
expect(
captureProviderPayload({
thinkingLevel: "off",
model: { id: "Qwen/Qwen3-8B" },
}),
).toEqual({});
});
});
describe("wrapVllmProviderStream", () => {
it("registers when vLLM Qwen thinking format params are configured", () => {
expect(
@@ -167,4 +245,36 @@ describe("wrapVllmProviderStream", () => {
} as never),
).toBeUndefined();
});
it("registers for vLLM Nemotron when thinking is off", () => {
expect(
wrapVllmProviderStream({
provider: "vllm",
modelId: "nemotron-3-super",
extraParams: {},
thinkingLevel: "off",
model: {
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeTypeOf("function");
expect(
wrapVllmProviderStream({
provider: "vllm",
modelId: "nemotron-3-super",
extraParams: {},
thinkingLevel: "low",
model: {
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeUndefined();
});
});

View File

@@ -1,7 +1,10 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
import { createPayloadPatchStreamWrapper } from "openclaw/plugin-sdk/provider-stream-shared";
import {
createPayloadPatchStreamWrapper,
isOpenAICompatibleThinkingEnabled,
} from "openclaw/plugin-sdk/provider-stream-shared";
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
type VllmQwenThinkingFormat = "chat-template" | "top-level";
@@ -41,19 +44,6 @@ function resolveVllmQwenThinkingFormat(
);
}
function resolveOpenAICompatibleThinkingEnabled(params: {
thinkingLevel: VllmThinkingLevel;
options: Parameters<StreamFn>[2];
}): boolean {
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
if (typeof raw !== "string") {
return true;
}
const normalized = raw.trim().toLowerCase();
return normalized !== "off" && normalized !== "none";
}
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
const existing = payload.chat_template_kwargs;
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
@@ -73,6 +63,31 @@ function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled:
};
}
function isVllmNemotronModel(model: { api?: unknown; provider?: unknown; id?: unknown }): boolean {
return (
model.api === "openai-completions" &&
typeof model.provider === "string" &&
normalizeProviderId(model.provider) === "vllm" &&
typeof model.id === "string" &&
/\bnemotron-3(?:[-_](?:nano|super|ultra))?\b/i.test(model.id)
);
}
function setNemotronThinkingOffChatTemplateKwargs(payload: Record<string, unknown>): void {
const defaults = {
enable_thinking: false,
force_nonempty_content: true,
};
const existing = payload.chat_template_kwargs;
payload.chat_template_kwargs =
existing && typeof existing === "object" && !Array.isArray(existing)
? {
...defaults,
...(existing as Record<string, unknown>),
}
: defaults;
}
export function createVllmQwenThinkingWrapper(params: {
baseStreamFn: StreamFn | undefined;
format: VllmQwenThinkingFormat;
@@ -81,7 +96,7 @@ export function createVllmQwenThinkingWrapper(params: {
return createPayloadPatchStreamWrapper(
params.baseStreamFn,
({ payload: payloadObj, options }) => {
const enableThinking = resolveOpenAICompatibleThinkingEnabled({
const enableThinking = isOpenAICompatibleThinkingEnabled({
thinkingLevel: params.thinkingLevel,
options,
});
@@ -100,17 +115,50 @@ export function createVllmQwenThinkingWrapper(params: {
);
}
export function createVllmProviderThinkingWrapper(params: {
baseStreamFn: StreamFn | undefined;
qwenFormat?: VllmQwenThinkingFormat;
thinkingLevel: VllmThinkingLevel;
}): StreamFn {
const qwenWrapped = params.qwenFormat
? createVllmQwenThinkingWrapper({
baseStreamFn: params.baseStreamFn,
format: params.qwenFormat,
thinkingLevel: params.thinkingLevel,
})
: params.baseStreamFn;
return createPayloadPatchStreamWrapper(
qwenWrapped,
({ payload: payloadObj }) => {
setNemotronThinkingOffChatTemplateKwargs(payloadObj);
},
{
shouldPatch: ({ model }) =>
model.api === "openai-completions" &&
params.thinkingLevel === "off" &&
isVllmNemotronModel(model),
},
);
}
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
return undefined;
}
const format = resolveVllmQwenThinkingFormat(ctx.extraParams);
if (!format) {
const qwenFormat = resolveVllmQwenThinkingFormat(ctx.extraParams);
const shouldHandleNemotron =
ctx.thinkingLevel === "off" &&
isVllmNemotronModel({
api: "openai-completions",
provider: ctx.provider,
id: ctx.modelId,
});
if (!qwenFormat && !shouldHandleNemotron) {
return undefined;
}
return createVllmQwenThinkingWrapper({
return createVllmProviderThinkingWrapper({
baseStreamFn: ctx.streamFn,
format,
qwenFormat,
thinkingLevel: ctx.thinkingLevel,
});
}

View File

@@ -894,85 +894,6 @@ describe("applyExtraParamsToAgent", () => {
});
});
it("injects vLLM Nemotron chat_template_kwargs when thinking is off", () => {
const payload = runResponsesPayloadMutationCase({
applyProvider: "vllm",
applyModelId: "nemotron-3-super",
model: {
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
baseUrl: "http://127.0.0.1:8000/v1",
} as Model<"openai-completions">,
payload: {
messages: [],
},
thinkingLevel: "off",
});
expect(payload.chat_template_kwargs).toEqual({
enable_thinking: false,
force_nonempty_content: true,
});
});
it("does not inject vLLM Nemotron chat_template_kwargs when thinking is enabled", () => {
const payload = runResponsesPayloadMutationCase({
applyProvider: "vllm",
applyModelId: "nemotron-3-super",
model: {
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
baseUrl: "http://127.0.0.1:8000/v1",
} as Model<"openai-completions">,
payload: {
messages: [],
},
thinkingLevel: "low",
});
expect(payload).not.toHaveProperty("chat_template_kwargs");
});
it("lets extra_body override generated vLLM Nemotron chat_template_kwargs", () => {
const payload = runResponsesPayloadMutationCase({
applyProvider: "vllm",
applyModelId: "nemotron-3-super",
cfg: {
agents: {
defaults: {
models: {
"vllm/nemotron-3-super": {
params: {
extra_body: {
chat_template_kwargs: {
enable_thinking: true,
},
},
},
},
},
},
},
},
model: {
api: "openai-completions",
provider: "vllm",
id: "nemotron-3-super",
baseUrl: "http://127.0.0.1:8000/v1",
} as Model<"openai-completions">,
payload: {
messages: [],
},
thinkingLevel: "off",
});
expect(payload.chat_template_kwargs).toEqual({
enable_thinking: true,
});
});
it("warns and skips invalid chat_template_kwargs params", () => {
const warnSpy = vi.spyOn(log, "warn").mockImplementation(() => {});
try {

View File

@@ -462,63 +462,25 @@ function resolveChatTemplateKwargsParam(
return Object.keys(chatTemplateKwargs).length > 0 ? chatTemplateKwargs : undefined;
}
function isVllmNemotronModel(model: ProviderRuntimeModel): boolean {
return (
model.api === "openai-completions" &&
typeof model.provider === "string" &&
model.provider.toLowerCase() === "vllm" &&
typeof model.id === "string" &&
/\bnemotron-3(?:[-_](?:nano|super|ultra))?\b/i.test(model.id)
);
}
function resolveOpenAICompletionsChatTemplateKwargs(params: {
model: ProviderRuntimeModel;
thinkingLevel?: ThinkLevel;
configured?: Record<string, unknown>;
}): Record<string, unknown> | undefined {
const defaults =
params.thinkingLevel === "off" && isVllmNemotronModel(params.model)
? {
enable_thinking: false,
force_nonempty_content: true,
}
: undefined;
const merged = {
...defaults,
...params.configured,
};
return Object.keys(merged).length > 0 ? merged : undefined;
}
function createOpenAICompletionsChatTemplateKwargsWrapper(params: {
baseStreamFn: StreamFn | undefined;
configured?: Record<string, unknown>;
thinkingLevel?: ThinkLevel;
configured: Record<string, unknown>;
}): StreamFn {
const underlying = params.baseStreamFn ?? streamSimple;
return (model, context, options) => {
if (model.api !== "openai-completions") {
return underlying(model, context, options);
}
const chatTemplateKwargs = resolveOpenAICompletionsChatTemplateKwargs({
model: model as ProviderRuntimeModel,
thinkingLevel: params.thinkingLevel,
configured: params.configured,
});
if (!chatTemplateKwargs) {
return underlying(model, context, options);
}
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
const existing = payloadObj.chat_template_kwargs;
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
payloadObj.chat_template_kwargs = {
...(existing as Record<string, unknown>),
...chatTemplateKwargs,
...params.configured,
};
return;
}
payloadObj.chat_template_kwargs = chatTemplateKwargs;
payloadObj.chat_template_kwargs = params.configured;
});
};
}
@@ -614,11 +576,10 @@ function applyPostPluginStreamWrappers(
"chatTemplateKwargs",
);
const configuredChatTemplateKwargs = resolveChatTemplateKwargsParam(rawChatTemplateKwargs);
if (configuredChatTemplateKwargs || ctx.thinkingLevel === "off") {
if (configuredChatTemplateKwargs) {
ctx.agent.streamFn = createOpenAICompletionsChatTemplateKwargsWrapper({
baseStreamFn: ctx.agent.streamFn,
configured: configuredChatTemplateKwargs,
thinkingLevel: ctx.thinkingLevel,
});
}

View File

@@ -7,6 +7,7 @@ import {
defaultToolStreamExtraParams,
decodeHtmlEntitiesInObject,
hasCopilotVisionInput,
isOpenAICompatibleThinkingEnabled,
} from "./provider-stream-shared.js";
type FakeWrappedStream = {
@@ -64,6 +65,43 @@ describe("defaultToolStreamExtraParams", () => {
});
});
describe("isOpenAICompatibleThinkingEnabled", () => {
it("uses explicit request reasoning before session thinking level", () => {
expect(
isOpenAICompatibleThinkingEnabled({
thinkingLevel: "high",
options: { reasoning: "none" } as never,
}),
).toBe(false);
expect(
isOpenAICompatibleThinkingEnabled({
thinkingLevel: "off",
options: { reasoningEffort: "medium" } as never,
}),
).toBe(true);
});
it("treats off and none as disabled", () => {
expect(isOpenAICompatibleThinkingEnabled({ thinkingLevel: "off", options: {} })).toBe(false);
expect(
isOpenAICompatibleThinkingEnabled({
thinkingLevel: "high",
options: { reasoning: "none" } as never,
}),
).toBe(false);
});
it("defaults to enabled for missing or non-string values", () => {
expect(isOpenAICompatibleThinkingEnabled({ thinkingLevel: undefined, options: {} })).toBe(true);
expect(
isOpenAICompatibleThinkingEnabled({
thinkingLevel: "off",
options: { reasoning: { effort: "off" } } as never,
}),
).toBe(true);
});
});
describe("buildCopilotDynamicHeaders", () => {
it("matches Copilot IDE-style request headers without the legacy Openai-Intent", () => {
expect(

View File

@@ -154,6 +154,21 @@ export function createPayloadPatchStreamWrapper(
};
}
export type OpenAICompatibleThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
export function isOpenAICompatibleThinkingEnabled(params: {
thinkingLevel: OpenAICompatibleThinkingLevel;
options: Parameters<StreamFn>[2];
}): boolean {
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
if (typeof raw !== "string") {
return true;
}
const normalized = raw.trim().toLowerCase();
return normalized !== "off" && normalized !== "none";
}
export type DeepSeekV4ThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
function isDisabledDeepSeekV4ThinkingLevel(thinkingLevel: DeepSeekV4ThinkingLevel): boolean {

View File

@@ -43,6 +43,7 @@ export {
defaultToolStreamExtraParams,
hasCopilotVisionInput,
isAnthropicBedrockModel,
isOpenAICompatibleThinkingEnabled,
type ProviderStreamWrapperFactory,
resolveAnthropicPayloadPolicy,
resolveMoonshotThinkingType,