mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-06 15:18:58 +00:00
refactor(vllm): own nemotron thinking payloads
This commit is contained in:
@@ -660,7 +660,7 @@ Example (OpenAI‑compatible):
|
||||
- For `api: "openai-completions"` on non-native endpoints (any non-empty `baseUrl` whose host is not `api.openai.com`), OpenClaw forces `compat.supportsDeveloperRole: false` to avoid provider 400 errors for unsupported `developer` roles.
|
||||
- Proxy-style OpenAI-compatible routes also skip native OpenAI-only request shaping: no `service_tier`, no Responses `store`, no Completions `store`, no prompt-cache hints, no OpenAI reasoning-compat payload shaping, and no hidden OpenClaw attribution headers.
|
||||
- For OpenAI-compatible Completions proxies that need vendor-specific fields, set `agents.defaults.models["provider/model"].params.extra_body` (or `extraBody`) to merge extra JSON into the outbound request body.
|
||||
- For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off.
|
||||
- For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. The bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off.
|
||||
- For slow local models or remote LAN/tailnet hosts, set `models.providers.<id>.timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout.
|
||||
- If `baseUrl` is empty/omitted, OpenClaw keeps the default OpenAI behavior (which resolves to `api.openai.com`).
|
||||
- For safety, an explicit `compat.supportsDeveloperRole: true` is still overridden on non-native `openai-completions` endpoints.
|
||||
|
||||
@@ -371,7 +371,7 @@ Time format in system prompt. Default: `auto` (OS preference).
|
||||
- `params`: global default provider parameters applied to all models. Set at `agents.defaults.params` (e.g. `{ cacheRetention: "long" }`).
|
||||
- `params` merge precedence (config): `agents.defaults.params` (global base) is overridden by `agents.defaults.models["provider/model"].params` (per-model), then `agents.list[].params` (matching agent id) overrides by key. See [Prompt Caching](/reference/prompt-caching) for details.
|
||||
- `params.extra_body`/`params.extraBody`: advanced pass-through JSON merged into `api: "openai-completions"` request bodies for OpenAI-compatible proxies. If it collides with generated request keys, the extra body wins; non-native completions routes still strip OpenAI-only `store` afterward.
|
||||
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, OpenClaw automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. For vLLM Qwen thinking controls, set `params.qwenThinkingFormat` to `"chat-template"` or `"top-level"` on that model entry.
|
||||
- `params.chat_template_kwargs`: vLLM/OpenAI-compatible chat-template arguments merged into top-level `api: "openai-completions"` request bodies. For `vllm/nemotron-3-*` with thinking off, the bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true`; explicit `chat_template_kwargs` override generated defaults, and `extra_body.chat_template_kwargs` still has final precedence. For vLLM Qwen thinking controls, set `params.qwenThinkingFormat` to `"chat-template"` or `"top-level"` on that model entry.
|
||||
- `params.preserveThinking`: Z.AI-only opt-in for preserved thinking. When enabled and thinking is on, OpenClaw sends `thinking.clear_thinking: false` and replays prior `reasoning_content`; see [Z.AI thinking and preserved thinking](/providers/zai#thinking-and-preserved-thinking).
|
||||
- `agentRuntime`: default low-level agent runtime policy. Omitted id defaults to OpenClaw Pi. Use `id: "pi"` to force the built-in PI harness, `id: "auto"` to let registered plugin harnesses claim supported models, a registered harness id such as `id: "codex"`, or a supported CLI backend alias such as `id: "claude-cli"`. Set `fallback: "none"` to disable automatic PI fallback. Explicit plugin runtimes such as `codex` fail closed by default unless you set `fallback: "pi"` in the same override scope. Keep model refs canonical as `provider/model`; select Codex, Claude CLI, Gemini CLI, and other execution backends through runtime config instead of legacy runtime provider prefixes. See [Agent runtimes](/concepts/agent-runtimes) for how this differs from provider/model selection.
|
||||
- Config writers that mutate these fields (for example `/models set`, `/models set-image`, and fallback add/remove commands) save canonical object form and preserve existing fallback lists when possible.
|
||||
|
||||
@@ -153,7 +153,7 @@ Use explicit config when:
|
||||
<Accordion title="Nemotron 3 thinking controls">
|
||||
vLLM/Nemotron 3 can use chat-template kwargs to control whether reasoning is
|
||||
returned as hidden reasoning or visible answer text. When an OpenClaw session
|
||||
uses `vllm/nemotron-3-*` with thinking off, OpenClaw sends:
|
||||
uses `vllm/nemotron-3-*` with thinking off, the bundled vLLM plugin sends:
|
||||
|
||||
```json
|
||||
{
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
|
||||
import { createPayloadPatchStreamWrapper } from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
import {
|
||||
createPayloadPatchStreamWrapper,
|
||||
isOpenAICompatibleThinkingEnabled,
|
||||
} from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
|
||||
type QwenThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
|
||||
@@ -15,19 +18,6 @@ function isQwenProviderId(providerId: string): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: QwenThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
export function createQwenThinkingWrapper(
|
||||
baseStreamFn: StreamFn | undefined,
|
||||
thinkingLevel: QwenThinkingLevel,
|
||||
@@ -35,7 +25,7 @@ export function createQwenThinkingWrapper(
|
||||
return createPayloadPatchStreamWrapper(
|
||||
baseStreamFn,
|
||||
({ payload: payloadObj, options }) => {
|
||||
const enableThinking = resolveOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
|
||||
const enableThinking = isOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
|
||||
payloadObj.enable_thinking = enableThinking;
|
||||
delete payloadObj.reasoning_effort;
|
||||
delete payloadObj.reasoningEffort;
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { Context, Model } from "@mariozechner/pi-ai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
|
||||
import {
|
||||
createVllmProviderThinkingWrapper,
|
||||
createVllmQwenThinkingWrapper,
|
||||
wrapVllmProviderStream,
|
||||
} from "./stream.js";
|
||||
|
||||
function capturePayload(params: {
|
||||
format: "chat-template" | "top-level";
|
||||
@@ -105,6 +109,80 @@ describe("createVllmQwenThinkingWrapper", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("createVllmProviderThinkingWrapper", () => {
|
||||
function captureProviderPayload(params: {
|
||||
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
|
||||
initialPayload?: Record<string, unknown>;
|
||||
model?: Partial<Model<"openai-completions">>;
|
||||
}): Record<string, unknown> {
|
||||
let captured: Record<string, unknown> = {};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
const payload = { ...params.initialPayload };
|
||||
options?.onPayload?.(payload, _model);
|
||||
captured = payload;
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
|
||||
const wrapped = createVllmProviderThinkingWrapper({
|
||||
baseStreamFn,
|
||||
thinkingLevel: params.thinkingLevel ?? "high",
|
||||
});
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
reasoning: true,
|
||||
...params.model,
|
||||
} as Model<"openai-completions">,
|
||||
{ messages: [] } as Context,
|
||||
{},
|
||||
);
|
||||
|
||||
return captured;
|
||||
}
|
||||
|
||||
it("injects Nemotron 3 chat-template kwargs when thinking is off", () => {
|
||||
expect(captureProviderPayload({ thinkingLevel: "off" })).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("does not inject Nemotron 3 chat-template kwargs when thinking is enabled", () => {
|
||||
expect(captureProviderPayload({ thinkingLevel: "low" })).toEqual({});
|
||||
});
|
||||
|
||||
it("preserves existing Nemotron 3 chat-template kwargs over defaults", () => {
|
||||
expect(
|
||||
captureProviderPayload({
|
||||
thinkingLevel: "off",
|
||||
initialPayload: {
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: true,
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: true,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("skips non-Nemotron vLLM models", () => {
|
||||
expect(
|
||||
captureProviderPayload({
|
||||
thinkingLevel: "off",
|
||||
model: { id: "Qwen/Qwen3-8B" },
|
||||
}),
|
||||
).toEqual({});
|
||||
});
|
||||
});
|
||||
|
||||
describe("wrapVllmProviderStream", () => {
|
||||
it("registers when vLLM Qwen thinking format params are configured", () => {
|
||||
expect(
|
||||
@@ -167,4 +245,36 @@ describe("wrapVllmProviderStream", () => {
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("registers for vLLM Nemotron when thinking is off", () => {
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "nemotron-3-super",
|
||||
extraParams: {},
|
||||
thinkingLevel: "off",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "nemotron-3-super",
|
||||
extraParams: {},
|
||||
thinkingLevel: "low",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
|
||||
import { createPayloadPatchStreamWrapper } from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
import {
|
||||
createPayloadPatchStreamWrapper,
|
||||
isOpenAICompatibleThinkingEnabled,
|
||||
} from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
|
||||
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
type VllmQwenThinkingFormat = "chat-template" | "top-level";
|
||||
@@ -41,19 +44,6 @@ function resolveVllmQwenThinkingFormat(
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
|
||||
const existing = payload.chat_template_kwargs;
|
||||
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
|
||||
@@ -73,6 +63,31 @@ function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled:
|
||||
};
|
||||
}
|
||||
|
||||
function isVllmNemotronModel(model: { api?: unknown; provider?: unknown; id?: unknown }): boolean {
|
||||
return (
|
||||
model.api === "openai-completions" &&
|
||||
typeof model.provider === "string" &&
|
||||
normalizeProviderId(model.provider) === "vllm" &&
|
||||
typeof model.id === "string" &&
|
||||
/\bnemotron-3(?:[-_](?:nano|super|ultra))?\b/i.test(model.id)
|
||||
);
|
||||
}
|
||||
|
||||
function setNemotronThinkingOffChatTemplateKwargs(payload: Record<string, unknown>): void {
|
||||
const defaults = {
|
||||
enable_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
};
|
||||
const existing = payload.chat_template_kwargs;
|
||||
payload.chat_template_kwargs =
|
||||
existing && typeof existing === "object" && !Array.isArray(existing)
|
||||
? {
|
||||
...defaults,
|
||||
...(existing as Record<string, unknown>),
|
||||
}
|
||||
: defaults;
|
||||
}
|
||||
|
||||
export function createVllmQwenThinkingWrapper(params: {
|
||||
baseStreamFn: StreamFn | undefined;
|
||||
format: VllmQwenThinkingFormat;
|
||||
@@ -81,7 +96,7 @@ export function createVllmQwenThinkingWrapper(params: {
|
||||
return createPayloadPatchStreamWrapper(
|
||||
params.baseStreamFn,
|
||||
({ payload: payloadObj, options }) => {
|
||||
const enableThinking = resolveOpenAICompatibleThinkingEnabled({
|
||||
const enableThinking = isOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
options,
|
||||
});
|
||||
@@ -100,17 +115,50 @@ export function createVllmQwenThinkingWrapper(params: {
|
||||
);
|
||||
}
|
||||
|
||||
export function createVllmProviderThinkingWrapper(params: {
|
||||
baseStreamFn: StreamFn | undefined;
|
||||
qwenFormat?: VllmQwenThinkingFormat;
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
}): StreamFn {
|
||||
const qwenWrapped = params.qwenFormat
|
||||
? createVllmQwenThinkingWrapper({
|
||||
baseStreamFn: params.baseStreamFn,
|
||||
format: params.qwenFormat,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
})
|
||||
: params.baseStreamFn;
|
||||
return createPayloadPatchStreamWrapper(
|
||||
qwenWrapped,
|
||||
({ payload: payloadObj }) => {
|
||||
setNemotronThinkingOffChatTemplateKwargs(payloadObj);
|
||||
},
|
||||
{
|
||||
shouldPatch: ({ model }) =>
|
||||
model.api === "openai-completions" &&
|
||||
params.thinkingLevel === "off" &&
|
||||
isVllmNemotronModel(model),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
|
||||
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
|
||||
return undefined;
|
||||
}
|
||||
const format = resolveVllmQwenThinkingFormat(ctx.extraParams);
|
||||
if (!format) {
|
||||
const qwenFormat = resolveVllmQwenThinkingFormat(ctx.extraParams);
|
||||
const shouldHandleNemotron =
|
||||
ctx.thinkingLevel === "off" &&
|
||||
isVllmNemotronModel({
|
||||
api: "openai-completions",
|
||||
provider: ctx.provider,
|
||||
id: ctx.modelId,
|
||||
});
|
||||
if (!qwenFormat && !shouldHandleNemotron) {
|
||||
return undefined;
|
||||
}
|
||||
return createVllmQwenThinkingWrapper({
|
||||
return createVllmProviderThinkingWrapper({
|
||||
baseStreamFn: ctx.streamFn,
|
||||
format,
|
||||
qwenFormat,
|
||||
thinkingLevel: ctx.thinkingLevel,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -894,85 +894,6 @@ describe("applyExtraParamsToAgent", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("injects vLLM Nemotron chat_template_kwargs when thinking is off", () => {
|
||||
const payload = runResponsesPayloadMutationCase({
|
||||
applyProvider: "vllm",
|
||||
applyModelId: "nemotron-3-super",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
baseUrl: "http://127.0.0.1:8000/v1",
|
||||
} as Model<"openai-completions">,
|
||||
payload: {
|
||||
messages: [],
|
||||
},
|
||||
thinkingLevel: "off",
|
||||
});
|
||||
|
||||
expect(payload.chat_template_kwargs).toEqual({
|
||||
enable_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("does not inject vLLM Nemotron chat_template_kwargs when thinking is enabled", () => {
|
||||
const payload = runResponsesPayloadMutationCase({
|
||||
applyProvider: "vllm",
|
||||
applyModelId: "nemotron-3-super",
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
baseUrl: "http://127.0.0.1:8000/v1",
|
||||
} as Model<"openai-completions">,
|
||||
payload: {
|
||||
messages: [],
|
||||
},
|
||||
thinkingLevel: "low",
|
||||
});
|
||||
|
||||
expect(payload).not.toHaveProperty("chat_template_kwargs");
|
||||
});
|
||||
|
||||
it("lets extra_body override generated vLLM Nemotron chat_template_kwargs", () => {
|
||||
const payload = runResponsesPayloadMutationCase({
|
||||
applyProvider: "vllm",
|
||||
applyModelId: "nemotron-3-super",
|
||||
cfg: {
|
||||
agents: {
|
||||
defaults: {
|
||||
models: {
|
||||
"vllm/nemotron-3-super": {
|
||||
params: {
|
||||
extra_body: {
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "nemotron-3-super",
|
||||
baseUrl: "http://127.0.0.1:8000/v1",
|
||||
} as Model<"openai-completions">,
|
||||
payload: {
|
||||
messages: [],
|
||||
},
|
||||
thinkingLevel: "off",
|
||||
});
|
||||
|
||||
expect(payload.chat_template_kwargs).toEqual({
|
||||
enable_thinking: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("warns and skips invalid chat_template_kwargs params", () => {
|
||||
const warnSpy = vi.spyOn(log, "warn").mockImplementation(() => {});
|
||||
try {
|
||||
|
||||
@@ -462,63 +462,25 @@ function resolveChatTemplateKwargsParam(
|
||||
return Object.keys(chatTemplateKwargs).length > 0 ? chatTemplateKwargs : undefined;
|
||||
}
|
||||
|
||||
function isVllmNemotronModel(model: ProviderRuntimeModel): boolean {
|
||||
return (
|
||||
model.api === "openai-completions" &&
|
||||
typeof model.provider === "string" &&
|
||||
model.provider.toLowerCase() === "vllm" &&
|
||||
typeof model.id === "string" &&
|
||||
/\bnemotron-3(?:[-_](?:nano|super|ultra))?\b/i.test(model.id)
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompletionsChatTemplateKwargs(params: {
|
||||
model: ProviderRuntimeModel;
|
||||
thinkingLevel?: ThinkLevel;
|
||||
configured?: Record<string, unknown>;
|
||||
}): Record<string, unknown> | undefined {
|
||||
const defaults =
|
||||
params.thinkingLevel === "off" && isVllmNemotronModel(params.model)
|
||||
? {
|
||||
enable_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
}
|
||||
: undefined;
|
||||
const merged = {
|
||||
...defaults,
|
||||
...params.configured,
|
||||
};
|
||||
return Object.keys(merged).length > 0 ? merged : undefined;
|
||||
}
|
||||
|
||||
function createOpenAICompletionsChatTemplateKwargsWrapper(params: {
|
||||
baseStreamFn: StreamFn | undefined;
|
||||
configured?: Record<string, unknown>;
|
||||
thinkingLevel?: ThinkLevel;
|
||||
configured: Record<string, unknown>;
|
||||
}): StreamFn {
|
||||
const underlying = params.baseStreamFn ?? streamSimple;
|
||||
return (model, context, options) => {
|
||||
if (model.api !== "openai-completions") {
|
||||
return underlying(model, context, options);
|
||||
}
|
||||
const chatTemplateKwargs = resolveOpenAICompletionsChatTemplateKwargs({
|
||||
model: model as ProviderRuntimeModel,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
configured: params.configured,
|
||||
});
|
||||
if (!chatTemplateKwargs) {
|
||||
return underlying(model, context, options);
|
||||
}
|
||||
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
|
||||
const existing = payloadObj.chat_template_kwargs;
|
||||
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
|
||||
payloadObj.chat_template_kwargs = {
|
||||
...(existing as Record<string, unknown>),
|
||||
...chatTemplateKwargs,
|
||||
...params.configured,
|
||||
};
|
||||
return;
|
||||
}
|
||||
payloadObj.chat_template_kwargs = chatTemplateKwargs;
|
||||
payloadObj.chat_template_kwargs = params.configured;
|
||||
});
|
||||
};
|
||||
}
|
||||
@@ -614,11 +576,10 @@ function applyPostPluginStreamWrappers(
|
||||
"chatTemplateKwargs",
|
||||
);
|
||||
const configuredChatTemplateKwargs = resolveChatTemplateKwargsParam(rawChatTemplateKwargs);
|
||||
if (configuredChatTemplateKwargs || ctx.thinkingLevel === "off") {
|
||||
if (configuredChatTemplateKwargs) {
|
||||
ctx.agent.streamFn = createOpenAICompletionsChatTemplateKwargsWrapper({
|
||||
baseStreamFn: ctx.agent.streamFn,
|
||||
configured: configuredChatTemplateKwargs,
|
||||
thinkingLevel: ctx.thinkingLevel,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
defaultToolStreamExtraParams,
|
||||
decodeHtmlEntitiesInObject,
|
||||
hasCopilotVisionInput,
|
||||
isOpenAICompatibleThinkingEnabled,
|
||||
} from "./provider-stream-shared.js";
|
||||
|
||||
type FakeWrappedStream = {
|
||||
@@ -64,6 +65,43 @@ describe("defaultToolStreamExtraParams", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("isOpenAICompatibleThinkingEnabled", () => {
|
||||
it("uses explicit request reasoning before session thinking level", () => {
|
||||
expect(
|
||||
isOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: "high",
|
||||
options: { reasoning: "none" } as never,
|
||||
}),
|
||||
).toBe(false);
|
||||
expect(
|
||||
isOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: "off",
|
||||
options: { reasoningEffort: "medium" } as never,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("treats off and none as disabled", () => {
|
||||
expect(isOpenAICompatibleThinkingEnabled({ thinkingLevel: "off", options: {} })).toBe(false);
|
||||
expect(
|
||||
isOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: "high",
|
||||
options: { reasoning: "none" } as never,
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("defaults to enabled for missing or non-string values", () => {
|
||||
expect(isOpenAICompatibleThinkingEnabled({ thinkingLevel: undefined, options: {} })).toBe(true);
|
||||
expect(
|
||||
isOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: "off",
|
||||
options: { reasoning: { effort: "off" } } as never,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildCopilotDynamicHeaders", () => {
|
||||
it("matches Copilot IDE-style request headers without the legacy Openai-Intent", () => {
|
||||
expect(
|
||||
|
||||
@@ -154,6 +154,21 @@ export function createPayloadPatchStreamWrapper(
|
||||
};
|
||||
}
|
||||
|
||||
export type OpenAICompatibleThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
|
||||
export function isOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: OpenAICompatibleThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
export type DeepSeekV4ThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
|
||||
function isDisabledDeepSeekV4ThinkingLevel(thinkingLevel: DeepSeekV4ThinkingLevel): boolean {
|
||||
|
||||
@@ -43,6 +43,7 @@ export {
|
||||
defaultToolStreamExtraParams,
|
||||
hasCopilotVisionInput,
|
||||
isAnthropicBedrockModel,
|
||||
isOpenAICompatibleThinkingEnabled,
|
||||
type ProviderStreamWrapperFactory,
|
||||
resolveAnthropicPayloadPolicy,
|
||||
resolveMoonshotThinkingType,
|
||||
|
||||
Reference in New Issue
Block a user