fix(gateway): support image_url in OpenAI chat completions (#34068)

* fix(gateway): parse image_url in openai chat completions

* test(gateway): cover openai chat completions image_url flows

* docs(changelog): note openai image_url chat completions fix (#17685)

* fix(gateway): harden openai image_url parsing and limits

* test(gateway): add openai image_url regression coverage

* docs(changelog): expand #17685 openai chat completions note

* Gateway: make OpenAI image_url URL fetch opt-in and configurable

* Diagnostics: redact image base64 payload data in trace logs

* Changelog: note OpenAI image_url hardening follow-ups

* Gateway: enforce OpenAI image_url total budget incrementally

* Gateway: scope OpenAI image_url extraction to the active turn

* Update CHANGELOG.md
This commit is contained in:
Vincent Koc
2026-03-06 00:35:50 -05:00
committed by GitHub
parent 36e2e04a32
commit 9c86a9fd23
16 changed files with 764 additions and 18 deletions

View File

@@ -149,6 +149,7 @@ Docs: https://docs.openclaw.ai
- Telegram/multi-account default routing clarity: warn only for ambiguous (2+) account setups without an explicit default, add `openclaw doctor` warnings for missing/invalid multi-account defaults across channels, and document explicit-default guidance for channel routing and Telegram config. (#32544) thanks @Sid-Qin.
- Telegram/plugin outbound hook parity: run `message_sending` + `message_sent` in Telegram reply delivery, include reply-path hook metadata (`mediaUrls`, `threadId`), and report `message_sent.success=false` when hooks blank text and no outbound message is delivered. (#32649) Thanks @KimGLee.
- CLI/Coding-agent reliability: switch default `claude-cli` non-interactive args to `--permission-mode bypassPermissions`, auto-normalize legacy `--dangerously-skip-permissions` backend overrides to the modern permission-mode form, align coding-agent + live-test docs with the non-PTY Claude path, and emit session system-event heartbeat notices when CLI watchdog no-output timeouts terminate runs. (#28610, #31149, #34055). Thanks @niceysam, @cryptomaltese and @vincentkoc.
- Gateway/OpenAI chat completions: parse active-turn `image_url` content parts (including parameterized data URIs and guarded URL sources), forward them as multimodal `images`, accept image-only user turns, enforce per-request image-part/byte budgets, default URL-based image fetches to disabled unless explicitly enabled by config, and redact image base64 data in cache-trace/provider payload diagnostics. (#17685) Thanks @vincentkoc
- ACP/ACPX session bootstrap: retry with `sessions new` when `sessions ensure` returns no session identifiers so ACP spawns avoid `NO_SESSION`/`ACP_TURN_FAILED` failures on affected agents. (#28786, #31338, #34055). Thanks @Sid-Qin and @vincentkoc.
- ACP/sessions_spawn parent stream visibility: add `streamTo: "parent"` for `runtime: "acp"` to forward initial child-run progress/no-output/completion updates back into the requester session as system events (instead of direct child delivery), and emit a tail-able session-scoped relay log (`<sessionId>.acp-stream.jsonl`, returned as `streamLogPath` when available), improving orchestrator visibility for blocked or long-running harness turns. (#34310, #29909; reopened from #34055). Thanks @vincentkoc.
- Agents/bootstrap truncation warning handling: unify bootstrap budget/truncation analysis across embedded + CLI runtime, `/context`, and `openclaw doctor`; add `agents.defaults.bootstrapPromptTruncationWarning` (`off|once|always`, default `once`) and persist warning-signature metadata so truncation warnings are consistent and deduped across turns. (#32769) Thanks @gumadeiras.

View File

@@ -0,0 +1,49 @@
import crypto from "node:crypto";
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { describe, expect, it } from "vitest";
import { createAnthropicPayloadLogger } from "./anthropic-payload-log.js";
describe("createAnthropicPayloadLogger", () => {
it("redacts image base64 payload data before writing logs", async () => {
const lines: string[] = [];
const logger = createAnthropicPayloadLogger({
env: { OPENCLAW_ANTHROPIC_PAYLOAD_LOG: "1" },
writer: {
filePath: "memory",
write: (line) => lines.push(line),
},
});
expect(logger).not.toBeNull();
const payload = {
messages: [
{
role: "user",
content: [
{
type: "image",
source: { type: "base64", media_type: "image/png", data: "QUJDRA==" },
},
],
},
],
};
const streamFn: StreamFn = ((_, __, options) => {
options?.onPayload?.(payload);
return {} as never;
}) as StreamFn;
const wrapped = logger?.wrapStreamFn(streamFn);
await wrapped?.({ api: "anthropic-messages" } as never, { messages: [] } as never, {});
const event = JSON.parse(lines[0]?.trim() ?? "{}") as Record<string, unknown>;
const message = ((event.payload as { messages?: unknown[] } | undefined)?.messages ??
[]) as Array<Record<string, unknown>>;
const source = (((message[0]?.content as Array<Record<string, unknown>> | undefined) ?? [])[0]
?.source ?? {}) as Record<string, unknown>;
expect(source.data).toBe("<redacted>");
expect(source.bytes).toBe(4);
expect(source.sha256).toBe(crypto.createHash("sha256").update("QUJDRA==").digest("hex"));
expect(event.payloadDigest).toBeDefined();
});
});

View File

@@ -7,6 +7,7 @@ import { createSubsystemLogger } from "../logging/subsystem.js";
import { resolveUserPath } from "../utils.js";
import { parseBooleanValue } from "../utils/boolean.js";
import { safeJsonStringify } from "../utils/safe-json.js";
import { redactImageDataForDiagnostics } from "./payload-redaction.js";
import { getQueuedFileWriter, type QueuedFileWriter } from "./queued-file-writer.js";
type PayloadLogStage = "request" | "usage";
@@ -103,6 +104,7 @@ export function createAnthropicPayloadLogger(params: {
modelId?: string;
modelApi?: string | null;
workspaceDir?: string;
writer?: PayloadLogWriter;
}): AnthropicPayloadLogger | null {
const env = params.env ?? process.env;
const cfg = resolvePayloadLogConfig(env);
@@ -110,7 +112,7 @@ export function createAnthropicPayloadLogger(params: {
return null;
}
const writer = getWriter(cfg.filePath);
const writer = params.writer ?? getWriter(cfg.filePath);
const base: Omit<PayloadLogEvent, "ts" | "stage"> = {
runId: params.runId,
sessionId: params.sessionId,
@@ -135,12 +137,13 @@ export function createAnthropicPayloadLogger(params: {
return streamFn(model, context, options);
}
const nextOnPayload = (payload: unknown) => {
const redactedPayload = redactImageDataForDiagnostics(payload);
record({
...base,
ts: new Date().toISOString(),
stage: "request",
payload,
payloadDigest: digest(payload),
payload: redactedPayload,
payloadDigest: digest(redactedPayload),
});
options?.onPayload?.(payload);
};

View File

@@ -1,3 +1,4 @@
import crypto from "node:crypto";
import { describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { resolveUserPath } from "../utils.js";
@@ -89,4 +90,58 @@ describe("createCacheTrace", () => {
expect(trace).toBeNull();
});
it("redacts image data from options and messages before writing", () => {
const lines: string[] = [];
const trace = createCacheTrace({
cfg: {
diagnostics: {
cacheTrace: {
enabled: true,
},
},
},
env: {},
writer: {
filePath: "memory",
write: (line) => lines.push(line),
},
});
trace?.recordStage("stream:context", {
options: {
images: [{ type: "image", mimeType: "image/png", data: "QUJDRA==" }],
},
messages: [
{
role: "user",
content: [
{
type: "image",
source: { type: "base64", media_type: "image/jpeg", data: "U0VDUkVU" },
},
],
},
] as unknown as [],
});
const event = JSON.parse(lines[0]?.trim() ?? "{}") as Record<string, unknown>;
const optionsImages = (
((event.options as { images?: unknown[] } | undefined)?.images ?? []) as Array<
Record<string, unknown>
>
)[0];
expect(optionsImages?.data).toBe("<redacted>");
expect(optionsImages?.bytes).toBe(4);
expect(optionsImages?.sha256).toBe(
crypto.createHash("sha256").update("QUJDRA==").digest("hex"),
);
const firstMessage = ((event.messages as Array<Record<string, unknown>> | undefined) ?? [])[0];
const source = (((firstMessage?.content as Array<Record<string, unknown>> | undefined) ?? [])[0]
?.source ?? {}) as Record<string, unknown>;
expect(source.data).toBe("<redacted>");
expect(source.bytes).toBe(6);
expect(source.sha256).toBe(crypto.createHash("sha256").update("U0VDUkVU").digest("hex"));
});
});

View File

@@ -6,6 +6,7 @@ import { resolveStateDir } from "../config/paths.js";
import { resolveUserPath } from "../utils.js";
import { parseBooleanValue } from "../utils/boolean.js";
import { safeJsonStringify } from "../utils/safe-json.js";
import { redactImageDataForDiagnostics } from "./payload-redaction.js";
import { getQueuedFileWriter, type QueuedFileWriter } from "./queued-file-writer.js";
export type CacheTraceStage =
@@ -198,7 +199,7 @@ export function createCacheTrace(params: CacheTraceInit): CacheTrace | null {
event.systemDigest = digest(payload.system);
}
if (payload.options) {
event.options = payload.options;
event.options = redactImageDataForDiagnostics(payload.options) as Record<string, unknown>;
}
if (payload.model) {
event.model = payload.model;
@@ -212,7 +213,7 @@ export function createCacheTrace(params: CacheTraceInit): CacheTrace | null {
event.messageFingerprints = summary.messageFingerprints;
event.messagesDigest = summary.messagesDigest;
if (cfg.includeMessages) {
event.messages = messages;
event.messages = redactImageDataForDiagnostics(messages) as AgentMessage[];
}
}

View File

@@ -0,0 +1,64 @@
import crypto from "node:crypto";
import { estimateBase64DecodedBytes } from "../media/base64.js";
export const REDACTED_IMAGE_DATA = "<redacted>";
function toLowerTrimmed(value: unknown): string {
return typeof value === "string" ? value.trim().toLowerCase() : "";
}
function hasImageMime(record: Record<string, unknown>): boolean {
const candidates = [
toLowerTrimmed(record.mimeType),
toLowerTrimmed(record.media_type),
toLowerTrimmed(record.mime_type),
];
return candidates.some((value) => value.startsWith("image/"));
}
function shouldRedactImageData(record: Record<string, unknown>): record is Record<string, string> {
if (typeof record.data !== "string") {
return false;
}
const type = toLowerTrimmed(record.type);
return type === "image" || hasImageMime(record);
}
function digestBase64Payload(data: string): string {
return crypto.createHash("sha256").update(data).digest("hex");
}
/**
* Redacts image/base64 payload data from diagnostic objects before persistence.
*/
export function redactImageDataForDiagnostics(value: unknown): unknown {
const seen = new WeakSet<object>();
const visit = (input: unknown): unknown => {
if (Array.isArray(input)) {
return input.map((entry) => visit(entry));
}
if (!input || typeof input !== "object") {
return input;
}
if (seen.has(input)) {
return "[Circular]";
}
seen.add(input);
const record = input as Record<string, unknown>;
const out: Record<string, unknown> = {};
for (const [key, val] of Object.entries(record)) {
out[key] = visit(val);
}
if (shouldRedactImageData(record)) {
out.data = REDACTED_IMAGE_DATA;
out.bytes = estimateBase64DecodedBytes(record.data);
out.sha256 = digestBase64Payload(record.data);
}
return out;
};
return visit(value);
}

View File

@@ -384,6 +384,26 @@ export const FIELD_HELP: Record<string, string> = {
"Disables Control UI device identity checks and relies on token/password only. Use only for short-lived debugging on trusted networks, then turn it off immediately.",
"gateway.http.endpoints.chatCompletions.enabled":
"Enable the OpenAI-compatible `POST /v1/chat/completions` endpoint (default: false).",
"gateway.http.endpoints.chatCompletions.maxBodyBytes":
"Max request body size in bytes for `/v1/chat/completions` (default: 20MB).",
"gateway.http.endpoints.chatCompletions.maxImageParts":
"Max number of `image_url` parts accepted from the latest user message (default: 8).",
"gateway.http.endpoints.chatCompletions.maxTotalImageBytes":
"Max cumulative decoded bytes across all `image_url` parts in one request (default: 20MB).",
"gateway.http.endpoints.chatCompletions.images":
"Image fetch/validation controls for OpenAI-compatible `image_url` parts.",
"gateway.http.endpoints.chatCompletions.images.allowUrl":
"Allow server-side URL fetches for `image_url` parts (default: false; data URIs remain supported).",
"gateway.http.endpoints.chatCompletions.images.urlAllowlist":
"Optional hostname allowlist for `image_url` URL fetches; supports exact hosts and `*.example.com` wildcards.",
"gateway.http.endpoints.chatCompletions.images.allowedMimes":
"Allowed MIME types for `image_url` parts (case-insensitive list).",
"gateway.http.endpoints.chatCompletions.images.maxBytes":
"Max bytes per fetched/decoded `image_url` image (default: 10MB).",
"gateway.http.endpoints.chatCompletions.images.maxRedirects":
"Max HTTP redirects allowed when fetching `image_url` URLs (default: 3).",
"gateway.http.endpoints.chatCompletions.images.timeoutMs":
"Timeout in milliseconds for `image_url` URL fetches (default: 10000).",
"gateway.reload.mode":
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",

View File

@@ -249,6 +249,23 @@ export const FIELD_LABELS: Record<string, string> = {
"gateway.controlUi.allowInsecureAuth": "Insecure Control UI Auth Toggle",
"gateway.controlUi.dangerouslyDisableDeviceAuth": "Dangerously Disable Control UI Device Auth",
"gateway.http.endpoints.chatCompletions.enabled": "OpenAI Chat Completions Endpoint",
"gateway.http.endpoints.chatCompletions.maxBodyBytes": "OpenAI Chat Completions Max Body Bytes",
"gateway.http.endpoints.chatCompletions.maxImageParts": "OpenAI Chat Completions Max Image Parts",
"gateway.http.endpoints.chatCompletions.maxTotalImageBytes":
"OpenAI Chat Completions Max Total Image Bytes",
"gateway.http.endpoints.chatCompletions.images": "OpenAI Chat Completions Image Limits",
"gateway.http.endpoints.chatCompletions.images.allowUrl":
"OpenAI Chat Completions Allow Image URLs",
"gateway.http.endpoints.chatCompletions.images.urlAllowlist":
"OpenAI Chat Completions Image URL Allowlist",
"gateway.http.endpoints.chatCompletions.images.allowedMimes":
"OpenAI Chat Completions Image MIME Allowlist",
"gateway.http.endpoints.chatCompletions.images.maxBytes":
"OpenAI Chat Completions Image Max Bytes",
"gateway.http.endpoints.chatCompletions.images.maxRedirects":
"OpenAI Chat Completions Image Max Redirects",
"gateway.http.endpoints.chatCompletions.images.timeoutMs":
"OpenAI Chat Completions Image Timeout (ms)",
"gateway.reload.mode": "Config Reload Mode",
"gateway.reload.debounceMs": "Config Reload Debounce (ms)",
"gateway.nodes.browser.mode": "Gateway Node Browser Mode",

View File

@@ -203,6 +203,41 @@ export type GatewayHttpChatCompletionsConfig = {
* Default: false when absent.
*/
enabled?: boolean;
/**
* Max request body size in bytes for `/v1/chat/completions`.
* Default: 20MB.
*/
maxBodyBytes?: number;
/**
* Max number of `image_url` parts processed from the latest user message.
* Default: 8.
*/
maxImageParts?: number;
/**
* Max cumulative decoded image bytes for all `image_url` parts in one request.
* Default: 20MB.
*/
maxTotalImageBytes?: number;
/** Image input controls for `image_url` parts. */
images?: GatewayHttpChatCompletionsImagesConfig;
};
export type GatewayHttpChatCompletionsImagesConfig = {
/** Allow URL fetches for `image_url` parts. Default: false. */
allowUrl?: boolean;
/**
* Optional hostname allowlist for URL fetches.
* Supports exact hosts and `*.example.com` wildcards.
*/
urlAllowlist?: string[];
/** Allowed MIME types (case-insensitive). */
allowedMimes?: string[];
/** Max bytes per image. Default: 10MB. */
maxBytes?: number;
/** Max redirects when fetching a URL. Default: 3. */
maxRedirects?: number;
/** Fetch timeout in ms. Default: 10s. */
timeoutMs?: number;
};
export type GatewayHttpResponsesConfig = {

View File

@@ -708,6 +708,15 @@ export const OpenClawSchema = z
chatCompletions: z
.object({
enabled: z.boolean().optional(),
maxBodyBytes: z.number().int().positive().optional(),
maxImageParts: z.number().int().nonnegative().optional(),
maxTotalImageBytes: z.number().int().positive().optional(),
images: z
.object({
...ResponsesEndpointUrlFetchShape,
})
.strict()
.optional(),
})
.strict()
.optional(),

View File

@@ -133,6 +133,7 @@ describe("OpenAI-compatible HTTP API (e2e)", () => {
sessionKey?: string;
message?: string;
extraSystemPrompt?: string;
images?: Array<{ type: string; data: string; mimeType: string }>;
}
| undefined;
const getFirstAgentMessage = () => getFirstAgentCall()?.message ?? "";
@@ -251,6 +252,223 @@ describe("OpenAI-compatible HTTP API (e2e)", () => {
await res.text();
}
{
const imageData = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAA";
mockAgentOnce([{ text: "looks good" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{ type: "text", text: "describe this" },
{
type: "image_url",
image_url: { url: `data:image/png;base64,${imageData}` },
},
],
},
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.message).toBe("describe this");
expect(firstCall?.images).toEqual([
{ type: "image", data: imageData, mimeType: "image/png" },
]);
await res.text();
}
{
const imageData = "QUJDRA==";
mockAgentOnce([{ text: "supports data-uri params" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{ type: "text", text: "with metadata params" },
{
type: "image_url",
image_url: { url: `data:image/png;charset=utf-8;base64,${imageData}` },
},
],
},
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.images).toEqual([
{ type: "image", data: imageData, mimeType: "image/png" },
]);
await res.text();
}
{
agentCommand.mockClear();
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{
type: "image_url",
image_url: { url: "https://example.com/image.png" },
},
],
},
],
});
expect(res.status).toBe(400);
const json = (await res.json()) as Record<string, unknown>;
expect((json.error as Record<string, unknown> | undefined)?.type).toBe(
"invalid_request_error",
);
expect(agentCommand).toHaveBeenCalledTimes(0);
}
{
mockAgentOnce([{ text: "I can see the image" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{
type: "image_url",
image_url: { url: "data:image/jpeg;base64,QUJDRA==" },
},
],
},
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.message).toContain("User sent image(s) with no text.");
expect(firstCall?.images).toEqual([
{ type: "image", data: "QUJDRA==", mimeType: "image/jpeg" },
]);
await res.text();
}
{
mockAgentOnce([{ text: "follow up answer" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{ type: "image_url", image_url: { url: "data:image/png;base64,QUJDRA==" } },
],
},
{ role: "assistant", content: "I can see it." },
{ role: "user", content: "What color was it?" },
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.images).toBeUndefined();
expect(firstCall?.message ?? "").not.toContain("User sent image(s) with no text.");
await res.text();
}
{
mockAgentOnce([{ text: "latest image only" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{ type: "text", text: "first" },
{ type: "image_url", image_url: { url: "data:image/png;base64,QUFBQQ==" } },
],
},
{ role: "assistant", content: "noted" },
{
role: "user",
content: [
{ type: "text", text: "second" },
{ type: "image_url", image_url: { url: "data:image/png;base64,QkJCQg==" } },
],
},
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.images).toEqual([
{ type: "image", data: "QkJCQg==", mimeType: "image/png" },
]);
await res.text();
}
{
const largeMessage = "x".repeat(1_200_000);
mockAgentOnce([{ text: "accepted" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [{ role: "user", content: largeMessage }],
});
expect(res.status).toBe(200);
await res.text();
}
{
agentCommand.mockClear();
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{
type: "image_url",
image_url: { url: "data:application/pdf;base64,QUJDRA==" },
},
],
},
],
});
expect(res.status).toBe(400);
const json = (await res.json()) as Record<string, unknown>;
expect((json.error as Record<string, unknown> | undefined)?.type).toBe(
"invalid_request_error",
);
expect(agentCommand).toHaveBeenCalledTimes(0);
}
{
agentCommand.mockClear();
const manyImageParts = Array.from({ length: 9 }).map(() => ({
type: "image_url",
image_url: { url: "data:image/png;base64,QUJDRA==" },
}));
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: manyImageParts,
},
],
});
expect(res.status).toBe(400);
const json = (await res.json()) as Record<string, unknown>;
expect((json.error as Record<string, unknown> | undefined)?.type).toBe(
"invalid_request_error",
);
expect(agentCommand).toHaveBeenCalledTimes(0);
}
{
mockAgentOnce([{ text: "I am Claude" }]);
const res = await postChatCompletions(port, {
@@ -327,6 +545,35 @@ describe("OpenAI-compatible HTTP API (e2e)", () => {
await res.text();
}
{
mockAgentOnce([{ text: "tool follow-up ok" }]);
const res = await postChatCompletions(port, {
model: "openclaw",
messages: [
{
role: "user",
content: [
{ type: "text", text: "look at this" },
{ type: "image_url", image_url: { url: "https://example.com/image.png" } },
],
},
{ role: "assistant", content: "Checking the image." },
{ role: "tool", content: "Vision tool says it is blue." },
],
});
expect(res.status).toBe(200);
const firstCall = getFirstAgentCall();
expect(firstCall?.images).toBeUndefined();
const message = getFirstAgentMessage();
expectMessageContext(message, {
history: ["User: look at this", "Assistant: Checking the image."],
current: ["Tool: Vision tool says it is blue."],
});
expect(message).not.toContain("User sent image(s) with no text.");
await res.text();
}
{
mockAgentOnce([{ text: "hello" }]);
const json = await postSyncUserMessage("hi");

View File

@@ -2,8 +2,21 @@ import { randomUUID } from "node:crypto";
import type { IncomingMessage, ServerResponse } from "node:http";
import { createDefaultDeps } from "../cli/deps.js";
import { agentCommandFromIngress } from "../commands/agent.js";
import type { ImageContent } from "../commands/agent/types.js";
import type { GatewayHttpChatCompletionsConfig } from "../config/types.gateway.js";
import { emitAgentEvent, onAgentEvent } from "../infra/agent-events.js";
import { logWarn } from "../logger.js";
import { estimateBase64DecodedBytes } from "../media/base64.js";
import {
DEFAULT_INPUT_IMAGE_MAX_BYTES,
DEFAULT_INPUT_IMAGE_MIMES,
DEFAULT_INPUT_MAX_REDIRECTS,
DEFAULT_INPUT_TIMEOUT_MS,
extractImageContentFromSource,
normalizeMimeList,
type InputImageLimits,
type InputImageSource,
} from "../media/input-files.js";
import { defaultRuntime } from "../runtime.js";
import { resolveAssistantStreamDeltaText } from "./agent-event-assistant-text.js";
import {
@@ -18,6 +31,7 @@ import { resolveGatewayRequestContext } from "./http-utils.js";
type OpenAiHttpOptions = {
auth: ResolvedGatewayAuth;
config?: GatewayHttpChatCompletionsConfig;
maxBodyBytes?: number;
trustedProxies?: string[];
allowRealIpFallback?: boolean;
@@ -37,12 +51,64 @@ type OpenAiChatCompletionRequest = {
user?: unknown;
};
const DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES = 20 * 1024 * 1024;
const IMAGE_ONLY_USER_MESSAGE = "User sent image(s) with no text.";
const DEFAULT_OPENAI_MAX_IMAGE_PARTS = 8;
const DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES = 20 * 1024 * 1024;
const DEFAULT_OPENAI_IMAGE_LIMITS: InputImageLimits = {
allowUrl: false,
allowedMimes: new Set(DEFAULT_INPUT_IMAGE_MIMES),
maxBytes: DEFAULT_INPUT_IMAGE_MAX_BYTES,
maxRedirects: DEFAULT_INPUT_MAX_REDIRECTS,
timeoutMs: DEFAULT_INPUT_TIMEOUT_MS,
};
type ResolvedOpenAiChatCompletionsLimits = {
maxBodyBytes: number;
maxImageParts: number;
maxTotalImageBytes: number;
images: InputImageLimits;
};
function normalizeHostnameAllowlist(values: string[] | undefined): string[] | undefined {
if (!values || values.length === 0) {
return undefined;
}
const normalized = values.map((value) => value.trim()).filter((value) => value.length > 0);
return normalized.length > 0 ? normalized : undefined;
}
function resolveOpenAiChatCompletionsLimits(
config: GatewayHttpChatCompletionsConfig | undefined,
): ResolvedOpenAiChatCompletionsLimits {
const imageConfig = config?.images;
return {
maxBodyBytes: config?.maxBodyBytes ?? DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES,
maxImageParts:
typeof config?.maxImageParts === "number"
? Math.max(0, Math.floor(config.maxImageParts))
: DEFAULT_OPENAI_MAX_IMAGE_PARTS,
maxTotalImageBytes:
typeof config?.maxTotalImageBytes === "number"
? Math.max(1, Math.floor(config.maxTotalImageBytes))
: DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES,
images: {
allowUrl: imageConfig?.allowUrl ?? DEFAULT_OPENAI_IMAGE_LIMITS.allowUrl,
urlAllowlist: normalizeHostnameAllowlist(imageConfig?.urlAllowlist),
allowedMimes: normalizeMimeList(imageConfig?.allowedMimes, DEFAULT_INPUT_IMAGE_MIMES),
maxBytes: imageConfig?.maxBytes ?? DEFAULT_INPUT_IMAGE_MAX_BYTES,
maxRedirects: imageConfig?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS,
timeoutMs: imageConfig?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS,
},
};
}
function writeSse(res: ServerResponse, data: unknown) {
res.write(`data: ${JSON.stringify(data)}\n\n`);
}
function buildAgentCommandInput(params: {
prompt: { message: string; extraSystemPrompt?: string };
prompt: { message: string; extraSystemPrompt?: string; images?: ImageContent[] };
sessionKey: string;
runId: string;
messageChannel: string;
@@ -50,6 +116,7 @@ function buildAgentCommandInput(params: {
return {
message: params.prompt.message,
extraSystemPrompt: params.prompt.extraSystemPrompt,
images: params.prompt.images,
sessionKey: params.sessionKey,
runId: params.runId,
deliver: false as const,
@@ -123,7 +190,142 @@ function extractTextContent(content: unknown): string {
return "";
}
function buildAgentPrompt(messagesUnknown: unknown): {
function resolveImageUrlPart(part: unknown): string | undefined {
if (!part || typeof part !== "object") {
return undefined;
}
const imageUrl = (part as { image_url?: unknown }).image_url;
if (typeof imageUrl === "string") {
const trimmed = imageUrl.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
if (!imageUrl || typeof imageUrl !== "object") {
return undefined;
}
const rawUrl = (imageUrl as { url?: unknown }).url;
if (typeof rawUrl !== "string") {
return undefined;
}
const trimmed = rawUrl.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
function extractImageUrls(content: unknown): string[] {
if (!Array.isArray(content)) {
return [];
}
const urls: string[] = [];
for (const part of content) {
if (!part || typeof part !== "object") {
continue;
}
if ((part as { type?: unknown }).type !== "image_url") {
continue;
}
const url = resolveImageUrlPart(part);
if (url) {
urls.push(url);
}
}
return urls;
}
type ActiveTurnContext = {
activeTurnIndex: number;
activeUserMessageIndex: number;
urls: string[];
};
function parseImageUrlToSource(url: string): InputImageSource {
const dataUriMatch = /^data:([^,]*?),(.*)$/is.exec(url);
if (dataUriMatch) {
const metadata = dataUriMatch[1]?.trim() ?? "";
const data = dataUriMatch[2] ?? "";
const metadataParts = metadata
.split(";")
.map((part) => part.trim())
.filter(Boolean);
const isBase64 = metadataParts.some((part) => part.toLowerCase() === "base64");
if (!isBase64) {
throw new Error("image_url data URI must be base64 encoded");
}
if (!data.trim()) {
throw new Error("image_url data URI is missing payload data");
}
const mediaTypeRaw = metadataParts.find((part) => part.includes("/"));
return {
type: "base64",
mediaType: mediaTypeRaw,
data,
};
}
return { type: "url", url };
}
function resolveActiveTurnContext(messagesUnknown: unknown): ActiveTurnContext {
const messages = asMessages(messagesUnknown);
for (let i = messages.length - 1; i >= 0; i -= 1) {
const msg = messages[i];
if (!msg || typeof msg !== "object") {
continue;
}
const role = typeof msg.role === "string" ? msg.role.trim() : "";
const normalizedRole = role === "function" ? "tool" : role;
if (normalizedRole !== "user" && normalizedRole !== "tool") {
continue;
}
return {
activeTurnIndex: i,
activeUserMessageIndex: normalizedRole === "user" ? i : -1,
urls: normalizedRole === "user" ? extractImageUrls(msg.content) : [],
};
}
return { activeTurnIndex: -1, activeUserMessageIndex: -1, urls: [] };
}
async function resolveImagesForRequest(
activeTurnContext: Pick<ActiveTurnContext, "urls">,
limits: ResolvedOpenAiChatCompletionsLimits,
): Promise<ImageContent[]> {
const urls = activeTurnContext.urls;
if (urls.length === 0) {
return [];
}
if (urls.length > limits.maxImageParts) {
throw new Error(`Too many image_url parts (${urls.length}; limit ${limits.maxImageParts})`);
}
const images: ImageContent[] = [];
let totalBytes = 0;
for (const url of urls) {
const source = parseImageUrlToSource(url);
if (source.type === "base64") {
totalBytes += estimateBase64DecodedBytes(source.data);
if (totalBytes > limits.maxTotalImageBytes) {
throw new Error(
`Total image payload too large (${totalBytes}; limit ${limits.maxTotalImageBytes})`,
);
}
}
const image = await extractImageContentFromSource(source, limits.images);
if (source.type !== "base64") {
totalBytes += estimateBase64DecodedBytes(image.data);
}
if (totalBytes > limits.maxTotalImageBytes) {
throw new Error(
`Total image payload too large (${totalBytes}; limit ${limits.maxTotalImageBytes})`,
);
}
images.push(image);
}
return images;
}
function buildAgentPrompt(
messagesUnknown: unknown,
activeUserMessageIndex: number,
): {
message: string;
extraSystemPrompt?: string;
} {
@@ -132,17 +334,20 @@ function buildAgentPrompt(messagesUnknown: unknown): {
const systemParts: string[] = [];
const conversationEntries: ConversationEntry[] = [];
for (const msg of messages) {
for (const [i, msg] of messages.entries()) {
if (!msg || typeof msg !== "object") {
continue;
}
const role = typeof msg.role === "string" ? msg.role.trim() : "";
const content = extractTextContent(msg.content).trim();
if (!role || !content) {
const hasImage = extractImageUrls(msg.content).length > 0;
if (!role) {
continue;
}
if (role === "system" || role === "developer") {
systemParts.push(content);
if (content) {
systemParts.push(content);
}
continue;
}
@@ -151,6 +356,16 @@ function buildAgentPrompt(messagesUnknown: unknown): {
continue;
}
// Keep the image-only placeholder scoped to the active user turn so we don't
// mention historical image-only turns whose bytes are intentionally not replayed.
const messageContent =
normalizedRole === "user" && !content && hasImage && i === activeUserMessageIndex
? IMAGE_ONLY_USER_MESSAGE
: content;
if (!messageContent) {
continue;
}
const name = typeof msg.name === "string" ? msg.name.trim() : "";
const sender =
normalizedRole === "assistant"
@@ -163,7 +378,7 @@ function buildAgentPrompt(messagesUnknown: unknown): {
conversationEntries.push({
role: normalizedRole,
entry: { sender, body: content },
entry: { sender, body: messageContent },
});
}
@@ -199,13 +414,14 @@ export async function handleOpenAiHttpRequest(
res: ServerResponse,
opts: OpenAiHttpOptions,
): Promise<boolean> {
const limits = resolveOpenAiChatCompletionsLimits(opts.config);
const handled = await handleGatewayPostJsonEndpoint(req, res, {
pathname: "/v1/chat/completions",
auth: opts.auth,
trustedProxies: opts.trustedProxies,
allowRealIpFallback: opts.allowRealIpFallback,
rateLimiter: opts.rateLimiter,
maxBodyBytes: opts.maxBodyBytes ?? 1024 * 1024,
maxBodyBytes: opts.maxBodyBytes ?? limits.maxBodyBytes,
});
if (handled === false) {
return false;
@@ -227,8 +443,23 @@ export async function handleOpenAiHttpRequest(
defaultMessageChannel: "webchat",
useMessageChannelHeader: true,
});
const prompt = buildAgentPrompt(payload.messages);
if (!prompt.message) {
const activeTurnContext = resolveActiveTurnContext(payload.messages);
const prompt = buildAgentPrompt(payload.messages, activeTurnContext.activeUserMessageIndex);
let images: ImageContent[] = [];
try {
images = await resolveImagesForRequest(activeTurnContext, limits);
} catch (err) {
logWarn(`openai-compat: invalid image_url content: ${String(err)}`);
sendJson(res, 400, {
error: {
message: "Invalid image_url content in `messages`.",
type: "invalid_request_error",
},
});
return true;
}
if (!prompt.message && images.length === 0) {
sendJson(res, 400, {
error: {
message: "Missing user message in `messages`.",
@@ -241,7 +472,11 @@ export async function handleOpenAiHttpRequest(
const runId = `chatcmpl_${randomUUID()}`;
const deps = createDefaultDeps();
const commandInput = buildAgentCommandInput({
prompt,
prompt: {
message: prompt.message,
extraSystemPrompt: prompt.extraSystemPrompt,
images: images.length > 0 ? images : undefined,
},
sessionKey,
runId,
messageChannel,

View File

@@ -509,6 +509,7 @@ export function createGatewayHttpServer(opts: {
controlUiBasePath: string;
controlUiRoot?: ControlUiRootState;
openAiChatCompletionsEnabled: boolean;
openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig;
openResponsesEnabled: boolean;
openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig;
strictTransportSecurityHeader?: string;
@@ -527,6 +528,7 @@ export function createGatewayHttpServer(opts: {
controlUiBasePath,
controlUiRoot,
openAiChatCompletionsEnabled,
openAiChatCompletionsConfig,
openResponsesEnabled,
openResponsesConfig,
strictTransportSecurityHeader,
@@ -610,6 +612,7 @@ export function createGatewayHttpServer(opts: {
run: () =>
handleOpenAiHttpRequest(req, res, {
auth: resolvedAuth,
config: openAiChatCompletionsConfig,
trustedProxies,
allowRealIpFallback,
rateLimiter,

View File

@@ -23,6 +23,7 @@ export type GatewayRuntimeConfig = {
bindHost: string;
controlUiEnabled: boolean;
openAiChatCompletionsEnabled: boolean;
openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig;
openResponsesEnabled: boolean;
openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig;
strictTransportSecurityHeader?: string;
@@ -73,10 +74,9 @@ export async function resolveGatewayRuntimeConfig(params: {
}
const controlUiEnabled =
params.controlUiEnabled ?? params.cfg.gateway?.controlUi?.enabled ?? true;
const openAiChatCompletionsConfig = params.cfg.gateway?.http?.endpoints?.chatCompletions;
const openAiChatCompletionsEnabled =
params.openAiChatCompletionsEnabled ??
params.cfg.gateway?.http?.endpoints?.chatCompletions?.enabled ??
false;
params.openAiChatCompletionsEnabled ?? openAiChatCompletionsConfig?.enabled ?? false;
const openResponsesConfig = params.cfg.gateway?.http?.endpoints?.responses;
const openResponsesEnabled = params.openResponsesEnabled ?? openResponsesConfig?.enabled ?? false;
const strictTransportSecurityConfig =
@@ -168,6 +168,9 @@ export async function resolveGatewayRuntimeConfig(params: {
bindHost,
controlUiEnabled,
openAiChatCompletionsEnabled,
openAiChatCompletionsConfig: openAiChatCompletionsConfig
? { ...openAiChatCompletionsConfig, enabled: openAiChatCompletionsEnabled }
: undefined,
openResponsesEnabled,
openResponsesConfig: openResponsesConfig
? { ...openResponsesConfig, enabled: openResponsesEnabled }

View File

@@ -43,6 +43,7 @@ export async function createGatewayRuntimeState(params: {
controlUiBasePath: string;
controlUiRoot?: ControlUiRootState;
openAiChatCompletionsEnabled: boolean;
openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig;
openResponsesEnabled: boolean;
openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig;
strictTransportSecurityHeader?: string;
@@ -146,6 +147,7 @@ export async function createGatewayRuntimeState(params: {
controlUiBasePath: params.controlUiBasePath,
controlUiRoot: params.controlUiRoot,
openAiChatCompletionsEnabled: params.openAiChatCompletionsEnabled,
openAiChatCompletionsConfig: params.openAiChatCompletionsConfig,
openResponsesEnabled: params.openResponsesEnabled,
openResponsesConfig: params.openResponsesConfig,
strictTransportSecurityHeader: params.strictTransportSecurityHeader,

View File

@@ -487,6 +487,7 @@ export async function startGatewayServer(
bindHost,
controlUiEnabled,
openAiChatCompletionsEnabled,
openAiChatCompletionsConfig,
openResponsesEnabled,
openResponsesConfig,
strictTransportSecurityHeader,
@@ -571,6 +572,7 @@ export async function startGatewayServer(
controlUiBasePath,
controlUiRoot: controlUiRootState,
openAiChatCompletionsEnabled,
openAiChatCompletionsConfig,
openResponsesEnabled,
openResponsesConfig,
strictTransportSecurityHeader,