From 85ded4d444cbbd3b144d939fbf5d4c0824e820e1 Mon Sep 17 00:00:00 2001 From: JC Date: Wed, 6 May 2026 01:34:42 -0700 Subject: [PATCH] pdf: add Codex instructions for extraction fallback (#51329) * Fix Codex PDF extraction fallback missing instructions - add a Codex-specific systemPrompt on the PDF extraction fallback path - keep non-Codex PDF fallback requests unchanged - add regression coverage proving openai-codex-responses requests include instructions for PDF tool calls * test: cover Codex text-only extraction fallback - add regression coverage for the branch where PDF extraction includes images but the selected Codex model only accepts text input - assert Codex-specific extraction instructions are still attached in that path * test: fix extracted image mock shape - add the required `type: "image"` field to the text-only fallback regression mock - keep the new Codex coverage test aligned with PdfExtractedImage * test: align Codex PDF fallback tests * docs(changelog): note PDF Codex fallback fix --------- Co-authored-by: Dr JCai Co-authored-by: anyech <8743351+anyech@users.noreply.github.com> --- CHANGELOG.md | 1 + src/agents/tools/pdf-tool.test.ts | 85 +++++++++++++++++++++++++++++++ src/agents/tools/pdf-tool.ts | 16 ++++-- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b6706401fe..2519c3f2fe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Google Meet/Voice Call: wait longer before playing PIN-derived Twilio DTMF for Meet dial-in prompts and retire stale delegated phone sessions instead of reusing completed calls. +- PDF/Codex: include extraction-fallback instructions for `openai-codex/*` PDF tool requests so Codex Responses receives its required system prompt. Fixes #77872. Thanks @anyech. - Onboard/channels: recover externalized channel plugins from stale `channels.` config by falling back to `ensureChannelSetupPluginInstalled` via the trusted catalog when the plugin is missing on disk, so leftover `appId`/token entries no longer dead-end onboard with " plugin not available." (#78328) Thanks @sliverp. - Codex/app-server: forward the OpenClaw workspace bootstrap block through Codex `developerInstructions` instead of `config.instructions`, so persona/style guidance reaches the behavior-shaping app-server lane. Fixes #77363. Thanks @lonexreb. - CLI/infer: pass minimal instructions to local `openai-codex/*` model probes and surface provider error details when `infer model run` returns no text. Fixes #76464. Thanks @lilesjtu. diff --git a/src/agents/tools/pdf-tool.test.ts b/src/agents/tools/pdf-tool.test.ts index 77a30edd71e..5e5306482bf 100644 --- a/src/agents/tools/pdf-tool.test.ts +++ b/src/agents/tools/pdf-tool.test.ts @@ -36,6 +36,7 @@ async function loadCreatePdfTool() { const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6"; const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini"; +const CODEX_PDF_MODEL = "openai-codex/gpt-5.4"; const FAKE_PDF_MEDIA = { kind: "document", buffer: Buffer.from("%PDF-1.4 fake"), @@ -85,6 +86,7 @@ async function stubPdfToolInfra( mockLoad?: boolean; provider?: string; input?: string[]; + api?: string; modelFound?: boolean; }, ) { @@ -102,6 +104,13 @@ async function stubPdfToolInfra( : () => ({ provider: params?.provider ?? "anthropic", + api: + params?.api ?? + (params?.provider === "openai-codex" + ? "openai-codex-responses" + : params?.provider === "openai" + ? "openai-responses" + : "anthropic-messages"), maxTokens: 8192, input: params?.input ?? ["text", "document"], }) as never; @@ -469,6 +478,82 @@ describe("createPdfTool", () => { content: [{ type: "text", text: "fallback summary" }], details: { native: false, model: OPENAI_PDF_MODEL }, }); + const [, context] = completeMock.mock.calls[0] ?? []; + expect(context?.systemPrompt).toBeUndefined(); + }); + }); + + it("adds Codex instructions for PDF extraction fallback requests", async () => { + await withTempPdfAgentDir(async (agentDir) => { + await stubPdfToolInfra(agentDir, { + provider: "openai-codex", + api: "openai-codex-responses", + input: ["text", "image"], + }); + + vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({ + text: "Extracted content", + images: [], + }); + + completeMock.mockResolvedValue({ + role: "assistant", + stopReason: "stop", + content: [{ type: "text", text: "codex summary" }], + } as never); + + const cfg = withPdfModel(CODEX_PDF_MODEL); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); + + const result = await tool.execute("t1", { + prompt: "summarize", + pdf: "/tmp/doc.pdf", + }); + + expect(result).toMatchObject({ + content: [{ type: "text", text: "codex summary" }], + details: { native: false, model: CODEX_PDF_MODEL }, + }); + expect(completeMock).toHaveBeenCalledTimes(1); + const [, context] = completeMock.mock.calls[0] ?? []; + expect(context?.systemPrompt).toContain("Analyze the provided PDF content"); + }); + }); + + it("adds Codex instructions when extraction has images but the model only accepts text", async () => { + await withTempPdfAgentDir(async (agentDir) => { + await stubPdfToolInfra(agentDir, { + provider: "openai-codex", + api: "openai-codex-responses", + input: ["text"], + }); + + vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({ + text: "Extracted content", + images: [{ type: "image", data: "base64img", mimeType: "image/png" }], + }); + + completeMock.mockResolvedValue({ + role: "assistant", + stopReason: "stop", + content: [{ type: "text", text: "codex summary" }], + } as never); + + const cfg = withPdfModel(CODEX_PDF_MODEL); + const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir })); + + const result = await tool.execute("t1", { + prompt: "summarize", + pdf: "/tmp/doc.pdf", + }); + + expect(result).toMatchObject({ + content: [{ type: "text", text: "codex summary" }], + details: { native: false, model: CODEX_PDF_MODEL }, + }); + expect(completeMock).toHaveBeenCalledTimes(1); + const [, context] = completeMock.mock.calls[0] ?? []; + expect(context?.systemPrompt).toContain("Analyze the provided PDF content"); }); }); diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts index 65fbac9af24..32e83e41aa6 100644 --- a/src/agents/tools/pdf-tool.ts +++ b/src/agents/tools/pdf-tool.ts @@ -90,7 +90,14 @@ function hasExplicitPdfToolModelConfig(config?: OpenClawConfig): boolean { // Build context for extraction fallback path // --------------------------------------------------------------------------- -function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context { +const CODEX_PDF_INSTRUCTIONS = + "Analyze the provided PDF content and answer the user's request accurately."; + +function buildPdfExtractionContext( + prompt: string, + extractions: PdfExtractedContent[], + model?: { api?: string }, +): Context { const content: Array< { type: "text"; text: string } | { type: "image"; data: string; mimeType: string } > = []; @@ -110,7 +117,10 @@ function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedCont // Add the user prompt content.push({ type: "text", text: prompt }); + const systemPrompt = model?.api === "openai-codex-responses" ? CODEX_PDF_INSTRUCTIONS : undefined; + return { + ...(systemPrompt ? { systemPrompt } : {}), messages: [{ role: "user", content, timestamp: Date.now() }], }; } @@ -217,7 +227,7 @@ async function runPdfPrompt(params: { text: e.text, images: [], })); - const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions); + const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions, model); const message = await complete(model, context, { apiKey, maxTokens: resolvePdfToolMaxTokens(model.maxTokens), @@ -226,7 +236,7 @@ async function runPdfPrompt(params: { return { text, provider, model: modelId, native: false }; } - const context = buildPdfExtractionContext(params.prompt, extractions); + const context = buildPdfExtractionContext(params.prompt, extractions, model); const message = await complete(model, context, { apiKey, maxTokens: resolvePdfToolMaxTokens(model.maxTokens),