From 85ded4d444cbbd3b144d939fbf5d4c0824e820e1 Mon Sep 17 00:00:00 2001
From: JC <anyech@users.noreply.github.com>
Date: Wed, 6 May 2026 01:34:42 -0700
Subject: [PATCH] pdf: add Codex instructions for extraction fallback (#51329)

* Fix Codex PDF extraction fallback missing instructions

- add a Codex-specific systemPrompt on the PDF extraction fallback path
- keep non-Codex PDF fallback requests unchanged
- add regression coverage proving openai-codex-responses requests include instructions for PDF tool calls

* test: cover Codex text-only extraction fallback

- add regression coverage for the branch where PDF extraction includes images
  but the selected Codex model only accepts text input
- assert Codex-specific extraction instructions are still attached in that path

* test: fix extracted image mock shape

- add the required `type: "image"` field to the text-only fallback regression mock
- keep the new Codex coverage test aligned with PdfExtractedImage

* test: align Codex PDF fallback tests

* docs(changelog): note PDF Codex fallback fix

---------

Co-authored-by: Dr JCai <jingxiao.cai@gmail.com>
Co-authored-by: anyech <8743351+anyech@users.noreply.github.com>
---
 CHANGELOG.md                      |  1 +
 src/agents/tools/pdf-tool.test.ts | 85 +++++++++++++++++++++++++++++++
 src/agents/tools/pdf-tool.ts      | 16 ++++--
 3 files changed, 99 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2b6706401fe..2519c3f2fe3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Google Meet/Voice Call: wait longer before playing PIN-derived Twilio DTMF for Meet dial-in prompts and retire stale delegated phone sessions instead of reusing completed calls.
+- PDF/Codex: include extraction-fallback instructions for `openai-codex/*` PDF tool requests so Codex Responses receives its required system prompt. Fixes #77872. Thanks @anyech.
 - Onboard/channels: recover externalized channel plugins from stale `channels.<id>` config by falling back to `ensureChannelSetupPluginInstalled` via the trusted catalog when the plugin is missing on disk, so leftover `appId`/token entries no longer dead-end onboard with "<channel> plugin not available." (#78328) Thanks @sliverp.
 - Codex/app-server: forward the OpenClaw workspace bootstrap block through Codex `developerInstructions` instead of `config.instructions`, so persona/style guidance reaches the behavior-shaping app-server lane. Fixes #77363. Thanks @lonexreb.
 - CLI/infer: pass minimal instructions to local `openai-codex/*` model probes and surface provider error details when `infer model run` returns no text. Fixes #76464. Thanks @lilesjtu.
diff --git a/src/agents/tools/pdf-tool.test.ts b/src/agents/tools/pdf-tool.test.ts
index 77a30edd71e..5e5306482bf 100644
--- a/src/agents/tools/pdf-tool.test.ts
+++ b/src/agents/tools/pdf-tool.test.ts
@@ -36,6 +36,7 @@ async function loadCreatePdfTool() {
 
 const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
 const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
+const CODEX_PDF_MODEL = "openai-codex/gpt-5.4";
 const FAKE_PDF_MEDIA = {
   kind: "document",
   buffer: Buffer.from("%PDF-1.4 fake"),
@@ -85,6 +86,7 @@ async function stubPdfToolInfra(
     mockLoad?: boolean;
     provider?: string;
     input?: string[];
+    api?: string;
     modelFound?: boolean;
   },
 ) {
@@ -102,6 +104,13 @@ async function stubPdfToolInfra(
       : () =>
           ({
             provider: params?.provider ?? "anthropic",
+            api:
+              params?.api ??
+              (params?.provider === "openai-codex"
+                ? "openai-codex-responses"
+                : params?.provider === "openai"
+                  ? "openai-responses"
+                  : "anthropic-messages"),
             maxTokens: 8192,
             input: params?.input ?? ["text", "document"],
           }) as never;
@@ -469,6 +478,82 @@ describe("createPdfTool", () => {
         content: [{ type: "text", text: "fallback summary" }],
         details: { native: false, model: OPENAI_PDF_MODEL },
       });
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toBeUndefined();
+    });
+  });
+
+  it("adds Codex instructions for PDF extraction fallback requests", async () => {
+    await withTempPdfAgentDir(async (agentDir) => {
+      await stubPdfToolInfra(agentDir, {
+        provider: "openai-codex",
+        api: "openai-codex-responses",
+        input: ["text", "image"],
+      });
+
+      vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
+        text: "Extracted content",
+        images: [],
+      });
+
+      completeMock.mockResolvedValue({
+        role: "assistant",
+        stopReason: "stop",
+        content: [{ type: "text", text: "codex summary" }],
+      } as never);
+
+      const cfg = withPdfModel(CODEX_PDF_MODEL);
+      const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
+
+      const result = await tool.execute("t1", {
+        prompt: "summarize",
+        pdf: "/tmp/doc.pdf",
+      });
+
+      expect(result).toMatchObject({
+        content: [{ type: "text", text: "codex summary" }],
+        details: { native: false, model: CODEX_PDF_MODEL },
+      });
+      expect(completeMock).toHaveBeenCalledTimes(1);
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
+    });
+  });
+
+  it("adds Codex instructions when extraction has images but the model only accepts text", async () => {
+    await withTempPdfAgentDir(async (agentDir) => {
+      await stubPdfToolInfra(agentDir, {
+        provider: "openai-codex",
+        api: "openai-codex-responses",
+        input: ["text"],
+      });
+
+      vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
+        text: "Extracted content",
+        images: [{ type: "image", data: "base64img", mimeType: "image/png" }],
+      });
+
+      completeMock.mockResolvedValue({
+        role: "assistant",
+        stopReason: "stop",
+        content: [{ type: "text", text: "codex summary" }],
+      } as never);
+
+      const cfg = withPdfModel(CODEX_PDF_MODEL);
+      const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
+
+      const result = await tool.execute("t1", {
+        prompt: "summarize",
+        pdf: "/tmp/doc.pdf",
+      });
+
+      expect(result).toMatchObject({
+        content: [{ type: "text", text: "codex summary" }],
+        details: { native: false, model: CODEX_PDF_MODEL },
+      });
+      expect(completeMock).toHaveBeenCalledTimes(1);
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
     });
   });
 
diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts
index 65fbac9af24..32e83e41aa6 100644
--- a/src/agents/tools/pdf-tool.ts
+++ b/src/agents/tools/pdf-tool.ts
@@ -90,7 +90,14 @@ function hasExplicitPdfToolModelConfig(config?: OpenClawConfig): boolean {
 // Build context for extraction fallback path
 // ---------------------------------------------------------------------------
 
-function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context {
+const CODEX_PDF_INSTRUCTIONS =
+  "Analyze the provided PDF content and answer the user's request accurately.";
+
+function buildPdfExtractionContext(
+  prompt: string,
+  extractions: PdfExtractedContent[],
+  model?: { api?: string },
+): Context {
   const content: Array<
     { type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
   > = [];
@@ -110,7 +117,10 @@ function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedCont
   // Add the user prompt
   content.push({ type: "text", text: prompt });
 
+  const systemPrompt = model?.api === "openai-codex-responses" ? CODEX_PDF_INSTRUCTIONS : undefined;
+
   return {
+    ...(systemPrompt ? { systemPrompt } : {}),
     messages: [{ role: "user", content, timestamp: Date.now() }],
   };
 }
@@ -217,7 +227,7 @@ async function runPdfPrompt(params: {
           text: e.text,
           images: [],
         }));
-        const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions);
+        const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions, model);
         const message = await complete(model, context, {
           apiKey,
           maxTokens: resolvePdfToolMaxTokens(model.maxTokens),
@@ -226,7 +236,7 @@ async function runPdfPrompt(params: {
         return { text, provider, model: modelId, native: false };
       }
 
-      const context = buildPdfExtractionContext(params.prompt, extractions);
+      const context = buildPdfExtractionContext(params.prompt, extractions, model);
       const message = await complete(model, context, {
         apiKey,
         maxTokens: resolvePdfToolMaxTokens(model.maxTokens),