diff --git a/CHANGELOG.md b/CHANGELOG.md index a950f1c7672..89d5367de0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -392,6 +392,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Security: strip hidden text from `web_fetch` extracted content to prevent indirect prompt injection, covering CSS-hidden elements, class-based hiding (sr-only, d-none, etc.), invisible Unicode, color:transparent, offscreen transforms, and non-content tags. (#8027, #21074) Thanks @hydro13 for the fix and @LucasAIBuilder for reporting. - Agents/Streaming: keep assistant partial streaming active during reasoning streams, handle native `thinking_*` stream events consistently, dedupe mixed reasoning-end signals, and clear stale mutating tool errors after same-target retry success. (#20635) Thanks @obviyus. - iOS/Chat: use a dedicated iOS chat session key for ChatSheet routing to avoid cross-client session collisions with main-session traffic. (#21139) thanks @mbelinky. - iOS/Chat: auto-resync chat history after reconnect sequence gaps, clear stale pending runs, and avoid dead-end manual refresh errors after transient disconnects. (#21135) thanks @mbelinky. diff --git a/src/agents/tools/web-fetch-utils.ts b/src/agents/tools/web-fetch-utils.ts index a9ef9d5ba45..4dc57abf80d 100644 --- a/src/agents/tools/web-fetch-utils.ts +++ b/src/agents/tools/web-fetch-utils.ts @@ -1,3 +1,5 @@ +import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js"; + export type ExtractMode = "markdown" | "text"; const READABILITY_MAX_HTML_CHARS = 1_000_000; @@ -209,23 +211,26 @@ export async function extractReadableContent(params: { url: string; extractMode: ExtractMode; }): Promise<{ text: string; title?: string } | null> { + const cleanHtml = await sanitizeHtml(params.html); const fallback = (): { text: string; title?: string } => { - const rendered = htmlToMarkdown(params.html); + const rendered = htmlToMarkdown(cleanHtml); if (params.extractMode === "text") { - const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html)); + const text = + stripInvisibleUnicode(markdownToText(rendered.text)) || + stripInvisibleUnicode(normalizeWhitespace(stripTags(cleanHtml))); return { text, title: rendered.title }; } - return rendered; + return { text: stripInvisibleUnicode(rendered.text), title: rendered.title }; }; if ( - params.html.length > READABILITY_MAX_HTML_CHARS || - exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH) + cleanHtml.length > READABILITY_MAX_HTML_CHARS || + exceedsEstimatedHtmlNestingDepth(cleanHtml, READABILITY_MAX_ESTIMATED_NESTING_DEPTH) ) { return fallback(); } try { const { Readability, parseHTML } = await loadReadabilityDeps(); - const { document } = parseHTML(params.html); + const { document } = parseHTML(cleanHtml); try { (document as { baseURI?: string }).baseURI = params.url; } catch { @@ -238,11 +243,11 @@ export async function extractReadableContent(params: { } const title = parsed.title || undefined; if (params.extractMode === "text") { - const text = normalizeWhitespace(parsed.textContent ?? ""); + const text = stripInvisibleUnicode(normalizeWhitespace(parsed.textContent ?? "")); return text ? { text, title } : fallback(); } const rendered = htmlToMarkdown(parsed.content); - return { text: rendered.text, title: title ?? rendered.title }; + return { text: stripInvisibleUnicode(rendered.text), title: title ?? rendered.title }; } catch { return fallback(); } diff --git a/src/agents/tools/web-fetch-visibility.test.ts b/src/agents/tools/web-fetch-visibility.test.ts new file mode 100644 index 00000000000..a1bf7f18f8f --- /dev/null +++ b/src/agents/tools/web-fetch-visibility.test.ts @@ -0,0 +1,246 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js"; + +describe("sanitizeHtml", () => { + it("strips display:none elements", async () => { + const html = '

Visible

Hidden

'; + const result = await sanitizeHtml(html); + expect(result).toContain("Visible"); + expect(result).not.toContain("Hidden"); + }); + + it("strips visibility:hidden elements", async () => { + const html = '

Visible

Secret'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Secret"); + }); + + it("strips opacity:0 elements", async () => { + const html = '

Show

Invisible
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Invisible"); + }); + + it("strips font-size:0 elements", async () => { + const html = '

Normal

Tiny'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Tiny"); + }); + + it("strips text-indent far-offscreen elements", async () => { + const html = '

Normal

Offscreen

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Offscreen"); + }); + + it("strips color:transparent elements", async () => { + const html = '

Visible

Ghost

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Ghost"); + }); + + it("strips color:rgba with zero alpha elements", async () => { + const html = '

Visible

Invisible

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Invisible"); + }); + + it("strips color:rgba with zero decimal alpha elements", async () => { + const html = '

Visible

Invisible

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Invisible"); + }); + + it("strips color:hsla with zero alpha elements", async () => { + const html = '

Visible

Invisible

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Invisible"); + }); + + it("strips transform:scale(0) elements", async () => { + const html = '

Show

Scaled
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Scaled"); + }); + + it("strips transform:translateX far-offscreen elements", async () => { + const html = '

Show

Translated
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Translated"); + }); + + it("strips width:0 height:0 overflow:hidden elements", async () => { + const html = '

Show

Zero
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Zero"); + }); + + it("strips left far-offscreen positioned elements", async () => { + const html = '

Show

Offscreen
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Offscreen"); + }); + + it("strips clip-path:inset(100%) elements", async () => { + const html = '

Show

Clipped
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Clipped"); + }); + + it("strips clip-path:inset(50%) elements", async () => { + const html = '

Show

Clipped
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Clipped"); + }); + + it("does not strip clip-path:inset(0%) elements", async () => { + const html = '

Show

Visible
'; + const result = await sanitizeHtml(html); + expect(result).toContain("Visible"); + }); + + it("strips sr-only class elements", async () => { + const html = '

Main

Screen reader only'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Screen reader only"); + }); + + it("strips visually-hidden class elements", async () => { + const html = '

Main

Hidden visually'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Hidden visually"); + }); + + it("strips d-none class elements", async () => { + const html = '

Main

Bootstrap hidden
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Bootstrap hidden"); + }); + + it("strips hidden class elements", async () => { + const html = '

Main

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Class hidden"); + }); + + it("does not strip elements with hidden as substring of class name", async () => { + const html = '

Main

Should be visible
'; + const result = await sanitizeHtml(html); + expect(result).toContain("Should be visible"); + }); + + it("strips aria-hidden=true elements", async () => { + const html = '

Visible

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Aria hidden"); + }); + + it("strips elements with hidden attribute", async () => { + const html = "

Visible

"; + const result = await sanitizeHtml(html); + expect(result).not.toContain("HTML hidden"); + }); + + it("strips input type=hidden", async () => { + const html = '
'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("csrf-token-secret"); + }); + + it("strips HTML comments", async () => { + const html = "

Visible

"; + const result = await sanitizeHtml(html); + expect(result).not.toContain("inject"); + expect(result).not.toContain("ignore previous instructions"); + }); + + it("strips meta tags", async () => { + const html = '

Body

'; + const result = await sanitizeHtml(html); + expect(result).not.toContain("prompt payload"); + }); + + it("strips template tags", async () => { + const html = "

Visible

"; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Hidden template content"); + }); + + it("strips iframe tags", async () => { + const html = "

Visible

"; + const result = await sanitizeHtml(html); + expect(result).not.toContain("Iframe content"); + }); + + it("preserves visible content", async () => { + const html = "

Hello world

Title

Link"; + const result = await sanitizeHtml(html); + expect(result).toContain("Hello world"); + expect(result).toContain("Title"); + }); + + it("handles nested hidden elements without removing visible siblings", async () => { + const html = + '

Visible

Hidden

Also visible

'; + const result = await sanitizeHtml(html); + expect(result).toContain("Visible"); + expect(result).toContain("Also visible"); + expect(result).not.toContain("Hidden"); + }); + + it("handles malformed HTML gracefully", async () => { + const html = "

Unclosed

Nested"; + await expect(sanitizeHtml(html)).resolves.toBeDefined(); + }); +}); + +describe("stripInvisibleUnicode", () => { + it("strips zero-width space", () => { + const text = "Hello\u200BWorld"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("strips zero-width non-joiner", () => { + const text = "Hello\u200CWorld"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("strips zero-width joiner", () => { + const text = "Hello\u200DWorld"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("strips left-to-right mark", () => { + const text = "Hello\u200EWorld"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("strips right-to-left mark", () => { + const text = "Hello\u200FWorld"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("strips directional overrides (LRO, RLO, PDF, etc.)", () => { + const text = "\u202AHello\u202E"; + expect(stripInvisibleUnicode(text)).toBe("Hello"); + }); + + it("strips word joiner and other formatting chars", () => { + const text = "Hello\u2060World\uFEFF"; + expect(stripInvisibleUnicode(text)).toBe("HelloWorld"); + }); + + it("preserves normal text unchanged", () => { + const text = "Hello, World! 123 \u00e9\u4e2d\u6587"; + expect(stripInvisibleUnicode(text)).toBe(text); + }); + + it("strips multiple invisible chars in a row", () => { + const text = "A\u200B\u200C\u200D\u200E\u200FB"; + expect(stripInvisibleUnicode(text)).toBe("AB"); + }); + + it("handles empty string", () => { + expect(stripInvisibleUnicode("")).toBe(""); + }); +}); diff --git a/src/agents/tools/web-fetch-visibility.ts b/src/agents/tools/web-fetch-visibility.ts new file mode 100644 index 00000000000..b00ceb2e75f --- /dev/null +++ b/src/agents/tools/web-fetch-visibility.ts @@ -0,0 +1,156 @@ +// CSS property values that indicate an element is hidden +const HIDDEN_STYLE_PATTERNS: Array<[string, RegExp]> = [ + ["display", /^\s*none\s*$/i], + ["visibility", /^\s*hidden\s*$/i], + ["opacity", /^\s*0\s*$/], + ["font-size", /^\s*0(px|em|rem|pt|%)?\s*$/i], + ["text-indent", /^\s*-\d{4,}px\s*$/], + ["color", /^\s*transparent\s*$/i], + ["color", /^\s*rgba\s*\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*0(?:\.0+)?\s*\)\s*$/i], + ["color", /^\s*hsla\s*\(\s*[\d.]+\s*,\s*[\d.]+%?\s*,\s*[\d.]+%?\s*,\s*0(?:\.0+)?\s*\)\s*$/i], +]; + +// Class names associated with visually hidden content +const HIDDEN_CLASS_NAMES = new Set([ + "sr-only", + "visually-hidden", + "d-none", + "hidden", + "invisible", + "screen-reader-only", + "offscreen", +]); + +function hasHiddenClass(className: string): boolean { + const classes = className.toLowerCase().split(/\s+/); + return classes.some((cls) => HIDDEN_CLASS_NAMES.has(cls)); +} + +function isStyleHidden(style: string): boolean { + for (const [prop, pattern] of HIDDEN_STYLE_PATTERNS) { + const escapedProp = prop.replace(/-/g, "\\-"); + const match = style.match(new RegExp(`(?:^|;)\\s*${escapedProp}\\s*:\\s*([^;]+)`, "i")); + if (match && pattern.test(match[1])) { + return true; + } + } + + // clip-path: none is not hidden, but positive percentage inset() clipping hides content. + const clipPath = style.match(/(?:^|;)\s*clip-path\s*:\s*([^;]+)/i); + if (clipPath && !/^\s*none\s*$/i.test(clipPath[1])) { + if (/inset\s*\(\s*(?:0*\.\d+|[1-9]\d*(?:\.\d+)?)%/i.test(clipPath[1])) { + return true; + } + } + + // transform: scale(0) + const transform = style.match(/(?:^|;)\s*transform\s*:\s*([^;]+)/i); + if (transform) { + if (/scale\s*\(\s*0\s*\)/i.test(transform[1])) { + return true; + } + if (/translateX\s*\(\s*-\d{4,}px\s*\)/i.test(transform[1])) { + return true; + } + if (/translateY\s*\(\s*-\d{4,}px\s*\)/i.test(transform[1])) { + return true; + } + } + + // width:0 + height:0 + overflow:hidden + const width = style.match(/(?:^|;)\s*width\s*:\s*([^;]+)/i); + const height = style.match(/(?:^|;)\s*height\s*:\s*([^;]+)/i); + const overflow = style.match(/(?:^|;)\s*overflow\s*:\s*([^;]+)/i); + if ( + width && + /^\s*0(px)?\s*$/i.test(width[1]) && + height && + /^\s*0(px)?\s*$/i.test(height[1]) && + overflow && + /^\s*hidden\s*$/i.test(overflow[1]) + ) { + return true; + } + + // Offscreen positioning: left/top far negative + const left = style.match(/(?:^|;)\s*left\s*:\s*([^;]+)/i); + const top = style.match(/(?:^|;)\s*top\s*:\s*([^;]+)/i); + if (left && /^\s*-\d{4,}px\s*$/i.test(left[1])) { + return true; + } + if (top && /^\s*-\d{4,}px\s*$/i.test(top[1])) { + return true; + } + + return false; +} + +function shouldRemoveElement(element: Element): boolean { + const tagName = element.tagName.toLowerCase(); + + // Always-remove tags + if (["meta", "template", "svg", "canvas", "iframe", "object", "embed"].includes(tagName)) { + return true; + } + + // input type=hidden + if (tagName === "input" && element.getAttribute("type")?.toLowerCase() === "hidden") { + return true; + } + + // aria-hidden=true + if (element.getAttribute("aria-hidden") === "true") { + return true; + } + + // hidden attribute + if (element.hasAttribute("hidden")) { + return true; + } + + // class-based hiding + const className = element.getAttribute("class") ?? ""; + if (hasHiddenClass(className)) { + return true; + } + + // inline style-based hiding + const style = element.getAttribute("style") ?? ""; + if (style && isStyleHidden(style)) { + return true; + } + + return false; +} + +export async function sanitizeHtml(html: string): Promise { + // Strip HTML comments + let sanitized = html.replace(//g, ""); + + let document: Document; + try { + const { parseHTML } = await import("linkedom"); + ({ document } = parseHTML(sanitized) as { document: Document }); + } catch { + return sanitized; + } + + // Walk all elements and remove hidden ones (bottom-up to avoid re-walking removed subtrees) + const all = Array.from(document.querySelectorAll("*")); + for (let i = all.length - 1; i >= 0; i--) { + const el = all[i]; + if (shouldRemoveElement(el)) { + el.parentNode?.removeChild(el); + } + } + + return (document as unknown as { toString(): string }).toString(); +} + +// Zero-width and invisible Unicode characters used in prompt injection attacks +const INVISIBLE_UNICODE_RE = + /[\u200B-\u200F\u202A-\u202E\u2060-\u2064\u206A-\u206F\uFEFF\u{E0000}-\u{E007F}]/gu; + +export function stripInvisibleUnicode(text: string): string { + return text.replace(INVISIBLE_UNICODE_RE, ""); +}