mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-06 23:55:12 +00:00
fix(reply): parse markdown image replies as media
* fix(reply): parse markdown image replies as media * fix(reply): preserve inline markdown image captions * fix(reply): harden markdown image parsing
This commit is contained in:
@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- Agents/replay: stop OpenAI/Codex transcript replay from synthesizing missing tool results while still preserving synthetic repair on Anthropic, Gemini, and Bedrock transport-owned sessions. (#61556) Thanks @VictorJeon and @vincentkoc.
|
||||
- Telegram/media replies: parse remote markdown image syntax into outbound media payloads on the final reply path, so Telegram group chats stop falling back to plain-text image URLs when the model or a tool emits `` instead of a `MEDIA:` token. (#66191) Thanks @apezam and @vincentkoc.
|
||||
- Agents/WebChat: surface non-retryable provider failures such as billing, auth, and rate-limit errors from the embedded runner instead of logging `surface_error` and leaving webchat with no rendered error. Fixes #70124. (#70848) Thanks @truffle-dev.
|
||||
- Memory/CLI: declare the built-in `local` embedding provider in the memory-core manifest, so standalone `openclaw memory status`, `index`, and `search` can resolve local embeddings just like the gateway runtime. Fixes #70836. (#70873) Thanks @mattznojassist.
|
||||
- Gateway/WebChat: preserve image attachments for text-only primary models by offloading them as media refs instead of dropping them, so configured image tools can still inspect the original file. Fixes #68513, #44276, #51656, #70212.
|
||||
|
||||
@@ -246,6 +246,49 @@ describe("buildReplyPayloads media filter integration", () => {
|
||||
expect(replyPayloads).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("extracts markdown image replies into final payload media urls", async () => {
|
||||
const { replyPayloads } = await buildReplyPayloads({
|
||||
...baseParams,
|
||||
payloads: [{ text: "Here you go\n\n" }],
|
||||
});
|
||||
|
||||
expect(replyPayloads).toHaveLength(1);
|
||||
expect(replyPayloads[0]).toMatchObject({
|
||||
text: "Here you go",
|
||||
mediaUrl: "https://example.com/chart.png",
|
||||
mediaUrls: ["https://example.com/chart.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves inline caption text when lifting markdown image replies into media", async () => {
|
||||
const { replyPayloads } = await buildReplyPayloads({
|
||||
...baseParams,
|
||||
payloads: [{ text: 'Look  now' }],
|
||||
});
|
||||
|
||||
expect(replyPayloads).toHaveLength(1);
|
||||
expect(replyPayloads[0]).toMatchObject({
|
||||
text: "Look now",
|
||||
mediaUrl: "https://example.com/chart.png",
|
||||
mediaUrls: ["https://example.com/chart.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps markdown local file images as plain text in final replies", async () => {
|
||||
const text = "Look  now";
|
||||
const { replyPayloads } = await buildReplyPayloads({
|
||||
...baseParams,
|
||||
payloads: [{ text }],
|
||||
});
|
||||
|
||||
expect(replyPayloads).toHaveLength(1);
|
||||
expect(replyPayloads[0]).toMatchObject({
|
||||
text,
|
||||
});
|
||||
expect(replyPayloads[0]?.mediaUrl).toBeUndefined();
|
||||
expect(replyPayloads[0]?.mediaUrls).toBeUndefined();
|
||||
});
|
||||
|
||||
it("deduplicates final payloads against directly sent block keys regardless of replyToId", async () => {
|
||||
// When block streaming is not active but directlySentBlockKeys has entries
|
||||
// (e.g. from pre-tool flush), the key should match even if replyToId differs.
|
||||
|
||||
@@ -103,4 +103,74 @@ describe("splitMediaFromOutput", () => {
|
||||
{ type: "text", text: "```text\nMEDIA:https://example.com/ignored.png\n```\nAfter" },
|
||||
]);
|
||||
});
|
||||
|
||||
it("extracts markdown image urls while keeping surrounding caption text", () => {
|
||||
expectParsedMediaOutputCase("Caption\n\n", {
|
||||
text: "Caption",
|
||||
mediaUrls: ["https://example.com/chart.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps inline caption text around markdown images", () => {
|
||||
expectParsedMediaOutputCase("Look  now", {
|
||||
text: "Look now",
|
||||
mediaUrls: ["https://example.com/chart.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("extracts multiple markdown image urls in order", () => {
|
||||
expectParsedMediaOutputCase(
|
||||
"Before\n\nMiddle\n\nAfter",
|
||||
{
|
||||
text: "Before\nMiddle\nAfter",
|
||||
mediaUrls: ["https://example.com/one.png", "https://example.com/two.png"],
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("strips markdown image title suffixes from extracted urls", () => {
|
||||
expectParsedMediaOutputCase(
|
||||
'Caption ',
|
||||
{
|
||||
text: "Caption",
|
||||
mediaUrls: ["https://example.com/chart.png"],
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps balanced parentheses inside markdown image urls", () => {
|
||||
expectParsedMediaOutputCase("Chart .png) now", {
|
||||
text: "Chart now",
|
||||
mediaUrls: ["https://example.com/a_(1).png"],
|
||||
});
|
||||
});
|
||||
|
||||
it.each([
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
] as const)("does not lift local markdown image target: %s", (input) => {
|
||||
expectParsedMediaOutputCase(input, {
|
||||
text: input,
|
||||
mediaUrls: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
it("does not lift markdown image urls that fail media validation", () => {
|
||||
const longUrl = `}.png)`;
|
||||
|
||||
expectParsedMediaOutputCase(longUrl, {
|
||||
text: longUrl,
|
||||
mediaUrls: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
it("leaves very long markdown-image candidate lines as text", () => {
|
||||
const input = `${"prefix ".repeat(3000)}`;
|
||||
|
||||
expectParsedMediaOutputCase(input, {
|
||||
text: input,
|
||||
mediaUrls: undefined,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -125,6 +125,265 @@ function mayContainFenceMarkers(input: string): boolean {
|
||||
return input.includes("```") || input.includes("~~~");
|
||||
}
|
||||
|
||||
function cleanLineText(text: string): string {
|
||||
return text.replace(/[ \t]{2,}/g, " ").trim();
|
||||
}
|
||||
|
||||
type MarkdownImageMatch = {
|
||||
start: number;
|
||||
end: number;
|
||||
destination: string;
|
||||
};
|
||||
|
||||
const MAX_MARKDOWN_IMAGE_LINE_LENGTH = 20_000;
|
||||
const MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE = 80;
|
||||
const MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE = 50;
|
||||
|
||||
function findMatchingBracket(
|
||||
input: string,
|
||||
start: number,
|
||||
open: string,
|
||||
close: string,
|
||||
): number | undefined {
|
||||
let depth = 1;
|
||||
for (let i = start; i < input.length; i += 1) {
|
||||
const ch = input[i];
|
||||
if (ch === "\\") {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (ch === open) {
|
||||
depth += 1;
|
||||
continue;
|
||||
}
|
||||
if (ch !== close) {
|
||||
continue;
|
||||
}
|
||||
depth -= 1;
|
||||
if (depth === 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function isRemoteMarkdownImageMedia(candidate: string): boolean {
|
||||
return /^https?:\/\//i.test(candidate) && isValidMedia(candidate);
|
||||
}
|
||||
|
||||
function parseMarkdownTitle(input: string, start: number): number | undefined {
|
||||
let index = start;
|
||||
while (index < input.length && /\s/.test(input[index] ?? "")) {
|
||||
index += 1;
|
||||
}
|
||||
const opener = input[index];
|
||||
if (!opener) {
|
||||
return undefined;
|
||||
}
|
||||
const closer = opener === '"' || opener === "'" ? opener : opener === "(" ? ")" : null;
|
||||
if (!closer) {
|
||||
return undefined;
|
||||
}
|
||||
const closingIndex =
|
||||
opener === "("
|
||||
? findMatchingBracket(input, index + 1, "(", ")")
|
||||
: (() => {
|
||||
for (let i = index + 1; i < input.length; i += 1) {
|
||||
const ch = input[i];
|
||||
if (ch === "\\") {
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (ch === closer) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
})();
|
||||
if (closingIndex == null) {
|
||||
return undefined;
|
||||
}
|
||||
let tailIndex = closingIndex + 1;
|
||||
while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
|
||||
tailIndex += 1;
|
||||
}
|
||||
return input[tailIndex] === ")" ? tailIndex + 1 : undefined;
|
||||
}
|
||||
|
||||
function parseMarkdownImageDestination(
|
||||
input: string,
|
||||
start: number,
|
||||
): { destination: string; end: number } | undefined {
|
||||
let index = start;
|
||||
while (index < input.length && /\s/.test(input[index] ?? "")) {
|
||||
index += 1;
|
||||
}
|
||||
if (index >= input.length) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (input[index] === "<") {
|
||||
let closing = index + 1;
|
||||
while (closing < input.length) {
|
||||
const ch = input[closing];
|
||||
if (ch === "\\") {
|
||||
closing += 2;
|
||||
continue;
|
||||
}
|
||||
if (ch === ">") {
|
||||
const destination = input.slice(index + 1, closing).trim();
|
||||
if (!destination) {
|
||||
return undefined;
|
||||
}
|
||||
let tailIndex = closing + 1;
|
||||
while (tailIndex < input.length && /\s/.test(input[tailIndex] ?? "")) {
|
||||
tailIndex += 1;
|
||||
}
|
||||
if (input[tailIndex] === ")") {
|
||||
return { destination, end: tailIndex + 1 };
|
||||
}
|
||||
const titledEnd = parseMarkdownTitle(input, tailIndex);
|
||||
return titledEnd ? { destination, end: titledEnd } : undefined;
|
||||
}
|
||||
closing += 1;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const destinationStart = index;
|
||||
let destinationEnd = index;
|
||||
let parenDepth = 0;
|
||||
while (index < input.length) {
|
||||
const ch = input[index];
|
||||
if (ch === "\\") {
|
||||
index += 2;
|
||||
destinationEnd = index;
|
||||
continue;
|
||||
}
|
||||
if (ch === "(") {
|
||||
parenDepth += 1;
|
||||
index += 1;
|
||||
destinationEnd = index;
|
||||
continue;
|
||||
}
|
||||
if (ch === ")") {
|
||||
if (parenDepth === 0) {
|
||||
const destination = input.slice(destinationStart, destinationEnd).trim();
|
||||
return destination ? { destination, end: index + 1 } : undefined;
|
||||
}
|
||||
parenDepth -= 1;
|
||||
index += 1;
|
||||
destinationEnd = index;
|
||||
continue;
|
||||
}
|
||||
if (/\s/.test(ch) && parenDepth === 0) {
|
||||
const destination = input.slice(destinationStart, destinationEnd).trim();
|
||||
if (!destination) {
|
||||
return undefined;
|
||||
}
|
||||
const titledEnd = parseMarkdownTitle(input, index);
|
||||
return titledEnd ? { destination, end: titledEnd } : undefined;
|
||||
}
|
||||
index += 1;
|
||||
destinationEnd = index;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function findMarkdownImageMatches(line: string): MarkdownImageMatch[] {
|
||||
if (line.length > MAX_MARKDOWN_IMAGE_LINE_LENGTH) {
|
||||
return [];
|
||||
}
|
||||
const matches: MarkdownImageMatch[] = [];
|
||||
let searchIndex = 0;
|
||||
let attempts = 0;
|
||||
while (
|
||||
matches.length < MAX_MARKDOWN_IMAGE_MATCHES_PER_LINE &&
|
||||
attempts < MAX_MARKDOWN_IMAGE_ATTEMPTS_PER_LINE
|
||||
) {
|
||||
const index = line.indexOf("![", searchIndex);
|
||||
if (index < 0) {
|
||||
break;
|
||||
}
|
||||
attempts += 1;
|
||||
const altEnd = findMatchingBracket(line, index + 2, "[", "]");
|
||||
if (altEnd == null || line[altEnd + 1] !== "(") {
|
||||
searchIndex = index + 2;
|
||||
continue;
|
||||
}
|
||||
const parsed = parseMarkdownImageDestination(line, altEnd + 2);
|
||||
if (!parsed) {
|
||||
searchIndex = index + 2;
|
||||
continue;
|
||||
}
|
||||
matches.push({
|
||||
start: index,
|
||||
end: parsed.end,
|
||||
destination: parsed.destination,
|
||||
});
|
||||
searchIndex = parsed.end;
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
function collectMarkdownImageSegments(params: { line: string; media: string[] }): {
|
||||
cleanedLine?: string;
|
||||
lineSegments: ParsedMediaOutputSegment[];
|
||||
foundMedia: boolean;
|
||||
} {
|
||||
const matches = findMarkdownImageMatches(params.line);
|
||||
if (matches.length === 0) {
|
||||
return { lineSegments: [], foundMedia: false };
|
||||
}
|
||||
|
||||
const segmentPieces: string[] = [];
|
||||
const visiblePieces: string[] = [];
|
||||
const lineSegments: ParsedMediaOutputSegment[] = [];
|
||||
let cursor = 0;
|
||||
let foundMedia = false;
|
||||
|
||||
for (const match of matches) {
|
||||
const before = params.line.slice(cursor, match.start);
|
||||
segmentPieces.push(before);
|
||||
visiblePieces.push(before);
|
||||
|
||||
const target = normalizeMediaSource(
|
||||
cleanCandidate(unwrapQuoted(match.destination) ?? match.destination),
|
||||
);
|
||||
if (isRemoteMarkdownImageMedia(target)) {
|
||||
const beforeText = cleanLineText(segmentPieces.join(""));
|
||||
if (beforeText) {
|
||||
lineSegments.push({ type: "text", text: beforeText });
|
||||
}
|
||||
segmentPieces.length = 0;
|
||||
params.media.push(target);
|
||||
lineSegments.push({ type: "media", url: target });
|
||||
foundMedia = true;
|
||||
} else {
|
||||
const original = params.line.slice(match.start, match.end);
|
||||
segmentPieces.push(original);
|
||||
visiblePieces.push(original);
|
||||
}
|
||||
|
||||
cursor = match.end;
|
||||
}
|
||||
|
||||
const after = params.line.slice(cursor);
|
||||
segmentPieces.push(after);
|
||||
visiblePieces.push(after);
|
||||
const trailingText = cleanLineText(segmentPieces.join(""));
|
||||
if (trailingText) {
|
||||
lineSegments.push({ type: "text", text: trailingText });
|
||||
}
|
||||
const cleanedLine = cleanLineText(visiblePieces.join(""));
|
||||
|
||||
return {
|
||||
cleanedLine: cleanedLine || undefined,
|
||||
lineSegments,
|
||||
foundMedia,
|
||||
};
|
||||
}
|
||||
|
||||
// Check if a character offset is inside any fenced code block
|
||||
function isInsideFence(fenceSpans: Array<{ start: number; end: number }>, offset: number): boolean {
|
||||
return fenceSpans.some((span) => offset >= span.start && offset < span.end);
|
||||
@@ -144,8 +403,9 @@ export function splitMediaFromOutput(raw: string): {
|
||||
return { text: "" };
|
||||
}
|
||||
const mayContainMediaToken = /media:/i.test(trimmedRaw);
|
||||
const mayContainMarkdownImage = /!\[[^\]]*]\(/.test(trimmedRaw);
|
||||
const mayContainAudioTag = trimmedRaw.includes("[[");
|
||||
if (!mayContainMediaToken && !mayContainAudioTag) {
|
||||
if (!mayContainMediaToken && !mayContainMarkdownImage && !mayContainAudioTag) {
|
||||
return { text: trimmedRaw };
|
||||
}
|
||||
|
||||
@@ -185,8 +445,23 @@ export function splitMediaFromOutput(raw: string): {
|
||||
|
||||
const trimmedStart = line.trimStart();
|
||||
if (!trimmedStart.toUpperCase().startsWith("MEDIA:")) {
|
||||
keptLines.push(line);
|
||||
pushTextSegment(line);
|
||||
const markdownImageResult = collectMarkdownImageSegments({ line, media });
|
||||
if (!markdownImageResult.foundMedia) {
|
||||
keptLines.push(line);
|
||||
pushTextSegment(line);
|
||||
} else {
|
||||
foundMediaToken = true;
|
||||
if (markdownImageResult.cleanedLine) {
|
||||
keptLines.push(markdownImageResult.cleanedLine);
|
||||
}
|
||||
for (const segment of markdownImageResult.lineSegments) {
|
||||
if (segment.type === "text") {
|
||||
pushTextSegment(segment.text);
|
||||
continue;
|
||||
}
|
||||
segments.push(segment);
|
||||
}
|
||||
}
|
||||
lineOffset += line.length + 1; // +1 for newline
|
||||
continue;
|
||||
}
|
||||
@@ -269,10 +544,7 @@ export function splitMediaFromOutput(raw: string): {
|
||||
}
|
||||
|
||||
if (hasValidMedia) {
|
||||
const beforeText = pieces
|
||||
.join("")
|
||||
.replace(/[ \t]{2,}/g, " ")
|
||||
.trim();
|
||||
const beforeText = cleanLineText(pieces.join(""));
|
||||
if (beforeText) {
|
||||
lineSegments.push({ type: "text", text: beforeText });
|
||||
}
|
||||
@@ -297,10 +569,7 @@ export function splitMediaFromOutput(raw: string): {
|
||||
|
||||
pieces.push(line.slice(cursor));
|
||||
|
||||
const cleanedLine = pieces
|
||||
.join("")
|
||||
.replace(/[ \t]{2,}/g, " ")
|
||||
.trim();
|
||||
const cleanedLine = cleanLineText(pieces.join(""));
|
||||
|
||||
// If the line becomes empty, drop it.
|
||||
if (cleanedLine) {
|
||||
|
||||
Reference in New Issue
Block a user