Fix text attachment MIME misclassification (#3628)

* Fix text file attachment detection * Add file attachment extraction tests
2026-03-08 06:54:24 +00:00 · 2026-01-28 18:33:03 -08:00
parent a109b7f1a9
commit cb18ce7a85
4 changed files with 364 additions and 13 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -41,7 +41,7 @@ describe("applyMediaUnderstanding", () => {
    mockedResolveApiKey.mockClear();
    mockedFetchRemoteMedia.mockReset();
    mockedFetchRemoteMedia.mockResolvedValue({
-      buffer: Buffer.from("audio-bytes"),
+      buffer: Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
      contentType: "audio/ogg",
      fileName: "note.ogg",
    });
@@ -51,7 +51,7 @@ describe("applyMediaUnderstanding", () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPath = path.join(dir, "note.ogg");
-    await fs.writeFile(audioPath, "hello");
+    await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));

    const ctx: MsgContext = {
      Body: "<media:audio>",
@@ -94,7 +94,7 @@ describe("applyMediaUnderstanding", () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPath = path.join(dir, "note.ogg");
-    await fs.writeFile(audioPath, "hello");
+    await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));

    const ctx: MsgContext = {
      Body: "<media:audio> /capture status",
@@ -176,7 +176,7 @@ describe("applyMediaUnderstanding", () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPath = path.join(dir, "large.wav");
-    await fs.writeFile(audioPath, "0123456789");
+    await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]));

    const ctx: MsgContext = {
      Body: "<media:audio>",
@@ -211,7 +211,7 @@ describe("applyMediaUnderstanding", () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPath = path.join(dir, "note.ogg");
-    await fs.writeFile(audioPath, "hello");
+    await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));

    const ctx: MsgContext = {
      Body: "<media:audio>",
@@ -352,7 +352,7 @@ describe("applyMediaUnderstanding", () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPath = path.join(dir, "fallback.ogg");
-    await fs.writeFile(audioPath, "hello");
+    await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6]));

    const ctx: MsgContext = {
      Body: "<media:audio>",
@@ -390,8 +390,8 @@ describe("applyMediaUnderstanding", () => {
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
    const audioPathA = path.join(dir, "note-a.ogg");
    const audioPathB = path.join(dir, "note-b.ogg");
-    await fs.writeFile(audioPathA, "hello");
-    await fs.writeFile(audioPathB, "world");
+    await fs.writeFile(audioPathA, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208]));
+    await fs.writeFile(audioPathB, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208]));

    const ctx: MsgContext = {
      Body: "<media:audio>",
@@ -435,7 +435,7 @@ describe("applyMediaUnderstanding", () => {
    const audioPath = path.join(dir, "note.ogg");
    const videoPath = path.join(dir, "clip.mp4");
    await fs.writeFile(imagePath, "image-bytes");
-    await fs.writeFile(audioPath, "audio-bytes");
+    await fs.writeFile(audioPath, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208]));
    await fs.writeFile(videoPath, "video-bytes");

    const ctx: MsgContext = {
@@ -487,4 +487,63 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.CommandBody).toBe("audio ok");
    expect(ctx.BodyForCommands).toBe("audio ok");
  });
+
+  it("treats text-like audio attachments as CSV (comma wins over tabs)", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
+    const csvPath = path.join(dir, "data.mp3");
+    const csvText = '"a","b"\t"c"\n"1","2"\t"3"';
+    const csvBuffer = Buffer.concat([Buffer.from([0xff, 0xfe]), Buffer.from(csvText, "utf16le")]);
+    await fs.writeFile(csvPath, csvBuffer);
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: csvPath,
+      MediaType: "audio/mpeg",
+    };
+    const cfg: MoltbotConfig = {
+      tools: {
+        media: {
+          audio: { enabled: false },
+          image: { enabled: false },
+          video: { enabled: false },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({ ctx, cfg });
+
+    expect(result.appliedFile).toBe(true);
+    expect(ctx.Body).toContain('<file name="data.mp3" mime="text/csv">');
+    expect(ctx.Body).toContain('"a","b"\t"c"');
+  });
+
+  it("infers TSV when tabs are present without commas", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-"));
+    const tsvPath = path.join(dir, "report.mp3");
+    const tsvText = "a\tb\tc\n1\t2\t3";
+    await fs.writeFile(tsvPath, tsvText);
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: tsvPath,
+      MediaType: "audio/mpeg",
+    };
+    const cfg: MoltbotConfig = {
+      tools: {
+        media: {
+          audio: { enabled: false },
+          image: { enabled: false },
+          video: { enabled: false },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({ ctx, cfg });
+
+    expect(result.appliedFile).toBe(true);
+    expect(ctx.Body).toContain('<file name="report.mp3" mime="text/tab-separated-values">');
+    expect(ctx.Body).toContain("a\tb\tc");
+  });
 });
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -1,6 +1,22 @@
+import path from "node:path";
+
 import type { MoltbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import {
+  DEFAULT_INPUT_FILE_MAX_BYTES,
+  DEFAULT_INPUT_FILE_MAX_CHARS,
+  DEFAULT_INPUT_FILE_MIMES,
+  DEFAULT_INPUT_MAX_REDIRECTS,
+  DEFAULT_INPUT_PDF_MAX_PAGES,
+  DEFAULT_INPUT_PDF_MAX_PIXELS,
+  DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
+  DEFAULT_INPUT_TIMEOUT_MS,
+  extractFileContentFromSource,
+  normalizeMimeList,
+  normalizeMimeType,
+} from "../media/input-files.js";
 import {
  extractMediaUserText,
  formatAudioTranscripts,
@@ -14,6 +30,7 @@ import type {
 } from "./types.js";
 import { runWithConcurrency } from "./concurrency.js";
 import { resolveConcurrency } from "./resolve.js";
+import { resolveAttachmentKind } from "./attachments.js";
 import {
  type ActiveMediaModel,
  buildProviderRegistry,
@@ -28,9 +45,255 @@ export type ApplyMediaUnderstandingResult = {
  appliedImage: boolean;
  appliedAudio: boolean;
  appliedVideo: boolean;
+  appliedFile: boolean;
 };

 const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
+const EXTRA_TEXT_MIMES = [
+  "application/xml",
+  "text/xml",
+  "application/x-yaml",
+  "text/yaml",
+  "application/yaml",
+  "application/javascript",
+  "text/javascript",
+  "text/tab-separated-values",
+];
+const TEXT_EXT_MIME = new Map<string, string>([
+  [".csv", "text/csv"],
+  [".tsv", "text/tab-separated-values"],
+  [".txt", "text/plain"],
+  [".md", "text/markdown"],
+  [".log", "text/plain"],
+  [".ini", "text/plain"],
+  [".cfg", "text/plain"],
+  [".conf", "text/plain"],
+  [".env", "text/plain"],
+  [".json", "application/json"],
+  [".yaml", "text/yaml"],
+  [".yml", "text/yaml"],
+  [".xml", "application/xml"],
+]);
+
+function resolveFileLimits(cfg: MoltbotConfig) {
+  const files = cfg.gateway?.http?.endpoints?.responses?.files;
+  return {
+    allowUrl: files?.allowUrl ?? true,
+    allowedMimes: normalizeMimeList(files?.allowedMimes, DEFAULT_INPUT_FILE_MIMES),
+    maxBytes: files?.maxBytes ?? DEFAULT_INPUT_FILE_MAX_BYTES,
+    maxChars: files?.maxChars ?? DEFAULT_INPUT_FILE_MAX_CHARS,
+    maxRedirects: files?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS,
+    timeoutMs: files?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS,
+    pdf: {
+      maxPages: files?.pdf?.maxPages ?? DEFAULT_INPUT_PDF_MAX_PAGES,
+      maxPixels: files?.pdf?.maxPixels ?? DEFAULT_INPUT_PDF_MAX_PIXELS,
+      minTextChars: files?.pdf?.minTextChars ?? DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
+    },
+  };
+}
+
+function appendFileBlocks(body: string | undefined, blocks: string[]): string {
+  if (!blocks || blocks.length === 0) {
+    return body ?? "";
+  }
+  const base = typeof body === "string" ? body.trim() : "";
+  const suffix = blocks.join("\n\n").trim();
+  if (!base) {
+    return suffix;
+  }
+  return `${base}\n\n${suffix}`.trim();
+}
+
+function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
+  if (!buffer || buffer.length < 2) return undefined;
+  const b0 = buffer[0];
+  const b1 = buffer[1];
+  if (b0 === 0xff && b1 === 0xfe) {
+    return "utf-16le";
+  }
+  if (b0 === 0xfe && b1 === 0xff) {
+    return "utf-16be";
+  }
+  const sampleLen = Math.min(buffer.length, 2048);
+  let zeroCount = 0;
+  for (let i = 0; i < sampleLen; i += 1) {
+    if (buffer[i] === 0) zeroCount += 1;
+  }
+  if (zeroCount / sampleLen > 0.2) {
+    return "utf-16le";
+  }
+  return undefined;
+}
+
+function looksLikeUtf8Text(buffer?: Buffer): boolean {
+  if (!buffer || buffer.length === 0) return false;
+  const sampleLen = Math.min(buffer.length, 4096);
+  let printable = 0;
+  let other = 0;
+  for (let i = 0; i < sampleLen; i += 1) {
+    const byte = buffer[i];
+    if (byte === 0) {
+      other += 1;
+      continue;
+    }
+    if (byte === 9 || byte === 10 || byte === 13 || (byte >= 32 && byte <= 126)) {
+      printable += 1;
+    } else {
+      other += 1;
+    }
+  }
+  const total = printable + other;
+  if (total === 0) return false;
+  return printable / total > 0.85;
+}
+
+function decodeTextSample(buffer?: Buffer): string {
+  if (!buffer || buffer.length === 0) return "";
+  const sample = buffer.subarray(0, Math.min(buffer.length, 8192));
+  const utf16Charset = resolveUtf16Charset(sample);
+  if (utf16Charset === "utf-16be") {
+    const swapped = Buffer.alloc(sample.length);
+    for (let i = 0; i + 1 < sample.length; i += 2) {
+      swapped[i] = sample[i + 1];
+      swapped[i + 1] = sample[i];
+    }
+    return new TextDecoder("utf-16le").decode(swapped);
+  }
+  if (utf16Charset === "utf-16le") {
+    return new TextDecoder("utf-16le").decode(sample);
+  }
+  return new TextDecoder("utf-8").decode(sample);
+}
+
+function guessDelimitedMime(text: string): string | undefined {
+  if (!text) return undefined;
+  const line = text.split(/\r?\n/)[0] ?? "";
+  const tabs = (line.match(/\t/g) ?? []).length;
+  const commas = (line.match(/,/g) ?? []).length;
+  if (commas > 0) {
+    return "text/csv";
+  }
+  if (tabs > 0) {
+    return "text/tab-separated-values";
+  }
+  return undefined;
+}
+
+function resolveTextMimeFromName(name?: string): string | undefined {
+  if (!name) return undefined;
+  const ext = path.extname(name).toLowerCase();
+  return TEXT_EXT_MIME.get(ext);
+}
+
+async function extractFileBlocks(params: {
+  attachments: ReturnType<typeof normalizeMediaAttachments>;
+  cache: ReturnType<typeof createMediaAttachmentCache>;
+  limits: ReturnType<typeof resolveFileLimits>;
+}): Promise<string[]> {
+  const { attachments, cache, limits } = params;
+  if (!attachments || attachments.length === 0) {
+    return [];
+  }
+  const blocks: string[] = [];
+  for (const attachment of attachments) {
+    if (!attachment) {
+      continue;
+    }
+    const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
+    const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
+    if (!forcedTextMime && (kind === "image" || kind === "video")) {
+      continue;
+    }
+    if (!limits.allowUrl && attachment.url && !attachment.path) {
+      if (shouldLogVerbose()) {
+        logVerbose(`media: file attachment skipped (url disabled) index=${attachment.index}`);
+      }
+      continue;
+    }
+    let bufferResult: Awaited<ReturnType<typeof cache.getBuffer>>;
+    try {
+      bufferResult = await cache.getBuffer({
+        attachmentIndex: attachment.index,
+        maxBytes: limits.maxBytes,
+        timeoutMs: limits.timeoutMs,
+      });
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`media: file attachment skipped (buffer): ${String(err)}`);
+      }
+      continue;
+    }
+    const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url;
+    const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
+    const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
+    const textSample = decodeTextSample(bufferResult?.buffer);
+    const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
+    if (!forcedTextMimeResolved && kind === "audio" && !textLike) {
+      continue;
+    }
+    const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
+    const textHint =
+      forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);
+    const rawMime = bufferResult?.mime ?? attachment.mime;
+    const mimeType = textHint ?? normalizeMimeType(rawMime);
+    if (!mimeType) {
+      if (shouldLogVerbose()) {
+        logVerbose(`media: file attachment skipped (unknown mime) index=${attachment.index}`);
+      }
+      continue;
+    }
+    const allowedMimes = new Set(limits.allowedMimes);
+    for (const extra of EXTRA_TEXT_MIMES) {
+      allowedMimes.add(extra);
+    }
+    if (mimeType.startsWith("text/")) {
+      allowedMimes.add(mimeType);
+    }
+    if (!allowedMimes.has(mimeType)) {
+      if (shouldLogVerbose()) {
+        logVerbose(
+          `media: file attachment skipped (unsupported mime ${mimeType}) index=${attachment.index}`,
+        );
+      }
+      continue;
+    }
+    let extracted: Awaited<ReturnType<typeof extractFileContentFromSource>>;
+    try {
+      const mediaType = utf16Charset ? `${mimeType}; charset=${utf16Charset}` : mimeType;
+      extracted = await extractFileContentFromSource({
+        source: {
+          type: "base64",
+          data: bufferResult.buffer.toString("base64"),
+          mediaType,
+          filename: bufferResult.fileName,
+        },
+        limits: {
+          ...limits,
+          allowedMimes,
+        },
+      });
+    } catch (err) {
+      if (shouldLogVerbose()) {
+        logVerbose(`media: file attachment skipped (extract): ${String(err)}`);
+      }
+      continue;
+    }
+    const text = extracted?.text?.trim() ?? "";
+    let blockText = text;
+    if (!blockText) {
+      if (extracted?.images && extracted.images.length > 0) {
+        blockText = "[PDF content rendered to images; images not forwarded to model]";
+      } else {
+        blockText = "[No extractable text]";
+      }
+    }
+    const safeName = (bufferResult.fileName ?? `file-${attachment.index + 1}`)
+      .replace(/[\r\n\t]+/g, " ")
+      .trim();
+    blocks.push(`<file name="${safeName}" mime="${mimeType}">\n${blockText}\n</file>`);
+  }
+  return blocks;
+}

 export async function applyMediaUnderstanding(params: {
  ctx: MsgContext;
@@ -51,6 +314,12 @@ export async function applyMediaUnderstanding(params: {
  const cache = createMediaAttachmentCache(attachments);

  try {
+    const fileBlocks = await extractFileBlocks({
+      attachments,
+      cache,
+      limits: resolveFileLimits(cfg),
+    });
+
    const tasks = CAPABILITY_ORDER.map((capability) => async () => {
      const config = cfg.tools?.media?.[capability];
      return await runCapability({
@@ -99,7 +368,15 @@ export async function applyMediaUnderstanding(params: {
        ctx.RawBody = originalUserText;
      }
      ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
-      finalizeInboundContext(ctx, { forceBodyForAgent: true, forceBodyForCommands: true });
+    }
+    if (fileBlocks.length > 0) {
+      ctx.Body = appendFileBlocks(ctx.Body, fileBlocks);
+    }
+    if (outputs.length > 0 || fileBlocks.length > 0) {
+      finalizeInboundContext(ctx, {
+        forceBodyForAgent: true,
+        forceBodyForCommands: outputs.length > 0,
+      });
    }

    return {
@@ -108,6 +385,7 @@ export async function applyMediaUnderstanding(params: {
      appliedImage: outputs.some((output) => output.kind === "image.description"),
      appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
      appliedVideo: outputs.some((output) => output.kind === "video.description"),
+      appliedFile: fileBlocks.length > 0,
    };
  } finally {
    await cache.cleanup();
--- a/src/telegram/bot/delivery.ts
+++ b/src/telegram/bot/delivery.ts
@@ -310,7 +310,14 @@ export async function resolveMedia(
        fetchImpl,
        filePathHint: file.file_path,
      });
-      const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes);
+      const originalName = fetched.fileName ?? file.file_path;
+      const saved = await saveMediaBuffer(
+        fetched.buffer,
+        fetched.contentType,
+        "inbound",
+        maxBytes,
+        originalName,
+      );

      // Check sticker cache for existing description
      const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null;
@@ -377,7 +384,14 @@ export async function resolveMedia(
    fetchImpl,
    filePathHint: file.file_path,
  });
-  const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes);
+  const originalName = fetched.fileName ?? file.file_path;
+  const saved = await saveMediaBuffer(
+    fetched.buffer,
+    fetched.contentType,
+    "inbound",
+    maxBytes,
+    originalName,
+  );
  let placeholder = "<media:document>";
  if (msg.photo) placeholder = "<media:image>";
  else if (msg.video) placeholder = "<media:video>";
--- a/src/telegram/download.ts
+++ b/src/telegram/download.ts
@@ -40,7 +40,7 @@ export async function downloadTelegramFile(
    filePath: info.file_path,
  });
  // save with inbound subdir
-  const saved = await saveMediaBuffer(array, mime, "inbound", maxBytes);
+  const saved = await saveMediaBuffer(array, mime, "inbound", maxBytes, info.file_path);
  // Ensure extension matches mime if possible
  if (!saved.contentType && mime) saved.contentType = mime;
  return saved;