feat(media): add moonshot video provider and wiring

Co-authored-by: xiaoyaner0201 <xiaoyaner0201@users.noreply.github.com>
2026-03-08 06:54:24 +00:00 · 2026-02-23 18:24:50 +00:00
parent e02c470d5e
commit 7837d23103
10 changed files with 385 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai

 - Agents/Context pruning: extend `cache-ttl` eligibility to Moonshot/Kimi and ZAI/GLM providers (including OpenRouter model refs), so `contextPruning.mode: "cache-ttl"` is no longer silently skipped for those sessions. (#24497) Thanks @lailoo.
 - Tools/web_search: add `provider: "kimi"` (Moonshot) support with key/config schema wiring and a corrected two-step `$web_search` tool flow that echoes tool results before final synthesis, including citation extraction from search results. (#18822) Thanks @adshine.
+- Media understanding/Video: add a native Moonshot video provider and include Moonshot in auto video key detection, plus refactor video execution to honor `entry/config/provider` baseUrl+header precedence (matching audio behavior). (#16616) Thanks @xiaoyaner0201.
 - Sessions/Store: canonicalize inbound mixed-case session keys for metadata and route updates, and migrate legacy case-variant entries to a single lowercase key to prevent duplicate sessions and missing TUI/WebUI history. (#9561) Thanks @hillghost86.
 - Telegram/Reactions: soft-fail reaction action errors (policy/token/emoji/API), accept snake_case `message_id`, and fallback to inbound message-id context when explicit `messageId` is omitted so DM reactions stay stable without regeneration loops. (#20236, #21001) Thanks @PeterShanxin and @vincentkoc.
 - Telegram/Polling: scope persisted polling offsets to bot identity and reuse a single awaited runner-stop path on abort/retry, preventing cross-token offset bleed and overlapping pollers during restart/error recovery. (#10850, #11347) Thanks @talhaorak, @anooprdawar, and @vincentkoc.
--- a/src/media-understanding/defaults.test.ts
+++ b/src/media-understanding/defaults.test.ts
@@ -1,5 +1,9 @@
 import { describe, expect, it } from "vitest";
-import { AUTO_AUDIO_KEY_PROVIDERS, DEFAULT_AUDIO_MODELS } from "./defaults.js";
+import {
+  AUTO_AUDIO_KEY_PROVIDERS,
+  AUTO_VIDEO_KEY_PROVIDERS,
+  DEFAULT_AUDIO_MODELS,
+} from "./defaults.js";

 describe("DEFAULT_AUDIO_MODELS", () => {
  it("includes Mistral Voxtral default", () => {
@@ -12,3 +16,9 @@ describe("AUTO_AUDIO_KEY_PROVIDERS", () => {
    expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral");
  });
 });
+
+describe("AUTO_VIDEO_KEY_PROVIDERS", () => {
+  it("includes moonshot auto key resolution", () => {
+    expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot");
+  });
+});
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -48,7 +48,7 @@ export const AUTO_IMAGE_KEY_PROVIDERS = [
  "minimax",
  "zai",
 ] as const;
-export const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const;
+export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const;
 export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
  openai: "gpt-5-mini",
  anthropic: "claude-opus-4-6",
--- a/src/media-understanding/providers/index.test.ts
+++ b/src/media-understanding/providers/index.test.ts
@@ -16,4 +16,12 @@ describe("media-understanding provider registry", () => {

    expect(provider?.id).toBe("google");
  });
+
+  it("registers the Moonshot provider", () => {
+    const registry = buildMediaUnderstandingRegistry();
+    const provider = getMediaUnderstandingProvider("moonshot", registry);
+
+    expect(provider?.id).toBe("moonshot");
+    expect(provider?.capabilities).toEqual(["image", "video"]);
+  });
 });
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
 import { minimaxProvider } from "./minimax/index.js";
 import { mistralProvider } from "./mistral/index.js";
+import { moonshotProvider } from "./moonshot/index.js";
 import { openaiProvider } from "./openai/index.js";
 import { zaiProvider } from "./zai/index.js";

@@ -15,6 +16,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
  googleProvider,
  anthropicProvider,
  minimaxProvider,
+  moonshotProvider,
  mistralProvider,
  zaiProvider,
  deepgramProvider,
--- a/src/media-understanding/providers/moonshot/index.ts
+++ b/src/media-understanding/providers/moonshot/index.ts
@@ -0,0 +1,10 @@
+import type { MediaUnderstandingProvider } from "../../types.js";
+import { describeImageWithModel } from "../image.js";
+import { describeMoonshotVideo } from "./video.js";
+
+export const moonshotProvider: MediaUnderstandingProvider = {
+  id: "moonshot",
+  capabilities: ["image", "video"],
+  describeImage: describeImageWithModel,
+  describeVideo: describeMoonshotVideo,
+};
--- a/src/media-understanding/providers/moonshot/video.test.ts
+++ b/src/media-understanding/providers/moonshot/video.test.ts
@@ -0,0 +1,72 @@
+import { describe, expect, it } from "vitest";
+import {
+  createRequestCaptureJsonFetch,
+  installPinnedHostnameTestHooks,
+} from "../audio.test-helpers.js";
+import { describeMoonshotVideo } from "./video.js";
+
+installPinnedHostnameTestHooks();
+
+describe("describeMoonshotVideo", () => {
+  it("builds an OpenAI-compatible video request", async () => {
+    const { fetchFn, getRequest } = createRequestCaptureJsonFetch({
+      choices: [{ message: { content: "video ok" } }],
+    });
+
+    const result = await describeMoonshotVideo({
+      buffer: Buffer.from("video-bytes"),
+      fileName: "clip.mp4",
+      apiKey: "moonshot-test",
+      timeoutMs: 1500,
+      baseUrl: "https://api.moonshot.ai/v1/",
+      model: "kimi-k2.5",
+      headers: { "X-Trace": "1" },
+      fetchFn,
+    });
+    const { url, init } = getRequest();
+
+    expect(result.text).toBe("video ok");
+    expect(result.model).toBe("kimi-k2.5");
+    expect(url).toBe("https://api.moonshot.ai/v1/chat/completions");
+    expect(init?.method).toBe("POST");
+    expect(init?.signal).toBeInstanceOf(AbortSignal);
+
+    const headers = new Headers(init?.headers);
+    expect(headers.get("authorization")).toBe("Bearer moonshot-test");
+    expect(headers.get("content-type")).toBe("application/json");
+    expect(headers.get("x-trace")).toBe("1");
+
+    const body = JSON.parse(typeof init?.body === "string" ? init.body : "{}") as {
+      model?: string;
+      messages?: Array<{
+        content?: Array<{ type?: string; text?: string; video_url?: { url?: string } }>;
+      }>;
+    };
+    expect(body.model).toBe("kimi-k2.5");
+    expect(body.messages?.[0]?.content?.[0]).toMatchObject({
+      type: "text",
+      text: "Describe the video.",
+    });
+    expect(body.messages?.[0]?.content?.[1]?.type).toBe("video_url");
+    expect(body.messages?.[0]?.content?.[1]?.video_url?.url).toBe(
+      `data:video/mp4;base64,${Buffer.from("video-bytes").toString("base64")}`,
+    );
+  });
+
+  it("falls back to reasoning_content when content is empty", async () => {
+    const { fetchFn } = createRequestCaptureJsonFetch({
+      choices: [{ message: { content: "", reasoning_content: "reasoned answer" } }],
+    });
+
+    const result = await describeMoonshotVideo({
+      buffer: Buffer.from("video"),
+      fileName: "clip.mp4",
+      apiKey: "moonshot-test",
+      timeoutMs: 1000,
+      fetchFn,
+    });
+
+    expect(result.text).toBe("reasoned answer");
+    expect(result.model).toBe("kimi-k2.5");
+  });
+});
--- a/src/media-understanding/providers/moonshot/video.ts
+++ b/src/media-understanding/providers/moonshot/video.ts
@@ -0,0 +1,109 @@
+import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
+import { assertOkOrThrowHttpError, fetchWithTimeoutGuarded, normalizeBaseUrl } from "../shared.js";
+
+export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
+const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
+const DEFAULT_MOONSHOT_VIDEO_PROMPT = "Describe the video.";
+
+type MoonshotVideoPayload = {
+  choices?: Array<{
+    message?: {
+      content?: string | Array<{ text?: string }>;
+      reasoning_content?: string;
+    };
+  }>;
+};
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  return trimmed || DEFAULT_MOONSHOT_VIDEO_MODEL;
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_MOONSHOT_VIDEO_PROMPT;
+}
+
+function coerceMoonshotText(payload: MoonshotVideoPayload): string | null {
+  const message = payload.choices?.[0]?.message;
+  if (!message) {
+    return null;
+  }
+  if (typeof message.content === "string" && message.content.trim()) {
+    return message.content.trim();
+  }
+  if (Array.isArray(message.content)) {
+    const text = message.content
+      .map((part) => (typeof part.text === "string" ? part.text.trim() : ""))
+      .filter(Boolean)
+      .join("\n")
+      .trim();
+    if (text) {
+      return text;
+    }
+  }
+  if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
+    return message.reasoning_content.trim();
+  }
+  return null;
+}
+
+export async function describeMoonshotVideo(
+  params: VideoDescriptionRequest,
+): Promise<VideoDescriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_MOONSHOT_VIDEO_BASE_URL);
+  const model = resolveModel(params.model);
+  const mime = params.mime ?? "video/mp4";
+  const prompt = resolvePrompt(params.prompt);
+  const url = `${baseUrl}/chat/completions`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("authorization")) {
+    headers.set("authorization", `Bearer ${params.apiKey}`);
+  }
+
+  const body = {
+    model,
+    messages: [
+      {
+        role: "user",
+        content: [
+          { type: "text", text: prompt },
+          {
+            type: "video_url",
+            video_url: {
+              url: `data:${mime};base64,${params.buffer.toString("base64")}`,
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const { response: res, release } = await fetchWithTimeoutGuarded(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  try {
+    await assertOkOrThrowHttpError(res, "Moonshot video description failed");
+    const payload = (await res.json()) as MoonshotVideoPayload;
+    const text = coerceMoonshotText(payload);
+    if (!text) {
+      throw new Error("Moonshot video description response missing content");
+    }
+    return { text, model };
+  } finally {
+    await release();
+  }
+}
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -497,6 +497,13 @@ export async function runProviderEntry(params: {
    entry,
    agentDir: params.agentDir,
  });
+  const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
+  const mergedHeaders = {
+    ...providerConfig?.headers,
+    ...params.config?.headers,
+    ...entry.headers,
+  };
+  const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
  const result = await executeWithApiKeyRotation({
    provider: providerId,
    apiKeys,
@@ -506,8 +513,8 @@ export async function runProviderEntry(params: {
        fileName: media.fileName,
        mime: media.mime,
        apiKey,
-        baseUrl: providerConfig?.baseUrl,
-        headers: providerConfig?.headers,
+        baseUrl,
+        headers,
        model: entry.model,
        prompt,
        timeoutMs,
--- a/src/media-understanding/runner.video.test.ts
+++ b/src/media-understanding/runner.video.test.ts
@@ -0,0 +1,162 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { describe, expect, it } from "vitest";
+import type { OpenClawConfig } from "../config/config.js";
+import { withEnvAsync } from "../test-utils/env.js";
+import { createMediaAttachmentCache, normalizeMediaAttachments, runCapability } from "./runner.js";
+
+async function withVideoFixture(
+  filePrefix: string,
+  run: (params: {
+    ctx: { MediaPath: string; MediaType: string };
+    media: ReturnType<typeof normalizeMediaAttachments>;
+    cache: ReturnType<typeof createMediaAttachmentCache>;
+  }) => Promise<void>,
+) {
+  const tmpPath = path.join(os.tmpdir(), `${filePrefix}-${Date.now().toString()}.mp4`);
+  await fs.writeFile(tmpPath, Buffer.from("video"));
+  const ctx = { MediaPath: tmpPath, MediaType: "video/mp4" };
+  const media = normalizeMediaAttachments(ctx);
+  const cache = createMediaAttachmentCache(media, {
+    localPathRoots: [path.dirname(tmpPath)],
+  });
+  try {
+    await withEnvAsync({ PATH: "" }, async () => {
+      await run({ ctx, media, cache });
+    });
+  } finally {
+    await cache.cleanup();
+    await fs.unlink(tmpPath).catch(() => {});
+  }
+}
+
+describe("runCapability video provider wiring", () => {
+  it("merges video baseUrl and headers with entry precedence", async () => {
+    let seenBaseUrl: string | undefined;
+    let seenHeaders: Record<string, string> | undefined;
+
+    await withVideoFixture("openclaw-video-merge", async ({ ctx, media, cache }) => {
+      const cfg = {
+        models: {
+          providers: {
+            moonshot: {
+              apiKey: "provider-key",
+              baseUrl: "https://provider.example/v1",
+              headers: { "X-Provider": "1" },
+              models: [],
+            },
+          },
+        },
+        tools: {
+          media: {
+            video: {
+              enabled: true,
+              baseUrl: "https://config.example/v1",
+              headers: { "X-Config": "2" },
+              models: [
+                {
+                  provider: "moonshot",
+                  model: "kimi-k2.5",
+                  baseUrl: "https://entry.example/v1",
+                  headers: { "X-Entry": "3" },
+                },
+              ],
+            },
+          },
+        },
+      } as unknown as OpenClawConfig;
+
+      const result = await runCapability({
+        capability: "video",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry: new Map([
+          [
+            "moonshot",
+            {
+              id: "moonshot",
+              capabilities: ["video"],
+              describeVideo: async (req) => {
+                seenBaseUrl = req.baseUrl;
+                seenHeaders = req.headers;
+                return { text: "video ok", model: req.model };
+              },
+            },
+          ],
+        ]),
+      });
+
+      expect(result.outputs[0]?.text).toBe("video ok");
+      expect(result.outputs[0]?.provider).toBe("moonshot");
+      expect(seenBaseUrl).toBe("https://entry.example/v1");
+      expect(seenHeaders).toMatchObject({
+        "X-Provider": "1",
+        "X-Config": "2",
+        "X-Entry": "3",
+      });
+    });
+  });
+
+  it("auto-selects moonshot for video when google is unavailable", async () => {
+    await withEnvAsync(
+      {
+        GEMINI_API_KEY: undefined,
+        MOONSHOT_API_KEY: undefined,
+      },
+      async () => {
+        await withVideoFixture("openclaw-video-auto-moonshot", async ({ ctx, media, cache }) => {
+          const cfg = {
+            models: {
+              providers: {
+                moonshot: {
+                  apiKey: "moonshot-key",
+                  models: [],
+                },
+              },
+            },
+            tools: {
+              media: {
+                video: {
+                  enabled: true,
+                },
+              },
+            },
+          } as unknown as OpenClawConfig;
+
+          const result = await runCapability({
+            capability: "video",
+            cfg,
+            ctx,
+            attachments: cache,
+            media,
+            providerRegistry: new Map([
+              [
+                "google",
+                {
+                  id: "google",
+                  capabilities: ["video"],
+                  describeVideo: async () => ({ text: "google" }),
+                },
+              ],
+              [
+                "moonshot",
+                {
+                  id: "moonshot",
+                  capabilities: ["video"],
+                  describeVideo: async () => ({ text: "moonshot", model: "kimi-k2.5" }),
+                },
+              ],
+            ]),
+          });
+
+          expect(result.decision.outcome).toBe("success");
+          expect(result.outputs[0]?.provider).toBe("moonshot");
+          expect(result.outputs[0]?.text).toBe("moonshot");
+        });
+      },
+    );
+  });
+});