feat(media): add moonshot video provider and wiring

Co-authored-by: xiaoyaner0201 <xiaoyaner0201@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-02-23 18:24:50 +00:00
parent e02c470d5e
commit 7837d23103
10 changed files with 385 additions and 4 deletions

View File

@@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai
- Agents/Context pruning: extend `cache-ttl` eligibility to Moonshot/Kimi and ZAI/GLM providers (including OpenRouter model refs), so `contextPruning.mode: "cache-ttl"` is no longer silently skipped for those sessions. (#24497) Thanks @lailoo.
- Tools/web_search: add `provider: "kimi"` (Moonshot) support with key/config schema wiring and a corrected two-step `$web_search` tool flow that echoes tool results before final synthesis, including citation extraction from search results. (#18822) Thanks @adshine.
- Media understanding/Video: add a native Moonshot video provider and include Moonshot in auto video key detection, plus refactor video execution to honor `entry/config/provider` baseUrl+header precedence (matching audio behavior). (#16616) Thanks @xiaoyaner0201.
- Sessions/Store: canonicalize inbound mixed-case session keys for metadata and route updates, and migrate legacy case-variant entries to a single lowercase key to prevent duplicate sessions and missing TUI/WebUI history. (#9561) Thanks @hillghost86.
- Telegram/Reactions: soft-fail reaction action errors (policy/token/emoji/API), accept snake_case `message_id`, and fallback to inbound message-id context when explicit `messageId` is omitted so DM reactions stay stable without regeneration loops. (#20236, #21001) Thanks @PeterShanxin and @vincentkoc.
- Telegram/Polling: scope persisted polling offsets to bot identity and reuse a single awaited runner-stop path on abort/retry, preventing cross-token offset bleed and overlapping pollers during restart/error recovery. (#10850, #11347) Thanks @talhaorak, @anooprdawar, and @vincentkoc.

View File

@@ -1,5 +1,9 @@
import { describe, expect, it } from "vitest";
import { AUTO_AUDIO_KEY_PROVIDERS, DEFAULT_AUDIO_MODELS } from "./defaults.js";
import {
AUTO_AUDIO_KEY_PROVIDERS,
AUTO_VIDEO_KEY_PROVIDERS,
DEFAULT_AUDIO_MODELS,
} from "./defaults.js";
describe("DEFAULT_AUDIO_MODELS", () => {
it("includes Mistral Voxtral default", () => {
@@ -12,3 +16,9 @@ describe("AUTO_AUDIO_KEY_PROVIDERS", () => {
expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral");
});
});
describe("AUTO_VIDEO_KEY_PROVIDERS", () => {
it("includes moonshot auto key resolution", () => {
expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot");
});
});

View File

@@ -48,7 +48,7 @@ export const AUTO_IMAGE_KEY_PROVIDERS = [
"minimax",
"zai",
] as const;
export const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const;
export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const;
export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
openai: "gpt-5-mini",
anthropic: "claude-opus-4-6",

View File

@@ -16,4 +16,12 @@ describe("media-understanding provider registry", () => {
expect(provider?.id).toBe("google");
});
it("registers the Moonshot provider", () => {
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("moonshot", registry);
expect(provider?.id).toBe("moonshot");
expect(provider?.capabilities).toEqual(["image", "video"]);
});
});

View File

@@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxProvider } from "./minimax/index.js";
import { mistralProvider } from "./mistral/index.js";
import { moonshotProvider } from "./moonshot/index.js";
import { openaiProvider } from "./openai/index.js";
import { zaiProvider } from "./zai/index.js";
@@ -15,6 +16,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [
googleProvider,
anthropicProvider,
minimaxProvider,
moonshotProvider,
mistralProvider,
zaiProvider,
deepgramProvider,

View File

@@ -0,0 +1,10 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { describeMoonshotVideo } from "./video.js";
export const moonshotProvider: MediaUnderstandingProvider = {
id: "moonshot",
capabilities: ["image", "video"],
describeImage: describeImageWithModel,
describeVideo: describeMoonshotVideo,
};

View File

@@ -0,0 +1,72 @@
import { describe, expect, it } from "vitest";
import {
createRequestCaptureJsonFetch,
installPinnedHostnameTestHooks,
} from "../audio.test-helpers.js";
import { describeMoonshotVideo } from "./video.js";
installPinnedHostnameTestHooks();
describe("describeMoonshotVideo", () => {
it("builds an OpenAI-compatible video request", async () => {
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({
choices: [{ message: { content: "video ok" } }],
});
const result = await describeMoonshotVideo({
buffer: Buffer.from("video-bytes"),
fileName: "clip.mp4",
apiKey: "moonshot-test",
timeoutMs: 1500,
baseUrl: "https://api.moonshot.ai/v1/",
model: "kimi-k2.5",
headers: { "X-Trace": "1" },
fetchFn,
});
const { url, init } = getRequest();
expect(result.text).toBe("video ok");
expect(result.model).toBe("kimi-k2.5");
expect(url).toBe("https://api.moonshot.ai/v1/chat/completions");
expect(init?.method).toBe("POST");
expect(init?.signal).toBeInstanceOf(AbortSignal);
const headers = new Headers(init?.headers);
expect(headers.get("authorization")).toBe("Bearer moonshot-test");
expect(headers.get("content-type")).toBe("application/json");
expect(headers.get("x-trace")).toBe("1");
const body = JSON.parse(typeof init?.body === "string" ? init.body : "{}") as {
model?: string;
messages?: Array<{
content?: Array<{ type?: string; text?: string; video_url?: { url?: string } }>;
}>;
};
expect(body.model).toBe("kimi-k2.5");
expect(body.messages?.[0]?.content?.[0]).toMatchObject({
type: "text",
text: "Describe the video.",
});
expect(body.messages?.[0]?.content?.[1]?.type).toBe("video_url");
expect(body.messages?.[0]?.content?.[1]?.video_url?.url).toBe(
`data:video/mp4;base64,${Buffer.from("video-bytes").toString("base64")}`,
);
});
it("falls back to reasoning_content when content is empty", async () => {
const { fetchFn } = createRequestCaptureJsonFetch({
choices: [{ message: { content: "", reasoning_content: "reasoned answer" } }],
});
const result = await describeMoonshotVideo({
buffer: Buffer.from("video"),
fileName: "clip.mp4",
apiKey: "moonshot-test",
timeoutMs: 1000,
fetchFn,
});
expect(result.text).toBe("reasoned answer");
expect(result.model).toBe("kimi-k2.5");
});
});

View File

@@ -0,0 +1,109 @@
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
import { assertOkOrThrowHttpError, fetchWithTimeoutGuarded, normalizeBaseUrl } from "../shared.js";
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
const DEFAULT_MOONSHOT_VIDEO_PROMPT = "Describe the video.";
type MoonshotVideoPayload = {
choices?: Array<{
message?: {
content?: string | Array<{ text?: string }>;
reasoning_content?: string;
};
}>;
};
function resolveModel(model?: string): string {
const trimmed = model?.trim();
return trimmed || DEFAULT_MOONSHOT_VIDEO_MODEL;
}
function resolvePrompt(prompt?: string): string {
const trimmed = prompt?.trim();
return trimmed || DEFAULT_MOONSHOT_VIDEO_PROMPT;
}
function coerceMoonshotText(payload: MoonshotVideoPayload): string | null {
const message = payload.choices?.[0]?.message;
if (!message) {
return null;
}
if (typeof message.content === "string" && message.content.trim()) {
return message.content.trim();
}
if (Array.isArray(message.content)) {
const text = message.content
.map((part) => (typeof part.text === "string" ? part.text.trim() : ""))
.filter(Boolean)
.join("\n")
.trim();
if (text) {
return text;
}
}
if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) {
return message.reasoning_content.trim();
}
return null;
}
export async function describeMoonshotVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_MOONSHOT_VIDEO_BASE_URL);
const model = resolveModel(params.model);
const mime = params.mime ?? "video/mp4";
const prompt = resolvePrompt(params.prompt);
const url = `${baseUrl}/chat/completions`;
const headers = new Headers(params.headers);
if (!headers.has("content-type")) {
headers.set("content-type", "application/json");
}
if (!headers.has("authorization")) {
headers.set("authorization", `Bearer ${params.apiKey}`);
}
const body = {
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "video_url",
video_url: {
url: `data:${mime};base64,${params.buffer.toString("base64")}`,
},
},
],
},
],
};
const { response: res, release } = await fetchWithTimeoutGuarded(
url,
{
method: "POST",
headers,
body: JSON.stringify(body),
},
params.timeoutMs,
fetchFn,
);
try {
await assertOkOrThrowHttpError(res, "Moonshot video description failed");
const payload = (await res.json()) as MoonshotVideoPayload;
const text = coerceMoonshotText(payload);
if (!text) {
throw new Error("Moonshot video description response missing content");
}
return { text, model };
} finally {
await release();
}
}

View File

@@ -497,6 +497,13 @@ export async function runProviderEntry(params: {
entry,
agentDir: params.agentDir,
});
const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
const mergedHeaders = {
...providerConfig?.headers,
...params.config?.headers,
...entry.headers,
};
const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
const result = await executeWithApiKeyRotation({
provider: providerId,
apiKeys,
@@ -506,8 +513,8 @@ export async function runProviderEntry(params: {
fileName: media.fileName,
mime: media.mime,
apiKey,
baseUrl: providerConfig?.baseUrl,
headers: providerConfig?.headers,
baseUrl,
headers,
model: entry.model,
prompt,
timeoutMs,

View File

@@ -0,0 +1,162 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { withEnvAsync } from "../test-utils/env.js";
import { createMediaAttachmentCache, normalizeMediaAttachments, runCapability } from "./runner.js";
async function withVideoFixture(
filePrefix: string,
run: (params: {
ctx: { MediaPath: string; MediaType: string };
media: ReturnType<typeof normalizeMediaAttachments>;
cache: ReturnType<typeof createMediaAttachmentCache>;
}) => Promise<void>,
) {
const tmpPath = path.join(os.tmpdir(), `${filePrefix}-${Date.now().toString()}.mp4`);
await fs.writeFile(tmpPath, Buffer.from("video"));
const ctx = { MediaPath: tmpPath, MediaType: "video/mp4" };
const media = normalizeMediaAttachments(ctx);
const cache = createMediaAttachmentCache(media, {
localPathRoots: [path.dirname(tmpPath)],
});
try {
await withEnvAsync({ PATH: "" }, async () => {
await run({ ctx, media, cache });
});
} finally {
await cache.cleanup();
await fs.unlink(tmpPath).catch(() => {});
}
}
describe("runCapability video provider wiring", () => {
it("merges video baseUrl and headers with entry precedence", async () => {
let seenBaseUrl: string | undefined;
let seenHeaders: Record<string, string> | undefined;
await withVideoFixture("openclaw-video-merge", async ({ ctx, media, cache }) => {
const cfg = {
models: {
providers: {
moonshot: {
apiKey: "provider-key",
baseUrl: "https://provider.example/v1",
headers: { "X-Provider": "1" },
models: [],
},
},
},
tools: {
media: {
video: {
enabled: true,
baseUrl: "https://config.example/v1",
headers: { "X-Config": "2" },
models: [
{
provider: "moonshot",
model: "kimi-k2.5",
baseUrl: "https://entry.example/v1",
headers: { "X-Entry": "3" },
},
],
},
},
},
} as unknown as OpenClawConfig;
const result = await runCapability({
capability: "video",
cfg,
ctx,
attachments: cache,
media,
providerRegistry: new Map([
[
"moonshot",
{
id: "moonshot",
capabilities: ["video"],
describeVideo: async (req) => {
seenBaseUrl = req.baseUrl;
seenHeaders = req.headers;
return { text: "video ok", model: req.model };
},
},
],
]),
});
expect(result.outputs[0]?.text).toBe("video ok");
expect(result.outputs[0]?.provider).toBe("moonshot");
expect(seenBaseUrl).toBe("https://entry.example/v1");
expect(seenHeaders).toMatchObject({
"X-Provider": "1",
"X-Config": "2",
"X-Entry": "3",
});
});
});
it("auto-selects moonshot for video when google is unavailable", async () => {
await withEnvAsync(
{
GEMINI_API_KEY: undefined,
MOONSHOT_API_KEY: undefined,
},
async () => {
await withVideoFixture("openclaw-video-auto-moonshot", async ({ ctx, media, cache }) => {
const cfg = {
models: {
providers: {
moonshot: {
apiKey: "moonshot-key",
models: [],
},
},
},
tools: {
media: {
video: {
enabled: true,
},
},
},
} as unknown as OpenClawConfig;
const result = await runCapability({
capability: "video",
cfg,
ctx,
attachments: cache,
media,
providerRegistry: new Map([
[
"google",
{
id: "google",
capabilities: ["video"],
describeVideo: async () => ({ text: "google" }),
},
],
[
"moonshot",
{
id: "moonshot",
capabilities: ["video"],
describeVideo: async () => ({ text: "moonshot", model: "kimi-k2.5" }),
},
],
]),
});
expect(result.decision.outcome).toBe("success");
expect(result.outputs[0]?.provider).toBe("moonshot");
expect(result.outputs[0]?.text).toBe("moonshot");
});
},
);
});
});