mirror of
https://github.com/moltbot/moltbot.git
synced 2026-04-23 22:55:24 +00:00
1025 lines
34 KiB
TypeScript
1025 lines
34 KiB
TypeScript
import { Type } from "@sinclair/typebox";
|
|
import type { OpenClawConfig } from "../../config/config.js";
|
|
import { loadConfig } from "../../config/config.js";
|
|
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
|
import { saveMediaBuffer } from "../../media/store.js";
|
|
import { loadWebMedia } from "../../media/web-media.js";
|
|
import { readSnakeCaseParamRaw } from "../../param-key.js";
|
|
import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
|
|
import { resolveUserPath } from "../../utils.js";
|
|
import type { DeliveryContext } from "../../utils/delivery-context.js";
|
|
import { resolveVideoGenerationSupportedDurations } from "../../video-generation/duration-support.js";
|
|
import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
|
|
import {
|
|
generateVideo,
|
|
listRuntimeVideoGenerationProviders,
|
|
} from "../../video-generation/runtime.js";
|
|
import type {
|
|
VideoGenerationProvider,
|
|
VideoGenerationResolution,
|
|
VideoGenerationSourceAsset,
|
|
} from "../../video-generation/types.js";
|
|
import { normalizeProviderId } from "../provider-id.js";
|
|
import {
|
|
ToolInputError,
|
|
readNumberParam,
|
|
readStringArrayParam,
|
|
readStringParam,
|
|
} from "./common.js";
|
|
import { decodeDataUrl } from "./image-tool.helpers.js";
|
|
import {
|
|
applyVideoGenerationModelConfigDefaults,
|
|
resolveMediaToolLocalRoots,
|
|
} from "./media-tool-shared.js";
|
|
import {
|
|
buildToolModelConfigFromCandidates,
|
|
coerceToolModelConfig,
|
|
hasAuthForProvider,
|
|
hasToolModelConfig,
|
|
resolveDefaultModelRef,
|
|
type ToolModelConfig,
|
|
} from "./model-config.helpers.js";
|
|
import {
|
|
createSandboxBridgeReadFile,
|
|
resolveSandboxedBridgeMediaPath,
|
|
type AnyAgentTool,
|
|
type SandboxFsBridge,
|
|
type ToolFsPolicy,
|
|
} from "./tool-runtime.helpers.js";
|
|
import {
|
|
completeVideoGenerationTaskRun,
|
|
createVideoGenerationTaskRun,
|
|
failVideoGenerationTaskRun,
|
|
recordVideoGenerationTaskProgress,
|
|
type VideoGenerationTaskHandle,
|
|
wakeVideoGenerationTaskCompletion,
|
|
} from "./video-generate-background.js";
|
|
|
|
const log = createSubsystemLogger("agents/tools/video-generate");
|
|
const MAX_INPUT_IMAGES = 5;
|
|
const MAX_INPUT_VIDEOS = 4;
|
|
const SUPPORTED_ASPECT_RATIOS = new Set([
|
|
"1:1",
|
|
"2:3",
|
|
"3:2",
|
|
"3:4",
|
|
"4:3",
|
|
"4:5",
|
|
"5:4",
|
|
"9:16",
|
|
"16:9",
|
|
"21:9",
|
|
]);
|
|
|
|
const VideoGenerateToolSchema = Type.Object({
|
|
action: Type.Optional(
|
|
Type.String({
|
|
description:
|
|
'Optional action: "generate" (default) or "list" to inspect available providers/models.',
|
|
}),
|
|
),
|
|
prompt: Type.Optional(Type.String({ description: "Video generation prompt." })),
|
|
image: Type.Optional(
|
|
Type.String({
|
|
description: "Optional single reference image path or URL.",
|
|
}),
|
|
),
|
|
images: Type.Optional(
|
|
Type.Array(Type.String(), {
|
|
description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`,
|
|
}),
|
|
),
|
|
video: Type.Optional(
|
|
Type.String({
|
|
description: "Optional single reference video path or URL.",
|
|
}),
|
|
),
|
|
videos: Type.Optional(
|
|
Type.Array(Type.String(), {
|
|
description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`,
|
|
}),
|
|
),
|
|
model: Type.Optional(
|
|
Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }),
|
|
),
|
|
filename: Type.Optional(
|
|
Type.String({
|
|
description:
|
|
"Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.",
|
|
}),
|
|
),
|
|
size: Type.Optional(
|
|
Type.String({
|
|
description: "Optional size hint like 1280x720 or 1920x1080 when the provider supports it.",
|
|
}),
|
|
),
|
|
aspectRatio: Type.Optional(
|
|
Type.String({
|
|
description:
|
|
"Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.",
|
|
}),
|
|
),
|
|
resolution: Type.Optional(
|
|
Type.String({
|
|
description: "Optional resolution hint: 480P, 720P, or 1080P.",
|
|
}),
|
|
),
|
|
durationSeconds: Type.Optional(
|
|
Type.Number({
|
|
description:
|
|
"Optional target duration in seconds. OpenClaw may round this to the nearest provider-supported duration.",
|
|
minimum: 1,
|
|
}),
|
|
),
|
|
audio: Type.Optional(
|
|
Type.Boolean({
|
|
description: "Optional audio toggle when the provider supports generated audio.",
|
|
}),
|
|
),
|
|
watermark: Type.Optional(
|
|
Type.Boolean({
|
|
description: "Optional watermark toggle when the provider supports it.",
|
|
}),
|
|
),
|
|
});
|
|
|
|
function getVideoGenerationProviderAuthEnvVars(providerId: string): string[] {
|
|
return getProviderEnvVars(providerId);
|
|
}
|
|
|
|
function resolveVideoGenerationModelCandidates(params: {
|
|
cfg?: OpenClawConfig;
|
|
agentDir?: string;
|
|
}): Array<string | undefined> {
|
|
const providerDefaults = new Map<string, string>();
|
|
for (const provider of listRuntimeVideoGenerationProviders({ config: params.cfg })) {
|
|
const providerId = provider.id.trim();
|
|
const modelId = provider.defaultModel?.trim();
|
|
if (
|
|
!providerId ||
|
|
!modelId ||
|
|
providerDefaults.has(providerId) ||
|
|
!isVideoGenerationProviderConfigured({
|
|
provider,
|
|
cfg: params.cfg,
|
|
agentDir: params.agentDir,
|
|
})
|
|
) {
|
|
continue;
|
|
}
|
|
providerDefaults.set(providerId, `${providerId}/${modelId}`);
|
|
}
|
|
|
|
const primaryProvider = resolveDefaultModelRef(params.cfg).provider;
|
|
const orderedProviders = [
|
|
primaryProvider,
|
|
...[...providerDefaults.keys()]
|
|
.filter((providerId) => providerId !== primaryProvider)
|
|
.toSorted(),
|
|
];
|
|
const orderedRefs: string[] = [];
|
|
const seen = new Set<string>();
|
|
for (const providerId of orderedProviders) {
|
|
const ref = providerDefaults.get(providerId);
|
|
if (!ref || seen.has(ref)) {
|
|
continue;
|
|
}
|
|
seen.add(ref);
|
|
orderedRefs.push(ref);
|
|
}
|
|
return orderedRefs;
|
|
}
|
|
|
|
export function resolveVideoGenerationModelConfigForTool(params: {
|
|
cfg?: OpenClawConfig;
|
|
agentDir?: string;
|
|
}): ToolModelConfig | null {
|
|
const explicit = coerceToolModelConfig(params.cfg?.agents?.defaults?.videoGenerationModel);
|
|
if (hasToolModelConfig(explicit)) {
|
|
return explicit;
|
|
}
|
|
return buildToolModelConfigFromCandidates({
|
|
explicit,
|
|
agentDir: params.agentDir,
|
|
candidates: resolveVideoGenerationModelCandidates(params),
|
|
isProviderConfigured: (providerId) =>
|
|
isVideoGenerationProviderConfigured({
|
|
providerId,
|
|
cfg: params.cfg,
|
|
agentDir: params.agentDir,
|
|
}),
|
|
});
|
|
}
|
|
|
|
function isVideoGenerationProviderConfigured(params: {
|
|
provider?: VideoGenerationProvider;
|
|
providerId?: string;
|
|
cfg?: OpenClawConfig;
|
|
agentDir?: string;
|
|
}): boolean {
|
|
const provider =
|
|
params.provider ??
|
|
listRuntimeVideoGenerationProviders({ config: params.cfg }).find((candidate) => {
|
|
const normalizedId = normalizeProviderId(params.providerId ?? "");
|
|
return (
|
|
normalizeProviderId(candidate.id) === normalizedId ||
|
|
(candidate.aliases ?? []).some((alias) => normalizeProviderId(alias) === normalizedId)
|
|
);
|
|
});
|
|
if (!provider) {
|
|
return params.providerId
|
|
? hasAuthForProvider({ provider: params.providerId, agentDir: params.agentDir })
|
|
: false;
|
|
}
|
|
if (provider.isConfigured) {
|
|
return provider.isConfigured({
|
|
cfg: params.cfg,
|
|
agentDir: params.agentDir,
|
|
});
|
|
}
|
|
return hasAuthForProvider({ provider: provider.id, agentDir: params.agentDir });
|
|
}
|
|
|
|
function resolveAction(args: Record<string, unknown>): "generate" | "list" {
|
|
const raw = readStringParam(args, "action");
|
|
if (!raw) {
|
|
return "generate";
|
|
}
|
|
const normalized = raw.trim().toLowerCase();
|
|
if (normalized === "generate" || normalized === "list") {
|
|
return normalized;
|
|
}
|
|
throw new ToolInputError('action must be "generate" or "list"');
|
|
}
|
|
|
|
function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined {
|
|
const normalized = raw?.trim().toUpperCase();
|
|
if (!normalized) {
|
|
return undefined;
|
|
}
|
|
if (normalized === "480P" || normalized === "720P" || normalized === "1080P") {
|
|
return normalized;
|
|
}
|
|
throw new ToolInputError("resolution must be one of 480P, 720P, or 1080P");
|
|
}
|
|
|
|
function normalizeAspectRatio(raw: string | undefined): string | undefined {
|
|
const normalized = raw?.trim();
|
|
if (!normalized) {
|
|
return undefined;
|
|
}
|
|
if (SUPPORTED_ASPECT_RATIOS.has(normalized)) {
|
|
return normalized;
|
|
}
|
|
throw new ToolInputError(
|
|
"aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9",
|
|
);
|
|
}
|
|
|
|
function readBooleanParam(params: Record<string, unknown>, key: string): boolean | undefined {
|
|
const raw = readSnakeCaseParamRaw(params, key);
|
|
if (typeof raw === "boolean") {
|
|
return raw;
|
|
}
|
|
if (typeof raw === "string") {
|
|
const normalized = raw.trim().toLowerCase();
|
|
if (normalized === "true") {
|
|
return true;
|
|
}
|
|
if (normalized === "false") {
|
|
return false;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function normalizeReferenceInputs(params: {
|
|
args: Record<string, unknown>;
|
|
singularKey: "image" | "video";
|
|
pluralKey: "images" | "videos";
|
|
maxCount: number;
|
|
}): string[] {
|
|
const single = readStringParam(params.args, params.singularKey);
|
|
const multiple = readStringArrayParam(params.args, params.pluralKey);
|
|
const combined = [...(single ? [single] : []), ...(multiple ?? [])];
|
|
const deduped: string[] = [];
|
|
const seen = new Set<string>();
|
|
for (const candidate of combined) {
|
|
const trimmed = candidate.trim();
|
|
const dedupe = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
|
|
if (!dedupe || seen.has(dedupe)) {
|
|
continue;
|
|
}
|
|
seen.add(dedupe);
|
|
deduped.push(trimmed);
|
|
}
|
|
if (deduped.length > params.maxCount) {
|
|
throw new ToolInputError(
|
|
`Too many reference ${params.pluralKey}: ${deduped.length} provided, maximum is ${params.maxCount}.`,
|
|
);
|
|
}
|
|
return deduped;
|
|
}
|
|
|
|
function resolveSelectedVideoGenerationProvider(params: {
|
|
config?: OpenClawConfig;
|
|
videoGenerationModelConfig: ToolModelConfig;
|
|
modelOverride?: string;
|
|
}): VideoGenerationProvider | undefined {
|
|
const selectedRef =
|
|
parseVideoGenerationModelRef(params.modelOverride) ??
|
|
parseVideoGenerationModelRef(params.videoGenerationModelConfig.primary);
|
|
if (!selectedRef) {
|
|
return undefined;
|
|
}
|
|
const selectedProvider = normalizeProviderId(selectedRef.provider);
|
|
return listRuntimeVideoGenerationProviders({ config: params.config }).find(
|
|
(provider) =>
|
|
normalizeProviderId(provider.id) === selectedProvider ||
|
|
(provider.aliases ?? []).some((alias) => normalizeProviderId(alias) === selectedProvider),
|
|
);
|
|
}
|
|
|
|
function validateVideoGenerationCapabilities(params: {
|
|
provider: VideoGenerationProvider | undefined;
|
|
model?: string;
|
|
inputImageCount: number;
|
|
inputVideoCount: number;
|
|
size?: string;
|
|
aspectRatio?: string;
|
|
resolution?: VideoGenerationResolution;
|
|
durationSeconds?: number;
|
|
audio?: boolean;
|
|
watermark?: boolean;
|
|
}) {
|
|
const provider = params.provider;
|
|
if (!provider) {
|
|
return;
|
|
}
|
|
const caps = provider.capabilities;
|
|
if (params.inputImageCount > 0) {
|
|
const maxInputImages = caps.maxInputImages ?? MAX_INPUT_IMAGES;
|
|
if (params.inputImageCount > maxInputImages) {
|
|
throw new ToolInputError(
|
|
`${provider.id} supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`,
|
|
);
|
|
}
|
|
}
|
|
if (params.inputVideoCount > 0) {
|
|
const maxInputVideos = caps.maxInputVideos ?? MAX_INPUT_VIDEOS;
|
|
if (params.inputVideoCount > maxInputVideos) {
|
|
throw new ToolInputError(
|
|
`${provider.id} supports at most ${maxInputVideos} reference video${maxInputVideos === 1 ? "" : "s"}.`,
|
|
);
|
|
}
|
|
}
|
|
if (params.size && !caps.supportsSize) {
|
|
throw new ToolInputError(`${provider.id} does not support size overrides.`);
|
|
}
|
|
if (params.aspectRatio && !caps.supportsAspectRatio) {
|
|
throw new ToolInputError(`${provider.id} does not support aspectRatio overrides.`);
|
|
}
|
|
if (params.resolution && !caps.supportsResolution) {
|
|
throw new ToolInputError(`${provider.id} does not support resolution overrides.`);
|
|
}
|
|
if (
|
|
typeof params.durationSeconds === "number" &&
|
|
Number.isFinite(params.durationSeconds) &&
|
|
!resolveVideoGenerationSupportedDurations({
|
|
provider,
|
|
model: params.model,
|
|
}) &&
|
|
typeof caps.maxDurationSeconds === "number" &&
|
|
params.durationSeconds > caps.maxDurationSeconds
|
|
) {
|
|
throw new ToolInputError(
|
|
`${provider.id} supports at most ${caps.maxDurationSeconds} seconds per video.`,
|
|
);
|
|
}
|
|
if (typeof params.audio === "boolean" && !caps.supportsAudio) {
|
|
throw new ToolInputError(`${provider.id} does not support audio toggles.`);
|
|
}
|
|
if (typeof params.watermark === "boolean" && !caps.supportsWatermark) {
|
|
throw new ToolInputError(`${provider.id} does not support watermark toggles.`);
|
|
}
|
|
}
|
|
|
|
type VideoGenerateSandboxConfig = {
|
|
root: string;
|
|
bridge: SandboxFsBridge;
|
|
};
|
|
|
|
type VideoGenerateBackgroundScheduler = (work: () => Promise<void>) => void;
|
|
|
|
function defaultScheduleVideoGenerateBackgroundWork(work: () => Promise<void>) {
|
|
queueMicrotask(() => {
|
|
void work().catch((error) => {
|
|
log.error("Detached video generation job crashed", {
|
|
error,
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
async function loadReferenceAssets(params: {
|
|
inputs: string[];
|
|
expectedKind: "image" | "video";
|
|
maxBytes?: number;
|
|
workspaceDir?: string;
|
|
sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null;
|
|
}): Promise<
|
|
Array<{
|
|
sourceAsset: VideoGenerationSourceAsset;
|
|
resolvedInput: string;
|
|
rewrittenFrom?: string;
|
|
}>
|
|
> {
|
|
const loaded: Array<{
|
|
sourceAsset: VideoGenerationSourceAsset;
|
|
resolvedInput: string;
|
|
rewrittenFrom?: string;
|
|
}> = [];
|
|
|
|
for (const rawInput of params.inputs) {
|
|
const trimmed = rawInput.trim();
|
|
const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
|
|
if (!inputRaw) {
|
|
throw new ToolInputError(`${params.expectedKind} required (empty string in array)`);
|
|
}
|
|
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
|
|
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
|
|
const isFileUrl = /^file:/i.test(inputRaw);
|
|
const isHttpUrl = /^https?:\/\//i.test(inputRaw);
|
|
const isDataUrl = /^data:/i.test(inputRaw);
|
|
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
|
|
throw new ToolInputError(
|
|
`Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
|
|
);
|
|
}
|
|
if (params.sandboxConfig && isHttpUrl) {
|
|
throw new ToolInputError(
|
|
`Sandboxed video_generate does not allow remote ${params.expectedKind} URLs.`,
|
|
);
|
|
}
|
|
|
|
const resolvedInput = (() => {
|
|
if (params.sandboxConfig) {
|
|
return inputRaw;
|
|
}
|
|
if (inputRaw.startsWith("~")) {
|
|
return resolveUserPath(inputRaw);
|
|
}
|
|
return inputRaw;
|
|
})();
|
|
|
|
if (isHttpUrl && !params.sandboxConfig) {
|
|
loaded.push({
|
|
sourceAsset: { url: resolvedInput },
|
|
resolvedInput,
|
|
});
|
|
continue;
|
|
}
|
|
|
|
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
|
|
? { resolved: "" }
|
|
: params.sandboxConfig
|
|
? await resolveSandboxedBridgeMediaPath({
|
|
sandbox: params.sandboxConfig,
|
|
mediaPath: resolvedInput,
|
|
inboundFallbackDir: "media/inbound",
|
|
})
|
|
: {
|
|
resolved: resolvedInput.startsWith("file://")
|
|
? resolvedInput.slice("file://".length)
|
|
: resolvedInput,
|
|
};
|
|
const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
|
|
const localRoots = resolveMediaToolLocalRoots(
|
|
params.workspaceDir,
|
|
{
|
|
workspaceOnly: params.sandboxConfig?.workspaceOnly === true,
|
|
},
|
|
resolvedPath ? [resolvedPath] : undefined,
|
|
);
|
|
const media = isDataUrl
|
|
? params.expectedKind === "image"
|
|
? decodeDataUrl(resolvedInput)
|
|
: (() => {
|
|
throw new ToolInputError("Video data: URLs are not supported for video_generate.");
|
|
})()
|
|
: params.sandboxConfig
|
|
? await loadWebMedia(resolvedPath ?? resolvedInput, {
|
|
maxBytes: params.maxBytes,
|
|
sandboxValidated: true,
|
|
readFile: createSandboxBridgeReadFile({ sandbox: params.sandboxConfig }),
|
|
})
|
|
: await loadWebMedia(resolvedPath ?? resolvedInput, {
|
|
maxBytes: params.maxBytes,
|
|
localRoots,
|
|
});
|
|
if (media.kind !== params.expectedKind) {
|
|
throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`);
|
|
}
|
|
const mimeType = "mimeType" in media ? media.mimeType : media.contentType;
|
|
const fileName = "fileName" in media ? media.fileName : undefined;
|
|
loaded.push({
|
|
sourceAsset: {
|
|
buffer: media.buffer,
|
|
mimeType,
|
|
fileName,
|
|
},
|
|
resolvedInput,
|
|
...(resolvedPathInfo.rewrittenFrom ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } : {}),
|
|
});
|
|
}
|
|
|
|
return loaded;
|
|
}
|
|
|
|
type LoadedReferenceAsset = Awaited<ReturnType<typeof loadReferenceAssets>>[number];
|
|
|
|
type ExecutedVideoGeneration = {
|
|
provider: string;
|
|
model: string;
|
|
savedPaths: string[];
|
|
contentText: string;
|
|
details: Record<string, unknown>;
|
|
wakeResult: string;
|
|
};
|
|
|
|
async function executeVideoGenerationJob(params: {
|
|
effectiveCfg: OpenClawConfig;
|
|
prompt: string;
|
|
agentDir?: string;
|
|
model?: string;
|
|
size?: string;
|
|
aspectRatio?: string;
|
|
resolution?: VideoGenerationResolution;
|
|
durationSeconds?: number;
|
|
audio?: boolean;
|
|
watermark?: boolean;
|
|
filename?: string;
|
|
loadedReferenceImages: LoadedReferenceAsset[];
|
|
loadedReferenceVideos: LoadedReferenceAsset[];
|
|
taskHandle?: VideoGenerationTaskHandle | null;
|
|
}): Promise<ExecutedVideoGeneration> {
|
|
if (params.taskHandle) {
|
|
recordVideoGenerationTaskProgress({
|
|
handle: params.taskHandle,
|
|
progressSummary: "Generating video",
|
|
});
|
|
}
|
|
const result = await generateVideo({
|
|
cfg: params.effectiveCfg,
|
|
prompt: params.prompt,
|
|
agentDir: params.agentDir,
|
|
modelOverride: params.model,
|
|
size: params.size,
|
|
aspectRatio: params.aspectRatio,
|
|
resolution: params.resolution,
|
|
durationSeconds: params.durationSeconds,
|
|
audio: params.audio,
|
|
watermark: params.watermark,
|
|
inputImages: params.loadedReferenceImages.map((entry) => entry.sourceAsset),
|
|
inputVideos: params.loadedReferenceVideos.map((entry) => entry.sourceAsset),
|
|
});
|
|
if (params.taskHandle) {
|
|
recordVideoGenerationTaskProgress({
|
|
handle: params.taskHandle,
|
|
progressSummary: "Saving generated video",
|
|
});
|
|
}
|
|
const savedVideos = await Promise.all(
|
|
result.videos.map((video) =>
|
|
saveMediaBuffer(
|
|
video.buffer,
|
|
video.mimeType,
|
|
"tool-video-generation",
|
|
undefined,
|
|
params.filename || video.fileName,
|
|
),
|
|
),
|
|
);
|
|
const requestedDurationSeconds =
|
|
typeof result.metadata?.requestedDurationSeconds === "number" &&
|
|
Number.isFinite(result.metadata.requestedDurationSeconds)
|
|
? result.metadata.requestedDurationSeconds
|
|
: params.durationSeconds;
|
|
const normalizedDurationSeconds =
|
|
typeof result.metadata?.normalizedDurationSeconds === "number" &&
|
|
Number.isFinite(result.metadata.normalizedDurationSeconds)
|
|
? result.metadata.normalizedDurationSeconds
|
|
: requestedDurationSeconds;
|
|
const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
|
|
? result.metadata.supportedDurationSeconds.filter(
|
|
(entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
|
|
)
|
|
: undefined;
|
|
const lines = [
|
|
`Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
|
|
typeof requestedDurationSeconds === "number" &&
|
|
typeof normalizedDurationSeconds === "number" &&
|
|
requestedDurationSeconds !== normalizedDurationSeconds
|
|
? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
|
|
: null,
|
|
...savedVideos.map((video) => `MEDIA:${video.path}`),
|
|
].filter((entry): entry is string => Boolean(entry));
|
|
|
|
return {
|
|
provider: result.provider,
|
|
model: result.model,
|
|
savedPaths: savedVideos.map((video) => video.path),
|
|
contentText: lines.join("\n"),
|
|
wakeResult: lines.join("\n"),
|
|
details: {
|
|
provider: result.provider,
|
|
model: result.model,
|
|
count: savedVideos.length,
|
|
media: {
|
|
mediaUrls: savedVideos.map((video) => video.path),
|
|
},
|
|
paths: savedVideos.map((video) => video.path),
|
|
...(params.taskHandle
|
|
? {
|
|
task: {
|
|
taskId: params.taskHandle.taskId,
|
|
runId: params.taskHandle.runId,
|
|
},
|
|
}
|
|
: {}),
|
|
...(params.loadedReferenceImages.length === 1
|
|
? {
|
|
image: params.loadedReferenceImages[0]?.resolvedInput,
|
|
...(params.loadedReferenceImages[0]?.rewrittenFrom
|
|
? { rewrittenFrom: params.loadedReferenceImages[0].rewrittenFrom }
|
|
: {}),
|
|
}
|
|
: params.loadedReferenceImages.length > 1
|
|
? {
|
|
images: params.loadedReferenceImages.map((entry) => ({
|
|
image: entry.resolvedInput,
|
|
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
|
})),
|
|
}
|
|
: {}),
|
|
...(params.loadedReferenceVideos.length === 1
|
|
? {
|
|
video: params.loadedReferenceVideos[0]?.resolvedInput,
|
|
...(params.loadedReferenceVideos[0]?.rewrittenFrom
|
|
? { videoRewrittenFrom: params.loadedReferenceVideos[0].rewrittenFrom }
|
|
: {}),
|
|
}
|
|
: params.loadedReferenceVideos.length > 1
|
|
? {
|
|
videos: params.loadedReferenceVideos.map((entry) => ({
|
|
video: entry.resolvedInput,
|
|
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
|
})),
|
|
}
|
|
: {}),
|
|
...(params.size ? { size: params.size } : {}),
|
|
...(params.aspectRatio ? { aspectRatio: params.aspectRatio } : {}),
|
|
...(params.resolution ? { resolution: params.resolution } : {}),
|
|
...(typeof normalizedDurationSeconds === "number"
|
|
? { durationSeconds: normalizedDurationSeconds }
|
|
: {}),
|
|
...(typeof requestedDurationSeconds === "number" &&
|
|
typeof normalizedDurationSeconds === "number" &&
|
|
requestedDurationSeconds !== normalizedDurationSeconds
|
|
? { requestedDurationSeconds }
|
|
: {}),
|
|
...(supportedDurationSeconds && supportedDurationSeconds.length > 0
|
|
? { supportedDurationSeconds }
|
|
: {}),
|
|
...(typeof params.audio === "boolean" ? { audio: params.audio } : {}),
|
|
...(typeof params.watermark === "boolean" ? { watermark: params.watermark } : {}),
|
|
...(params.filename ? { filename: params.filename } : {}),
|
|
attempts: result.attempts,
|
|
metadata: result.metadata,
|
|
},
|
|
};
|
|
}
|
|
|
|
export function createVideoGenerateTool(options?: {
|
|
config?: OpenClawConfig;
|
|
agentDir?: string;
|
|
agentSessionKey?: string;
|
|
requesterOrigin?: DeliveryContext;
|
|
workspaceDir?: string;
|
|
sandbox?: VideoGenerateSandboxConfig;
|
|
fsPolicy?: ToolFsPolicy;
|
|
scheduleBackgroundWork?: VideoGenerateBackgroundScheduler;
|
|
}): AnyAgentTool | null {
|
|
const cfg: OpenClawConfig = options?.config ?? loadConfig();
|
|
const videoGenerationModelConfig = resolveVideoGenerationModelConfigForTool({
|
|
cfg,
|
|
agentDir: options?.agentDir,
|
|
});
|
|
if (!videoGenerationModelConfig) {
|
|
return null;
|
|
}
|
|
|
|
const sandboxConfig = options?.sandbox
|
|
? {
|
|
root: options.sandbox.root,
|
|
bridge: options.sandbox.bridge,
|
|
workspaceOnly: options.fsPolicy?.workspaceOnly === true,
|
|
}
|
|
: null;
|
|
const scheduleBackgroundWork =
|
|
options?.scheduleBackgroundWork ?? defaultScheduleVideoGenerateBackgroundWork;
|
|
|
|
return {
|
|
label: "Video Generation",
|
|
name: "video_generate",
|
|
displaySummary: "Generate videos",
|
|
description:
|
|
"Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments. Duration requests may be rounded to the nearest provider-supported value.",
|
|
parameters: VideoGenerateToolSchema,
|
|
execute: async (_toolCallId, rawArgs) => {
|
|
const args = rawArgs as Record<string, unknown>;
|
|
const action = resolveAction(args);
|
|
const effectiveCfg =
|
|
applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg;
|
|
|
|
if (action === "list") {
|
|
const providers = listRuntimeVideoGenerationProviders({ config: effectiveCfg });
|
|
if (providers.length === 0) {
|
|
return {
|
|
content: [{ type: "text", text: "No video-generation providers are registered." }],
|
|
details: { providers: [] },
|
|
};
|
|
}
|
|
const lines = providers.map((provider) => {
|
|
const authHints = getVideoGenerationProviderAuthEnvVars(provider.id);
|
|
const capabilities = [
|
|
provider.capabilities.maxVideos ? `maxVideos=${provider.capabilities.maxVideos}` : null,
|
|
provider.capabilities.maxInputImages
|
|
? `maxInputImages=${provider.capabilities.maxInputImages}`
|
|
: null,
|
|
provider.capabilities.maxInputVideos
|
|
? `maxInputVideos=${provider.capabilities.maxInputVideos}`
|
|
: null,
|
|
provider.capabilities.maxDurationSeconds
|
|
? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}`
|
|
: null,
|
|
provider.capabilities.supportedDurationSeconds?.length
|
|
? `supportedDurationSeconds=${provider.capabilities.supportedDurationSeconds.join("/")}`
|
|
: null,
|
|
provider.capabilities.supportedDurationSecondsByModel &&
|
|
Object.keys(provider.capabilities.supportedDurationSecondsByModel).length > 0
|
|
? `supportedDurationSecondsByModel=${Object.entries(
|
|
provider.capabilities.supportedDurationSecondsByModel,
|
|
)
|
|
.map(([modelId, durations]) => `${modelId}:${durations.join("/")}`)
|
|
.join("; ")}`
|
|
: null,
|
|
provider.capabilities.supportsResolution ? "resolution" : null,
|
|
provider.capabilities.supportsAspectRatio ? "aspectRatio" : null,
|
|
provider.capabilities.supportsSize ? "size" : null,
|
|
provider.capabilities.supportsAudio ? "audio" : null,
|
|
provider.capabilities.supportsWatermark ? "watermark" : null,
|
|
]
|
|
.filter((entry): entry is string => Boolean(entry))
|
|
.join(", ");
|
|
return [
|
|
`${provider.id}: default=${provider.defaultModel ?? "none"}`,
|
|
provider.models?.length ? `models=${provider.models.join(", ")}` : null,
|
|
capabilities ? `capabilities=${capabilities}` : null,
|
|
authHints.length > 0 ? `auth=${authHints.join(" / ")}` : null,
|
|
]
|
|
.filter((entry): entry is string => Boolean(entry))
|
|
.join(" | ");
|
|
});
|
|
return {
|
|
content: [{ type: "text", text: lines.join("\n") }],
|
|
details: {
|
|
providers: providers.map((provider) => ({
|
|
id: provider.id,
|
|
defaultModel: provider.defaultModel,
|
|
models: provider.models ?? [],
|
|
authEnvVars: getVideoGenerationProviderAuthEnvVars(provider.id),
|
|
capabilities: provider.capabilities,
|
|
})),
|
|
},
|
|
};
|
|
}
|
|
|
|
const prompt = readStringParam(args, "prompt", { required: true });
|
|
const model = readStringParam(args, "model");
|
|
const filename = readStringParam(args, "filename");
|
|
const size = readStringParam(args, "size");
|
|
const aspectRatio = normalizeAspectRatio(readStringParam(args, "aspectRatio"));
|
|
const resolution = normalizeResolution(readStringParam(args, "resolution"));
|
|
const durationSeconds = readNumberParam(args, "durationSeconds", {
|
|
integer: true,
|
|
strict: true,
|
|
});
|
|
const audio = readBooleanParam(args, "audio");
|
|
const watermark = readBooleanParam(args, "watermark");
|
|
const imageInputs = normalizeReferenceInputs({
|
|
args,
|
|
singularKey: "image",
|
|
pluralKey: "images",
|
|
maxCount: MAX_INPUT_IMAGES,
|
|
});
|
|
const videoInputs = normalizeReferenceInputs({
|
|
args,
|
|
singularKey: "video",
|
|
pluralKey: "videos",
|
|
maxCount: MAX_INPUT_VIDEOS,
|
|
});
|
|
|
|
const selectedProvider = resolveSelectedVideoGenerationProvider({
|
|
config: effectiveCfg,
|
|
videoGenerationModelConfig,
|
|
modelOverride: model,
|
|
});
|
|
const loadedReferenceImages = await loadReferenceAssets({
|
|
inputs: imageInputs,
|
|
expectedKind: "image",
|
|
workspaceDir: options?.workspaceDir,
|
|
sandboxConfig,
|
|
});
|
|
const loadedReferenceVideos = await loadReferenceAssets({
|
|
inputs: videoInputs,
|
|
expectedKind: "video",
|
|
workspaceDir: options?.workspaceDir,
|
|
sandboxConfig,
|
|
});
|
|
validateVideoGenerationCapabilities({
|
|
provider: selectedProvider,
|
|
model:
|
|
parseVideoGenerationModelRef(model)?.model ?? model ?? selectedProvider?.defaultModel,
|
|
inputImageCount: loadedReferenceImages.length,
|
|
inputVideoCount: loadedReferenceVideos.length,
|
|
size,
|
|
aspectRatio,
|
|
resolution,
|
|
durationSeconds,
|
|
audio,
|
|
watermark,
|
|
});
|
|
const taskHandle = createVideoGenerationTaskRun({
|
|
sessionKey: options?.agentSessionKey,
|
|
requesterOrigin: options?.requesterOrigin,
|
|
prompt,
|
|
providerId: selectedProvider?.id,
|
|
});
|
|
const shouldDetach = Boolean(taskHandle && options?.agentSessionKey?.trim());
|
|
|
|
if (shouldDetach) {
|
|
scheduleBackgroundWork(async () => {
|
|
try {
|
|
const executed = await executeVideoGenerationJob({
|
|
effectiveCfg,
|
|
prompt,
|
|
agentDir: options?.agentDir,
|
|
model,
|
|
size,
|
|
aspectRatio,
|
|
resolution,
|
|
durationSeconds,
|
|
audio,
|
|
watermark,
|
|
filename,
|
|
loadedReferenceImages,
|
|
loadedReferenceVideos,
|
|
taskHandle,
|
|
});
|
|
completeVideoGenerationTaskRun({
|
|
handle: taskHandle,
|
|
provider: executed.provider,
|
|
model: executed.model,
|
|
count: executed.savedPaths.length,
|
|
paths: executed.savedPaths,
|
|
});
|
|
try {
|
|
await wakeVideoGenerationTaskCompletion({
|
|
handle: taskHandle,
|
|
status: "ok",
|
|
statusLabel: "completed successfully",
|
|
result: executed.wakeResult,
|
|
});
|
|
} catch (error) {
|
|
log.warn("Video generation completion wake failed after successful generation", {
|
|
taskId: taskHandle?.taskId,
|
|
runId: taskHandle?.runId,
|
|
error,
|
|
});
|
|
}
|
|
} catch (error) {
|
|
failVideoGenerationTaskRun({
|
|
handle: taskHandle,
|
|
error,
|
|
});
|
|
await wakeVideoGenerationTaskCompletion({
|
|
handle: taskHandle,
|
|
status: "error",
|
|
statusLabel: "failed",
|
|
result: error instanceof Error ? error.message : String(error),
|
|
});
|
|
return;
|
|
}
|
|
});
|
|
|
|
return {
|
|
content: [
|
|
{
|
|
type: "text",
|
|
text: `Started video generation task ${taskHandle?.taskId ?? "unknown"} in the background. I'll post the finished video here when it's ready.`,
|
|
},
|
|
],
|
|
details: {
|
|
async: true,
|
|
status: "started",
|
|
...(taskHandle
|
|
? {
|
|
task: {
|
|
taskId: taskHandle.taskId,
|
|
runId: taskHandle.runId,
|
|
},
|
|
}
|
|
: {}),
|
|
...(loadedReferenceImages.length === 1
|
|
? {
|
|
image: loadedReferenceImages[0]?.resolvedInput,
|
|
...(loadedReferenceImages[0]?.rewrittenFrom
|
|
? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
|
|
: {}),
|
|
}
|
|
: loadedReferenceImages.length > 1
|
|
? {
|
|
images: loadedReferenceImages.map((entry) => ({
|
|
image: entry.resolvedInput,
|
|
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
|
})),
|
|
}
|
|
: {}),
|
|
...(loadedReferenceVideos.length === 1
|
|
? {
|
|
video: loadedReferenceVideos[0]?.resolvedInput,
|
|
...(loadedReferenceVideos[0]?.rewrittenFrom
|
|
? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
|
|
: {}),
|
|
}
|
|
: loadedReferenceVideos.length > 1
|
|
? {
|
|
videos: loadedReferenceVideos.map((entry) => ({
|
|
video: entry.resolvedInput,
|
|
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
|
|
})),
|
|
}
|
|
: {}),
|
|
...(model ? { model } : {}),
|
|
...(size ? { size } : {}),
|
|
...(aspectRatio ? { aspectRatio } : {}),
|
|
...(resolution ? { resolution } : {}),
|
|
...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
|
|
...(typeof audio === "boolean" ? { audio } : {}),
|
|
...(typeof watermark === "boolean" ? { watermark } : {}),
|
|
...(filename ? { filename } : {}),
|
|
},
|
|
};
|
|
}
|
|
|
|
try {
|
|
const executed = await executeVideoGenerationJob({
|
|
effectiveCfg,
|
|
prompt,
|
|
agentDir: options?.agentDir,
|
|
model,
|
|
size,
|
|
aspectRatio,
|
|
resolution,
|
|
durationSeconds,
|
|
audio,
|
|
watermark,
|
|
filename,
|
|
loadedReferenceImages,
|
|
loadedReferenceVideos,
|
|
taskHandle,
|
|
});
|
|
completeVideoGenerationTaskRun({
|
|
handle: taskHandle,
|
|
provider: executed.provider,
|
|
model: executed.model,
|
|
count: executed.savedPaths.length,
|
|
paths: executed.savedPaths,
|
|
});
|
|
|
|
return {
|
|
content: [{ type: "text", text: executed.contentText }],
|
|
details: executed.details,
|
|
};
|
|
} catch (error) {
|
|
failVideoGenerationTaskRun({
|
|
handle: taskHandle,
|
|
error,
|
|
});
|
|
throw error;
|
|
}
|
|
},
|
|
};
|
|
}
|