mirror of
https://github.com/moltbot/moltbot.git
synced 2026-04-21 05:32:53 +00:00
fix: surface normalized video durations
This commit is contained in:
@@ -58,24 +58,24 @@ Use `action: "list"` to inspect available providers and models at runtime:
|
||||
|
||||
## Tool parameters
|
||||
|
||||
| Parameter | Type | Description |
|
||||
| ----------------- | -------- | ------------------------------------------------------------------------------------- |
|
||||
| `prompt` | string | Video generation prompt (required for `action: "generate"`) |
|
||||
| `action` | string | `"generate"` (default) or `"list"` to inspect providers |
|
||||
| `model` | string | Provider/model override, e.g. `qwen/wan2.6-t2v` |
|
||||
| `image` | string | Single reference image path or URL |
|
||||
| `images` | string[] | Multiple reference images (up to 5) |
|
||||
| `video` | string | Single reference video path or URL |
|
||||
| `videos` | string[] | Multiple reference videos (up to 4) |
|
||||
| `size` | string | Size hint when the provider supports it |
|
||||
| `aspectRatio` | string | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` |
|
||||
| `resolution` | string | Resolution hint: `480P`, `720P`, or `1080P` |
|
||||
| `durationSeconds` | number | Target duration in seconds |
|
||||
| `audio` | boolean | Enable generated audio when the provider supports it |
|
||||
| `watermark` | boolean | Toggle provider watermarking when supported |
|
||||
| `filename` | string | Output filename hint |
|
||||
| Parameter | Type | Description |
|
||||
| ----------------- | -------- | -------------------------------------------------------------------------------------- |
|
||||
| `prompt` | string | Video generation prompt (required for `action: "generate"`) |
|
||||
| `action` | string | `"generate"` (default) or `"list"` to inspect providers |
|
||||
| `model` | string | Provider/model override, e.g. `qwen/wan2.6-t2v` |
|
||||
| `image` | string | Single reference image path or URL |
|
||||
| `images` | string[] | Multiple reference images (up to 5) |
|
||||
| `video` | string | Single reference video path or URL |
|
||||
| `videos` | string[] | Multiple reference videos (up to 4) |
|
||||
| `size` | string | Size hint when the provider supports it |
|
||||
| `aspectRatio` | string | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` |
|
||||
| `resolution` | string | Resolution hint: `480P`, `720P`, or `1080P` |
|
||||
| `durationSeconds` | number | Target duration in seconds. OpenClaw may round to the nearest provider-supported value |
|
||||
| `audio` | boolean | Enable generated audio when the provider supports it |
|
||||
| `watermark` | boolean | Toggle provider watermarking when supported |
|
||||
| `filename` | string | Output filename hint |
|
||||
|
||||
Not all providers support all parameters. The tool validates provider capability limits before it submits the request.
|
||||
Not all providers support all parameters. The tool validates provider capability limits before it submits the request. When a provider or model only supports a discrete set of video lengths, OpenClaw rounds `durationSeconds` to the nearest supported value and reports the normalized duration in the tool result.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
@@ -162,6 +162,7 @@ export function buildGoogleVideoGenerationProvider(): VideoGenerationProvider {
|
||||
maxInputImages: 1,
|
||||
maxInputVideos: 1,
|
||||
maxDurationSeconds: GOOGLE_VIDEO_MAX_DURATION_SECONDS,
|
||||
supportedDurationSeconds: GOOGLE_VIDEO_ALLOWED_DURATION_SECONDS,
|
||||
supportsAspectRatio: true,
|
||||
supportsResolution: true,
|
||||
supportsSize: true,
|
||||
|
||||
@@ -232,6 +232,7 @@ export function buildMinimaxVideoGenerationProvider(): VideoGenerationProvider {
|
||||
maxInputImages: 1,
|
||||
maxInputVideos: 0,
|
||||
maxDurationSeconds: 10,
|
||||
supportedDurationSecondsByModel: MINIMAX_MODEL_ALLOWED_DURATIONS,
|
||||
supportsResolution: true,
|
||||
supportsWatermark: false,
|
||||
},
|
||||
|
||||
@@ -194,6 +194,7 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider {
|
||||
maxInputImages: 1,
|
||||
maxInputVideos: 1,
|
||||
maxDurationSeconds: 12,
|
||||
supportedDurationSeconds: OPENAI_VIDEO_SECONDS,
|
||||
supportsSize: true,
|
||||
},
|
||||
async generateVideo(req) {
|
||||
|
||||
@@ -11,6 +11,7 @@ function asConfig(value: unknown): OpenClawConfig {
|
||||
describe("createVideoGenerateTool", () => {
|
||||
beforeEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
@@ -88,4 +89,90 @@ describe("createVideoGenerateTool", () => {
|
||||
metadata: { taskId: "task-1" },
|
||||
});
|
||||
});
|
||||
|
||||
it("shows duration normalization details from runtime metadata", async () => {
|
||||
vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
|
||||
provider: "google",
|
||||
model: "veo-3.1-fast-generate-preview",
|
||||
attempts: [],
|
||||
videos: [
|
||||
{
|
||||
buffer: Buffer.from("video-bytes"),
|
||||
mimeType: "video/mp4",
|
||||
fileName: "lobster.mp4",
|
||||
},
|
||||
],
|
||||
metadata: {
|
||||
requestedDurationSeconds: 5,
|
||||
normalizedDurationSeconds: 6,
|
||||
supportedDurationSeconds: [4, 6, 8],
|
||||
},
|
||||
});
|
||||
vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({
|
||||
path: "/tmp/generated-lobster.mp4",
|
||||
id: "generated-lobster.mp4",
|
||||
size: 11,
|
||||
contentType: "video/mp4",
|
||||
});
|
||||
|
||||
const tool = createVideoGenerateTool({
|
||||
config: asConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: { primary: "google/veo-3.1-fast-generate-preview" },
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
if (!tool) {
|
||||
throw new Error("expected video_generate tool");
|
||||
}
|
||||
|
||||
const result = await tool.execute("call-1", {
|
||||
prompt: "friendly lobster surfing",
|
||||
durationSeconds: 5,
|
||||
});
|
||||
const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
|
||||
|
||||
expect(text).toContain("Duration normalized: requested 5s; used 6s.");
|
||||
expect(result.details).toMatchObject({
|
||||
durationSeconds: 6,
|
||||
requestedDurationSeconds: 5,
|
||||
supportedDurationSeconds: [4, 6, 8],
|
||||
});
|
||||
});
|
||||
|
||||
it("lists supported provider durations when advertised", async () => {
|
||||
vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([
|
||||
{
|
||||
id: "google",
|
||||
defaultModel: "veo-3.1-fast-generate-preview",
|
||||
models: ["veo-3.1-fast-generate-preview"],
|
||||
capabilities: {
|
||||
maxDurationSeconds: 8,
|
||||
supportedDurationSeconds: [4, 6, 8],
|
||||
},
|
||||
generateVideo: vi.fn(async () => {
|
||||
throw new Error("not used");
|
||||
}),
|
||||
},
|
||||
]);
|
||||
|
||||
const tool = createVideoGenerateTool({
|
||||
config: asConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: { primary: "google/veo-3.1-fast-generate-preview" },
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
if (!tool) {
|
||||
throw new Error("expected video_generate tool");
|
||||
}
|
||||
|
||||
const result = await tool.execute("call-1", { action: "list" });
|
||||
const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
|
||||
expect(text).toContain("supportedDurationSeconds=4/6/8");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,6 +6,7 @@ import { loadWebMedia } from "../../media/web-media.js";
|
||||
import { readSnakeCaseParamRaw } from "../../param-key.js";
|
||||
import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
import { resolveVideoGenerationSupportedDurations } from "../../video-generation/duration-support.js";
|
||||
import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
|
||||
import {
|
||||
generateVideo,
|
||||
@@ -114,7 +115,8 @@ const VideoGenerateToolSchema = Type.Object({
|
||||
),
|
||||
durationSeconds: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Optional target duration in seconds.",
|
||||
description:
|
||||
"Optional target duration in seconds. OpenClaw may round this to the nearest provider-supported duration.",
|
||||
minimum: 1,
|
||||
}),
|
||||
),
|
||||
@@ -329,6 +331,7 @@ function resolveSelectedVideoGenerationProvider(params: {
|
||||
|
||||
function validateVideoGenerationCapabilities(params: {
|
||||
provider: VideoGenerationProvider | undefined;
|
||||
model?: string;
|
||||
inputImageCount: number;
|
||||
inputVideoCount: number;
|
||||
size?: string;
|
||||
@@ -371,6 +374,10 @@ function validateVideoGenerationCapabilities(params: {
|
||||
if (
|
||||
typeof params.durationSeconds === "number" &&
|
||||
Number.isFinite(params.durationSeconds) &&
|
||||
!resolveVideoGenerationSupportedDurations({
|
||||
provider,
|
||||
model: params.model,
|
||||
}) &&
|
||||
typeof caps.maxDurationSeconds === "number" &&
|
||||
params.durationSeconds > caps.maxDurationSeconds
|
||||
) {
|
||||
@@ -535,7 +542,7 @@ export function createVideoGenerateTool(options?: {
|
||||
name: "video_generate",
|
||||
displaySummary: "Generate videos",
|
||||
description:
|
||||
"Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments.",
|
||||
"Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments. Duration requests may be rounded to the nearest provider-supported value.",
|
||||
parameters: VideoGenerateToolSchema,
|
||||
execute: async (_toolCallId, rawArgs) => {
|
||||
const args = rawArgs as Record<string, unknown>;
|
||||
@@ -564,6 +571,17 @@ export function createVideoGenerateTool(options?: {
|
||||
provider.capabilities.maxDurationSeconds
|
||||
? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}`
|
||||
: null,
|
||||
provider.capabilities.supportedDurationSeconds?.length
|
||||
? `supportedDurationSeconds=${provider.capabilities.supportedDurationSeconds.join("/")}`
|
||||
: null,
|
||||
provider.capabilities.supportedDurationSecondsByModel &&
|
||||
Object.keys(provider.capabilities.supportedDurationSecondsByModel).length > 0
|
||||
? `supportedDurationSecondsByModel=${Object.entries(
|
||||
provider.capabilities.supportedDurationSecondsByModel,
|
||||
)
|
||||
.map(([modelId, durations]) => `${modelId}:${durations.join("/")}`)
|
||||
.join("; ")}`
|
||||
: null,
|
||||
provider.capabilities.supportsResolution ? "resolution" : null,
|
||||
provider.capabilities.supportsAspectRatio ? "aspectRatio" : null,
|
||||
provider.capabilities.supportsSize ? "size" : null,
|
||||
@@ -639,6 +657,8 @@ export function createVideoGenerateTool(options?: {
|
||||
});
|
||||
validateVideoGenerationCapabilities({
|
||||
provider: selectedProvider,
|
||||
model:
|
||||
parseVideoGenerationModelRef(model)?.model ?? model ?? selectedProvider?.defaultModel,
|
||||
inputImageCount: loadedReferenceImages.length,
|
||||
inputVideoCount: loadedReferenceVideos.length,
|
||||
size,
|
||||
@@ -674,10 +694,30 @@ export function createVideoGenerateTool(options?: {
|
||||
),
|
||||
),
|
||||
);
|
||||
const requestedDurationSeconds =
|
||||
typeof result.metadata?.requestedDurationSeconds === "number" &&
|
||||
Number.isFinite(result.metadata.requestedDurationSeconds)
|
||||
? result.metadata.requestedDurationSeconds
|
||||
: durationSeconds;
|
||||
const normalizedDurationSeconds =
|
||||
typeof result.metadata?.normalizedDurationSeconds === "number" &&
|
||||
Number.isFinite(result.metadata.normalizedDurationSeconds)
|
||||
? result.metadata.normalizedDurationSeconds
|
||||
: requestedDurationSeconds;
|
||||
const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
|
||||
? result.metadata.supportedDurationSeconds.filter(
|
||||
(entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
|
||||
)
|
||||
: undefined;
|
||||
const lines = [
|
||||
`Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
|
||||
typeof requestedDurationSeconds === "number" &&
|
||||
typeof normalizedDurationSeconds === "number" &&
|
||||
requestedDurationSeconds !== normalizedDurationSeconds
|
||||
? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
|
||||
: null,
|
||||
...savedVideos.map((video) => `MEDIA:${video.path}`),
|
||||
];
|
||||
].filter((entry): entry is string => Boolean(entry));
|
||||
|
||||
return {
|
||||
content: [{ type: "text", text: lines.join("\n") }],
|
||||
@@ -722,7 +762,17 @@ export function createVideoGenerateTool(options?: {
|
||||
...(size ? { size } : {}),
|
||||
...(aspectRatio ? { aspectRatio } : {}),
|
||||
...(resolution ? { resolution } : {}),
|
||||
...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
|
||||
...(typeof normalizedDurationSeconds === "number"
|
||||
? { durationSeconds: normalizedDurationSeconds }
|
||||
: {}),
|
||||
...(typeof requestedDurationSeconds === "number" &&
|
||||
typeof normalizedDurationSeconds === "number" &&
|
||||
requestedDurationSeconds !== normalizedDurationSeconds
|
||||
? { requestedDurationSeconds }
|
||||
: {}),
|
||||
...(supportedDurationSeconds && supportedDurationSeconds.length > 0
|
||||
? { supportedDurationSeconds }
|
||||
: {}),
|
||||
...(typeof audio === "boolean" ? { audio } : {}),
|
||||
...(typeof watermark === "boolean" ? { watermark } : {}),
|
||||
...(filename ? { filename } : {}),
|
||||
|
||||
54
src/video-generation/duration-support.ts
Normal file
54
src/video-generation/duration-support.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import type { VideoGenerationProvider } from "./types.js";
|
||||
|
||||
function normalizeSupportedDurationValues(
|
||||
values: readonly number[] | undefined,
|
||||
): number[] | undefined {
|
||||
if (!Array.isArray(values) || values.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = [...new Set(values)]
|
||||
.filter((value) => Number.isFinite(value) && value > 0)
|
||||
.map((value) => Math.round(value))
|
||||
.filter((value) => value > 0)
|
||||
.toSorted((left, right) => left - right);
|
||||
return normalized.length > 0 ? normalized : undefined;
|
||||
}
|
||||
|
||||
export function resolveVideoGenerationSupportedDurations(params: {
|
||||
provider?: VideoGenerationProvider;
|
||||
model?: string;
|
||||
}): number[] | undefined {
|
||||
const caps = params.provider?.capabilities;
|
||||
const model = params.model?.trim();
|
||||
const modelSpecific =
|
||||
model && caps?.supportedDurationSecondsByModel
|
||||
? caps.supportedDurationSecondsByModel[model]
|
||||
: undefined;
|
||||
return normalizeSupportedDurationValues(modelSpecific ?? caps?.supportedDurationSeconds);
|
||||
}
|
||||
|
||||
export function normalizeVideoGenerationDuration(params: {
|
||||
provider?: VideoGenerationProvider;
|
||||
model?: string;
|
||||
durationSeconds?: number;
|
||||
}): number | undefined {
|
||||
if (typeof params.durationSeconds !== "number" || !Number.isFinite(params.durationSeconds)) {
|
||||
return undefined;
|
||||
}
|
||||
const rounded = Math.max(1, Math.round(params.durationSeconds));
|
||||
const supported = resolveVideoGenerationSupportedDurations(params);
|
||||
if (!supported || supported.length === 0) {
|
||||
return rounded;
|
||||
}
|
||||
return supported.reduce((best, current) => {
|
||||
const currentDistance = Math.abs(current - rounded);
|
||||
const bestDistance = Math.abs(best - rounded);
|
||||
if (currentDistance < bestDistance) {
|
||||
return current;
|
||||
}
|
||||
if (currentDistance === bestDistance && current > best) {
|
||||
return current;
|
||||
}
|
||||
return best;
|
||||
});
|
||||
}
|
||||
@@ -150,6 +150,43 @@ describe("video-generation runtime", () => {
|
||||
expect(mocks.listVideoGenerationProviders).toHaveBeenCalledWith({} as OpenClawConfig);
|
||||
});
|
||||
|
||||
it("normalizes requested durations to supported provider values", async () => {
|
||||
let seenDurationSeconds: number | undefined;
|
||||
mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1");
|
||||
mocks.getVideoGenerationProvider.mockReturnValue({
|
||||
id: "video-plugin",
|
||||
capabilities: {
|
||||
supportedDurationSeconds: [4, 6, 8],
|
||||
},
|
||||
generateVideo: async (req) => {
|
||||
seenDurationSeconds = req.durationSeconds;
|
||||
return {
|
||||
videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }],
|
||||
model: "vid-v1",
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generateVideo({
|
||||
cfg: {
|
||||
agents: {
|
||||
defaults: {
|
||||
videoGenerationModel: { primary: "video-plugin/vid-v1" },
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig,
|
||||
prompt: "animate a cat",
|
||||
durationSeconds: 5,
|
||||
});
|
||||
|
||||
expect(seenDurationSeconds).toBe(6);
|
||||
expect(result.metadata).toMatchObject({
|
||||
requestedDurationSeconds: 5,
|
||||
normalizedDurationSeconds: 6,
|
||||
supportedDurationSeconds: [4, 6, 8],
|
||||
});
|
||||
});
|
||||
|
||||
it("builds a generic config hint without hardcoded provider ids", async () => {
|
||||
mocks.listVideoGenerationProviders.mockReturnValue([
|
||||
{
|
||||
|
||||
@@ -8,6 +8,10 @@ import {
|
||||
} from "../config/model-input.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
import { getProviderEnvVars } from "../secrets/provider-env-vars.js";
|
||||
import {
|
||||
normalizeVideoGenerationDuration,
|
||||
resolveVideoGenerationSupportedDurations,
|
||||
} from "./duration-support.js";
|
||||
import { parseVideoGenerationModelRef } from "./model-ref.js";
|
||||
import { getVideoGenerationProvider, listVideoGenerationProviders } from "./provider-registry.js";
|
||||
import type {
|
||||
@@ -147,6 +151,19 @@ export async function generateVideo(
|
||||
}
|
||||
|
||||
try {
|
||||
const requestedDurationSeconds =
|
||||
typeof params.durationSeconds === "number" && Number.isFinite(params.durationSeconds)
|
||||
? Math.max(1, Math.round(params.durationSeconds))
|
||||
: undefined;
|
||||
const normalizedDurationSeconds = normalizeVideoGenerationDuration({
|
||||
provider,
|
||||
model: candidate.model,
|
||||
durationSeconds: requestedDurationSeconds,
|
||||
});
|
||||
const supportedDurationSeconds = resolveVideoGenerationSupportedDurations({
|
||||
provider,
|
||||
model: candidate.model,
|
||||
});
|
||||
const result: VideoGenerationResult = await provider.generateVideo({
|
||||
provider: candidate.provider,
|
||||
model: candidate.model,
|
||||
@@ -157,7 +174,7 @@ export async function generateVideo(
|
||||
size: params.size,
|
||||
aspectRatio: params.aspectRatio,
|
||||
resolution: params.resolution,
|
||||
durationSeconds: params.durationSeconds,
|
||||
durationSeconds: normalizedDurationSeconds,
|
||||
audio: params.audio,
|
||||
watermark: params.watermark,
|
||||
inputImages: params.inputImages,
|
||||
@@ -171,7 +188,17 @@ export async function generateVideo(
|
||||
provider: candidate.provider,
|
||||
model: result.model ?? candidate.model,
|
||||
attempts,
|
||||
metadata: result.metadata,
|
||||
metadata:
|
||||
typeof requestedDurationSeconds === "number" &&
|
||||
typeof normalizedDurationSeconds === "number" &&
|
||||
requestedDurationSeconds !== normalizedDurationSeconds
|
||||
? {
|
||||
...result.metadata,
|
||||
requestedDurationSeconds,
|
||||
normalizedDurationSeconds,
|
||||
...(supportedDurationSeconds ? { supportedDurationSeconds } : {}),
|
||||
}
|
||||
: result.metadata,
|
||||
};
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
|
||||
@@ -52,6 +52,8 @@ export type VideoGenerationProviderCapabilities = {
|
||||
maxInputImages?: number;
|
||||
maxInputVideos?: number;
|
||||
maxDurationSeconds?: number;
|
||||
supportedDurationSeconds?: readonly number[];
|
||||
supportedDurationSecondsByModel?: Readonly<Record<string, readonly number[]>>;
|
||||
supportsSize?: boolean;
|
||||
supportsAspectRatio?: boolean;
|
||||
supportsResolution?: boolean;
|
||||
|
||||
Reference in New Issue
Block a user