fix: surface normalized video durations

2026-04-21 05:32:53 +00:00 · 2026-04-05 23:57:18 +01:00
parent 09fe144e52
commit b5ade7b629
10 changed files with 283 additions and 23 deletions
--- a/docs/tools/video-generation.md
+++ b/docs/tools/video-generation.md
@@ -58,24 +58,24 @@ Use `action: "list"` to inspect available providers and models at runtime:

 ## Tool parameters

-| Parameter         | Type     | Description                                                                           |
-| ----------------- | -------- | ------------------------------------------------------------------------------------- |
-| `prompt`          | string   | Video generation prompt (required for `action: "generate"`)                           |
-| `action`          | string   | `"generate"` (default) or `"list"` to inspect providers                               |
-| `model`           | string   | Provider/model override, e.g. `qwen/wan2.6-t2v`                                       |
-| `image`           | string   | Single reference image path or URL                                                    |
-| `images`          | string[] | Multiple reference images (up to 5)                                                   |
-| `video`           | string   | Single reference video path or URL                                                    |
-| `videos`          | string[] | Multiple reference videos (up to 4)                                                   |
-| `size`            | string   | Size hint when the provider supports it                                               |
-| `aspectRatio`     | string   | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9` |
-| `resolution`      | string   | Resolution hint: `480P`, `720P`, or `1080P`                                           |
-| `durationSeconds` | number   | Target duration in seconds                                                            |
-| `audio`           | boolean  | Enable generated audio when the provider supports it                                  |
-| `watermark`       | boolean  | Toggle provider watermarking when supported                                           |
-| `filename`        | string   | Output filename hint                                                                  |
+| Parameter         | Type     | Description                                                                            |
+| ----------------- | -------- | -------------------------------------------------------------------------------------- |
+| `prompt`          | string   | Video generation prompt (required for `action: "generate"`)                            |
+| `action`          | string   | `"generate"` (default) or `"list"` to inspect providers                                |
+| `model`           | string   | Provider/model override, e.g. `qwen/wan2.6-t2v`                                        |
+| `image`           | string   | Single reference image path or URL                                                     |
+| `images`          | string[] | Multiple reference images (up to 5)                                                    |
+| `video`           | string   | Single reference video path or URL                                                     |
+| `videos`          | string[] | Multiple reference videos (up to 4)                                                    |
+| `size`            | string   | Size hint when the provider supports it                                                |
+| `aspectRatio`     | string   | Aspect ratio: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9`  |
+| `resolution`      | string   | Resolution hint: `480P`, `720P`, or `1080P`                                            |
+| `durationSeconds` | number   | Target duration in seconds. OpenClaw may round to the nearest provider-supported value |
+| `audio`           | boolean  | Enable generated audio when the provider supports it                                   |
+| `watermark`       | boolean  | Toggle provider watermarking when supported                                            |
+| `filename`        | string   | Output filename hint                                                                   |

-Not all providers support all parameters. The tool validates provider capability limits before it submits the request.
+Not all providers support all parameters. The tool validates provider capability limits before it submits the request. When a provider or model only supports a discrete set of video lengths, OpenClaw rounds `durationSeconds` to the nearest supported value and reports the normalized duration in the tool result.

 ## Configuration

--- a/extensions/google/video-generation-provider.ts
+++ b/extensions/google/video-generation-provider.ts
@@ -162,6 +162,7 @@ export function buildGoogleVideoGenerationProvider(): VideoGenerationProvider {
      maxInputImages: 1,
      maxInputVideos: 1,
      maxDurationSeconds: GOOGLE_VIDEO_MAX_DURATION_SECONDS,
+      supportedDurationSeconds: GOOGLE_VIDEO_ALLOWED_DURATION_SECONDS,
      supportsAspectRatio: true,
      supportsResolution: true,
      supportsSize: true,
--- a/extensions/minimax/video-generation-provider.ts
+++ b/extensions/minimax/video-generation-provider.ts
@@ -232,6 +232,7 @@ export function buildMinimaxVideoGenerationProvider(): VideoGenerationProvider {
      maxInputImages: 1,
      maxInputVideos: 0,
      maxDurationSeconds: 10,
+      supportedDurationSecondsByModel: MINIMAX_MODEL_ALLOWED_DURATIONS,
      supportsResolution: true,
      supportsWatermark: false,
    },
--- a/extensions/openai/video-generation-provider.ts
+++ b/extensions/openai/video-generation-provider.ts
@@ -194,6 +194,7 @@ export function buildOpenAIVideoGenerationProvider(): VideoGenerationProvider {
      maxInputImages: 1,
      maxInputVideos: 1,
      maxDurationSeconds: 12,
+      supportedDurationSeconds: OPENAI_VIDEO_SECONDS,
      supportsSize: true,
    },
    async generateVideo(req) {
--- a/src/agents/tools/video-generate-tool.test.ts
+++ b/src/agents/tools/video-generate-tool.test.ts
@@ -11,6 +11,7 @@ function asConfig(value: unknown): OpenClawConfig {
 describe("createVideoGenerateTool", () => {
  beforeEach(() => {
    vi.restoreAllMocks();
+    vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
  });

  afterEach(() => {
@@ -88,4 +89,90 @@ describe("createVideoGenerateTool", () => {
      metadata: { taskId: "task-1" },
    });
  });
+
+  it("shows duration normalization details from runtime metadata", async () => {
+    vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
+      provider: "google",
+      model: "veo-3.1-fast-generate-preview",
+      attempts: [],
+      videos: [
+        {
+          buffer: Buffer.from("video-bytes"),
+          mimeType: "video/mp4",
+          fileName: "lobster.mp4",
+        },
+      ],
+      metadata: {
+        requestedDurationSeconds: 5,
+        normalizedDurationSeconds: 6,
+        supportedDurationSeconds: [4, 6, 8],
+      },
+    });
+    vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValueOnce({
+      path: "/tmp/generated-lobster.mp4",
+      id: "generated-lobster.mp4",
+      size: 11,
+      contentType: "video/mp4",
+    });
+
+    const tool = createVideoGenerateTool({
+      config: asConfig({
+        agents: {
+          defaults: {
+            videoGenerationModel: { primary: "google/veo-3.1-fast-generate-preview" },
+          },
+        },
+      }),
+    });
+    if (!tool) {
+      throw new Error("expected video_generate tool");
+    }
+
+    const result = await tool.execute("call-1", {
+      prompt: "friendly lobster surfing",
+      durationSeconds: 5,
+    });
+    const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
+
+    expect(text).toContain("Duration normalized: requested 5s; used 6s.");
+    expect(result.details).toMatchObject({
+      durationSeconds: 6,
+      requestedDurationSeconds: 5,
+      supportedDurationSeconds: [4, 6, 8],
+    });
+  });
+
+  it("lists supported provider durations when advertised", async () => {
+    vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([
+      {
+        id: "google",
+        defaultModel: "veo-3.1-fast-generate-preview",
+        models: ["veo-3.1-fast-generate-preview"],
+        capabilities: {
+          maxDurationSeconds: 8,
+          supportedDurationSeconds: [4, 6, 8],
+        },
+        generateVideo: vi.fn(async () => {
+          throw new Error("not used");
+        }),
+      },
+    ]);
+
+    const tool = createVideoGenerateTool({
+      config: asConfig({
+        agents: {
+          defaults: {
+            videoGenerationModel: { primary: "google/veo-3.1-fast-generate-preview" },
+          },
+        },
+      }),
+    });
+    if (!tool) {
+      throw new Error("expected video_generate tool");
+    }
+
+    const result = await tool.execute("call-1", { action: "list" });
+    const text = (result.content?.[0] as { text: string } | undefined)?.text ?? "";
+    expect(text).toContain("supportedDurationSeconds=4/6/8");
+  });
 });
--- a/src/agents/tools/video-generate-tool.ts
+++ b/src/agents/tools/video-generate-tool.ts
@@ -6,6 +6,7 @@ import { loadWebMedia } from "../../media/web-media.js";
 import { readSnakeCaseParamRaw } from "../../param-key.js";
 import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
 import { resolveUserPath } from "../../utils.js";
+import { resolveVideoGenerationSupportedDurations } from "../../video-generation/duration-support.js";
 import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
 import {
  generateVideo,
@@ -114,7 +115,8 @@ const VideoGenerateToolSchema = Type.Object({
  ),
  durationSeconds: Type.Optional(
    Type.Number({
-      description: "Optional target duration in seconds.",
+      description:
+        "Optional target duration in seconds. OpenClaw may round this to the nearest provider-supported duration.",
      minimum: 1,
    }),
  ),
@@ -329,6 +331,7 @@ function resolveSelectedVideoGenerationProvider(params: {

 function validateVideoGenerationCapabilities(params: {
  provider: VideoGenerationProvider | undefined;
+  model?: string;
  inputImageCount: number;
  inputVideoCount: number;
  size?: string;
@@ -371,6 +374,10 @@ function validateVideoGenerationCapabilities(params: {
  if (
    typeof params.durationSeconds === "number" &&
    Number.isFinite(params.durationSeconds) &&
+    !resolveVideoGenerationSupportedDurations({
+      provider,
+      model: params.model,
+    }) &&
    typeof caps.maxDurationSeconds === "number" &&
    params.durationSeconds > caps.maxDurationSeconds
  ) {
@@ -535,7 +542,7 @@ export function createVideoGenerateTool(options?: {
    name: "video_generate",
    displaySummary: "Generate videos",
    description:
-      "Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments.",
+      "Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments. Duration requests may be rounded to the nearest provider-supported value.",
    parameters: VideoGenerateToolSchema,
    execute: async (_toolCallId, rawArgs) => {
      const args = rawArgs as Record<string, unknown>;
@@ -564,6 +571,17 @@ export function createVideoGenerateTool(options?: {
            provider.capabilities.maxDurationSeconds
              ? `maxDurationSeconds=${provider.capabilities.maxDurationSeconds}`
              : null,
+            provider.capabilities.supportedDurationSeconds?.length
+              ? `supportedDurationSeconds=${provider.capabilities.supportedDurationSeconds.join("/")}`
+              : null,
+            provider.capabilities.supportedDurationSecondsByModel &&
+            Object.keys(provider.capabilities.supportedDurationSecondsByModel).length > 0
+              ? `supportedDurationSecondsByModel=${Object.entries(
+                  provider.capabilities.supportedDurationSecondsByModel,
+                )
+                  .map(([modelId, durations]) => `${modelId}:${durations.join("/")}`)
+                  .join("; ")}`
+              : null,
            provider.capabilities.supportsResolution ? "resolution" : null,
            provider.capabilities.supportsAspectRatio ? "aspectRatio" : null,
            provider.capabilities.supportsSize ? "size" : null,
@@ -639,6 +657,8 @@ export function createVideoGenerateTool(options?: {
      });
      validateVideoGenerationCapabilities({
        provider: selectedProvider,
+        model:
+          parseVideoGenerationModelRef(model)?.model ?? model ?? selectedProvider?.defaultModel,
        inputImageCount: loadedReferenceImages.length,
        inputVideoCount: loadedReferenceVideos.length,
        size,
@@ -674,10 +694,30 @@ export function createVideoGenerateTool(options?: {
          ),
        ),
      );
+      const requestedDurationSeconds =
+        typeof result.metadata?.requestedDurationSeconds === "number" &&
+        Number.isFinite(result.metadata.requestedDurationSeconds)
+          ? result.metadata.requestedDurationSeconds
+          : durationSeconds;
+      const normalizedDurationSeconds =
+        typeof result.metadata?.normalizedDurationSeconds === "number" &&
+        Number.isFinite(result.metadata.normalizedDurationSeconds)
+          ? result.metadata.normalizedDurationSeconds
+          : requestedDurationSeconds;
+      const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
+        ? result.metadata.supportedDurationSeconds.filter(
+            (entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
+          )
+        : undefined;
      const lines = [
        `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
+        typeof requestedDurationSeconds === "number" &&
+        typeof normalizedDurationSeconds === "number" &&
+        requestedDurationSeconds !== normalizedDurationSeconds
+          ? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
+          : null,
        ...savedVideos.map((video) => `MEDIA:${video.path}`),
-      ];
+      ].filter((entry): entry is string => Boolean(entry));

      return {
        content: [{ type: "text", text: lines.join("\n") }],
@@ -722,7 +762,17 @@ export function createVideoGenerateTool(options?: {
          ...(size ? { size } : {}),
          ...(aspectRatio ? { aspectRatio } : {}),
          ...(resolution ? { resolution } : {}),
-          ...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
+          ...(typeof normalizedDurationSeconds === "number"
+            ? { durationSeconds: normalizedDurationSeconds }
+            : {}),
+          ...(typeof requestedDurationSeconds === "number" &&
+          typeof normalizedDurationSeconds === "number" &&
+          requestedDurationSeconds !== normalizedDurationSeconds
+            ? { requestedDurationSeconds }
+            : {}),
+          ...(supportedDurationSeconds && supportedDurationSeconds.length > 0
+            ? { supportedDurationSeconds }
+            : {}),
          ...(typeof audio === "boolean" ? { audio } : {}),
          ...(typeof watermark === "boolean" ? { watermark } : {}),
          ...(filename ? { filename } : {}),
--- a/src/video-generation/duration-support.ts
+++ b/src/video-generation/duration-support.ts
@@ -0,0 +1,54 @@
+import type { VideoGenerationProvider } from "./types.js";
+
+function normalizeSupportedDurationValues(
+  values: readonly number[] | undefined,
+): number[] | undefined {
+  if (!Array.isArray(values) || values.length === 0) {
+    return undefined;
+  }
+  const normalized = [...new Set(values)]
+    .filter((value) => Number.isFinite(value) && value > 0)
+    .map((value) => Math.round(value))
+    .filter((value) => value > 0)
+    .toSorted((left, right) => left - right);
+  return normalized.length > 0 ? normalized : undefined;
+}
+
+export function resolveVideoGenerationSupportedDurations(params: {
+  provider?: VideoGenerationProvider;
+  model?: string;
+}): number[] | undefined {
+  const caps = params.provider?.capabilities;
+  const model = params.model?.trim();
+  const modelSpecific =
+    model && caps?.supportedDurationSecondsByModel
+      ? caps.supportedDurationSecondsByModel[model]
+      : undefined;
+  return normalizeSupportedDurationValues(modelSpecific ?? caps?.supportedDurationSeconds);
+}
+
+export function normalizeVideoGenerationDuration(params: {
+  provider?: VideoGenerationProvider;
+  model?: string;
+  durationSeconds?: number;
+}): number | undefined {
+  if (typeof params.durationSeconds !== "number" || !Number.isFinite(params.durationSeconds)) {
+    return undefined;
+  }
+  const rounded = Math.max(1, Math.round(params.durationSeconds));
+  const supported = resolveVideoGenerationSupportedDurations(params);
+  if (!supported || supported.length === 0) {
+    return rounded;
+  }
+  return supported.reduce((best, current) => {
+    const currentDistance = Math.abs(current - rounded);
+    const bestDistance = Math.abs(best - rounded);
+    if (currentDistance < bestDistance) {
+      return current;
+    }
+    if (currentDistance === bestDistance && current > best) {
+      return current;
+    }
+    return best;
+  });
+}
--- a/src/video-generation/runtime.test.ts
+++ b/src/video-generation/runtime.test.ts
@@ -150,6 +150,43 @@ describe("video-generation runtime", () => {
    expect(mocks.listVideoGenerationProviders).toHaveBeenCalledWith({} as OpenClawConfig);
  });

+  it("normalizes requested durations to supported provider values", async () => {
+    let seenDurationSeconds: number | undefined;
+    mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1");
+    mocks.getVideoGenerationProvider.mockReturnValue({
+      id: "video-plugin",
+      capabilities: {
+        supportedDurationSeconds: [4, 6, 8],
+      },
+      generateVideo: async (req) => {
+        seenDurationSeconds = req.durationSeconds;
+        return {
+          videos: [{ buffer: Buffer.from("mp4-bytes"), mimeType: "video/mp4" }],
+          model: "vid-v1",
+        };
+      },
+    });
+
+    const result = await generateVideo({
+      cfg: {
+        agents: {
+          defaults: {
+            videoGenerationModel: { primary: "video-plugin/vid-v1" },
+          },
+        },
+      } as OpenClawConfig,
+      prompt: "animate a cat",
+      durationSeconds: 5,
+    });
+
+    expect(seenDurationSeconds).toBe(6);
+    expect(result.metadata).toMatchObject({
+      requestedDurationSeconds: 5,
+      normalizedDurationSeconds: 6,
+      supportedDurationSeconds: [4, 6, 8],
+    });
+  });
+
  it("builds a generic config hint without hardcoded provider ids", async () => {
    mocks.listVideoGenerationProviders.mockReturnValue([
      {
--- a/src/video-generation/runtime.ts
+++ b/src/video-generation/runtime.ts
@@ -8,6 +8,10 @@ import {
 } from "../config/model-input.js";
 import { createSubsystemLogger } from "../logging/subsystem.js";
 import { getProviderEnvVars } from "../secrets/provider-env-vars.js";
+import {
+  normalizeVideoGenerationDuration,
+  resolveVideoGenerationSupportedDurations,
+} from "./duration-support.js";
 import { parseVideoGenerationModelRef } from "./model-ref.js";
 import { getVideoGenerationProvider, listVideoGenerationProviders } from "./provider-registry.js";
 import type {
@@ -147,6 +151,19 @@ export async function generateVideo(
    }

    try {
+      const requestedDurationSeconds =
+        typeof params.durationSeconds === "number" && Number.isFinite(params.durationSeconds)
+          ? Math.max(1, Math.round(params.durationSeconds))
+          : undefined;
+      const normalizedDurationSeconds = normalizeVideoGenerationDuration({
+        provider,
+        model: candidate.model,
+        durationSeconds: requestedDurationSeconds,
+      });
+      const supportedDurationSeconds = resolveVideoGenerationSupportedDurations({
+        provider,
+        model: candidate.model,
+      });
      const result: VideoGenerationResult = await provider.generateVideo({
        provider: candidate.provider,
        model: candidate.model,
@@ -157,7 +174,7 @@ export async function generateVideo(
        size: params.size,
        aspectRatio: params.aspectRatio,
        resolution: params.resolution,
-        durationSeconds: params.durationSeconds,
+        durationSeconds: normalizedDurationSeconds,
        audio: params.audio,
        watermark: params.watermark,
        inputImages: params.inputImages,
@@ -171,7 +188,17 @@ export async function generateVideo(
        provider: candidate.provider,
        model: result.model ?? candidate.model,
        attempts,
-        metadata: result.metadata,
+        metadata:
+          typeof requestedDurationSeconds === "number" &&
+          typeof normalizedDurationSeconds === "number" &&
+          requestedDurationSeconds !== normalizedDurationSeconds
+            ? {
+                ...result.metadata,
+                requestedDurationSeconds,
+                normalizedDurationSeconds,
+                ...(supportedDurationSeconds ? { supportedDurationSeconds } : {}),
+              }
+            : result.metadata,
      };
    } catch (err) {
      lastError = err;
--- a/src/video-generation/types.ts
+++ b/src/video-generation/types.ts
@@ -52,6 +52,8 @@ export type VideoGenerationProviderCapabilities = {
  maxInputImages?: number;
  maxInputVideos?: number;
  maxDurationSeconds?: number;
+  supportedDurationSeconds?: readonly number[];
+  supportedDurationSecondsByModel?: Readonly<Record<string, readonly number[]>>;
  supportsSize?: boolean;
  supportsAspectRatio?: boolean;
  supportsResolution?: boolean;