fix(agents): enforce idle timeout during stream setup

2026-05-17 02:37:33 +00:00 · 2026-05-10 09:39:23 +01:00
parent 72ffd3a464
commit 7d5cccaef4
3 changed files with 82 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -176,6 +176,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/doctor: invalidate persisted plugin registry snapshots when plugin diagnostics point at deleted source paths, so `openclaw doctor` stops repeating stale warnings after a local extension is replaced by a managed npm plugin. Fixes #80087. (#80134) Thanks @hclsys.
 - Doctor/OpenAI Codex: preserve Codex auth intent when auto-repairing legacy `openai-codex/*` model refs to canonical `openai/*` by adding provider/model-scoped Codex runtime policy, preventing repaired configs from falling through to direct OpenAI API-key auth. Fixes #78533 and #78570. Thanks @superck110 and @Azmodump.
 - CLI/agents: surface durable message delivery status from `sendDurableMessageBatch` in `deliverAgentCommandResult` and `openclaw agent --json --deliver`, preserving suppressed hook outcomes as terminal no-retry results while exposing partial and failed sends for automation. Supersedes #53961 and #57755. Thanks @Kaspre.
+- Agents: apply the LLM idle watchdog while provider stream setup is still pending, preventing silent pre-stream model hangs from waiting for the full agent timeout.
 - Cron: let isolated self-cleanup runs inspect their own job run history while keeping other cron jobs and mutation actions blocked. Fixes #80019. Thanks @hclsys.
 - Cron: report isolated agent-turn setup and pre-model stalls with phase-specific timeout errors instead of waiting for the full job budget when no model call starts. Fixes #74803. Thanks @jeffsteinbok-openclaw and @dgkim311.
 - CLI/plugins: treat arbitrary unknown subcommands outside plugin CLI metadata as normal unknown commands instead of suggesting `plugins.allow`, while preserving allowlist guidance for real plugin command roots. Fixes #80109. (#80123) Thanks @kagura-agent.
--- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts
+++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.test.ts
@@ -42,12 +42,20 @@ describe("resolveLlmIdleTimeoutMs", () => {
    expect(resolveLlmIdleTimeoutMs({ runTimeoutMs: 2_147_000_000 })).toBe(0);
  });

-  it("uses the provider request timeout as the model idle watchdog", () => {
-    expect(resolveLlmIdleTimeoutMs({ modelRequestTimeoutMs: 300_000 })).toBe(300_000);
+  it("caps remote provider request timeouts at the default idle watchdog", () => {
+    expect(resolveLlmIdleTimeoutMs({ modelRequestTimeoutMs: 300_000 })).toBe(
+      DEFAULT_LLM_IDLE_TIMEOUT_MS,
+    );
+  });
+
+  it("uses remote provider request timeouts when shorter than the default idle watchdog", () => {
+    expect(resolveLlmIdleTimeoutMs({ modelRequestTimeoutMs: 30_000 })).toBe(30_000);
  });

  it("caps provider request timeout at the max safe timeout", () => {
-    expect(resolveLlmIdleTimeoutMs({ modelRequestTimeoutMs: 10_000_000_000 })).toBe(2_147_000_000);
+    expect(
+      resolveLlmIdleTimeoutMs({ trigger: "cron", modelRequestTimeoutMs: 10_000_000_000 }),
+    ).toBe(2_147_000_000);
  });

  it("ignores invalid provider request timeout values", () => {
@@ -296,6 +304,23 @@ describe("streamWithIdleTimeout", () => {
    await next;
  });

+  it("throws when a promise stream never resolves", async () => {
+    vi.useFakeTimers();
+    const baseFn = vi.fn().mockReturnValue(new Promise<AsyncIterable<unknown>>(() => {}));
+    const onIdleTimeout = vi.fn();
+    const wrapped = streamWithIdleTimeout(baseFn, 50, onIdleTimeout);
+
+    const model = {} as Parameters<typeof baseFn>[0];
+    const context = {} as Parameters<typeof baseFn>[1];
+    const options = {} as Parameters<typeof baseFn>[2];
+
+    const stream = expect(wrapped(model, context, options)).rejects.toThrow(/LLM idle timeout/);
+    await vi.advanceTimersByTimeAsync(50);
+    await stream;
+
+    expect(onIdleTimeout).toHaveBeenCalledTimes(1);
+  });
+
  it("resets timer on each chunk", async () => {
    const chunks = [{ text: "a" }, { text: "b" }, { text: "c" }];
    const mockStream = createMockAsyncIterable(chunks);
--- a/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts
+++ b/src/agents/pi-embedded-runner/run/llm-idle-timeout.ts
@@ -144,6 +144,9 @@ export function resolveLlmIdleTimeoutMs(params?: {
      value > 0 &&
      value < MAX_SAFE_TIMEOUT_MS,
  );
+  const baseUrl = params?.model?.baseUrl;
+  const isLocalProvider =
+    typeof baseUrl === "string" && baseUrl.length > 0 && isLocalProviderBaseUrl(baseUrl);

  const modelRequestTimeoutMs = params?.modelRequestTimeoutMs;
  if (
@@ -151,7 +154,11 @@ export function resolveLlmIdleTimeoutMs(params?: {
    Number.isFinite(modelRequestTimeoutMs) &&
    modelRequestTimeoutMs > 0
  ) {
-    return clampTimeoutMs(Math.min(modelRequestTimeoutMs, ...timeoutBounds));
+    const boundedTimeoutMs = Math.min(modelRequestTimeoutMs, ...timeoutBounds);
+    if (params?.trigger === "cron" || isLocalProvider) {
+      return clampTimeoutMs(boundedTimeoutMs);
+    }
+    return clampImplicitTimeoutMs(boundedTimeoutMs);
  }

  if (typeof runTimeoutMs === "number" && Number.isFinite(runTimeoutMs) && runTimeoutMs > 0) {
@@ -176,13 +183,7 @@ export function resolveLlmIdleTimeoutMs(params?: {
  // baseUrl pointing at loopback / private-network / `.local`. Ollama cloud
  // models are still hosted remotely even when proxied through local Ollama, so
  // keep the cloud watchdog for `*:cloud` model ids.
-  const baseUrl = params?.model?.baseUrl;
-  if (
-    typeof baseUrl === "string" &&
-    baseUrl.length > 0 &&
-    isLocalProviderBaseUrl(baseUrl) &&
-    !isOllamaCloudModel(params?.model)
-  ) {
+  if (isLocalProvider && !isOllamaCloudModel(params?.model)) {
    return 0;
  }

@@ -206,6 +207,21 @@ export function streamWithIdleTimeout(
  return (model, context, options) => {
    const maybeStream = baseFn(model, context, options);

+    const createIdleTimeoutError = () =>
+      new Error(`LLM idle timeout (${Math.floor(timeoutMs / 1000)}s): no response from model`);
+
+    const createTimeoutPromise = (setTimer: (timer: NodeJS.Timeout) => void): Promise<never> => {
+      return new Promise((_, reject) => {
+        const timer = setTimeout(() => {
+          const error = createIdleTimeoutError();
+          onIdleTimeout?.(error);
+          reject(error);
+        }, timeoutMs);
+        timer.unref?.();
+        setTimer(timer);
+      });
+    };
+
    const wrapStream = (stream: ReturnType<typeof streamSimple>) => {
      const originalAsyncIterator = stream[Symbol.asyncIterator].bind(stream);
      (stream as { [Symbol.asyncIterator]: typeof originalAsyncIterator })[Symbol.asyncIterator] =
@@ -213,18 +229,6 @@ export function streamWithIdleTimeout(
          const iterator = originalAsyncIterator();
          let idleTimer: NodeJS.Timeout | null = null;

-          const createTimeoutPromise = (): Promise<never> => {
-            return new Promise((_, reject) => {
-              idleTimer = setTimeout(() => {
-                const error = new Error(
-                  `LLM idle timeout (${Math.floor(timeoutMs / 1000)}s): no response from model`,
-                );
-                onIdleTimeout?.(error);
-                reject(error);
-              }, timeoutMs);
-            });
-          };
-
          const clearTimer = () => {
            if (idleTimer) {
              clearTimeout(idleTimer);
@@ -239,7 +243,12 @@ export function streamWithIdleTimeout(

              try {
                // Race between the actual next() and the timeout
-                const result = await Promise.race([streamIterator.next(), createTimeoutPromise()]);
+                const result = await Promise.race([
+                  streamIterator.next(),
+                  createTimeoutPromise((timer) => {
+                    idleTimer = timer;
+                  }),
+                ]);

                if (result.done) {
                  clearTimer();
@@ -268,7 +277,29 @@ export function streamWithIdleTimeout(
    };

    if (maybeStream && typeof maybeStream === "object" && "then" in maybeStream) {
-      return Promise.resolve(maybeStream).then(wrapStream);
+      let streamPromiseTimer: NodeJS.Timeout | null = null;
+      const clearStreamPromiseTimer = () => {
+        if (streamPromiseTimer) {
+          clearTimeout(streamPromiseTimer);
+          streamPromiseTimer = null;
+        }
+      };
+
+      return Promise.race([
+        Promise.resolve(maybeStream),
+        createTimeoutPromise((timer) => {
+          streamPromiseTimer = timer;
+        }),
+      ]).then(
+        (stream) => {
+          clearStreamPromiseTimer();
+          return wrapStream(stream);
+        },
+        (error) => {
+          clearStreamPromiseTimer();
+          throw error;
+        },
+      );
    }
    return wrapStream(maybeStream);
  };