fix: harden feishu startup probe sequencing (#29941) (thanks @bmendonca3)

This commit is contained in:
Peter Steinberger
2026-03-02 03:58:34 +00:00
parent abc7b6fbec
commit 3fc19ed7d7
5 changed files with 170 additions and 10 deletions

View File

@@ -267,7 +267,7 @@ Docs: https://docs.openclaw.ai
- Agents/Ollama discovery: skip Ollama discovery when explicit models are configured. (#28827) Thanks @Kansodata and @vincentkoc.
- Issues/triage labeling: consolidate bug intake to a single bug issue form with required bug-type classification (regression/crash/behavior), auto-apply matching subtype labels from issue form content, and retire the separate regression template to reduce misfiled issue types and improve queue filtering. Thanks @vincentkoc.
- Android/Onboarding + voice reliability: request per-toggle onboarding permissions, update pairing guidance to `openclaw devices list/approve`, restore assistant speech playback in mic capture flow, cancel superseded in-flight speech (mute + per-reply token rotation), and keep `talk.config` loads retryable after transient failures. (#29796) Thanks @obviyus.
- Feishu/Startup probes: serialize multi-account bot-info probes during monitor startup so large Feishu account sets do not burst `/open-apis/bot/v3/info` and trigger avoidable rate limits. (#26685)
- Feishu/Startup probes: serialize multi-account bot-info probes during monitor startup so large Feishu account sets do not burst `/open-apis/bot/v3/info`, bound startup probe latency/abort handling to avoid head-of-line stalls, and avoid triggering rate limits. (#26685, #29941) Thanks @bmendonca3.
- FS/Sandbox workspace boundaries: add a dedicated `outside-workspace` safe-open error code for root-escape checks, and propagate specific outside-workspace messages across edit/browser/media consumers instead of generic not-found/invalid-path fallbacks. (#29715) Thanks @YuzuruS.
- Config/Doctor group allowlist diagnostics: align `groupPolicy: "allowlist"` warnings with per-channel runtime semantics by excluding Google Chat sender-list checks and by warning when no-fallback channels (for example iMessage) omit `groupAllowFrom`, with regression coverage. (#28477) Thanks @tonydehnke.
- Slack/Disabled channel startup: skip Slack monitor socket startup entirely when `channels.slack.enabled=false` (including configs that still contain valid tokens), preventing disabled accounts from opening websocket connections. (#30586) Thanks @liuxiaopai-ai.

View File

@@ -34,6 +34,9 @@ const botOpenIds = new Map<string, string>();
const FEISHU_WEBHOOK_MAX_BODY_BYTES = 1024 * 1024;
const FEISHU_WEBHOOK_BODY_TIMEOUT_MS = 30_000;
const FEISHU_REACTION_VERIFY_TIMEOUT_MS = 1_500;
const FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS = 10_000;
const FEISHU_BOT_INFO_FETCH_ABORTED = Symbol("feishu-bot-info-fetch-aborted");
const FEISHU_BOT_INFO_FETCH_TIMED_OUT = Symbol("feishu-bot-info-fetch-timed-out");
export type FeishuReactionCreatedEvent = {
message_id: string;
@@ -188,12 +191,68 @@ export async function resolveReactionSyntheticEvent(
};
}
async function fetchBotOpenId(account: ResolvedFeishuAccount): Promise<string | undefined> {
type FetchBotOpenIdOptions = {
runtime?: RuntimeEnv;
abortSignal?: AbortSignal;
timeoutMs?: number;
};
async function fetchBotOpenId(
account: ResolvedFeishuAccount,
options: FetchBotOpenIdOptions = {},
): Promise<string | undefined> {
if (options.abortSignal?.aborted) {
return undefined;
}
const timeoutMs = options.timeoutMs ?? FEISHU_STARTUP_BOT_INFO_TIMEOUT_MS;
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
let abortHandler: (() => void) | undefined;
try {
const result = await probeFeishu(account);
return result.ok ? result.botOpenId : undefined;
const contenders: Array<
Promise<
| string
| undefined
| typeof FEISHU_BOT_INFO_FETCH_ABORTED
| typeof FEISHU_BOT_INFO_FETCH_TIMED_OUT
>
> = [
probeFeishu(account)
.then((result) => (result.ok ? result.botOpenId : undefined))
.catch(() => undefined),
new Promise((resolve) => {
timeoutHandle = setTimeout(() => resolve(FEISHU_BOT_INFO_FETCH_TIMED_OUT), timeoutMs);
}),
];
if (options.abortSignal) {
contenders.push(
new Promise((resolve) => {
abortHandler = () => resolve(FEISHU_BOT_INFO_FETCH_ABORTED);
options.abortSignal?.addEventListener("abort", abortHandler, { once: true });
}),
);
}
const outcome = await Promise.race(contenders);
if (outcome === FEISHU_BOT_INFO_FETCH_ABORTED) {
return undefined;
}
if (outcome === FEISHU_BOT_INFO_FETCH_TIMED_OUT) {
const error = options.runtime?.error ?? console.error;
error(
`feishu[${account.accountId}]: bot info probe timed out after ${timeoutMs}ms; continuing startup`,
);
return undefined;
}
return outcome;
} catch {
return undefined;
} finally {
if (timeoutHandle) {
clearTimeout(timeoutHandle);
}
if (abortHandler) {
options.abortSignal?.removeEventListener("abort", abortHandler);
}
}
}
@@ -347,7 +406,9 @@ async function monitorSingleAccount(params: MonitorAccountParams): Promise<void>
const log = runtime?.log ?? console.log;
// Fetch bot open_id
const botOpenId = params.botOpenIdPrefetched ? params.botOpenId : await fetchBotOpenId(account);
const botOpenId = params.botOpenIdPrefetched
? params.botOpenId
: await fetchBotOpenId(account, { runtime, abortSignal });
botOpenIds.set(accountId, botOpenId ?? "");
log(`feishu[${accountId}]: bot open_id resolved: ${botOpenId ?? "unknown"}`);
@@ -550,8 +611,19 @@ export async function monitorFeishuProvider(opts: MonitorFeishuOpts = {}): Promi
const monitorPromises: Promise<void>[] = [];
for (const account of accounts) {
if (opts.abortSignal?.aborted) {
log("feishu: abort signal received during startup preflight; stopping startup");
break;
}
// Probe sequentially so large multi-account startups do not burst Feishu's bot-info endpoint.
const botOpenId = await fetchBotOpenId(account);
const botOpenId = await fetchBotOpenId(account, {
runtime: opts.runtime,
abortSignal: opts.abortSignal,
});
if (opts.abortSignal?.aborted) {
log("feishu: abort signal received during startup preflight; stopping startup");
break;
}
monitorPromises.push(
monitorSingleAccount({
cfg,

View File

@@ -287,9 +287,9 @@ describe("Feishu webhook security hardening", () => {
});
try {
await Promise.resolve();
await Promise.resolve();
await Promise.resolve();
for (let i = 0; i < 10 && !started.includes("beta"); i += 1) {
await Promise.resolve();
}
expect(started).toEqual(["alpha", "beta"]);
expect(started.filter((accountId) => accountId === "alpha")).toHaveLength(1);
@@ -299,4 +299,73 @@ describe("Feishu webhook security hardening", () => {
await monitorPromise;
}
});
it("continues startup when a sequential preflight probe times out", async () => {
vi.useFakeTimers();
const started: string[] = [];
let releaseBetaProbe!: () => void;
const betaProbeReleased = new Promise<void>((resolve) => {
releaseBetaProbe = () => resolve();
});
probeFeishuMock.mockImplementation((account: { accountId: string }) => {
started.push(account.accountId);
if (account.accountId === "alpha") {
return new Promise<never>(() => {});
}
return betaProbeReleased.then(() => ({ ok: true, botOpenId: `bot_${account.accountId}` }));
});
const abortController = new AbortController();
const runtime = { log: vi.fn(), error: vi.fn(), exit: vi.fn() };
const monitorPromise = monitorFeishuProvider({
config: buildMultiAccountWebsocketConfig(["alpha", "beta"]),
runtime,
abortSignal: abortController.signal,
});
try {
await Promise.resolve();
expect(started).toEqual(["alpha"]);
await vi.advanceTimersByTimeAsync(10_000);
await Promise.resolve();
expect(started).toEqual(["alpha", "beta"]);
expect(runtime.error).toHaveBeenCalledWith(
expect.stringContaining("bot info probe timed out"),
);
} finally {
releaseBetaProbe();
abortController.abort();
await monitorPromise;
vi.useRealTimers();
}
});
it("stops sequential preflight when aborted during a stuck probe", async () => {
const started: string[] = [];
probeFeishuMock.mockImplementation((account: { accountId: string }) => {
started.push(account.accountId);
return new Promise<never>(() => {});
});
const abortController = new AbortController();
const monitorPromise = monitorFeishuProvider({
config: buildMultiAccountWebsocketConfig(["alpha", "beta"]),
abortSignal: abortController.signal,
});
try {
await Promise.resolve();
expect(started).toEqual(["alpha"]);
abortController.abort();
await monitorPromise;
expect(started).toEqual(["alpha"]);
} finally {
abortController.abort();
}
});
});

View File

@@ -6,7 +6,7 @@ vi.mock("./client.js", () => ({
createFeishuClient: createFeishuClientMock,
}));
import { probeFeishu, clearProbeCache } from "./probe.js";
import { FEISHU_PROBE_REQUEST_TIMEOUT_MS, probeFeishu, clearProbeCache } from "./probe.js";
function makeRequestFn(response: Record<string, unknown>) {
return vi.fn().mockResolvedValue(response);
@@ -59,6 +59,23 @@ describe("probeFeishu", () => {
expect(requestFn).toHaveBeenCalledTimes(1);
});
it("uses explicit timeout for bot info request", async () => {
const requestFn = setupClient({
code: 0,
bot: { bot_name: "TestBot", open_id: "ou_abc123" },
});
await probeFeishu({ appId: "cli_123", appSecret: "secret" });
expect(requestFn).toHaveBeenCalledWith(
expect.objectContaining({
method: "GET",
url: "/open-apis/bot/v3/info",
timeout: FEISHU_PROBE_REQUEST_TIMEOUT_MS,
}),
);
});
it("returns cached result on subsequent calls within TTL", async () => {
const requestFn = setupClient({
code: 0,

View File

@@ -8,6 +8,7 @@ import type { FeishuProbeResult } from "./types.js";
const probeCache = new Map<string, { result: FeishuProbeResult; expiresAt: number }>();
const PROBE_CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
const MAX_PROBE_CACHE_SIZE = 64;
export const FEISHU_PROBE_REQUEST_TIMEOUT_MS = 10_000;
export async function probeFeishu(creds?: FeishuClientCredentials): Promise<FeishuProbeResult> {
if (!creds?.appId || !creds?.appSecret) {
@@ -35,6 +36,7 @@ export async function probeFeishu(creds?: FeishuClientCredentials): Promise<Feis
method: "GET",
url: "/open-apis/bot/v3/info",
data: {},
timeout: FEISHU_PROBE_REQUEST_TIMEOUT_MS,
});
if (response.code !== 0) {