fix(gateway): enforce singleton lock
This commit is contained in:
@@ -20,6 +20,7 @@ Docs: https://docs.clawd.bot
|
|||||||
- Nodes tool: include agent/node/gateway context in tool failure logs to speed approval debugging.
|
- Nodes tool: include agent/node/gateway context in tool failure logs to speed approval debugging.
|
||||||
- macOS: exec approvals now respect wildcard agent allowlists (`*`).
|
- macOS: exec approvals now respect wildcard agent allowlists (`*`).
|
||||||
- macOS: allow SSH agent auth when no identity file is set. (#1384) Thanks @ameno-.
|
- macOS: allow SSH agent auth when no identity file is set. (#1384) Thanks @ameno-.
|
||||||
|
- Gateway: prevent multiple gateways from sharing the same config/state at once (singleton lock).
|
||||||
- UI: remove the chat stop button and keep the composer aligned to the bottom edge.
|
- UI: remove the chat stop button and keep the composer aligned to the bottom edge.
|
||||||
- Typing: start instant typing indicators at run start so DMs and mentions show immediately.
|
- Typing: start instant typing indicators at run start so DMs and mentions show immediately.
|
||||||
- Configure: restrict the model allowlist picker to OAuth-compatible Anthropic models and preselect Opus 4.5.
|
- Configure: restrict the model allowlist picker to OAuth-compatible Anthropic models and preselect Opus 4.5.
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import type { startGatewayServer } from "../../gateway/server.js";
|
import type { startGatewayServer } from "../../gateway/server.js";
|
||||||
|
import { acquireGatewayLock } from "../../infra/gateway-lock.js";
|
||||||
import {
|
import {
|
||||||
consumeGatewaySigusr1RestartAuthorization,
|
consumeGatewaySigusr1RestartAuthorization,
|
||||||
isGatewaySigusr1RestartExternallyAllowed,
|
isGatewaySigusr1RestartExternallyAllowed,
|
||||||
@@ -14,6 +15,7 @@ export async function runGatewayLoop(params: {
|
|||||||
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
|
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
|
||||||
runtime: typeof defaultRuntime;
|
runtime: typeof defaultRuntime;
|
||||||
}) {
|
}) {
|
||||||
|
const lock = await acquireGatewayLock();
|
||||||
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
||||||
let shuttingDown = false;
|
let shuttingDown = false;
|
||||||
let restartResolver: (() => void) | null = null;
|
let restartResolver: (() => void) | null = null;
|
||||||
@@ -96,6 +98,7 @@ export async function runGatewayLoop(params: {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
await lock?.release();
|
||||||
cleanupSignals();
|
cleanupSignals();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
55
src/infra/gateway-lock.test.ts
Normal file
55
src/infra/gateway-lock.test.ts
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
|
||||||
|
|
||||||
|
async function makeEnv() {
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
|
||||||
|
const configPath = path.join(dir, "clawdbot.json");
|
||||||
|
await fs.writeFile(configPath, "{}", "utf8");
|
||||||
|
return {
|
||||||
|
env: {
|
||||||
|
...process.env,
|
||||||
|
CLAWDBOT_STATE_DIR: dir,
|
||||||
|
CLAWDBOT_CONFIG_PATH: configPath,
|
||||||
|
},
|
||||||
|
cleanup: async () => {
|
||||||
|
await fs.rm(dir, { recursive: true, force: true });
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("gateway lock", () => {
|
||||||
|
it("blocks concurrent acquisition until release", async () => {
|
||||||
|
const { env, cleanup } = await makeEnv();
|
||||||
|
const lock = await acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 200,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
});
|
||||||
|
expect(lock).not.toBeNull();
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 200,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
}),
|
||||||
|
).rejects.toBeInstanceOf(GatewayLockError);
|
||||||
|
|
||||||
|
await lock?.release();
|
||||||
|
const lock2 = await acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 200,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
});
|
||||||
|
await lock2?.release();
|
||||||
|
await cleanup();
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,3 +1,33 @@
|
|||||||
|
import { createHash } from "node:crypto";
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
|
||||||
|
|
||||||
|
const DEFAULT_TIMEOUT_MS = 5000;
|
||||||
|
const DEFAULT_POLL_INTERVAL_MS = 100;
|
||||||
|
const DEFAULT_STALE_MS = 30_000;
|
||||||
|
|
||||||
|
type LockPayload = {
|
||||||
|
pid: number;
|
||||||
|
createdAt: string;
|
||||||
|
configPath: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type GatewayLockHandle = {
|
||||||
|
lockPath: string;
|
||||||
|
configPath: string;
|
||||||
|
release: () => Promise<void>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type GatewayLockOptions = {
|
||||||
|
env?: NodeJS.ProcessEnv;
|
||||||
|
timeoutMs?: number;
|
||||||
|
pollIntervalMs?: number;
|
||||||
|
staleMs?: number;
|
||||||
|
allowInTests?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
export class GatewayLockError extends Error {
|
export class GatewayLockError extends Error {
|
||||||
constructor(
|
constructor(
|
||||||
message: string,
|
message: string,
|
||||||
@@ -7,3 +37,117 @@ export class GatewayLockError extends Error {
|
|||||||
this.name = "GatewayLockError";
|
this.name = "GatewayLockError";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isAlive(pid: number): boolean {
|
||||||
|
if (!Number.isFinite(pid) || pid <= 0) return false;
|
||||||
|
try {
|
||||||
|
process.kill(pid, 0);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||||
|
try {
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const parsed = JSON.parse(raw) as Partial<LockPayload>;
|
||||||
|
if (typeof parsed.pid !== "number") return null;
|
||||||
|
if (typeof parsed.createdAt !== "string") return null;
|
||||||
|
if (typeof parsed.configPath !== "string") return null;
|
||||||
|
return {
|
||||||
|
pid: parsed.pid,
|
||||||
|
createdAt: parsed.createdAt,
|
||||||
|
configPath: parsed.configPath,
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
|
||||||
|
const stateDir = resolveStateDir(env);
|
||||||
|
const configPath = resolveConfigPath(env, stateDir);
|
||||||
|
const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8);
|
||||||
|
const lockPath = path.join(stateDir, `gateway.${hash}.lock`);
|
||||||
|
return { lockPath, configPath };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function acquireGatewayLock(
|
||||||
|
opts: GatewayLockOptions = {},
|
||||||
|
): Promise<GatewayLockHandle | null> {
|
||||||
|
const env = opts.env ?? process.env;
|
||||||
|
const allowInTests = opts.allowInTests === true;
|
||||||
|
if (
|
||||||
|
env.CLAWDBOT_ALLOW_MULTI_GATEWAY === "1" ||
|
||||||
|
(!allowInTests && (env.VITEST || env.NODE_ENV === "test"))
|
||||||
|
) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
||||||
|
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
|
||||||
|
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
|
||||||
|
const { lockPath, configPath } = resolveGatewayLockPath(env);
|
||||||
|
await fs.mkdir(path.dirname(lockPath), { recursive: true });
|
||||||
|
|
||||||
|
const startedAt = Date.now();
|
||||||
|
let lastPayload: LockPayload | null = null;
|
||||||
|
|
||||||
|
while (Date.now() - startedAt < timeoutMs) {
|
||||||
|
try {
|
||||||
|
const handle = await fs.open(lockPath, "wx");
|
||||||
|
const payload: LockPayload = {
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
configPath,
|
||||||
|
};
|
||||||
|
await handle.writeFile(JSON.stringify(payload), "utf8");
|
||||||
|
return {
|
||||||
|
lockPath,
|
||||||
|
configPath,
|
||||||
|
release: async () => {
|
||||||
|
await handle.close().catch(() => undefined);
|
||||||
|
await fs.rm(lockPath, { force: true });
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
const code = (err as { code?: unknown }).code;
|
||||||
|
if (code !== "EEXIST") {
|
||||||
|
throw new GatewayLockError(`failed to acquire gateway lock at ${lockPath}`, err);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastPayload = await readLockPayload(lockPath);
|
||||||
|
const ownerPid = lastPayload?.pid;
|
||||||
|
const ownerAlive = ownerPid ? isAlive(ownerPid) : false;
|
||||||
|
if (!ownerAlive && ownerPid) {
|
||||||
|
await fs.rm(lockPath, { force: true });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!ownerAlive) {
|
||||||
|
let stale = false;
|
||||||
|
if (lastPayload?.createdAt) {
|
||||||
|
const createdAt = Date.parse(lastPayload.createdAt);
|
||||||
|
stale = Number.isFinite(createdAt) ? Date.now() - createdAt > staleMs : false;
|
||||||
|
}
|
||||||
|
if (!stale) {
|
||||||
|
try {
|
||||||
|
const st = await fs.stat(lockPath);
|
||||||
|
stale = Date.now() - st.mtimeMs > staleMs;
|
||||||
|
} catch {
|
||||||
|
stale = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (stale) {
|
||||||
|
await fs.rm(lockPath, { force: true });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const owner = lastPayload?.pid ? ` (pid ${lastPayload.pid})` : "";
|
||||||
|
throw new GatewayLockError(`gateway already running${owner}; lock timeout after ${timeoutMs}ms`);
|
||||||
|
}
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ async function main() {
|
|||||||
{ startGatewayServer },
|
{ startGatewayServer },
|
||||||
{ setGatewayWsLogStyle },
|
{ setGatewayWsLogStyle },
|
||||||
{ setVerbose },
|
{ setVerbose },
|
||||||
|
{ acquireGatewayLock, GatewayLockError },
|
||||||
{ consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed },
|
{ consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed },
|
||||||
{ defaultRuntime },
|
{ defaultRuntime },
|
||||||
{ enableConsoleCapture, setConsoleTimestampPrefix },
|
{ enableConsoleCapture, setConsoleTimestampPrefix },
|
||||||
@@ -53,6 +54,7 @@ async function main() {
|
|||||||
import("../gateway/server.js"),
|
import("../gateway/server.js"),
|
||||||
import("../gateway/ws-logging.js"),
|
import("../gateway/ws-logging.js"),
|
||||||
import("../globals.js"),
|
import("../globals.js"),
|
||||||
|
import("../infra/gateway-lock.js"),
|
||||||
import("../infra/restart.js"),
|
import("../infra/restart.js"),
|
||||||
import("../runtime.js"),
|
import("../runtime.js"),
|
||||||
import("../logging.js"),
|
import("../logging.js"),
|
||||||
@@ -103,6 +105,7 @@ async function main() {
|
|||||||
if (token) process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
if (token) process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
||||||
|
|
||||||
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
||||||
|
let lock: Awaited<ReturnType<typeof acquireGatewayLock>> | null = null;
|
||||||
let shuttingDown = false;
|
let shuttingDown = false;
|
||||||
let forceExitTimer: ReturnType<typeof setTimeout> | null = null;
|
let forceExitTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
let restartResolver: (() => void) | null = null;
|
let restartResolver: (() => void) | null = null;
|
||||||
@@ -177,6 +180,15 @@ async function main() {
|
|||||||
process.on("SIGUSR1", onSigusr1);
|
process.on("SIGUSR1", onSigusr1);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
try {
|
||||||
|
lock = await acquireGatewayLock();
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof GatewayLockError) {
|
||||||
|
defaultRuntime.error(`Gateway start blocked: ${err.message}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
// eslint-disable-next-line no-constant-condition
|
// eslint-disable-next-line no-constant-condition
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
@@ -191,6 +203,7 @@ async function main() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
await lock?.release();
|
||||||
cleanupSignals();
|
cleanupSignals();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user