fix(gateway): enforce singleton lock

This commit is contained in:
Peter Steinberger
2026-01-21 22:46:19 +00:00
parent 529372f762
commit 05a254746e
5 changed files with 216 additions and 0 deletions

View File

@@ -20,6 +20,7 @@ Docs: https://docs.clawd.bot
- Nodes tool: include agent/node/gateway context in tool failure logs to speed approval debugging.
- macOS: exec approvals now respect wildcard agent allowlists (`*`).
- macOS: allow SSH agent auth when no identity file is set. (#1384) Thanks @ameno-.
- Gateway: prevent multiple gateways from sharing the same config/state at once (singleton lock).
- UI: remove the chat stop button and keep the composer aligned to the bottom edge.
- Typing: start instant typing indicators at run start so DMs and mentions show immediately.
- Configure: restrict the model allowlist picker to OAuth-compatible Anthropic models and preselect Opus 4.5.

View File

@@ -1,4 +1,5 @@
import type { startGatewayServer } from "../../gateway/server.js";
import { acquireGatewayLock } from "../../infra/gateway-lock.js";
import {
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
@@ -14,6 +15,7 @@ export async function runGatewayLoop(params: {
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
runtime: typeof defaultRuntime;
}) {
const lock = await acquireGatewayLock();
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let shuttingDown = false;
let restartResolver: (() => void) | null = null;
@@ -96,6 +98,7 @@ export async function runGatewayLoop(params: {
});
}
} finally {
await lock?.release();
cleanupSignals();
}
}

View File

@@ -0,0 +1,55 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
async function makeEnv() {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
const configPath = path.join(dir, "clawdbot.json");
await fs.writeFile(configPath, "{}", "utf8");
return {
env: {
...process.env,
CLAWDBOT_STATE_DIR: dir,
CLAWDBOT_CONFIG_PATH: configPath,
},
cleanup: async () => {
await fs.rm(dir, { recursive: true, force: true });
},
};
}
describe("gateway lock", () => {
it("blocks concurrent acquisition until release", async () => {
const { env, cleanup } = await makeEnv();
const lock = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
});
expect(lock).not.toBeNull();
await expect(
acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
}),
).rejects.toBeInstanceOf(GatewayLockError);
await lock?.release();
const lock2 = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
});
await lock2?.release();
await cleanup();
});
});

View File

@@ -1,3 +1,33 @@
import { createHash } from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
const DEFAULT_TIMEOUT_MS = 5000;
const DEFAULT_POLL_INTERVAL_MS = 100;
const DEFAULT_STALE_MS = 30_000;
type LockPayload = {
pid: number;
createdAt: string;
configPath: string;
};
export type GatewayLockHandle = {
lockPath: string;
configPath: string;
release: () => Promise<void>;
};
export type GatewayLockOptions = {
env?: NodeJS.ProcessEnv;
timeoutMs?: number;
pollIntervalMs?: number;
staleMs?: number;
allowInTests?: boolean;
};
export class GatewayLockError extends Error {
constructor(
message: string,
@@ -7,3 +37,117 @@ export class GatewayLockError extends Error {
this.name = "GatewayLockError";
}
}
function isAlive(pid: number): boolean {
if (!Number.isFinite(pid) || pid <= 0) return false;
try {
process.kill(pid, 0);
return true;
} catch {
return false;
}
}
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
try {
const raw = await fs.readFile(lockPath, "utf8");
const parsed = JSON.parse(raw) as Partial<LockPayload>;
if (typeof parsed.pid !== "number") return null;
if (typeof parsed.createdAt !== "string") return null;
if (typeof parsed.configPath !== "string") return null;
return {
pid: parsed.pid,
createdAt: parsed.createdAt,
configPath: parsed.configPath,
};
} catch {
return null;
}
}
function resolveGatewayLockPath(env: NodeJS.ProcessEnv) {
const stateDir = resolveStateDir(env);
const configPath = resolveConfigPath(env, stateDir);
const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8);
const lockPath = path.join(stateDir, `gateway.${hash}.lock`);
return { lockPath, configPath };
}
export async function acquireGatewayLock(
opts: GatewayLockOptions = {},
): Promise<GatewayLockHandle | null> {
const env = opts.env ?? process.env;
const allowInTests = opts.allowInTests === true;
if (
env.CLAWDBOT_ALLOW_MULTI_GATEWAY === "1" ||
(!allowInTests && (env.VITEST || env.NODE_ENV === "test"))
) {
return null;
}
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
const { lockPath, configPath } = resolveGatewayLockPath(env);
await fs.mkdir(path.dirname(lockPath), { recursive: true });
const startedAt = Date.now();
let lastPayload: LockPayload | null = null;
while (Date.now() - startedAt < timeoutMs) {
try {
const handle = await fs.open(lockPath, "wx");
const payload: LockPayload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
};
await handle.writeFile(JSON.stringify(payload), "utf8");
return {
lockPath,
configPath,
release: async () => {
await handle.close().catch(() => undefined);
await fs.rm(lockPath, { force: true });
},
};
} catch (err) {
const code = (err as { code?: unknown }).code;
if (code !== "EEXIST") {
throw new GatewayLockError(`failed to acquire gateway lock at ${lockPath}`, err);
}
lastPayload = await readLockPayload(lockPath);
const ownerPid = lastPayload?.pid;
const ownerAlive = ownerPid ? isAlive(ownerPid) : false;
if (!ownerAlive && ownerPid) {
await fs.rm(lockPath, { force: true });
continue;
}
if (!ownerAlive) {
let stale = false;
if (lastPayload?.createdAt) {
const createdAt = Date.parse(lastPayload.createdAt);
stale = Number.isFinite(createdAt) ? Date.now() - createdAt > staleMs : false;
}
if (!stale) {
try {
const st = await fs.stat(lockPath);
stale = Date.now() - st.mtimeMs > staleMs;
} catch {
stale = true;
}
}
if (stale) {
await fs.rm(lockPath, { force: true });
continue;
}
}
await new Promise((r) => setTimeout(r, pollIntervalMs));
}
}
const owner = lastPayload?.pid ? ` (pid ${lastPayload.pid})` : "";
throw new GatewayLockError(`gateway already running${owner}; lock timeout after ${timeoutMs}ms`);
}

View File

@@ -45,6 +45,7 @@ async function main() {
{ startGatewayServer },
{ setGatewayWsLogStyle },
{ setVerbose },
{ acquireGatewayLock, GatewayLockError },
{ consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed },
{ defaultRuntime },
{ enableConsoleCapture, setConsoleTimestampPrefix },
@@ -53,6 +54,7 @@ async function main() {
import("../gateway/server.js"),
import("../gateway/ws-logging.js"),
import("../globals.js"),
import("../infra/gateway-lock.js"),
import("../infra/restart.js"),
import("../runtime.js"),
import("../logging.js"),
@@ -103,6 +105,7 @@ async function main() {
if (token) process.env.CLAWDBOT_GATEWAY_TOKEN = token;
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let lock: Awaited<ReturnType<typeof acquireGatewayLock>> | null = null;
let shuttingDown = false;
let forceExitTimer: ReturnType<typeof setTimeout> | null = null;
let restartResolver: (() => void) | null = null;
@@ -177,6 +180,15 @@ async function main() {
process.on("SIGUSR1", onSigusr1);
try {
try {
lock = await acquireGatewayLock();
} catch (err) {
if (err instanceof GatewayLockError) {
defaultRuntime.error(`Gateway start blocked: ${err.message}`);
process.exit(1);
}
throw err;
}
// eslint-disable-next-line no-constant-condition
while (true) {
try {
@@ -191,6 +203,7 @@ async function main() {
});
}
} finally {
await lock?.release();
cleanupSignals();
}
}