fix: harden gateway lock validation (#1572) (thanks @steipete)

This commit is contained in:
Peter Steinberger
2026-01-24 08:14:30 +00:00
parent 90685ef814
commit 3fff943ba1
5 changed files with 221 additions and 28 deletions

View File

@@ -1,10 +1,13 @@
import { createHash } from "node:crypto";
import fsSync from "node:fs";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
async function makeEnv() {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
@@ -22,6 +25,41 @@ async function makeEnv() {
};
}
function resolveLockPath(env: NodeJS.ProcessEnv) {
const stateDir = resolveStateDir(env);
const configPath = resolveConfigPath(env, stateDir);
const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8);
return { lockPath: path.join(stateDir, `gateway.${hash}.lock`), configPath };
}
function makeProcStat(pid: number, startTime: number) {
const fields = [
"R",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
String(startTime),
"1",
"1",
];
return `${pid} (node) ${fields.join(" ")}`;
}
describe("gateway lock", () => {
it("blocks concurrent acquisition until release", async () => {
const { env, cleanup } = await makeEnv();
@@ -52,4 +90,98 @@ describe("gateway lock", () => {
await lock2?.release();
await cleanup();
});
it("treats recycled linux pid as stale when start time mismatches", async () => {
const { env, cleanup } = await makeEnv();
const { lockPath, configPath } = resolveLockPath(env);
const payload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
startTime: 111,
};
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
const readFileSync = fsSync.readFileSync;
const statValue = makeProcStat(process.pid, 222);
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
return statValue;
}
return readFileSync(filePath as never, encoding as never) as never;
});
const lock = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
platform: "linux",
});
expect(lock).not.toBeNull();
await lock?.release();
spy.mockRestore();
await cleanup();
});
it("keeps lock on linux when proc access fails unless stale", async () => {
const { env, cleanup } = await makeEnv();
const { lockPath, configPath } = resolveLockPath(env);
const payload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
startTime: 111,
};
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
const readFileSync = fsSync.readFileSync;
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
throw new Error("EACCES");
}
return readFileSync(filePath as never, encoding as never) as never;
});
await expect(
acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 120,
pollIntervalMs: 20,
staleMs: 10_000,
platform: "linux",
}),
).rejects.toBeInstanceOf(GatewayLockError);
spy.mockRestore();
const stalePayload = {
...payload,
createdAt: new Date(0).toISOString(),
};
await fs.writeFile(lockPath, JSON.stringify(stalePayload), "utf8");
const staleSpy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
throw new Error("EACCES");
}
return readFileSync(filePath as never, encoding as never) as never;
});
const lock = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
staleMs: 1,
platform: "linux",
});
expect(lock).not.toBeNull();
await lock?.release();
staleSpy.mockRestore();
await cleanup();
});
});

View File

@@ -13,6 +13,7 @@ type LockPayload = {
pid: number;
createdAt: string;
configPath: string;
startTime?: number;
};
export type GatewayLockHandle = {
@@ -27,6 +28,7 @@ export type GatewayLockOptions = {
pollIntervalMs?: number;
staleMs?: number;
allowInTests?: boolean;
platform?: NodeJS.Platform;
};
export class GatewayLockError extends Error {
@@ -39,6 +41,8 @@ export class GatewayLockError extends Error {
}
}
type LockOwnerStatus = "alive" | "dead" | "unknown";
function isAlive(pid: number): boolean {
if (!Number.isFinite(pid) || pid <= 0) return false;
try {
@@ -49,32 +53,78 @@ function isAlive(pid: number): boolean {
}
}
/**
* Check if a PID is actually a clawdbot gateway process.
* This handles PID recycling in containers where a different process
* might have the same PID after a restart.
*/
function isGatewayProcess(pid: number): boolean {
if (!isAlive(pid)) return false;
function normalizeProcArg(arg: string): string {
return arg.replaceAll("\\", "/").toLowerCase();
}
// On Linux, check /proc/PID/cmdline to verify it's actually clawdbot
if (process.platform === "linux") {
try {
const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
// cmdline uses null bytes as separators
const args = cmdline.split("\0").join(" ").toLowerCase();
// Check if this is actually a clawdbot gateway process
return args.includes("clawdbot") || args.includes("gateway");
} catch {
// Can't read cmdline - process might have exited or we lack permissions
// Fall back to assuming it's not our process (safer in containers)
return false;
}
function parseProcCmdline(raw: string): string[] {
return raw
.split("\0")
.map((entry) => entry.trim())
.filter(Boolean);
}
function isGatewayArgv(args: string[]): boolean {
const normalized = args.map(normalizeProcArg);
if (!normalized.includes("gateway")) return false;
const entryCandidates = [
"dist/index.js",
"dist/index.mjs",
"dist/entry.js",
"dist/entry.mjs",
"scripts/run-node.mjs",
"src/index.ts",
];
if (normalized.some((arg) => entryCandidates.some((entry) => arg.endsWith(entry)))) {
return true;
}
// On non-Linux (macOS, Windows), trust the PID check
// PID recycling is less of an issue outside containers
return true;
const exe = normalized[0] ?? "";
return exe.endsWith("/clawdbot") || exe === "clawdbot";
}
function readLinuxCmdline(pid: number): string[] | null {
try {
const raw = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
return parseProcCmdline(raw);
} catch {
return null;
}
}
function readLinuxStartTime(pid: number): number | null {
try {
const raw = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8").trim();
const closeParen = raw.lastIndexOf(")");
if (closeParen < 0) return null;
const rest = raw.slice(closeParen + 1).trim();
const fields = rest.split(/\s+/);
const startTime = Number.parseInt(fields[19] ?? "", 10);
return Number.isFinite(startTime) ? startTime : null;
} catch {
return null;
}
}
function resolveGatewayOwnerStatus(
pid: number,
payload: LockPayload | null,
platform: NodeJS.Platform,
): LockOwnerStatus {
if (!isAlive(pid)) return "dead";
if (platform !== "linux") return "alive";
const payloadStartTime = payload?.startTime;
if (Number.isFinite(payloadStartTime)) {
const currentStartTime = readLinuxStartTime(pid);
if (currentStartTime == null) return "unknown";
return currentStartTime === payloadStartTime ? "alive" : "dead";
}
const args = readLinuxCmdline(pid);
if (!args) return "unknown";
return isGatewayArgv(args) ? "alive" : "dead";
}
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
@@ -84,10 +134,12 @@ async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
if (typeof parsed.pid !== "number") return null;
if (typeof parsed.createdAt !== "string") return null;
if (typeof parsed.configPath !== "string") return null;
const startTime = typeof parsed.startTime === "number" ? parsed.startTime : undefined;
return {
pid: parsed.pid,
createdAt: parsed.createdAt,
configPath: parsed.configPath,
startTime,
};
} catch {
return null;
@@ -117,6 +169,7 @@ export async function acquireGatewayLock(
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
const platform = opts.platform ?? process.platform;
const { lockPath, configPath } = resolveGatewayLockPath(env);
await fs.mkdir(path.dirname(lockPath), { recursive: true });
@@ -126,11 +179,15 @@ export async function acquireGatewayLock(
while (Date.now() - startedAt < timeoutMs) {
try {
const handle = await fs.open(lockPath, "wx");
const startTime = platform === "linux" ? readLinuxStartTime(process.pid) : null;
const payload: LockPayload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
};
if (typeof startTime === "number" && Number.isFinite(startTime)) {
payload.startTime = startTime;
}
await handle.writeFile(JSON.stringify(payload), "utf8");
return {
lockPath,
@@ -148,13 +205,14 @@ export async function acquireGatewayLock(
lastPayload = await readLockPayload(lockPath);
const ownerPid = lastPayload?.pid;
// Use isGatewayProcess to handle PID recycling in containers
const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false;
if (!ownerAlive && ownerPid) {
const ownerStatus = ownerPid
? resolveGatewayOwnerStatus(ownerPid, lastPayload, platform)
: "unknown";
if (ownerStatus === "dead" && ownerPid) {
await fs.rm(lockPath, { force: true });
continue;
}
if (!ownerAlive) {
if (ownerStatus !== "alive") {
let stale = false;
if (lastPayload?.createdAt) {
const createdAt = Date.parse(lastPayload.createdAt);