fix: handle PID recycling in container gateway lock

In containers, PIDs can be recycled quickly after restarts. When a container
restarts, a different process might get the same PID as the previous gateway,
causing the lock check to incorrectly think the old gateway is still running.

This fix adds isGatewayProcess() which verifies on Linux that the PID actually
belongs to a clawdbot gateway by checking /proc/PID/cmdline. If the cmdline
doesn't contain 'clawdbot' or 'gateway', we assume the lock is stale.

Fixes gateway boot-loop in Docker/Fly.io deployments.
This commit is contained in:
Peter Steinberger
2026-01-24 07:46:14 +00:00
parent 90ae2f541c
commit dea96a2c3d

View File

@@ -1,5 +1,6 @@
import { createHash } from "node:crypto";
import fs from "node:fs/promises";
import fsSync from "node:fs";
import path from "node:path";
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
@@ -48,6 +49,34 @@ function isAlive(pid: number): boolean {
}
}
/**
* Check if a PID is actually a clawdbot gateway process.
* This handles PID recycling in containers where a different process
* might have the same PID after a restart.
*/
function isGatewayProcess(pid: number): boolean {
if (!isAlive(pid)) return false;
// On Linux, check /proc/PID/cmdline to verify it's actually clawdbot
if (process.platform === "linux") {
try {
const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
// cmdline uses null bytes as separators
const args = cmdline.split("\0").join(" ").toLowerCase();
// Check if this is actually a clawdbot gateway process
return args.includes("clawdbot") || args.includes("gateway");
} catch {
// Can't read cmdline - process might have exited or we lack permissions
// Fall back to assuming it's not our process (safer in containers)
return false;
}
}
// On non-Linux (macOS, Windows), trust the PID check
// PID recycling is less of an issue outside containers
return true;
}
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
try {
const raw = await fs.readFile(lockPath, "utf8");
@@ -119,7 +148,8 @@ export async function acquireGatewayLock(
lastPayload = await readLockPayload(lockPath);
const ownerPid = lastPayload?.pid;
const ownerAlive = ownerPid ? isAlive(ownerPid) : false;
// Use isGatewayProcess to handle PID recycling in containers
const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false;
if (!ownerAlive && ownerPid) {
await fs.rm(lockPath, { force: true });
continue;