fix: handle PID recycling in container gateway lock
In containers, PIDs can be recycled quickly after restarts. When a container restarts, a different process might get the same PID as the previous gateway, causing the lock check to incorrectly think the old gateway is still running. This fix adds isGatewayProcess() which verifies on Linux that the PID actually belongs to a clawdbot gateway by checking /proc/PID/cmdline. If the cmdline doesn't contain 'clawdbot' or 'gateway', we assume the lock is stale. Fixes gateway boot-loop in Docker/Fly.io deployments.
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import { createHash } from "node:crypto";
|
||||
import fs from "node:fs/promises";
|
||||
import fsSync from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
|
||||
@@ -48,6 +49,34 @@ function isAlive(pid: number): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a PID is actually a clawdbot gateway process.
|
||||
* This handles PID recycling in containers where a different process
|
||||
* might have the same PID after a restart.
|
||||
*/
|
||||
function isGatewayProcess(pid: number): boolean {
|
||||
if (!isAlive(pid)) return false;
|
||||
|
||||
// On Linux, check /proc/PID/cmdline to verify it's actually clawdbot
|
||||
if (process.platform === "linux") {
|
||||
try {
|
||||
const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
|
||||
// cmdline uses null bytes as separators
|
||||
const args = cmdline.split("\0").join(" ").toLowerCase();
|
||||
// Check if this is actually a clawdbot gateway process
|
||||
return args.includes("clawdbot") || args.includes("gateway");
|
||||
} catch {
|
||||
// Can't read cmdline - process might have exited or we lack permissions
|
||||
// Fall back to assuming it's not our process (safer in containers)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// On non-Linux (macOS, Windows), trust the PID check
|
||||
// PID recycling is less of an issue outside containers
|
||||
return true;
|
||||
}
|
||||
|
||||
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||
try {
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
@@ -119,7 +148,8 @@ export async function acquireGatewayLock(
|
||||
|
||||
lastPayload = await readLockPayload(lockPath);
|
||||
const ownerPid = lastPayload?.pid;
|
||||
const ownerAlive = ownerPid ? isAlive(ownerPid) : false;
|
||||
// Use isGatewayProcess to handle PID recycling in containers
|
||||
const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false;
|
||||
if (!ownerAlive && ownerPid) {
|
||||
await fs.rm(lockPath, { force: true });
|
||||
continue;
|
||||
|
||||
Reference in New Issue
Block a user