diff --git a/CHANGELOG.md b/CHANGELOG.md index a0c945eea..c833d1e1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot - TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg. ### Fixes +- Gateway: compare Linux process start time to avoid PID recycling lock loops; keep locks unless stale. (#1572) Thanks @steipete. - Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest. - Agents: ignore IDENTITY.md template placeholders when parsing identity to avoid placeholder replies. (#1556) - Docker: update gateway command in docker-compose and Hetzner guide. (#1514) diff --git a/docs/docs.json b/docs/docs.json index ea097b430..2e48322ad 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1045,6 +1045,7 @@ "platforms/android", "platforms/windows", "platforms/linux", + "platforms/fly", "platforms/hetzner", "platforms/exe-dev" ] diff --git a/docs/platforms/index.md b/docs/platforms/index.md index d646b2026..bc721db8e 100644 --- a/docs/platforms/index.md +++ b/docs/platforms/index.md @@ -23,6 +23,7 @@ Native companion apps for Windows are also planned; the Gateway is recommended v ## VPS & hosting +- Fly.io: [Fly.io](/platforms/fly) - Hetzner (Docker): [Hetzner](/platforms/hetzner) - exe.dev (VM + HTTPS proxy): [exe.dev](/platforms/exe-dev) diff --git a/src/infra/gateway-lock.test.ts b/src/infra/gateway-lock.test.ts index 1e4e90cac..04b402b27 100644 --- a/src/infra/gateway-lock.test.ts +++ b/src/infra/gateway-lock.test.ts @@ -1,10 +1,13 @@ +import { createHash } from "node:crypto"; +import fsSync from "node:fs"; import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js"; +import { resolveConfigPath, resolveStateDir } from "../config/paths.js"; async function makeEnv() { const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-")); @@ -22,6 +25,41 @@ async function makeEnv() { }; } +function resolveLockPath(env: NodeJS.ProcessEnv) { + const stateDir = resolveStateDir(env); + const configPath = resolveConfigPath(env, stateDir); + const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8); + return { lockPath: path.join(stateDir, `gateway.${hash}.lock`), configPath }; +} + +function makeProcStat(pid: number, startTime: number) { + const fields = [ + "R", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + String(startTime), + "1", + "1", + ]; + return `${pid} (node) ${fields.join(" ")}`; +} + describe("gateway lock", () => { it("blocks concurrent acquisition until release", async () => { const { env, cleanup } = await makeEnv(); @@ -52,4 +90,98 @@ describe("gateway lock", () => { await lock2?.release(); await cleanup(); }); + + it("treats recycled linux pid as stale when start time mismatches", async () => { + const { env, cleanup } = await makeEnv(); + const { lockPath, configPath } = resolveLockPath(env); + const payload = { + pid: process.pid, + createdAt: new Date().toISOString(), + configPath, + startTime: 111, + }; + await fs.writeFile(lockPath, JSON.stringify(payload), "utf8"); + + const readFileSync = fsSync.readFileSync; + const statValue = makeProcStat(process.pid, 222); + const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === `/proc/${process.pid}/stat`) { + return statValue; + } + return readFileSync(filePath as never, encoding as never) as never; + }); + + const lock = await acquireGatewayLock({ + env, + allowInTests: true, + timeoutMs: 200, + pollIntervalMs: 20, + platform: "linux", + }); + expect(lock).not.toBeNull(); + + await lock?.release(); + spy.mockRestore(); + await cleanup(); + }); + + it("keeps lock on linux when proc access fails unless stale", async () => { + const { env, cleanup } = await makeEnv(); + const { lockPath, configPath } = resolveLockPath(env); + const payload = { + pid: process.pid, + createdAt: new Date().toISOString(), + configPath, + startTime: 111, + }; + await fs.writeFile(lockPath, JSON.stringify(payload), "utf8"); + + const readFileSync = fsSync.readFileSync; + const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === `/proc/${process.pid}/stat`) { + throw new Error("EACCES"); + } + return readFileSync(filePath as never, encoding as never) as never; + }); + + await expect( + acquireGatewayLock({ + env, + allowInTests: true, + timeoutMs: 120, + pollIntervalMs: 20, + staleMs: 10_000, + platform: "linux", + }), + ).rejects.toBeInstanceOf(GatewayLockError); + + spy.mockRestore(); + + const stalePayload = { + ...payload, + createdAt: new Date(0).toISOString(), + }; + await fs.writeFile(lockPath, JSON.stringify(stalePayload), "utf8"); + + const staleSpy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => { + if (filePath === `/proc/${process.pid}/stat`) { + throw new Error("EACCES"); + } + return readFileSync(filePath as never, encoding as never) as never; + }); + + const lock = await acquireGatewayLock({ + env, + allowInTests: true, + timeoutMs: 200, + pollIntervalMs: 20, + staleMs: 1, + platform: "linux", + }); + expect(lock).not.toBeNull(); + + await lock?.release(); + staleSpy.mockRestore(); + await cleanup(); + }); }); diff --git a/src/infra/gateway-lock.ts b/src/infra/gateway-lock.ts index 4aad93acd..8a84c5e2f 100644 --- a/src/infra/gateway-lock.ts +++ b/src/infra/gateway-lock.ts @@ -13,6 +13,7 @@ type LockPayload = { pid: number; createdAt: string; configPath: string; + startTime?: number; }; export type GatewayLockHandle = { @@ -27,6 +28,7 @@ export type GatewayLockOptions = { pollIntervalMs?: number; staleMs?: number; allowInTests?: boolean; + platform?: NodeJS.Platform; }; export class GatewayLockError extends Error { @@ -39,6 +41,8 @@ export class GatewayLockError extends Error { } } +type LockOwnerStatus = "alive" | "dead" | "unknown"; + function isAlive(pid: number): boolean { if (!Number.isFinite(pid) || pid <= 0) return false; try { @@ -49,32 +53,78 @@ function isAlive(pid: number): boolean { } } -/** - * Check if a PID is actually a clawdbot gateway process. - * This handles PID recycling in containers where a different process - * might have the same PID after a restart. - */ -function isGatewayProcess(pid: number): boolean { - if (!isAlive(pid)) return false; +function normalizeProcArg(arg: string): string { + return arg.replaceAll("\\", "/").toLowerCase(); +} - // On Linux, check /proc/PID/cmdline to verify it's actually clawdbot - if (process.platform === "linux") { - try { - const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8"); - // cmdline uses null bytes as separators - const args = cmdline.split("\0").join(" ").toLowerCase(); - // Check if this is actually a clawdbot gateway process - return args.includes("clawdbot") || args.includes("gateway"); - } catch { - // Can't read cmdline - process might have exited or we lack permissions - // Fall back to assuming it's not our process (safer in containers) - return false; - } +function parseProcCmdline(raw: string): string[] { + return raw + .split("\0") + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function isGatewayArgv(args: string[]): boolean { + const normalized = args.map(normalizeProcArg); + if (!normalized.includes("gateway")) return false; + + const entryCandidates = [ + "dist/index.js", + "dist/index.mjs", + "dist/entry.js", + "dist/entry.mjs", + "scripts/run-node.mjs", + "src/index.ts", + ]; + if (normalized.some((arg) => entryCandidates.some((entry) => arg.endsWith(entry)))) { + return true; } - // On non-Linux (macOS, Windows), trust the PID check - // PID recycling is less of an issue outside containers - return true; + const exe = normalized[0] ?? ""; + return exe.endsWith("/clawdbot") || exe === "clawdbot"; +} + +function readLinuxCmdline(pid: number): string[] | null { + try { + const raw = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8"); + return parseProcCmdline(raw); + } catch { + return null; + } +} + +function readLinuxStartTime(pid: number): number | null { + try { + const raw = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8").trim(); + const closeParen = raw.lastIndexOf(")"); + if (closeParen < 0) return null; + const rest = raw.slice(closeParen + 1).trim(); + const fields = rest.split(/\s+/); + const startTime = Number.parseInt(fields[19] ?? "", 10); + return Number.isFinite(startTime) ? startTime : null; + } catch { + return null; + } +} + +function resolveGatewayOwnerStatus( + pid: number, + payload: LockPayload | null, + platform: NodeJS.Platform, +): LockOwnerStatus { + if (!isAlive(pid)) return "dead"; + if (platform !== "linux") return "alive"; + + const payloadStartTime = payload?.startTime; + if (Number.isFinite(payloadStartTime)) { + const currentStartTime = readLinuxStartTime(pid); + if (currentStartTime == null) return "unknown"; + return currentStartTime === payloadStartTime ? "alive" : "dead"; + } + + const args = readLinuxCmdline(pid); + if (!args) return "unknown"; + return isGatewayArgv(args) ? "alive" : "dead"; } async function readLockPayload(lockPath: string): Promise { @@ -84,10 +134,12 @@ async function readLockPayload(lockPath: string): Promise { if (typeof parsed.pid !== "number") return null; if (typeof parsed.createdAt !== "string") return null; if (typeof parsed.configPath !== "string") return null; + const startTime = typeof parsed.startTime === "number" ? parsed.startTime : undefined; return { pid: parsed.pid, createdAt: parsed.createdAt, configPath: parsed.configPath, + startTime, }; } catch { return null; @@ -117,6 +169,7 @@ export async function acquireGatewayLock( const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS; const staleMs = opts.staleMs ?? DEFAULT_STALE_MS; + const platform = opts.platform ?? process.platform; const { lockPath, configPath } = resolveGatewayLockPath(env); await fs.mkdir(path.dirname(lockPath), { recursive: true }); @@ -126,11 +179,15 @@ export async function acquireGatewayLock( while (Date.now() - startedAt < timeoutMs) { try { const handle = await fs.open(lockPath, "wx"); + const startTime = platform === "linux" ? readLinuxStartTime(process.pid) : null; const payload: LockPayload = { pid: process.pid, createdAt: new Date().toISOString(), configPath, }; + if (typeof startTime === "number" && Number.isFinite(startTime)) { + payload.startTime = startTime; + } await handle.writeFile(JSON.stringify(payload), "utf8"); return { lockPath, @@ -148,13 +205,14 @@ export async function acquireGatewayLock( lastPayload = await readLockPayload(lockPath); const ownerPid = lastPayload?.pid; - // Use isGatewayProcess to handle PID recycling in containers - const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false; - if (!ownerAlive && ownerPid) { + const ownerStatus = ownerPid + ? resolveGatewayOwnerStatus(ownerPid, lastPayload, platform) + : "unknown"; + if (ownerStatus === "dead" && ownerPid) { await fs.rm(lockPath, { force: true }); continue; } - if (!ownerAlive) { + if (ownerStatus !== "alive") { let stale = false; if (lastPayload?.createdAt) { const createdAt = Date.parse(lastPayload.createdAt);