infra: use flock gateway lock

This commit is contained in:
Peter Steinberger
2025-12-10 00:46:50 +00:00
parent b1834b7cf8
commit 426503e062
5 changed files with 75 additions and 65 deletions

2
.npmrc
View File

@@ -1 +1 @@
allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs,fs-ext

28
docs/gateway-lock.md Normal file
View File

@@ -0,0 +1,28 @@
---
summary: "Gateway lock strategy using POSIX flock and PID file"
read_when:
- Running or debugging the gateway process
- Investigating single-instance enforcement
---
# Gateway lock
Last updated: 2025-12-10
## Why
- Ensure only one gateway instance runs per host.
- Survive crashes/SIGKILL without leaving a blocking stale lock.
- Keep the PID visible for observability and manual debugging.
## Mechanism
- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process.
- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL.
- The PID is written into the same file after locking; the lock (not file existence) is the source of truth.
- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup.
## Error surface
- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`.
- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`.
## Operational notes
- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone.
- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself.

View File

@@ -45,6 +45,7 @@
"detect-libc": "^2.1.2", "detect-libc": "^2.1.2",
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"express": "^5.2.1", "express": "^5.2.1",
"fs-ext": "^2.1.1",
"grammy": "^1.38.4", "grammy": "^1.38.4",
"json5": "^2.2.3", "json5": "^2.2.3",
"qrcode-terminal": "^0.12.0", "qrcode-terminal": "^0.12.0",
@@ -59,6 +60,7 @@
"@mariozechner/mini-lit": "0.2.1", "@mariozechner/mini-lit": "0.2.1",
"@types/body-parser": "^1.19.6", "@types/body-parser": "^1.19.6",
"@types/express": "^5.0.6", "@types/express": "^5.0.6",
"@types/fs-ext": "^2.0.3",
"@types/node": "^24.10.2", "@types/node": "^24.10.2",
"@types/qrcode-terminal": "^0.12.2", "@types/qrcode-terminal": "^0.12.2",
"@types/ws": "^8.18.1", "@types/ws": "^8.18.1",

View File

@@ -667,7 +667,9 @@ export async function startGatewayServer(port = 18789): Promise<GatewayServer> {
if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) { if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) {
respond(true, cached, undefined, { cached: true }); respond(true, cached, undefined, { cached: true });
void refreshHealthSnapshot({ probe: true }).catch((err) => void refreshHealthSnapshot({ probe: true }).catch((err) =>
logError(`background health refresh failed: ${formatError(err)}`), logError(
`background health refresh failed: ${formatError(err)}`,
),
); );
break; break;
} }

View File

@@ -1,100 +1,78 @@
import fs from "node:fs"; import fs from "node:fs";
import net from "node:net";
import os from "node:os"; import os from "node:os";
import path from "node:path"; import path from "node:path";
import { flockSync } from "fs-ext";
const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock"); const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock");
export class GatewayLockError extends Error {} export class GatewayLockError extends Error {}
type ReleaseFn = () => Promise<void>; type ReleaseFn = () => Promise<void>;
const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
/** /**
* Acquire an exclusive single-instance lock for the gateway using a Unix domain socket. * Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file.
* *
* Why a socket? If the process crashes or is SIGKILLed, the socket file remains but * Kernel locks are released automatically when the process exits or is SIGKILLed, so the
* the next start will detect ECONNREFUSED when connecting and clean the stale path * lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but
* before retrying. This keeps the lock self-healing without manual pidfile cleanup. * correctness relies solely on the kernel lock.
*/ */
export async function acquireGatewayLock( export async function acquireGatewayLock(
lockPath = DEFAULT_LOCK_PATH, lockPath = DEFAULT_LOCK_PATH,
): Promise<ReleaseFn> { ): Promise<ReleaseFn> {
// Fast path: try to listen on the lock path. fs.mkdirSync(path.dirname(lockPath), { recursive: true });
const attemptListen = (): Promise<net.Server> =>
new Promise((resolve, reject) => {
const server = net.createServer();
server.once("error", async (err: NodeJS.ErrnoException) => { const fd = fs.openSync(lockPath, "w+");
if (err.code !== "EADDRINUSE") { try {
reject(new GatewayLockError(`lock listen failed: ${err.message}`)); flockSync(fd, "exnb");
return; } catch (err) {
} fs.closeSync(fd);
const code = (err as NodeJS.ErrnoException).code;
if (code === "EWOULDBLOCK" || code === "EAGAIN") {
throw new GatewayLockError("another gateway instance is already running");
}
throw new GatewayLockError(
`failed to acquire gateway lock: ${(err as Error).message}`,
);
}
// Something is already bound. Try to connect to see if it is alive. fs.ftruncateSync(fd, 0);
const client = net.connect({ path: lockPath }); fs.writeSync(fd, `${process.pid}\n`, 0, "utf8");
fs.fsyncSync(fd);
client.once("connect", () => {
client.destroy();
reject(
new GatewayLockError("another gateway instance is already running"),
);
});
client.once("error", (connErr: NodeJS.ErrnoException) => {
// Nothing is listening -> stale socket file. Remove and retry once.
if (connErr.code === "ECONNREFUSED" || connErr.code === "ENOENT") {
try {
fs.rmSync(lockPath, { force: true });
} catch (rmErr) {
reject(
new GatewayLockError(
`failed to clean stale lock at ${lockPath}: ${String(rmErr)}`,
),
);
return;
}
attemptListen().then(resolve, reject);
return;
}
reject(
new GatewayLockError(
`failed to connect to existing lock (${lockPath}): ${connErr.message}`,
),
);
});
});
server.listen(lockPath, () => resolve(server));
});
const server = await attemptListen();
let released = false; let released = false;
const release = async (): Promise<void> => { const release = async (): Promise<void> => {
if (released) return; if (released) return;
released = true; released = true;
await new Promise<void>((resolve) => server.close(() => resolve())); try {
flockSync(fd, "un");
} catch {
/* ignore unlock errors */
}
try {
fs.closeSync(fd);
} catch {
/* ignore close errors */
}
try { try {
fs.rmSync(lockPath, { force: true }); fs.rmSync(lockPath, { force: true });
} catch { } catch {
/* ignore */ /* ignore unlink errors */
} }
}; };
const cleanupSignals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"]; const handleSignal = () => {
const handleSignal = async () => { void release();
await release();
process.exit(0); process.exit(0);
}; };
for (const sig of cleanupSignals) { for (const sig of SIGNALS) {
process.once(sig, () => { process.once(sig, handleSignal);
void handleSignal();
});
} }
process.once("exit", () => { process.once("exit", () => {
// Exit handler must be sync-safe; release is async but close+rm are fast.
void release(); void release();
}); });