infra: use flock gateway lock
This commit is contained in:
2
.npmrc
2
.npmrc
@@ -1 +1 @@
|
|||||||
allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs
|
allow-build-scripts=@whiskeysockets/baileys,sharp,esbuild,protobufjs,fs-ext
|
||||||
|
|||||||
28
docs/gateway-lock.md
Normal file
28
docs/gateway-lock.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
summary: "Gateway lock strategy using POSIX flock and PID file"
|
||||||
|
read_when:
|
||||||
|
- Running or debugging the gateway process
|
||||||
|
- Investigating single-instance enforcement
|
||||||
|
---
|
||||||
|
# Gateway lock
|
||||||
|
|
||||||
|
Last updated: 2025-12-10
|
||||||
|
|
||||||
|
## Why
|
||||||
|
- Ensure only one gateway instance runs per host.
|
||||||
|
- Survive crashes/SIGKILL without leaving a blocking stale lock.
|
||||||
|
- Keep the PID visible for observability and manual debugging.
|
||||||
|
|
||||||
|
## Mechanism
|
||||||
|
- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process.
|
||||||
|
- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL.
|
||||||
|
- The PID is written into the same file after locking; the lock (not file existence) is the source of truth.
|
||||||
|
- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup.
|
||||||
|
|
||||||
|
## Error surface
|
||||||
|
- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`.
|
||||||
|
- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`.
|
||||||
|
|
||||||
|
## Operational notes
|
||||||
|
- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone.
|
||||||
|
- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself.
|
||||||
@@ -45,6 +45,7 @@
|
|||||||
"detect-libc": "^2.1.2",
|
"detect-libc": "^2.1.2",
|
||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"express": "^5.2.1",
|
"express": "^5.2.1",
|
||||||
|
"fs-ext": "^2.1.1",
|
||||||
"grammy": "^1.38.4",
|
"grammy": "^1.38.4",
|
||||||
"json5": "^2.2.3",
|
"json5": "^2.2.3",
|
||||||
"qrcode-terminal": "^0.12.0",
|
"qrcode-terminal": "^0.12.0",
|
||||||
@@ -59,6 +60,7 @@
|
|||||||
"@mariozechner/mini-lit": "0.2.1",
|
"@mariozechner/mini-lit": "0.2.1",
|
||||||
"@types/body-parser": "^1.19.6",
|
"@types/body-parser": "^1.19.6",
|
||||||
"@types/express": "^5.0.6",
|
"@types/express": "^5.0.6",
|
||||||
|
"@types/fs-ext": "^2.0.3",
|
||||||
"@types/node": "^24.10.2",
|
"@types/node": "^24.10.2",
|
||||||
"@types/qrcode-terminal": "^0.12.2",
|
"@types/qrcode-terminal": "^0.12.2",
|
||||||
"@types/ws": "^8.18.1",
|
"@types/ws": "^8.18.1",
|
||||||
|
|||||||
@@ -667,7 +667,9 @@ export async function startGatewayServer(port = 18789): Promise<GatewayServer> {
|
|||||||
if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) {
|
if (cached && now - cached.ts < HEALTH_REFRESH_INTERVAL_MS) {
|
||||||
respond(true, cached, undefined, { cached: true });
|
respond(true, cached, undefined, { cached: true });
|
||||||
void refreshHealthSnapshot({ probe: true }).catch((err) =>
|
void refreshHealthSnapshot({ probe: true }).catch((err) =>
|
||||||
logError(`background health refresh failed: ${formatError(err)}`),
|
logError(
|
||||||
|
`background health refresh failed: ${formatError(err)}`,
|
||||||
|
),
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,100 +1,78 @@
|
|||||||
import fs from "node:fs";
|
import fs from "node:fs";
|
||||||
import net from "node:net";
|
|
||||||
import os from "node:os";
|
import os from "node:os";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { flockSync } from "fs-ext";
|
||||||
|
|
||||||
const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock");
|
const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock");
|
||||||
|
|
||||||
export class GatewayLockError extends Error {}
|
export class GatewayLockError extends Error {}
|
||||||
|
|
||||||
type ReleaseFn = () => Promise<void>;
|
type ReleaseFn = () => Promise<void>;
|
||||||
|
|
||||||
|
const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Acquire an exclusive single-instance lock for the gateway using a Unix domain socket.
|
* Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file.
|
||||||
*
|
*
|
||||||
* Why a socket? If the process crashes or is SIGKILLed, the socket file remains but
|
* Kernel locks are released automatically when the process exits or is SIGKILLed, so the
|
||||||
* the next start will detect ECONNREFUSED when connecting and clean the stale path
|
* lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but
|
||||||
* before retrying. This keeps the lock self-healing without manual pidfile cleanup.
|
* correctness relies solely on the kernel lock.
|
||||||
*/
|
*/
|
||||||
export async function acquireGatewayLock(
|
export async function acquireGatewayLock(
|
||||||
lockPath = DEFAULT_LOCK_PATH,
|
lockPath = DEFAULT_LOCK_PATH,
|
||||||
): Promise<ReleaseFn> {
|
): Promise<ReleaseFn> {
|
||||||
// Fast path: try to listen on the lock path.
|
fs.mkdirSync(path.dirname(lockPath), { recursive: true });
|
||||||
const attemptListen = (): Promise<net.Server> =>
|
|
||||||
new Promise((resolve, reject) => {
|
|
||||||
const server = net.createServer();
|
|
||||||
|
|
||||||
server.once("error", async (err: NodeJS.ErrnoException) => {
|
const fd = fs.openSync(lockPath, "w+");
|
||||||
if (err.code !== "EADDRINUSE") {
|
|
||||||
reject(new GatewayLockError(`lock listen failed: ${err.message}`));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Something is already bound. Try to connect to see if it is alive.
|
|
||||||
const client = net.connect({ path: lockPath });
|
|
||||||
|
|
||||||
client.once("connect", () => {
|
|
||||||
client.destroy();
|
|
||||||
reject(
|
|
||||||
new GatewayLockError("another gateway instance is already running"),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
client.once("error", (connErr: NodeJS.ErrnoException) => {
|
|
||||||
// Nothing is listening -> stale socket file. Remove and retry once.
|
|
||||||
if (connErr.code === "ECONNREFUSED" || connErr.code === "ENOENT") {
|
|
||||||
try {
|
try {
|
||||||
fs.rmSync(lockPath, { force: true });
|
flockSync(fd, "exnb");
|
||||||
} catch (rmErr) {
|
} catch (err) {
|
||||||
reject(
|
fs.closeSync(fd);
|
||||||
new GatewayLockError(
|
const code = (err as NodeJS.ErrnoException).code;
|
||||||
`failed to clean stale lock at ${lockPath}: ${String(rmErr)}`,
|
if (code === "EWOULDBLOCK" || code === "EAGAIN") {
|
||||||
),
|
throw new GatewayLockError("another gateway instance is already running");
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
attemptListen().then(resolve, reject);
|
throw new GatewayLockError(
|
||||||
return;
|
`failed to acquire gateway lock: ${(err as Error).message}`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
reject(
|
fs.ftruncateSync(fd, 0);
|
||||||
new GatewayLockError(
|
fs.writeSync(fd, `${process.pid}\n`, 0, "utf8");
|
||||||
`failed to connect to existing lock (${lockPath}): ${connErr.message}`,
|
fs.fsyncSync(fd);
|
||||||
),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
server.listen(lockPath, () => resolve(server));
|
|
||||||
});
|
|
||||||
|
|
||||||
const server = await attemptListen();
|
|
||||||
|
|
||||||
let released = false;
|
let released = false;
|
||||||
const release = async (): Promise<void> => {
|
const release = async (): Promise<void> => {
|
||||||
if (released) return;
|
if (released) return;
|
||||||
released = true;
|
released = true;
|
||||||
await new Promise<void>((resolve) => server.close(() => resolve()));
|
try {
|
||||||
|
flockSync(fd, "un");
|
||||||
|
} catch {
|
||||||
|
/* ignore unlock errors */
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
fs.closeSync(fd);
|
||||||
|
} catch {
|
||||||
|
/* ignore close errors */
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
fs.rmSync(lockPath, { force: true });
|
fs.rmSync(lockPath, { force: true });
|
||||||
} catch {
|
} catch {
|
||||||
/* ignore */
|
/* ignore unlink errors */
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const cleanupSignals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
|
const handleSignal = () => {
|
||||||
const handleSignal = async () => {
|
void release();
|
||||||
await release();
|
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const sig of cleanupSignals) {
|
for (const sig of SIGNALS) {
|
||||||
process.once(sig, () => {
|
process.once(sig, handleSignal);
|
||||||
void handleSignal();
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
process.once("exit", () => {
|
process.once("exit", () => {
|
||||||
// Exit handler must be sync-safe; release is async but close+rm are fast.
|
|
||||||
void release();
|
void release();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user