fix: improve app restart and gateway logs

This commit is contained in:
Peter Steinberger
2025-12-09 18:36:49 +00:00
parent 0d4bf1c15a
commit 2adb14c320
6 changed files with 610 additions and 3 deletions

View File

@@ -0,0 +1,34 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
const newLockPath = () =>
path.join(
os.tmpdir(),
`clawdis-gateway-lock-test-${process.pid}-${Math.random().toString(16).slice(2)}.sock`,
);
describe("gateway-lock", () => {
it("prevents concurrent gateway instances and releases cleanly", async () => {
const lockPath = newLockPath();
const release1 = await acquireGatewayLock(lockPath);
expect(fs.existsSync(lockPath)).toBe(true);
await expect(acquireGatewayLock(lockPath)).rejects.toBeInstanceOf(
GatewayLockError,
);
await release1();
expect(fs.existsSync(lockPath)).toBe(false);
// After release, lock can be reacquired.
const release2 = await acquireGatewayLock(lockPath);
await release2();
expect(fs.existsSync(lockPath)).toBe(false);
});
});

102
src/infra/gateway-lock.ts Normal file
View File

@@ -0,0 +1,102 @@
import fs from "node:fs";
import net from "node:net";
import os from "node:os";
import path from "node:path";
const DEFAULT_LOCK_PATH = path.join(os.tmpdir(), "clawdis-gateway.lock");
export class GatewayLockError extends Error {}
type ReleaseFn = () => Promise<void>;
/**
* Acquire an exclusive single-instance lock for the gateway using a Unix domain socket.
*
* Why a socket? If the process crashes or is SIGKILLed, the socket file remains but
* the next start will detect ECONNREFUSED when connecting and clean the stale path
* before retrying. This keeps the lock self-healing without manual pidfile cleanup.
*/
export async function acquireGatewayLock(
lockPath = DEFAULT_LOCK_PATH,
): Promise<ReleaseFn> {
// Fast path: try to listen on the lock path.
const attemptListen = (): Promise<net.Server> =>
new Promise((resolve, reject) => {
const server = net.createServer();
server.once("error", async (err: NodeJS.ErrnoException) => {
if (err.code !== "EADDRINUSE") {
reject(new GatewayLockError(`lock listen failed: ${err.message}`));
return;
}
// Something is already bound. Try to connect to see if it is alive.
const client = net.connect({ path: lockPath });
client.once("connect", () => {
client.destroy();
reject(
new GatewayLockError("another gateway instance is already running"),
);
});
client.once("error", (connErr: NodeJS.ErrnoException) => {
// Nothing is listening -> stale socket file. Remove and retry once.
if (connErr.code === "ECONNREFUSED" || connErr.code === "ENOENT") {
try {
fs.rmSync(lockPath, { force: true });
} catch (rmErr) {
reject(
new GatewayLockError(
`failed to clean stale lock at ${lockPath}: ${String(rmErr)}`,
),
);
return;
}
attemptListen().then(resolve, reject);
return;
}
reject(
new GatewayLockError(
`failed to connect to existing lock (${lockPath}): ${connErr.message}`,
),
);
});
});
server.listen(lockPath, () => resolve(server));
});
const server = await attemptListen();
let released = false;
const release = async (): Promise<void> => {
if (released) return;
released = true;
await new Promise<void>((resolve) => server.close(() => resolve()));
try {
fs.rmSync(lockPath, { force: true });
} catch {
/* ignore */
}
};
const cleanupSignals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
const handleSignal = async () => {
await release();
process.exit(0);
};
for (const sig of cleanupSignals) {
process.once(sig, () => {
void handleSignal();
});
}
process.once("exit", () => {
// Exit handler must be sync-safe; release is async but close+rm are fast.
void release();
});
return release;
}