diff --git a/docs/gateway-lock.md b/docs/gateway-lock.md index 97ee6d061..4bf619354 100644 --- a/docs/gateway-lock.md +++ b/docs/gateway-lock.md @@ -1,28 +1,28 @@ --- -summary: "Gateway lock strategy using POSIX flock and PID file" +summary: "Gateway singleton guard using the WebSocket listener bind" read_when: - Running or debugging the gateway process - Investigating single-instance enforcement --- # Gateway lock -Last updated: 2025-12-10 +Last updated: 2025-12-11 ## Why - Ensure only one gateway instance runs per host. -- Survive crashes/SIGKILL without leaving a blocking stale lock. -- Keep the PID visible for observability and manual debugging. +- Survive crashes/SIGKILL without leaving stale lock files. +- Fail fast with a clear error when the control port is already occupied. ## Mechanism -- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process. -- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL. -- The PID is written into the same file after locking; the lock (not file existence) is the source of truth. -- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup. +- The gateway binds the WebSocket listener (default `ws://127.0.0.1:18789`) immediately on startup using an exclusive TCP listener. +- If the bind fails with `EADDRINUSE`, startup throws `GatewayLockError("another gateway instance is already listening on ws://127.0.0.1:")`. +- The OS releases the listener automatically on any process exit, including crashes and SIGKILL—no separate lock file or cleanup step is needed. +- On shutdown the gateway closes the WebSocket server and underlying HTTP server to free the port promptly. ## Error surface -- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`. -- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`. +- If another process holds the port, startup throws `GatewayLockError("another gateway instance is already listening on ws://127.0.0.1:")`. +- Other bind failures surface as `GatewayLockError("failed to bind gateway socket on ws://127.0.0.1:: …")`. ## Operational notes -- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone. -- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself. +- If the port is occupied by *another* process, the error is the same; free the port or choose another with `clawdis gateway --port `. +- The macOS app still maintains its own lightweight PID guard before spawning the gateway; the runtime lock is enforced by the WebSocket bind. diff --git a/docs/test.md b/docs/test.md index 6893a208e..c1f85b536 100644 --- a/docs/test.md +++ b/docs/test.md @@ -1,4 +1,4 @@ ## Tests -- `pnpm test:force`: Kills any lingering gateway process holding the default lock/port, removes stale lock files, runs the full Vitest suite with an isolated temporary gateway lock path so gateway server tests don’t collide with a running instance. Use this when a prior gateway run left `/tmp/clawdis-gateway.lock` or port 18789 occupied. +- `pnpm test:force`: Kills any lingering gateway process holding the default control port, then runs the full Vitest suite with an isolated gateway port so server tests don’t collide with a running instance. Use this when a prior gateway run left port 18789 occupied. - `pnpm test:coverage`: Runs Vitest with V8 coverage. Global thresholds are 70% lines/functions/statements and 55% branches. Coverage excludes integration-heavy entrypoints (CLI wiring, gateway/telegram bridges, webchat static server) to keep the target focused on unit-testable logic. diff --git a/package.json b/package.json index a11dbca1b..13efbd407 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,6 @@ "detect-libc": "^2.1.2", "dotenv": "^17.2.3", "express": "^5.2.1", - "fs-ext": "^2.1.1", "grammy": "^1.38.4", "json5": "^2.2.3", "qrcode-terminal": "^0.12.0", @@ -61,7 +60,6 @@ "@mariozechner/mini-lit": "0.2.1", "@types/body-parser": "^1.19.6", "@types/express": "^5.0.6", - "@types/fs-ext": "^2.0.3", "@types/node": "^24.10.2", "@types/qrcode-terminal": "^0.12.2", "@types/ws": "^8.18.1", diff --git a/scripts/test-force.ts b/scripts/test-force.ts index df7ec2af4..3eeb0fc52 100644 --- a/scripts/test-force.ts +++ b/scripts/test-force.ts @@ -1,48 +1,10 @@ #!/usr/bin/env tsx -import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { spawnSync } from "node:child_process"; import { forceFreePort, type PortProcess } from "../src/cli/ports.js"; const DEFAULT_PORT = 18789; -const DEFAULT_LOCK = path.join(os.tmpdir(), "clawdis-gateway.lock"); - -function killPid(pid: number, reason: string) { - try { - process.kill(pid, "SIGTERM"); - console.log(`sent SIGTERM to ${pid} (${reason})`); - } catch (err) { - if ((err as NodeJS.ErrnoException).code === "ESRCH") { - console.log(`pid ${pid} (${reason}) not running`); - } else { - console.error(`failed to kill ${pid} (${reason}): ${String(err)}`); - } - } -} - -function killLockHolder(lockPath: string) { - if (!fs.existsSync(lockPath)) return; - try { - const contents = fs.readFileSync(lockPath, "utf8").trim(); - const pid = Number.parseInt(contents.split("\n")[0] ?? "", 10); - if (Number.isFinite(pid)) { - killPid(pid, "gateway lock holder"); - } - } catch (err) { - console.error(`failed to read lock ${lockPath}: ${String(err)}`); - } -} - -function cleanupLock(lockPath: string) { - if (!fs.existsSync(lockPath)) return; - try { - fs.rmSync(lockPath, { force: true }); - console.log(`removed gateway lock: ${lockPath}`); - } catch (err) { - console.error(`failed to remove lock ${lockPath}: ${String(err)}`); - } -} function killGatewayListeners(port: number): PortProcess[] { try { @@ -86,16 +48,13 @@ function main() { process.env.CLAWDIS_GATEWAY_PORT ?? `${DEFAULT_PORT}`, 10, ); - const lockPath = process.env.CLAWDIS_GATEWAY_LOCK ?? DEFAULT_LOCK; console.log(`🧹 test:force - clearing gateway on port ${port}`); - killLockHolder(lockPath); const killed = killGatewayListeners(port); if (killed.length === 0) { console.log("no listeners to kill"); } - cleanupLock(lockPath); console.log("running pnpm test…"); runTests(); } diff --git a/src/gateway/server.test.ts b/src/gateway/server.test.ts index 630c4549d..04aabf76e 100644 --- a/src/gateway/server.test.ts +++ b/src/gateway/server.test.ts @@ -1,11 +1,9 @@ -import { randomUUID } from "node:crypto"; import { type AddressInfo, createServer } from "node:net"; -import os from "node:os"; -import path from "node:path"; -import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { describe, expect, test, vi } from "vitest"; import { WebSocket } from "ws"; import { emitAgentEvent } from "../infra/agent-events.js"; import { startGatewayServer } from "./server.js"; +import { GatewayLockError } from "../infra/gateway-lock.js"; vi.mock("../commands/health.js", () => ({ getHealthSnapshot: vi.fn().mockResolvedValue({ ok: true, stub: true }), @@ -27,19 +25,6 @@ vi.mock("../commands/agent.js", () => ({ process.env.CLAWDIS_SKIP_PROVIDERS = "1"; -const originalLockPath = process.env.CLAWDIS_GATEWAY_LOCK_PATH; - -beforeEach(() => { - process.env.CLAWDIS_GATEWAY_LOCK_PATH = path.join( - os.tmpdir(), - `clawdis-gateway-${randomUUID()}.lock`, - ); -}); - -afterEach(() => { - process.env.CLAWDIS_GATEWAY_LOCK_PATH = originalLockPath; -}); - async function getFreePort(): Promise { return await new Promise((resolve, reject) => { const server = createServer(); @@ -50,6 +35,17 @@ async function getFreePort(): Promise { }); } +async function occupyPort(): Promise<{ server: ReturnType; port: number }> { + return await new Promise((resolve, reject) => { + const server = createServer(); + server.once("error", reject); + server.listen(0, "127.0.0.1", () => { + const port = (server.address() as AddressInfo).port; + resolve({ server, port }); + }); + }); +} + function onceMessage( ws: WebSocket, filter: (obj: unknown) => boolean, @@ -654,4 +650,12 @@ describe("gateway server", () => { ws.close(); await server.close(); }); + + test("refuses to start when port already bound", async () => { + const { server: blocker, port } = await occupyPort(); + await expect(startGatewayServer(port)).rejects.toBeInstanceOf( + GatewayLockError, + ); + blocker.close(); + }); }); diff --git a/src/gateway/server.ts b/src/gateway/server.ts index ec92b9ae9..49f8b409f 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -2,6 +2,7 @@ import { randomUUID } from "node:crypto"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; +import { createServer as createHttpServer, type Server as HttpServer } from "node:http"; import chalk from "chalk"; import { type WebSocket, WebSocketServer } from "ws"; import { createDefaultDeps } from "../cli/deps.js"; @@ -17,7 +18,7 @@ import { } from "../config/sessions.js"; import { isVerbose } from "../globals.js"; import { onAgentEvent } from "../infra/agent-events.js"; -import { acquireGatewayLock, GatewayLockError } from "../infra/gateway-lock.js"; +import { GatewayLockError } from "../infra/gateway-lock.js"; import { enqueueSystemEvent } from "../infra/system-events.js"; import { listSystemPresence, @@ -255,15 +256,38 @@ export async function startGatewayServer( port = 18789, opts?: { webchatPort?: number }, ): Promise { - const releaseLock = await acquireGatewayLock().catch((err) => { - // Bubble known lock errors so callers can present a nice message. - if (err instanceof GatewayLockError) throw err; - throw new GatewayLockError(String(err)); - }); + const host = "127.0.0.1"; + const httpServer: HttpServer = createHttpServer(); + try { + await new Promise((resolve, reject) => { + const onError = (err: NodeJS.ErrnoException) => { + httpServer.off("listening", onListening); + reject(err); + }; + const onListening = () => { + httpServer.off("error", onError); + resolve(); + }; + httpServer.once("error", onError); + httpServer.once("listening", onListening); + httpServer.listen(port, host); + }); + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code === "EADDRINUSE") { + throw new GatewayLockError( + `another gateway instance is already listening on ws://${host}:${port}`, + err, + ); + } + throw new GatewayLockError( + `failed to bind gateway socket on ws://${host}:${port}: ${String(err)}`, + err, + ); + } const wss = new WebSocketServer({ - port, - host: "127.0.0.1", + server: httpServer, maxPayload: MAX_PAYLOAD_BYTES, }); const providerAbort = new AbortController(); @@ -1183,7 +1207,6 @@ export async function startGatewayServer( return { close: async () => { - await releaseLock(); providerAbort.abort(); broadcast("shutdown", { reason: "gateway stopping", @@ -1211,6 +1234,9 @@ export async function startGatewayServer( clients.clear(); await Promise.allSettled(providerTasks); await new Promise((resolve) => wss.close(() => resolve())); + await new Promise((resolve, reject) => + httpServer.close((err) => (err ? reject(err) : resolve())), + ); }, }; } diff --git a/src/infra/gateway-lock.test.ts b/src/infra/gateway-lock.test.ts deleted file mode 100644 index 9a67d7861..000000000 --- a/src/infra/gateway-lock.test.ts +++ /dev/null @@ -1,34 +0,0 @@ -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { describe, expect, it } from "vitest"; - -import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js"; - -const newLockPath = () => - path.join( - os.tmpdir(), - `clawdis-gateway-lock-test-${process.pid}-${Math.random().toString(16).slice(2)}.sock`, - ); - -describe("gateway-lock", () => { - it("prevents concurrent gateway instances and releases cleanly", async () => { - const lockPath = newLockPath(); - - const release1 = await acquireGatewayLock(lockPath); - expect(fs.existsSync(lockPath)).toBe(true); - - await expect(acquireGatewayLock(lockPath)).rejects.toBeInstanceOf( - GatewayLockError, - ); - - await release1(); - expect(fs.existsSync(lockPath)).toBe(false); - - // After release, lock can be reacquired. - const release2 = await acquireGatewayLock(lockPath); - await release2(); - expect(fs.existsSync(lockPath)).toBe(false); - }); -}); diff --git a/src/infra/gateway-lock.ts b/src/infra/gateway-lock.ts index 6e6e0b50f..a959c17d2 100644 --- a/src/infra/gateway-lock.ts +++ b/src/infra/gateway-lock.ts @@ -1,84 +1,6 @@ -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { flockSync } from "fs-ext"; -import { getLogger } from "../logging.js"; - -const defaultLockPath = () => - process.env.CLAWDIS_GATEWAY_LOCK_PATH ?? - path.join(os.tmpdir(), "clawdis-gateway.lock"); - -export class GatewayLockError extends Error {} - -type ReleaseFn = () => Promise; - -const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"]; - -/** - * Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file. - * - * Kernel locks are released automatically when the process exits or is SIGKILLed, so the - * lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but - * correctness relies solely on the kernel lock. - */ -export async function acquireGatewayLock( - lockPath = defaultLockPath(), -): Promise { - fs.mkdirSync(path.dirname(lockPath), { recursive: true }); - - const fd = fs.openSync(lockPath, "w+"); - try { - flockSync(fd, "exnb"); - } catch (err) { - fs.closeSync(fd); - const code = (err as NodeJS.ErrnoException).code; - if (code === "EWOULDBLOCK" || code === "EAGAIN") { - throw new GatewayLockError("another gateway instance is already running"); - } - throw new GatewayLockError( - `failed to acquire gateway lock: ${(err as Error).message}`, - ); +export class GatewayLockError extends Error { + constructor(message: string, public readonly cause?: unknown) { + super(message); + this.name = "GatewayLockError"; } - - fs.ftruncateSync(fd, 0); - fs.writeSync(fd, `${process.pid}\n`, 0, "utf8"); - fs.fsyncSync(fd); - getLogger().info({ pid: process.pid, lockPath }, "gateway lock acquired"); - - let released = false; - const release = async (): Promise => { - if (released) return; - released = true; - try { - flockSync(fd, "un"); - } catch { - /* ignore unlock errors */ - } - try { - fs.closeSync(fd); - } catch { - /* ignore close errors */ - } - try { - fs.rmSync(lockPath, { force: true }); - } catch { - /* ignore unlink errors */ - } - }; - - const handleSignal = () => { - void release(); - process.exit(0); - }; - - for (const sig of SIGNALS) { - process.once(sig, handleSignal); - } - - process.once("exit", () => { - void release(); - }); - - return release; }