fix(gateway): handle SIGTERM shutdown cleanly

This commit is contained in:
Peter Steinberger
2025-12-12 18:28:08 +00:00
parent c86cb4e9a5
commit 8fb3aef917
6 changed files with 72 additions and 7 deletions

View File

@@ -24,6 +24,7 @@ pnpm clawdis gateway --force
- Logs to stdout; use launchd/systemd to keep it alive and rotate logs.
- Pass `--verbose` to mirror debug logging (handshakes, req/res, events) from the log file into stdio when troubleshooting.
- `--force` uses `lsof` to find listeners on the chosen port, sends SIGTERM, logs what it killed, then starts the gateway (fails fast if `lsof` is missing).
- If you run under a supervisor (launchd/systemd/mac app child-process mode), a stop/restart typically sends **SIGTERM**; older builds may surface this as `pnpm` `ELIFECYCLE` exit code **143** (SIGTERM), which is a normal shutdown, not a crash.
- Optional shared secret: pass `--token <value>` or set `CLAWDIS_GATEWAY_TOKEN` to require clients to send `hello.auth.token`.
## Remote access

View File

@@ -269,8 +269,53 @@ Examples:
if (opts.token) {
process.env.CLAWDIS_GATEWAY_TOKEN = String(opts.token);
}
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let shuttingDown = false;
let forceExitTimer: ReturnType<typeof setTimeout> | null = null;
const onSigterm = () => shutdown("SIGTERM");
const onSigint = () => shutdown("SIGINT");
const shutdown = (signal: string) => {
// Ensure we don't leak listeners across restarts/tests.
process.removeListener("SIGTERM", onSigterm);
process.removeListener("SIGINT", onSigint);
if (shuttingDown) {
defaultRuntime.log(
info(`gateway: received ${signal} during shutdown; exiting now`),
);
defaultRuntime.exit(0);
}
shuttingDown = true;
defaultRuntime.log(info(`gateway: received ${signal}; shutting down`));
// Avoid hanging forever if a provider task ignores abort.
forceExitTimer = setTimeout(() => {
defaultRuntime.error(
"gateway: shutdown timed out; exiting without full cleanup",
);
defaultRuntime.exit(0);
}, 5000);
void (async () => {
try {
await server?.close();
} catch (err) {
defaultRuntime.error(`gateway: shutdown error: ${String(err)}`);
} finally {
if (forceExitTimer) clearTimeout(forceExitTimer);
defaultRuntime.exit(0);
}
})();
};
process.once("SIGTERM", onSigterm);
process.once("SIGINT", onSigint);
try {
await startGatewayServer(port, { webchatPort });
server = await startGatewayServer(port, { webchatPort });
} catch (err) {
if (err instanceof GatewayLockError) {
defaultRuntime.error(`Gateway failed to start: ${err.message}`);

View File

@@ -157,7 +157,10 @@ describe("gateway server", () => {
},
}),
);
await onceMessage(ws, (o) => o.type === "res" && o.id === "agent-last-stale");
await onceMessage(
ws,
(o) => o.type === "res" && o.id === "agent-last-stale",
);
const spy = vi.mocked(agentCommand);
expect(spy).toHaveBeenCalled();

View File

@@ -34,9 +34,9 @@ import { monitorWebProvider, webAuthExists } from "../providers/web/index.js";
import { defaultRuntime } from "../runtime.js";
import { monitorTelegramProvider } from "../telegram/monitor.js";
import { sendMessageTelegram } from "../telegram/send.js";
import { normalizeE164 } from "../utils.js";
import { sendMessageWhatsApp } from "../web/outbound.js";
import { ensureWebChatServerFromConfig } from "../webchat/server.js";
import { normalizeE164 } from "../utils.js";
import { buildMessageWithAttachments } from "./chat-attachments.js";
import {
ErrorCodes,

View File

@@ -244,9 +244,16 @@ describe("partial reply gating", () => {
replyResolver,
);
const stored = JSON.parse(await fs.readFile(store.storePath, "utf8")) as {
main?: { lastChannel?: string; lastTo?: string };
};
let stored: { main?: { lastChannel?: string; lastTo?: string } } | null =
null;
for (let attempt = 0; attempt < 50; attempt += 1) {
stored = JSON.parse(await fs.readFile(store.storePath, "utf8")) as {
main?: { lastChannel?: string; lastTo?: string };
};
if (stored.main?.lastChannel && stored.main?.lastTo) break;
await new Promise((resolve) => setTimeout(resolve, 5));
}
if (!stored) throw new Error("store not loaded");
expect(stored.main?.lastChannel).toBe("whatsapp");
expect(stored.main?.lastTo).toBe("+1000");

View File

@@ -752,6 +752,7 @@ export async function monitorWebProvider(
// Batch inbound messages per conversation while command queue is busy.
type PendingBatch = { messages: WebInboundMsg[]; timer?: NodeJS.Timeout };
const pendingBatches = new Map<string, PendingBatch>();
const backgroundTasks = new Set<Promise<unknown>>();
const buildLine = (msg: WebInboundMsg) => {
// Build message prefix: explicit config > default based on allowFrom
@@ -863,7 +864,7 @@ export async function monitorWebProvider(
return normalizeE164(latest.from);
})();
if (to) {
void updateLastRoute({
const task = updateLastRoute({
storePath,
sessionKey: mainKey,
channel: "whatsapp",
@@ -874,6 +875,10 @@ export async function monitorWebProvider(
"failed updating last route",
);
});
backgroundTasks.add(task);
task.finally(() => {
backgroundTasks.delete(task);
});
}
}
@@ -1053,6 +1058,10 @@ export async function monitorWebProvider(
if (heartbeat) clearInterval(heartbeat);
if (replyHeartbeatTimer) clearInterval(replyHeartbeatTimer);
if (watchdogTimer) clearInterval(watchdogTimer);
if (backgroundTasks.size > 0) {
await Promise.allSettled(backgroundTasks);
backgroundTasks.clear();
}
try {
await listener.close();
} catch (err) {