fix: gate gateway restarts and discord abort reconnects

This commit is contained in:
Peter Steinberger
2026-01-18 23:25:04 +00:00
parent e97bcf4dae
commit d802844bd6
11 changed files with 143 additions and 8 deletions

View File

@@ -28,6 +28,8 @@ Docs: https://docs.clawd.bot
- TUI: highlight model search matches and stabilize search ordering.
- CLI: keep banners on routed commands, restore config guarding outside fast-path routing, and tighten fast-path flag parsing while skipping console capture for extra speed. (#1195) — thanks @gumadeiras.
- Slack: resolve Bolt import interop for Bun + Node. (#1191) — thanks @CoreyH.
- Gateway: require authorized restarts for SIGUSR1 (restart/apply/update) so config gating can't be bypassed.
- Discord: stop reconnecting the gateway after aborts to prevent duplicate listeners.
## 2026.1.18-4

View File

@@ -28,7 +28,7 @@ clawdbot gateway
Notes:
- By default, the Gateway refuses to start unless `gateway.mode=local` is set in `~/.clawdbot/clawdbot.json`. Use `--allow-unconfigured` for ad-hoc/dev runs.
- Binding beyond loopback without auth is blocked (safety guardrail).
- `SIGUSR1` triggers an in-process restart (useful without a supervisor).
- `SIGUSR1` triggers an in-process restart when authorized (enable `commands.restart` or use the gateway tool/config apply/update).
- `SIGINT`/`SIGTERM` handlers stop the gateway process, but they dont restore any custom terminal state. If you wrap the CLI with a TUI or raw-mode input, restore the terminal before exit.
### Options

View File

@@ -34,7 +34,7 @@ pnpm gateway:watch
- Pass `--verbose` to mirror debug logging (handshakes, req/res, events) from the log file into stdio when troubleshooting.
- `--force` uses `lsof` to find listeners on the chosen port, sends SIGTERM, logs what it killed, then starts the gateway (fails fast if `lsof` is missing).
- If you run under a supervisor (launchd/systemd/mac app child-process mode), a stop/restart typically sends **SIGTERM**; older builds may surface this as `pnpm` `ELIFECYCLE` exit code **143** (SIGTERM), which is a normal shutdown, not a crash.
- **SIGUSR1** triggers an in-process restart (no external supervisor required). This is what the `gateway` agent tool uses.
- **SIGUSR1** triggers an in-process restart when authorized (gateway tool/config apply/update, or enable `commands.restart` for manual restarts).
- Gateway auth: set `gateway.auth.mode=token` + `gateway.auth.token` (or pass `--token <value>` / `CLAWDBOT_GATEWAY_TOKEN`) to require clients to send `connect.params.auth.token`.
- The wizard now generates a token by default, even on loopback.
- Port precedence: `--port` > `CLAWDBOT_GATEWAY_PORT` > `gateway.port` > default `18789`.

View File

@@ -356,7 +356,7 @@ Notes:
Restart or apply updates to the running Gateway process (in-place).
Core actions:
- `restart` (sends `SIGUSR1` to the current process; `clawdbot gateway` restart in-place)
- `restart` (authorizes + sends `SIGUSR1` for in-process restart; `clawdbot gateway` restart in-place)
- `config.get` / `config.schema`
- `config.apply` (validate + write config + restart + wake)
- `update.run` (run update + restart + wake)

View File

@@ -1,5 +1,9 @@
import type { startGatewayServer } from "../../gateway/server.js";
import { createSubsystemLogger } from "../../logging.js";
import {
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
} from "../../infra/restart.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import type { defaultRuntime } from "../../runtime.js";
const gatewayLog = createSubsystemLogger("gateway");
@@ -67,6 +71,13 @@ export async function runGatewayLoop(params: {
};
const onSigusr1 = () => {
gatewayLog.info("signal SIGUSR1 received");
const authorized = consumeGatewaySigusr1RestartAuthorization();
if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) {
gatewayLog.warn(
"SIGUSR1 restart ignored (not authorized; enable commands.restart or use gateway tool).",
);
return;
}
request("restart", "SIGUSR1");
};

View File

@@ -14,7 +14,7 @@ import {
import type { ClawdbotConfig, ReplyToMode } from "../../config/config.js";
import { loadConfig } from "../../config/config.js";
import { danger, logVerbose, shouldLogVerbose, warn } from "../../globals.js";
import { createSubsystemLogger } from "../../logging.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import type { RuntimeEnv } from "../../runtime.js";
import { resolveDiscordAccount } from "../accounts.js";
import { attachDiscordGatewayLogging } from "../gateway-logging.js";
@@ -443,6 +443,17 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
emitter: gatewayEmitter,
runtime,
});
const abortSignal = opts.abortSignal;
const onAbort = () => {
if (!gateway) return;
gateway.options.reconnect = { maxAttempts: 0 };
gateway.disconnect();
};
if (abortSignal?.aborted) {
onAbort();
} else {
abortSignal?.addEventListener("abort", onAbort, { once: true });
}
// Timeout to detect zombie connections where HELLO is never received.
const HELLO_TIMEOUT_MS = 30000;
let helloTimeoutId: ReturnType<typeof setTimeout> | undefined;
@@ -472,7 +483,7 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
disconnect: () => gateway.disconnect(),
}
: undefined,
abortSignal: opts.abortSignal,
abortSignal,
onGatewayError: (err) => {
runtime.error?.(danger(`discord gateway error: ${String(err)}`));
},
@@ -487,6 +498,7 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
stopGatewayLogging();
if (helloTimeoutId) clearTimeout(helloTimeoutId);
gatewayEmitter?.removeListener("debug", onGatewayDebug);
abortSignal?.removeEventListener("abort", onAbort);
}
}

View File

@@ -3,6 +3,10 @@ import type { loadConfig } from "../config/config.js";
import { startGmailWatcher, stopGmailWatcher } from "../hooks/gmail-watcher.js";
import { startHeartbeatRunner } from "../infra/heartbeat-runner.js";
import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
import {
authorizeGatewaySigusr1Restart,
setGatewaySigusr1RestartPolicy,
} from "../infra/restart.js";
import { setCommandLaneConcurrency } from "../process/command-queue.js";
import { isTruthyEnvValue } from "../infra/env.js";
import type { ChannelKind, GatewayReloadPlan } from "./config-reload.js";
@@ -38,6 +42,7 @@ export function createGatewayReloadHandlers(params: {
plan: GatewayReloadPlan,
nextConfig: ReturnType<typeof loadConfig>,
) => {
setGatewaySigusr1RestartPolicy({ allowExternal: nextConfig.commands?.restart === true });
const state = params.getState();
const nextState = { ...state };
@@ -139,8 +144,9 @@ export function createGatewayReloadHandlers(params: {
const requestGatewayRestart = (
plan: GatewayReloadPlan,
_nextConfig: ReturnType<typeof loadConfig>,
nextConfig: ReturnType<typeof loadConfig>,
) => {
setGatewaySigusr1RestartPolicy({ allowExternal: nextConfig.commands?.restart === true });
const reasons = plan.restartReasons.length
? plan.restartReasons.join(", ")
: plan.changedPaths.join(", ");
@@ -149,6 +155,7 @@ export function createGatewayReloadHandlers(params: {
params.logReload.warn("no SIGUSR1 listener found; restart skipped");
return;
}
authorizeGatewaySigusr1Restart();
process.emit("SIGUSR1");
};

View File

@@ -23,8 +23,9 @@ import {
setSkillsRemoteBridge,
} from "../infra/skills-remote.js";
import { scheduleGatewayUpdateCheck } from "../infra/update-startup.js";
import { setGatewaySigusr1RestartPolicy } from "../infra/restart.js";
import { autoMigrateLegacyState } from "../infra/state-migrations.js";
import { createSubsystemLogger, runtimeForLogger } from "../logging.js";
import { createSubsystemLogger, runtimeForLogger } from "../logging/subsystem.js";
import type { PluginServicesHandle } from "../plugins/services.js";
import type { RuntimeEnv } from "../runtime.js";
import { runOnboardingWizard } from "../wizard/onboarding.js";
@@ -172,6 +173,7 @@ export async function startGatewayServer(
}
const cfgAtStart = loadConfig();
setGatewaySigusr1RestartPolicy({ allowExternal: cfgAtStart.commands?.restart === true });
initSubagentRegistry();
await autoMigrateLegacyState({ cfg: cfgAtStart, log });
const defaultAgentId = resolveDefaultAgentId(cfgAtStart);

41
src/infra/restart.test.ts Normal file
View File

@@ -0,0 +1,41 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import {
__testing,
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
scheduleGatewaySigusr1Restart,
setGatewaySigusr1RestartPolicy,
} from "./restart.js";
describe("restart authorization", () => {
beforeEach(() => {
__testing.resetSigusr1State();
vi.useFakeTimers();
vi.spyOn(process, "kill").mockImplementation(() => true);
});
afterEach(async () => {
await vi.runOnlyPendingTimersAsync();
vi.useRealTimers();
vi.restoreAllMocks();
__testing.resetSigusr1State();
});
it("consumes a scheduled authorization once", async () => {
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
scheduleGatewaySigusr1Restart({ delayMs: 0 });
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(true);
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
await vi.runAllTimersAsync();
});
it("tracks external restart policy", () => {
expect(isGatewaySigusr1RestartExternallyAllowed()).toBe(false);
setGatewaySigusr1RestartPolicy({ allowExternal: true });
expect(isGatewaySigusr1RestartExternallyAllowed()).toBe(true);
});
});

View File

@@ -12,6 +12,45 @@ export type RestartAttempt = {
};
const SPAWN_TIMEOUT_MS = 2000;
const SIGUSR1_AUTH_GRACE_MS = 5000;
let sigusr1AuthorizedCount = 0;
let sigusr1AuthorizedUntil = 0;
let sigusr1ExternalAllowed = false;
function resetSigusr1AuthorizationIfExpired(now = Date.now()) {
if (sigusr1AuthorizedCount <= 0) return;
if (now <= sigusr1AuthorizedUntil) return;
sigusr1AuthorizedCount = 0;
sigusr1AuthorizedUntil = 0;
}
export function setGatewaySigusr1RestartPolicy(opts?: { allowExternal?: boolean }) {
sigusr1ExternalAllowed = opts?.allowExternal === true;
}
export function isGatewaySigusr1RestartExternallyAllowed() {
return sigusr1ExternalAllowed;
}
export function authorizeGatewaySigusr1Restart(delayMs = 0) {
const delay = Math.max(0, Math.floor(delayMs));
const expiresAt = Date.now() + delay + SIGUSR1_AUTH_GRACE_MS;
sigusr1AuthorizedCount += 1;
if (expiresAt > sigusr1AuthorizedUntil) {
sigusr1AuthorizedUntil = expiresAt;
}
}
export function consumeGatewaySigusr1RestartAuthorization(): boolean {
resetSigusr1AuthorizationIfExpired();
if (sigusr1AuthorizedCount <= 0) return false;
sigusr1AuthorizedCount -= 1;
if (sigusr1AuthorizedCount <= 0) {
sigusr1AuthorizedUntil = 0;
}
return true;
}
function formatSpawnDetail(result: {
error?: unknown;
@@ -134,6 +173,7 @@ export function scheduleGatewaySigusr1Restart(opts?: {
typeof opts?.reason === "string" && opts.reason.trim()
? opts.reason.trim().slice(0, 200)
: undefined;
authorizeGatewaySigusr1Restart(delayMs);
const pid = process.pid;
const hasListener = process.listenerCount("SIGUSR1") > 0;
setTimeout(() => {
@@ -156,3 +196,11 @@ export function scheduleGatewaySigusr1Restart(opts?: {
mode: hasListener ? "emit" : "signal",
};
}
export const __testing = {
resetSigusr1State() {
sigusr1AuthorizedCount = 0;
sigusr1AuthorizedUntil = 0;
sigusr1ExternalAllowed = false;
},
};

View File

@@ -45,6 +45,10 @@ async function main() {
{ startGatewayServer },
{ setGatewayWsLogStyle },
{ setVerbose },
{
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
},
{ defaultRuntime },
{ enableConsoleCapture, setConsoleTimestampPrefix },
] = await Promise.all([
@@ -52,6 +56,7 @@ async function main() {
import("../gateway/server.js"),
import("../gateway/ws-logging.js"),
import("../globals.js"),
import("../infra/restart.js"),
import("../runtime.js"),
import("../logging.js"),
]);
@@ -156,6 +161,13 @@ async function main() {
};
const onSigusr1 = () => {
defaultRuntime.log("gateway: signal SIGUSR1 received");
const authorized = consumeGatewaySigusr1RestartAuthorization();
if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) {
defaultRuntime.log(
"gateway: SIGUSR1 restart ignored (not authorized; enable commands.restart or use gateway tool).",
);
return;
}
request("restart", "SIGUSR1");
};