feat(doctor): repair launch agent bootstrap

Co-authored-by: Dr Alexander Mikhalev <alex@metacortex.engineer>
2026-01-18 16:09:55 +00:00
parent d024dceef7
commit 1db0384090
4 changed files with 270 additions and 31 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,15 +2,9 @@

 Docs: https://docs.clawd.bot

-## 2026.1.17-7
+## 2026.1.18-4

 ### Changes
- Exec approvals: add `clawdbot approvals` CLI for viewing and updating gateway/node allowlists.
- CLI: add `clawdbot service` gateway/node management and a `clawdbot node status` alias.
- Status: show gateway + node service summaries in `clawdbot status` and `status --all`.
- Control UI: add gateway/node target selector for exec approvals.
- Docs: add approvals/service references and refresh node/control UI docs.
- Dependencies: update core + plugin deps (grammy, vitest, openai, Microsoft agents hosting, etc.).
 - macOS: switch PeekabooBridge integration to the tagged Swift Package Manager release (no submodule).
 - macOS: stop syncing Peekaboo as a git submodule in postinstall.
 - Swabble: use the tagged Commander Swift package release.
@@ -19,14 +13,29 @@ Docs: https://docs.clawd.bot
 - Memory: add native Gemini embeddings provider for memory search. (#1151)
 - Agents: add local docs path resolution and include docs/mirror/source/community pointers in the system prompt.
 - Slack: add HTTP webhook mode via Bolt HTTP receiver for Events API deployments. (#1143) — thanks @jdrhyne.
+
+### Fixes
+- Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee.
+- Agents: sanitize oversized image payloads before send and surface image-dimension errors.
+- macOS: Doctor repairs LaunchAgent bootstrap issues for Gateway + Node when listed but not loaded. (#1166) — thanks @AlexMikhalev.
+- macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105)
+- Memory: index atomically so failed reindex preserves the previous memory database. (#1151)
+- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151)
+
+## 2026.1.18-5
+
+### Changes
+- Dependencies: update core + plugin deps (grammy, vitest, openai, Microsoft agents hosting, etc.).
+
+## 2026.1.18-3
+
+### Changes
 - Exec: add host/security/ask routing for gateway + node exec.
 - Exec: add `/exec` directive for per-session exec defaults (host/security/ask/node).
 - macOS: migrate exec approvals to `~/.clawdbot/exec-approvals.json` with per-agent allowlists and skill auto-allow toggle.
 - macOS: add approvals socket UI server + node exec lifecycle events.
 - Nodes: add headless node host (`clawdbot node start`) for `system.run`/`system.which`.
 - Nodes: add node daemon service install/status/start/stop/restart.
- Hooks: run `BOOT.md` on gateway startup with the boot-md hook. (#1164) — thanks @ngutman.
- Onboarding: list all eligible hooks with one-line descriptions in the enable step.
 - Bridge: add `skills.bins` RPC to support node host auto-allow skill bins.
 - Slash commands: replace `/cost` with `/usage off|tokens|full` to control per-response usage footer; `/usage` no longer aliases `/status`. (Supersedes #1140) — thanks @Nachx639.
 - Sessions: add daily reset policy with per-type overrides and idle windows (default 4am local), preserving legacy idle-only configs. (#1146) — thanks @austinm911.
@@ -34,29 +43,17 @@ Docs: https://docs.clawd.bot
 - Docs: refresh exec/elevated/exec-approvals docs for the new flow. https://docs.clawd.bot/tools/exec-approvals
 - Docs: add node host CLI + update exec approvals/bridge protocol docs. https://docs.clawd.bot/cli/node
 - ACP: add experimental ACP support for IDE integrations (`clawdbot acp`). Thanks @visionik.
- Tools: allow `sessions_spawn` to override thinking level for sub-agent runs.
- Channels: unify thread/topic allowlist matching + command/mention gating helpers across core providers.
- Models: add Qwen Portal OAuth provider support. (#1120) — thanks @mukhtharcm.
- Memory: add `--verbose` logging for memory status + batch indexing details.
- Memory: allow parallel OpenAI batch indexing jobs (default concurrency: 2).
- macOS: add per-agent exec approvals with allowlists, skill CLI auto-allow, and settings UI.
- Docs: add exec approvals guide and link from tools index. https://docs.clawd.bot/tools/exec-approvals

 ### Fixes
- Auth profiles: keep auto-pinned preference while allowing rotation on failover; user pins stay locked. (#1138) — thanks @cheeeee.
- Agents: sanitize oversized image payloads before send and surface image-dimension errors.
- macOS: avoid touching launchd in Remote over SSH so quitting the app no longer disables the remote gateway. (#1105)
- Memory: index atomically so failed reindex preserves the previous memory database. (#1151)
- Memory: avoid sqlite-vec unique constraint failures when reindexing duplicate chunk ids. (#1151)
 - Exec approvals: enforce allowlist when ask is off; prefer raw command for node approvals/events.
- Exec approvals: parse command tokens correctly for PATH and relative resolution.
 - Tools: return a companion-app-required message when node exec is requested with no paired node.
 - Streaming: emit assistant deltas for OpenAI-compatible SSE chunks. (#1147) — thanks @alauppe.
 - Model fallback: treat timeout aborts as failover while preserving user aborts. (#1137) — thanks @cheeeee.
+
+## 2026.1.18-2
+
+### Fixes
 - Tests: stabilize plugin SDK resolution and embedded agent timeouts.
- Memory: apply OpenAI batch defaults even without explicit remote config.
- macOS: bundle Textual resources in packaged app builds to avoid code block crashes. (#1006)
- Discord: only emit slow listener warnings after 30s.

 ## 2026.1.17-6

@@ -84,6 +81,22 @@ Docs: https://docs.clawd.bot
 ### Fixes
 - Voice call: include request query in Twilio webhook verification when publicUrl is set. (#864)

+## 2026.1.18-1
+
+### Changes
+- Tools: allow `sessions_spawn` to override thinking level for sub-agent runs.
+- Channels: unify thread/topic allowlist matching + command/mention gating helpers across core providers.
+- Models: add Qwen Portal OAuth provider support. (#1120) — thanks @mukhtharcm.
+- Memory: add `--verbose` logging for memory status + batch indexing details.
+- Memory: allow parallel OpenAI batch indexing jobs (default concurrency: 2).
+- macOS: add per-agent exec approvals with allowlists, skill CLI auto-allow, and settings UI.
+- Docs: add exec approvals guide and link from tools index. https://docs.clawd.bot/tools/exec-approvals
+
+### Fixes
+- Memory: apply OpenAI batch defaults even without explicit remote config.
+- macOS: bundle Textual resources in packaged app builds to avoid code block crashes. (#1006)
+- Tools: return a companion-app-required message when `system.run` is requested without a supporting node.
+- Discord: only emit slow listener warnings after 30s.
 ## 2026.1.17-3

 ### Changes
--- a/src/commands/doctor-gateway-daemon-flow.ts
+++ b/src/commands/doctor-gateway-daemon-flow.ts
@@ -1,7 +1,13 @@
 import type { ClawdbotConfig } from "../config/config.js";
 import { resolveGatewayPort } from "../config/config.js";
-import { resolveGatewayLaunchAgentLabel } from "../daemon/constants.js";
+import { resolveGatewayLaunchAgentLabel, resolveNodeLaunchAgentLabel } from "../daemon/constants.js";
 import { readLastGatewayErrorLine } from "../daemon/diagnostics.js";
+import {
+  isLaunchAgentListed,
+  isLaunchAgentLoaded,
+  launchAgentPlistExists,
+  repairLaunchAgentBootstrap,
+} from "../daemon/launchd.js";
 import { resolveGatewayService } from "../daemon/service.js";
 import { isSystemdUserServiceAvailable } from "../daemon/systemd.js";
 import { renderSystemdUnavailableHints } from "../daemon/systemd-hints.js";
@@ -21,6 +27,53 @@ import type { DoctorOptions, DoctorPrompter } from "./doctor-prompter.js";
 import { healthCommand } from "./health.js";
 import { formatHealthCheckFailure } from "./health-format.js";

+async function maybeRepairLaunchAgentBootstrap(params: {
+  env: Record<string, string | undefined>;
+  title: string;
+  runtime: RuntimeEnv;
+  prompter: DoctorPrompter;
+}): Promise<boolean> {
+  if (process.platform !== "darwin") return false;
+
+  const listed = await isLaunchAgentListed({ env: params.env });
+  if (!listed) return false;
+
+  const loaded = await isLaunchAgentLoaded({ env: params.env });
+  if (loaded) return false;
+
+  const plistExists = await launchAgentPlistExists(params.env);
+  if (!plistExists) return false;
+
+  note(
+    "LaunchAgent is listed but not loaded in launchd.",
+    `${params.title} LaunchAgent`,
+  );
+
+  const shouldFix = await params.prompter.confirmSkipInNonInteractive({
+    message: `Repair ${params.title} LaunchAgent bootstrap now?`,
+    initialValue: true,
+  });
+  if (!shouldFix) return false;
+
+  params.runtime.log(`Bootstrapping ${params.title} LaunchAgent...`);
+  const repair = await repairLaunchAgentBootstrap({ env: params.env });
+  if (!repair.ok) {
+    params.runtime.error(
+      `${params.title} LaunchAgent bootstrap failed: ${repair.detail ?? "unknown error"}`,
+    );
+    return false;
+  }
+
+  const verified = await isLaunchAgentLoaded({ env: params.env });
+  if (!verified) {
+    params.runtime.error(`${params.title} LaunchAgent still not loaded after repair.`);
+    return false;
+  }
+
+  note(`${params.title} LaunchAgent repaired.`, `${params.title} LaunchAgent`);
+  return true;
+}
+
 export async function maybeRepairGatewayDaemon(params: {
  cfg: ClawdbotConfig;
  runtime: RuntimeEnv;
@@ -32,12 +85,33 @@ export async function maybeRepairGatewayDaemon(params: {
  if (params.healthOk) return;

  const service = resolveGatewayService();
-  const loaded = await service.isLoaded({ env: process.env });
+  let loaded = await service.isLoaded({ env: process.env });
  let serviceRuntime: Awaited<ReturnType<typeof service.readRuntime>> | undefined;
  if (loaded) {
    serviceRuntime = await service.readRuntime(process.env).catch(() => undefined);
  }

+  if (process.platform === "darwin" && params.cfg.gateway?.mode !== "remote") {
+    const gatewayRepaired = await maybeRepairLaunchAgentBootstrap({
+      env: process.env,
+      title: "Gateway",
+      runtime: params.runtime,
+      prompter: params.prompter,
+    });
+    await maybeRepairLaunchAgentBootstrap({
+      env: { ...process.env, CLAWDBOT_LAUNCHD_LABEL: resolveNodeLaunchAgentLabel() },
+      title: "Node",
+      runtime: params.runtime,
+      prompter: params.prompter,
+    });
+    if (gatewayRepaired) {
+      loaded = await service.isLoaded({ env: process.env });
+      if (loaded) {
+        serviceRuntime = await service.readRuntime(process.env).catch(() => undefined);
+      }
+    }
+  }
+
  if (params.cfg.gateway?.mode !== "remote") {
    const port = resolveGatewayPort(params.cfg, process.env);
    const diagnostics = await inspectPortUsage(port);
--- a/src/daemon/launchd.test.ts
+++ b/src/daemon/launchd.test.ts
@@ -5,7 +5,88 @@ import { PassThrough } from "node:stream";

 import { describe, expect, it } from "vitest";

-import { installLaunchAgent, parseLaunchctlPrint, resolveLaunchAgentPlistPath } from "./launchd.js";
+import {
+  installLaunchAgent,
+  isLaunchAgentListed,
+  parseLaunchctlPrint,
+  repairLaunchAgentBootstrap,
+  resolveLaunchAgentPlistPath,
+} from "./launchd.js";
+
+async function withLaunchctlStub(
+  options: { listOutput?: string },
+  run: (context: { env: Record<string, string | undefined>; logPath: string }) => Promise<void>,
+) {
+  const originalPath = process.env.PATH;
+  const originalLogPath = process.env.CLAWDBOT_TEST_LAUNCHCTL_LOG;
+  const originalListOutput = process.env.CLAWDBOT_TEST_LAUNCHCTL_LIST_OUTPUT;
+
+  const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-launchctl-test-"));
+  try {
+    const binDir = path.join(tmpDir, "bin");
+    const homeDir = path.join(tmpDir, "home");
+    const logPath = path.join(tmpDir, "launchctl.log");
+    await fs.mkdir(binDir, { recursive: true });
+    await fs.mkdir(homeDir, { recursive: true });
+
+    const stubJsPath = path.join(binDir, "launchctl.js");
+    await fs.writeFile(
+      stubJsPath,
+      [
+        'import fs from "node:fs";',
+        "const args = process.argv.slice(2);",
+        "const logPath = process.env.CLAWDBOT_TEST_LAUNCHCTL_LOG;",
+        "if (logPath) {",
+        '  fs.appendFileSync(logPath, JSON.stringify(args) + "\\n", "utf8");',
+        "}",
+        'if (args[0] === "list") {',
+        "  const output = process.env.CLAWDBOT_TEST_LAUNCHCTL_LIST_OUTPUT || \"\";",
+        "  process.stdout.write(output);",
+        "}",
+        "process.exit(0);",
+        "",
+      ].join("\n"),
+      "utf8",
+    );
+
+    if (process.platform === "win32") {
+      await fs.writeFile(
+        path.join(binDir, "launchctl.cmd"),
+        `@echo off\r\nnode "%~dp0\\launchctl.js" %*\r\n`,
+        "utf8",
+      );
+    } else {
+      const shPath = path.join(binDir, "launchctl");
+      await fs.writeFile(shPath, `#!/bin/sh\nnode "$(dirname "$0")/launchctl.js" "$@"\n`, "utf8");
+      await fs.chmod(shPath, 0o755);
+    }
+
+    process.env.CLAWDBOT_TEST_LAUNCHCTL_LOG = logPath;
+    process.env.CLAWDBOT_TEST_LAUNCHCTL_LIST_OUTPUT = options.listOutput ?? "";
+    process.env.PATH = `${binDir}${path.delimiter}${originalPath ?? ""}`;
+
+    await run({
+      env: {
+        HOME: homeDir,
+        CLAWDBOT_PROFILE: "default",
+      },
+      logPath,
+    });
+  } finally {
+    process.env.PATH = originalPath;
+    if (originalLogPath === undefined) {
+      delete process.env.CLAWDBOT_TEST_LAUNCHCTL_LOG;
+    } else {
+      process.env.CLAWDBOT_TEST_LAUNCHCTL_LOG = originalLogPath;
+    }
+    if (originalListOutput === undefined) {
+      delete process.env.CLAWDBOT_TEST_LAUNCHCTL_LIST_OUTPUT;
+    } else {
+      process.env.CLAWDBOT_TEST_LAUNCHCTL_LIST_OUTPUT = originalListOutput;
+    }
+    await fs.rm(tmpDir, { recursive: true, force: true });
+  }
+}

 describe("launchd runtime parsing", () => {
  it("parses state, pid, and exit status", () => {
@@ -24,6 +105,46 @@ describe("launchd runtime parsing", () => {
  });
 });

+describe("launchctl list detection", () => {
+  it("detects the resolved label in launchctl list", async () => {
+    await withLaunchctlStub(
+      { listOutput: "123 0 com.clawdbot.gateway\n" },
+      async ({ env }) => {
+        const listed = await isLaunchAgentListed({ env });
+        expect(listed).toBe(true);
+      },
+    );
+  });
+
+  it("returns false when the label is missing", async () => {
+    await withLaunchctlStub({ listOutput: "123 0 com.other.service\n" }, async ({ env }) => {
+      const listed = await isLaunchAgentListed({ env });
+      expect(listed).toBe(false);
+    });
+  });
+});
+
+describe("launchd bootstrap repair", () => {
+  it("bootstraps and kickstarts the resolved label", async () => {
+    await withLaunchctlStub({}, async ({ env, logPath }) => {
+      const repair = await repairLaunchAgentBootstrap({ env });
+      expect(repair.ok).toBe(true);
+
+      const calls = (await fs.readFile(logPath, "utf8"))
+        .split("\n")
+        .filter(Boolean)
+        .map((line) => JSON.parse(line) as string[]);
+
+      const domain = typeof process.getuid === "function" ? `gui/${process.getuid()}` : "gui/501";
+      const label = "com.clawdbot.gateway";
+      const plistPath = resolveLaunchAgentPlistPath(env);
+
+      expect(calls).toContainEqual(["bootstrap", domain, plistPath]);
+      expect(calls).toContainEqual(["kickstart", "-k", `${domain}/${label}`]);
+    });
+  });
+});
+
 describe("launchd install", () => {
  it("enables service before bootstrap (clears persisted disabled state)", async () => {
    const originalPath = process.env.PATH;
--- a/src/daemon/launchd.ts
+++ b/src/daemon/launchd.ts
@@ -170,9 +170,22 @@ export async function isLaunchAgentLoaded(args: {
  return res.code === 0;
 }

-async function hasLaunchAgentPlist(env: Record<string, string | undefined>): Promise<boolean> {
-  const plistPath = resolveLaunchAgentPlistPath(env);
+export async function isLaunchAgentListed(args: {
+  env?: Record<string, string | undefined>;
+}): Promise<boolean> {
+  const label = resolveLaunchAgentLabel({ env: args.env });
+  const res = await execLaunchctl(["list"]);
+  if (res.code !== 0) return false;
+  return res.stdout
+    .split(/\r?\n/)
+    .some((line) => line.trim().split(/\s+/).at(-1) === label);
+}
+
+export async function launchAgentPlistExists(
+  env: Record<string, string | undefined>,
+): Promise<boolean> {
  try {
+    const plistPath = resolveLaunchAgentPlistPath(env);
    await fs.access(plistPath);
    return true;
  } catch {
@@ -194,7 +207,7 @@ export async function readLaunchAgentRuntime(
    };
  }
  const parsed = parseLaunchctlPrint(res.stdout || res.stderr || "");
-  const plistExists = await hasLaunchAgentPlist(env);
+  const plistExists = await launchAgentPlistExists(env);
  const state = parsed.state?.toLowerCase();
  const status = state === "running" || parsed.pid ? "running" : state ? "stopped" : "unknown";
  return {
@@ -207,6 +220,24 @@ export async function readLaunchAgentRuntime(
  };
 }

+export async function repairLaunchAgentBootstrap(args: {
+  env?: Record<string, string | undefined>;
+}): Promise<{ ok: boolean; detail?: string }> {
+  const env = args.env ?? (process.env as Record<string, string | undefined>);
+  const domain = resolveGuiDomain();
+  const label = resolveLaunchAgentLabel({ env });
+  const plistPath = resolveLaunchAgentPlistPath(env);
+  const boot = await execLaunchctl(["bootstrap", domain, plistPath]);
+  if (boot.code !== 0) {
+    return { ok: false, detail: (boot.stderr || boot.stdout).trim() || undefined };
+  }
+  const kick = await execLaunchctl(["kickstart", "-k", `${domain}/${label}`]);
+  if (kick.code !== 0) {
+    return { ok: false, detail: (kick.stderr || kick.stdout).trim() || undefined };
+  }
+  return { ok: true };
+}
+
 export type LegacyLaunchAgent = {
  label: string;
  plistPath: string;