fix: cleanup suspended CLI processes (#978) (thanks @Nachx639)

2026-01-16 01:37:50 +00:00
parent 8befe7f8a7
commit 8e2707e232
4 changed files with 147 additions and 57 deletions
--- a/src/agents/cli-runner.test.ts
+++ b/src/agents/cli-runner.test.ts
@@ -1,6 +1,8 @@
 import { beforeEach, describe, expect, it, vi } from "vitest";

+import type { CliBackendConfig } from "../config/types.js";
 import { runCliAgent } from "./cli-runner.js";
+import { cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";

 const runCommandWithTimeoutMock = vi.fn();
 const runExecMock = vi.fn();
@@ -17,9 +19,12 @@ describe("runCliAgent resume cleanup", () => {
  });

  it("kills stale resume processes for codex sessions", async () => {
-    // First call is for cleanupSuspendedCliProcesses (returns count 0)
-    // Second call is for cleanupResumeProcesses (pkill)
-    runExecMock.mockResolvedValue({ stdout: "0", stderr: "" });
+    runExecMock
+      .mockResolvedValueOnce({
+        stdout: "  1 S /bin/launchd\n",
+        stderr: "",
+      }) // cleanupSuspendedCliProcesses (ps)
+      .mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (pkill)
    runCommandWithTimeoutMock.mockResolvedValueOnce({
      stdout: "ok",
      stderr: "",
@@ -45,11 +50,7 @@ describe("runCliAgent resume cleanup", () => {
      return;
    }

-    // First call: cleanupSuspendedCliProcesses (bash to count)
-    // Second call: cleanupResumeProcesses (pkill)
    expect(runExecMock).toHaveBeenCalledTimes(2);
-
-    // Verify the pkill call for resume cleanup
    const pkillCall = runExecMock.mock.calls[1] ?? [];
    expect(pkillCall[0]).toBe("pkill");
    const pkillArgs = pkillCall[1] as string[];
@@ -58,45 +59,94 @@ describe("runCliAgent resume cleanup", () => {
    expect(pkillArgs[1]).toContain("resume");
    expect(pkillArgs[1]).toContain("thread-123");
  });
+});

-  it("cleans up suspended processes when threshold exceeded", async () => {
-    // Return count > 10 to trigger cleanup
-    runExecMock
-      .mockResolvedValueOnce({ stdout: "15", stderr: "" }) // count suspended
-      .mockResolvedValueOnce({ stdout: "", stderr: "" }); // kill command
-    runCommandWithTimeoutMock.mockResolvedValueOnce({
-      stdout: "ok",
-      stderr: "",
-      code: 0,
-      signal: null,
-      killed: false,
-    });
+describe("cleanupSuspendedCliProcesses", () => {
+  beforeEach(() => {
+    runExecMock.mockReset();
+  });

-    await runCliAgent({
-      sessionId: "s1",
-      sessionFile: "/tmp/session.jsonl",
-      workspaceDir: "/tmp",
-      prompt: "hi",
-      provider: "claude-cli",
-      timeoutMs: 1_000,
-      runId: "run-1",
-    });
+  it("skips when no session tokens are configured", async () => {
+    await cleanupSuspendedCliProcesses(
+      {
+        command: "tool",
+      } as CliBackendConfig,
+      0,
+    );
+
+    if (process.platform === "win32") {
+      expect(runExecMock).not.toHaveBeenCalled();
+      return;
+    }
+
+    expect(runExecMock).not.toHaveBeenCalled();
+  });
+
+  it("matches sessionArg-based commands", async () => {
+    runExecMock
+      .mockResolvedValueOnce({
+        stdout: [
+          "  40 T+ claude --session-id thread-1 -p",
+          "  41 S  claude --session-id thread-2 -p",
+        ].join("\n"),
+        stderr: "",
+      })
+      .mockResolvedValueOnce({ stdout: "", stderr: "" });
+
+    await cleanupSuspendedCliProcesses(
+      {
+        command: "claude",
+        sessionArg: "--session-id",
+      } as CliBackendConfig,
+      0,
+    );

    if (process.platform === "win32") {
      expect(runExecMock).not.toHaveBeenCalled();
      return;
    }

-    // cleanupSuspendedCliProcesses: count + kill (2 calls)
-    // cleanupResumeProcesses: not called for claude-cli (no resumeArgs)
    expect(runExecMock).toHaveBeenCalledTimes(2);
-
-    // First call: count suspended processes
-    const countCall = runExecMock.mock.calls[0] ?? [];
-    expect(countCall[0]).toBe("bash");
-
-    // Second call: kill suspended processes
    const killCall = runExecMock.mock.calls[1] ?? [];
-    expect(killCall[0]).toBe("bash");
+    expect(killCall[0]).toBe("kill");
+    expect(killCall[1]).toEqual(["-9", "40"]);
+  });
+
+  it("matches resumeArgs with positional session id", async () => {
+    runExecMock
+      .mockResolvedValueOnce({
+        stdout: [
+          "  50 T  codex exec resume thread-99 --color never --sandbox read-only",
+          "  51 T  codex exec resume other --color never --sandbox read-only",
+        ].join("\n"),
+        stderr: "",
+      })
+      .mockResolvedValueOnce({ stdout: "", stderr: "" });
+
+    await cleanupSuspendedCliProcesses(
+      {
+        command: "codex",
+        resumeArgs: [
+          "exec",
+          "resume",
+          "{sessionId}",
+          "--color",
+          "never",
+          "--sandbox",
+          "read-only",
+        ],
+      } as CliBackendConfig,
+      1,
+    );
+
+    if (process.platform === "win32") {
+      expect(runExecMock).not.toHaveBeenCalled();
+      return;
+    }
+
+    expect(runExecMock).toHaveBeenCalledTimes(2);
+    const killCall = runExecMock.mock.calls[1] ?? [];
+    expect(killCall[0]).toBe("kill");
+    expect(killCall[1]).toEqual(["-9", "50", "51"]);
  });
 });
--- a/src/agents/cli-runner.ts
+++ b/src/agents/cli-runner.ts
@@ -209,7 +209,6 @@ export async function runCliAgent(params: {

      // Cleanup suspended processes that have accumulated (regardless of sessionId)
      await cleanupSuspendedCliProcesses(backend);
-
      if (useResume && cliSessionIdToSend) {
        await cleanupResumeProcesses(backend, cliSessionIdToSend);
      }
--- a/src/agents/cli-runner/helpers.ts
+++ b/src/agents/cli-runner/helpers.ts
@@ -44,42 +44,82 @@ export async function cleanupResumeProcesses(
  }
 }

+function buildSessionMatchers(backend: CliBackendConfig): RegExp[] {
+  const commandToken = path.basename(backend.command ?? "").trim();
+  if (!commandToken) return [];
+  const matchers: RegExp[] = [];
+  const sessionArg = backend.sessionArg?.trim();
+  const sessionArgs = backend.sessionArgs ?? [];
+  const resumeArgs = backend.resumeArgs ?? [];
+
+  const addMatcher = (args: string[]) => {
+    if (args.length === 0) return;
+    const tokens = [commandToken, ...args];
+    const pattern = tokens
+      .map((token, index) => {
+        const tokenPattern = tokenToRegex(token);
+        return index === 0 ? `(?:^|\\s)${tokenPattern}` : `\\s+${tokenPattern}`;
+      })
+      .join("");
+    matchers.push(new RegExp(pattern));
+  };
+
+  if (sessionArgs.some((arg) => arg.includes("{sessionId}"))) {
+    addMatcher(sessionArgs);
+  } else if (sessionArg) {
+    addMatcher([sessionArg, "{sessionId}"]);
+  }
+
+  if (resumeArgs.some((arg) => arg.includes("{sessionId}"))) {
+    addMatcher(resumeArgs);
+  }
+
+  return matchers;
+}
+
+function tokenToRegex(token: string): string {
+  if (!token.includes("{sessionId}")) return escapeRegex(token);
+  const parts = token.split("{sessionId}").map((part) => escapeRegex(part));
+  return parts.join("\\S+");
+}
+
 /**
 * Cleanup suspended Clawdbot CLI processes that have accumulated.
 * Only cleans up if there are more than the threshold (default: 10).
- * Uses --session-id pattern to only target Clawdbot processes, not user's Claude Code sessions.
 */
 export async function cleanupSuspendedCliProcesses(
  backend: CliBackendConfig,
  threshold = 10,
 ): Promise<void> {
  if (process.platform === "win32") return;
-  const commandToken = path.basename(backend.command ?? "").trim();
-  if (!commandToken) return;
-
-  // Pattern includes --session-id to only match Clawdbot processes
-  const pattern = `[${commandToken[0]}]${commandToken.slice(1)}.*--session-id`;
+  const matchers = buildSessionMatchers(backend);
+  if (matchers.length === 0) return;

  try {
-    // Count suspended Clawdbot processes
-    const { stdout } = await runExec("bash", [
-      "-c",
-      `ps aux | grep -E '${pattern}' | grep -E '\\s+T\\s+' | wc -l`,
-    ]);
-    const count = parseInt(stdout.trim(), 10) || 0;
+    const { stdout } = await runExec("ps", ["-ax", "-o", "pid=,stat=,command="]);
+    const suspended: number[] = [];
+    for (const line of stdout.split("\n")) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+      const match = /^(\d+)\s+(\S+)\s+(.*)$/.exec(trimmed);
+      if (!match) continue;
+      const pid = Number(match[1]);
+      const stat = match[2] ?? "";
+      const command = match[3] ?? "";
+      if (!Number.isFinite(pid)) continue;
+      if (!stat.includes("T")) continue;
+      if (!matchers.some((matcher) => matcher.test(command))) continue;
+      suspended.push(pid);
+    }

-    if (count > threshold) {
-      // Kill suspended Clawdbot processes only
-      await runExec("bash", [
-        "-c",
-        `ps aux | grep -E '${pattern}' | grep -E '\\s+T\\s+' | awk '{print $2}' | xargs kill -9 2>/dev/null`,
-      ]);
+    if (suspended.length > threshold) {
+      // Verified locally: stopped (T) processes ignore SIGTERM, so use SIGKILL.
+      await runExec("kill", ["-9", ...suspended.map((pid) => String(pid))]);
    }
  } catch {
    // ignore errors - best effort cleanup
  }
 }
-
 export function enqueueCliRun<T>(key: string, task: () => Promise<T>): Promise<T> {
  const prior = CLI_RUN_QUEUE.get(key) ?? Promise.resolve();
  const chained = prior.catch(() => undefined).then(task);