diff --git a/CHANGELOG.md b/CHANGELOG.md index 570878ebe..ab2509ae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ Docs: https://docs.clawd.bot ### Fixes - Web UI: hide internal `message_id` hints in chat bubbles. - Heartbeat: normalize target identifiers for consistent routing. -- TUI: unify reasoning tag stripping so `` wrappers stay hidden. (#1613) Thanks @kyleok. +- Gateway: reduce log noise for late invokes + remote node probes; debounce skills refresh. (#1607) Thanks @petter-b. ## 2026.1.23-1 diff --git a/src/gateway/server-methods/nodes.ts b/src/gateway/server-methods/nodes.ts index d8787cea1..0ae80dd9c 100644 --- a/src/gateway/server-methods/nodes.ts +++ b/src/gateway/server-methods/nodes.ts @@ -468,7 +468,10 @@ export const nodeHandlers: GatewayRequestHandlers = { error: p.error ?? null, }); if (!ok) { - respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, "unknown invoke id")); + // Late-arriving results (after invoke timeout) are expected and harmless. + // Return success instead of error to reduce log noise; client can discard. + context.logGateway.debug(`late invoke result ignored: id=${p.id} node=${p.nodeId}`); + respond(true, { ok: true, ignored: true }, undefined); return; } respond(true, { ok: true }, undefined); diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index d2242fa2a..a3759d183 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -344,9 +344,19 @@ export async function startGatewayServer( setSkillsRemoteRegistry(nodeRegistry); void primeRemoteSkillsCache(); - registerSkillsChangeListener(() => { - const latest = loadConfig(); - void refreshRemoteBinsForConnectedNodes(latest); + // Debounce skills-triggered node probes to avoid feedback loops and rapid-fire invokes. + // Skills changes can happen in bursts (e.g., file watcher events), and each probe + // takes time to complete. A 30-second delay ensures we batch changes together. + let skillsRefreshTimer: ReturnType | null = null; + const skillsRefreshDelayMs = 30_000; + const skillsChangeUnsub = registerSkillsChangeListener((event) => { + if (event.reason === "remote-node") return; + if (skillsRefreshTimer) clearTimeout(skillsRefreshTimer); + skillsRefreshTimer = setTimeout(() => { + skillsRefreshTimer = null; + const latest = loadConfig(); + void refreshRemoteBinsForConnectedNodes(latest); + }, skillsRefreshDelayMs); }); const { tickInterval, healthInterval, dedupeCleanup } = startGatewayMaintenanceTimers({ @@ -544,6 +554,11 @@ export async function startGatewayServer( if (diagnosticsEnabled) { stopDiagnosticHeartbeat(); } + if (skillsRefreshTimer) { + clearTimeout(skillsRefreshTimer); + skillsRefreshTimer = null; + } + skillsChangeUnsub(); await close(opts); }, }; diff --git a/src/gateway/server.nodes.late-invoke.test.ts b/src/gateway/server.nodes.late-invoke.test.ts new file mode 100644 index 000000000..50801583d --- /dev/null +++ b/src/gateway/server.nodes.late-invoke.test.ts @@ -0,0 +1,125 @@ +import { afterAll, beforeAll, describe, expect, test, vi } from "vitest"; +import { WebSocket } from "ws"; + +import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js"; +import { loadOrCreateDeviceIdentity } from "../infra/device-identity.js"; + +vi.mock("../infra/update-runner.js", () => ({ + runGatewayUpdate: vi.fn(async () => ({ + status: "ok", + mode: "git", + root: "/repo", + steps: [], + durationMs: 12, + })), +})); + +import { + connectOk, + installGatewayTestHooks, + rpcReq, + startServerWithClient, +} from "./test-helpers.js"; + +installGatewayTestHooks({ scope: "suite" }); + +let server: Awaited>["server"]; +let ws: WebSocket; +let port: number; + +beforeAll(async () => { + const started = await startServerWithClient(); + server = started.server; + ws = started.ws; + port = started.port; + await connectOk(ws); +}); + +afterAll(async () => { + ws.close(); + await server.close(); +}); + +describe("late-arriving invoke results", () => { + test("returns success for unknown invoke id (late arrival after timeout)", async () => { + // Create a node client WebSocket + const nodeWs = new WebSocket(`ws://127.0.0.1:${port}`); + await new Promise((resolve) => nodeWs.once("open", resolve)); + + try { + // Connect as a node with device identity + const identity = loadOrCreateDeviceIdentity(); + const nodeId = identity.deviceId; + + await connectOk(nodeWs, { + role: "node", + client: { + id: GATEWAY_CLIENT_NAMES.NODE_HOST, + version: "1.0.0", + platform: "ios", + mode: GATEWAY_CLIENT_MODES.NODE, + }, + commands: ["canvas.snapshot"], + }); + + // Send an invoke result with an unknown ID (simulating late arrival after timeout) + const result = await rpcReq<{ ok?: boolean; ignored?: boolean }>( + nodeWs, + "node.invoke.result", + { + id: "unknown-invoke-id-12345", + nodeId, + ok: true, + payloadJSON: JSON.stringify({ result: "late" }), + }, + ); + + // Late-arriving results return success instead of error to reduce log noise + expect(result.ok).toBe(true); + expect(result.payload?.ok).toBe(true); + expect(result.payload?.ignored).toBe(true); + } finally { + nodeWs.close(); + } + }); + + test("returns success for unknown invoke id with error payload", async () => { + // Verifies late results are accepted regardless of their ok/error status + const nodeWs = new WebSocket(`ws://127.0.0.1:${port}`); + await new Promise((resolve) => nodeWs.once("open", resolve)); + + try { + await connectOk(nodeWs, { + role: "node", + client: { + id: GATEWAY_CLIENT_NAMES.NODE_HOST, + version: "1.0.0", + platform: "darwin", + mode: GATEWAY_CLIENT_MODES.NODE, + }, + commands: [], + }); + + const identity = loadOrCreateDeviceIdentity(); + const nodeId = identity.deviceId; + + // Late invoke result with error payload - should still return success + const result = await rpcReq<{ ok?: boolean; ignored?: boolean }>( + nodeWs, + "node.invoke.result", + { + id: "another-unknown-invoke-id", + nodeId, + ok: false, + error: { code: "FAILED", message: "test error" }, + }, + ); + + expect(result.ok).toBe(true); + expect(result.payload?.ok).toBe(true); + expect(result.payload?.ignored).toBe(true); + } finally { + nodeWs.close(); + } + }); +}); diff --git a/src/infra/skills-remote.ts b/src/infra/skills-remote.ts index 2c907dbed..1efb08459 100644 --- a/src/infra/skills-remote.ts +++ b/src/infra/skills-remote.ts @@ -55,10 +55,10 @@ function extractErrorMessage(err: unknown): string | undefined { function logRemoteBinProbeFailure(nodeId: string, err: unknown) { const message = extractErrorMessage(err); const label = describeNode(nodeId); - if (message?.includes("node not connected")) { - log.info( - `remote bin probe skipped: node not connected (${label}); check nodes list/status for ${label}`, - ); + // Node unavailable errors (not connected or disconnected mid-operation) are expected + // when nodes have transient connections - log at info level instead of warn + if (message?.includes("node not connected") || message?.includes("node disconnected")) { + log.info(`remote bin probe skipped: node unavailable (${label})`); return; } if (message?.includes("invoke timed out") || message?.includes("timeout")) { @@ -213,6 +213,15 @@ function parseBinProbePayload(payloadJSON: string | null | undefined, payload?: return []; } +function areBinSetsEqual(a: Set | undefined, b: Set): boolean { + if (!a) return false; + if (a.size !== b.size) return false; + for (const bin of b) { + if (!a.has(bin)) return false; + } + return true; +} + export async function refreshRemoteNodeBins(params: { nodeId: string; platform?: string; @@ -261,7 +270,11 @@ export async function refreshRemoteNodeBins(params: { return; } const bins = parseBinProbePayload(res.payloadJSON, res.payload); + const existingBins = remoteNodes.get(params.nodeId)?.bins; + const nextBins = new Set(bins); + const hasChanged = !areBinSetsEqual(existingBins, nextBins); recordRemoteNodeBins(params.nodeId, bins); + if (!hasChanged) return; await updatePairedNodeMetadata(params.nodeId, { bins }); bumpSkillsSnapshotVersion({ reason: "remote-node" }); } catch (err) {