From 6550e7d5626183404bfc491d1932f1a5b266a43d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 23 Dec 2025 02:26:11 +0100 Subject: [PATCH] fix: add whatsapp reply context --- CHANGELOG.md | 5 ++++ docs/surface.md | 1 + docs/telegram.md | 4 +-- src/web/auto-reply.test.ts | 41 ++++++++++++++++++++++++++ src/web/auto-reply.ts | 3 ++ src/web/inbound.ts | 43 +++++++++++++++++++++++++-- src/web/monitor-inbox.test.ts | 55 +++++++++++++++++++++++++++++++++-- 7 files changed, 145 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c97226db8..d848133fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Unreleased — 2025-12-23 + +### Fixes +- Telegram/WhatsApp: native replies now target the original inbound message; reply context is captured in `ReplyTo*` fields for templates. (Thanks @joshp123 for the PR and follow-up question.) + ## 2.0.0-beta2 — 2025-12-21 Second beta focused on bundled gateway packaging, skills management, onboarding polish, and provider reliability. diff --git a/docs/surface.md b/docs/surface.md index 80ac85985..86107a79a 100644 --- a/docs/surface.md +++ b/docs/surface.md @@ -10,6 +10,7 @@ Updated: 2025-12-07 Goal: make replies deterministic per channel while keeping one shared context for direct chats. - **Surfaces** (channel labels): `whatsapp`, `webchat`, `telegram`, `voice`, etc. Add `Surface` to inbound `MsgContext` so templates/agents can log which channel a turn came from. Routing is fixed: replies go back to the origin surface; the model doesn’t choose. +- **Reply context (optional):** inbound replies may include `ReplyToId`, `ReplyToBody`, and `ReplyToSender` so templates can surface the quoted context when needed. - **Canonical direct session:** All direct chats collapse into the single `main` session by default (no config needed). Groups stay `group:`, so they remain isolated. - **Session store:** Keys are resolved via `resolveSessionKey(scope, ctx, mainKey)`; the agent JSONL path lives under `~/.clawdis/sessions/.jsonl`. - **WebChat:** Always attaches to `main`, loads the full session transcript so desktop reflects cross-surface history, and writes new turns back to the same session. diff --git a/docs/telegram.md b/docs/telegram.md index 94ad4bc68..03c25dad2 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -30,11 +30,11 @@ Status: ready for bot-mode use with grammY (long-polling by default; webhook sup - Sees only messages sent after it’s added to a chat; no pre-history access. - Cannot DM users first; they must initiate. Channels are receive-only unless the bot is an admin poster. - File size caps follow Telegram Bot API (up to 2 GB for documents; smaller for some media types). -- Typing indicators (`sendChatAction`) supported; inline reply/threading supported where Telegram allows. +- Typing indicators (`sendChatAction`) supported; outbound replies are sent as native replies to the triggering message (threaded where Telegram allows). ## Planned implementation details - Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits. -- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default. +- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, `Timestamp`, and reply-to metadata (`ReplyToId`, `ReplyToBody`, `ReplyToSender`) when the user replies; groups require @bot mention by default. - Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort. - Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported. diff --git a/src/web/auto-reply.test.ts b/src/web/auto-reply.test.ts index 18277331e..31bb65ce6 100644 --- a/src/web/auto-reply.test.ts +++ b/src/web/auto-reply.test.ts @@ -1751,6 +1751,47 @@ describe("web auto-reply", () => { expect(callArg?.Body).toContain("hello"); }); + it("forwards reply-to context to resolver", async () => { + let capturedOnMessage: + | ((msg: import("./inbound.js").WebInboundMessage) => Promise) + | undefined; + const listenerFactory = async (opts: { + onMessage: ( + msg: import("./inbound.js").WebInboundMessage, + ) => Promise; + }) => { + capturedOnMessage = opts.onMessage; + return { close: vi.fn() }; + }; + + const resolver = vi.fn().mockResolvedValue({ text: "reply" }); + + await monitorWebProvider(false, listenerFactory, false, resolver); + expect(capturedOnMessage).toBeDefined(); + + await capturedOnMessage?.({ + body: "hello", + from: "+1555", + to: "+2666", + id: "msg1", + replyToId: "q1", + replyToBody: "original", + replyToSender: "+1999", + sendComposing: vi.fn(), + reply: vi.fn(), + sendMedia: vi.fn(), + }); + + const callArg = resolver.mock.calls[0]?.[0] as { + ReplyToId?: string; + ReplyToBody?: string; + ReplyToSender?: string; + }; + expect(callArg.ReplyToId).toBe("q1"); + expect(callArg.ReplyToBody).toBe("original"); + expect(callArg.ReplyToSender).toBe("+1999"); + }); + it("applies responsePrefix to regular replies", async () => { setLoadConfigMock(() => ({ inbound: { diff --git a/src/web/auto-reply.ts b/src/web/auto-reply.ts index cfb7200df..029292a8f 100644 --- a/src/web/auto-reply.ts +++ b/src/web/auto-reply.ts @@ -1107,6 +1107,9 @@ export async function monitorWebProvider( From: msg.from, To: msg.to, MessageSid: msg.id, + ReplyToId: msg.replyToId, + ReplyToBody: msg.replyToBody, + ReplyToSender: msg.replyToSender, MediaPath: msg.mediaPath, MediaUrl: msg.mediaUrl, MediaType: msg.mediaType, diff --git a/src/web/inbound.ts b/src/web/inbound.ts index 488a8bc04..9d95d26db 100644 --- a/src/web/inbound.ts +++ b/src/web/inbound.ts @@ -39,6 +39,9 @@ export type WebInboundMessage = { senderJid?: string; senderE164?: string; senderName?: string; + replyToId?: string; + replyToBody?: string; + replyToSender?: string; groupSubject?: string; groupParticipants?: string[]; mentionedJids?: string[]; @@ -187,6 +190,9 @@ export async function monitorWebInbox(options: { body = extractMediaPlaceholder(msg.message ?? undefined); if (!body) continue; } + const replyContext = describeReplyContext( + msg.message as proto.IMessage | undefined, + ); let mediaPath: string | undefined; let mediaType: string | undefined; try { @@ -211,10 +217,10 @@ export async function monitorWebInbox(options: { } }; const reply = async (text: string) => { - await sock.sendMessage(chatJid, { text }); + await sock.sendMessage(chatJid, { text }, { quoted: msg }); }; const sendMedia = async (payload: AnyMessageContent) => { - await sock.sendMessage(chatJid, payload); + await sock.sendMessage(chatJid, payload, { quoted: msg }); }; const timestamp = msg.messageTimestamp ? Number(msg.messageTimestamp) * 1000 @@ -249,6 +255,9 @@ export async function monitorWebInbox(options: { senderJid: participantJid, senderE164: senderE164 ?? undefined, senderName, + replyToId: replyContext?.id, + replyToBody: replyContext?.body, + replyToSender: replyContext?.sender, groupSubject, groupParticipants, mentionedJids: mentionedJids ?? undefined, @@ -443,6 +452,36 @@ export function extractMediaPlaceholder( return undefined; } +function describeReplyContext(rawMessage: proto.IMessage | undefined): { + id?: string; + body: string; + sender: string; +} | null { + const message = unwrapMessage(rawMessage); + if (!message) return null; + const contextInfo = + message.extendedTextMessage?.contextInfo ?? + message.imageMessage?.contextInfo ?? + message.videoMessage?.contextInfo ?? + message.documentMessage?.contextInfo ?? + message.audioMessage?.contextInfo ?? + message.stickerMessage?.contextInfo ?? + message.buttonsResponseMessage?.contextInfo ?? + message.listResponseMessage?.contextInfo; + const quoted = contextInfo?.quotedMessage as proto.IMessage | undefined; + if (!quoted) return null; + const body = extractText(quoted) ?? extractMediaPlaceholder(quoted); + if (!body) return null; + const senderJid = contextInfo?.participant ?? undefined; + const senderE164 = senderJid ? jidToE164(senderJid) ?? senderJid : undefined; + const sender = senderE164 ?? "unknown sender"; + return { + id: contextInfo?.stanzaId ? String(contextInfo.stanzaId) : undefined, + body, + sender, + }; +} + async function downloadInboundMedia( msg: proto.IWebMessageInfo, sock: Awaited>, diff --git a/src/web/monitor-inbox.test.ts b/src/web/monitor-inbox.test.ts index 130fd6e6f..f8ce48d2c 100644 --- a/src/web/monitor-inbox.test.ts +++ b/src/web/monitor-inbox.test.ts @@ -107,9 +107,11 @@ describe("web monitor inbox", () => { "composing", "999@s.whatsapp.net", ); - expect(sock.sendMessage).toHaveBeenCalledWith("999@s.whatsapp.net", { - text: "pong", - }); + expect(sock.sendMessage).toHaveBeenCalledWith( + "999@s.whatsapp.net", + { text: "pong" }, + { quoted: expect.objectContaining({ key: { id: "abc" } }) }, + ); await listener.close(); }); @@ -151,6 +153,53 @@ describe("web monitor inbox", () => { await listener.close(); }); + it("captures reply context from quoted messages", async () => { + const onMessage = vi.fn(async (msg) => { + await msg.reply("pong"); + }); + + const listener = await monitorWebInbox({ verbose: false, onMessage }); + const sock = await createWaSocket(); + const upsert = { + type: "notify", + messages: [ + { + key: { id: "abc", fromMe: false, remoteJid: "999@s.whatsapp.net" }, + message: { + extendedTextMessage: { + text: "reply", + contextInfo: { + stanzaId: "q1", + participant: "111@s.whatsapp.net", + quotedMessage: { conversation: "original" }, + }, + }, + }, + messageTimestamp: 1_700_000_000, + pushName: "Tester", + }, + ], + }; + + sock.ev.emit("messages.upsert", upsert); + await new Promise((resolve) => setImmediate(resolve)); + + expect(onMessage).toHaveBeenCalledWith( + expect.objectContaining({ + replyToId: "q1", + replyToBody: "original", + replyToSender: "+111", + }), + ); + expect(sock.sendMessage).toHaveBeenCalledWith( + "999@s.whatsapp.net", + { text: "pong" }, + { quoted: expect.objectContaining({ key: { id: "abc" } }) }, + ); + + await listener.close(); + }); + it("captures media path for image messages", async () => { const onMessage = vi.fn(); const listener = await monitorWebInbox({ verbose: false, onMessage });