From 6550e7d5626183404bfc491d1932f1a5b266a43d Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 23 Dec 2025 02:26:11 +0100
Subject: [PATCH] fix: add whatsapp reply context

---
 CHANGELOG.md                  |  5 ++++
 docs/surface.md               |  1 +
 docs/telegram.md              |  4 +--
 src/web/auto-reply.test.ts    | 41 ++++++++++++++++++++++++++
 src/web/auto-reply.ts         |  3 ++
 src/web/inbound.ts            | 43 +++++++++++++++++++++++++--
 src/web/monitor-inbox.test.ts | 55 +++++++++++++++++++++++++++++++++--
 7 files changed, 145 insertions(+), 7 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c97226db8..d848133fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Unreleased — 2025-12-23
+
+### Fixes
+- Telegram/WhatsApp: native replies now target the original inbound message; reply context is captured in `ReplyTo*` fields for templates. (Thanks @joshp123 for the PR and follow-up question.)
+
 ## 2.0.0-beta2 — 2025-12-21
 
 Second beta focused on bundled gateway packaging, skills management, onboarding polish, and provider reliability.
diff --git a/docs/surface.md b/docs/surface.md
index 80ac85985..86107a79a 100644
--- a/docs/surface.md
+++ b/docs/surface.md
@@ -10,6 +10,7 @@ Updated: 2025-12-07
 Goal: make replies deterministic per channel while keeping one shared context for direct chats.
 
 - **Surfaces** (channel labels): `whatsapp`, `webchat`, `telegram`, `voice`, etc. Add `Surface` to inbound `MsgContext` so templates/agents can log which channel a turn came from. Routing is fixed: replies go back to the origin surface; the model doesn’t choose.
+- **Reply context (optional):** inbound replies may include `ReplyToId`, `ReplyToBody`, and `ReplyToSender` so templates can surface the quoted context when needed.
 - **Canonical direct session:** All direct chats collapse into the single `main` session by default (no config needed). Groups stay `group:<jid>`, so they remain isolated.
 - **Session store:** Keys are resolved via `resolveSessionKey(scope, ctx, mainKey)`; the agent JSONL path lives under `~/.clawdis/sessions/<SessionId>.jsonl`.
 - **WebChat:** Always attaches to `main`, loads the full session transcript so desktop reflects cross-surface history, and writes new turns back to the same session.
diff --git a/docs/telegram.md b/docs/telegram.md
index 94ad4bc68..03c25dad2 100644
--- a/docs/telegram.md
+++ b/docs/telegram.md
@@ -30,11 +30,11 @@ Status: ready for bot-mode use with grammY (long-polling by default; webhook sup
 - Sees only messages sent after it’s added to a chat; no pre-history access.
 - Cannot DM users first; they must initiate. Channels are receive-only unless the bot is an admin poster.
 - File size caps follow Telegram Bot API (up to 2 GB for documents; smaller for some media types).
-- Typing indicators (`sendChatAction`) supported; inline reply/threading supported where Telegram allows.
+- Typing indicators (`sendChatAction`) supported; outbound replies are sent as native replies to the triggering message (threaded where Telegram allows).
 
 ## Planned implementation details
 - Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits.
-- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default.
+- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, `Timestamp`, and reply-to metadata (`ReplyToId`, `ReplyToBody`, `ReplyToSender`) when the user replies; groups require @bot mention by default.
 - Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort.
 - Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported.
 
diff --git a/src/web/auto-reply.test.ts b/src/web/auto-reply.test.ts
index 18277331e..31bb65ce6 100644
--- a/src/web/auto-reply.test.ts
+++ b/src/web/auto-reply.test.ts
@@ -1751,6 +1751,47 @@ describe("web auto-reply", () => {
     expect(callArg?.Body).toContain("hello");
   });
 
+  it("forwards reply-to context to resolver", async () => {
+    let capturedOnMessage:
+      | ((msg: import("./inbound.js").WebInboundMessage) => Promise<void>)
+      | undefined;
+    const listenerFactory = async (opts: {
+      onMessage: (
+        msg: import("./inbound.js").WebInboundMessage,
+      ) => Promise<void>;
+    }) => {
+      capturedOnMessage = opts.onMessage;
+      return { close: vi.fn() };
+    };
+
+    const resolver = vi.fn().mockResolvedValue({ text: "reply" });
+
+    await monitorWebProvider(false, listenerFactory, false, resolver);
+    expect(capturedOnMessage).toBeDefined();
+
+    await capturedOnMessage?.({
+      body: "hello",
+      from: "+1555",
+      to: "+2666",
+      id: "msg1",
+      replyToId: "q1",
+      replyToBody: "original",
+      replyToSender: "+1999",
+      sendComposing: vi.fn(),
+      reply: vi.fn(),
+      sendMedia: vi.fn(),
+    });
+
+    const callArg = resolver.mock.calls[0]?.[0] as {
+      ReplyToId?: string;
+      ReplyToBody?: string;
+      ReplyToSender?: string;
+    };
+    expect(callArg.ReplyToId).toBe("q1");
+    expect(callArg.ReplyToBody).toBe("original");
+    expect(callArg.ReplyToSender).toBe("+1999");
+  });
+
   it("applies responsePrefix to regular replies", async () => {
     setLoadConfigMock(() => ({
       inbound: {
diff --git a/src/web/auto-reply.ts b/src/web/auto-reply.ts
index cfb7200df..029292a8f 100644
--- a/src/web/auto-reply.ts
+++ b/src/web/auto-reply.ts
@@ -1107,6 +1107,9 @@ export async function monitorWebProvider(
           From: msg.from,
           To: msg.to,
           MessageSid: msg.id,
+          ReplyToId: msg.replyToId,
+          ReplyToBody: msg.replyToBody,
+          ReplyToSender: msg.replyToSender,
           MediaPath: msg.mediaPath,
           MediaUrl: msg.mediaUrl,
           MediaType: msg.mediaType,
diff --git a/src/web/inbound.ts b/src/web/inbound.ts
index 488a8bc04..9d95d26db 100644
--- a/src/web/inbound.ts
+++ b/src/web/inbound.ts
@@ -39,6 +39,9 @@ export type WebInboundMessage = {
   senderJid?: string;
   senderE164?: string;
   senderName?: string;
+  replyToId?: string;
+  replyToBody?: string;
+  replyToSender?: string;
   groupSubject?: string;
   groupParticipants?: string[];
   mentionedJids?: string[];
@@ -187,6 +190,9 @@ export async function monitorWebInbox(options: {
         body = extractMediaPlaceholder(msg.message ?? undefined);
         if (!body) continue;
       }
+      const replyContext = describeReplyContext(
+        msg.message as proto.IMessage | undefined,
+      );
       let mediaPath: string | undefined;
       let mediaType: string | undefined;
       try {
@@ -211,10 +217,10 @@ export async function monitorWebInbox(options: {
         }
       };
       const reply = async (text: string) => {
-        await sock.sendMessage(chatJid, { text });
+        await sock.sendMessage(chatJid, { text }, { quoted: msg });
       };
       const sendMedia = async (payload: AnyMessageContent) => {
-        await sock.sendMessage(chatJid, payload);
+        await sock.sendMessage(chatJid, payload, { quoted: msg });
       };
       const timestamp = msg.messageTimestamp
         ? Number(msg.messageTimestamp) * 1000
@@ -249,6 +255,9 @@ export async function monitorWebInbox(options: {
             senderJid: participantJid,
             senderE164: senderE164 ?? undefined,
             senderName,
+            replyToId: replyContext?.id,
+            replyToBody: replyContext?.body,
+            replyToSender: replyContext?.sender,
             groupSubject,
             groupParticipants,
             mentionedJids: mentionedJids ?? undefined,
@@ -443,6 +452,36 @@ export function extractMediaPlaceholder(
   return undefined;
 }
 
+function describeReplyContext(rawMessage: proto.IMessage | undefined): {
+  id?: string;
+  body: string;
+  sender: string;
+} | null {
+  const message = unwrapMessage(rawMessage);
+  if (!message) return null;
+  const contextInfo =
+    message.extendedTextMessage?.contextInfo ??
+    message.imageMessage?.contextInfo ??
+    message.videoMessage?.contextInfo ??
+    message.documentMessage?.contextInfo ??
+    message.audioMessage?.contextInfo ??
+    message.stickerMessage?.contextInfo ??
+    message.buttonsResponseMessage?.contextInfo ??
+    message.listResponseMessage?.contextInfo;
+  const quoted = contextInfo?.quotedMessage as proto.IMessage | undefined;
+  if (!quoted) return null;
+  const body = extractText(quoted) ?? extractMediaPlaceholder(quoted);
+  if (!body) return null;
+  const senderJid = contextInfo?.participant ?? undefined;
+  const senderE164 = senderJid ? jidToE164(senderJid) ?? senderJid : undefined;
+  const sender = senderE164 ?? "unknown sender";
+  return {
+    id: contextInfo?.stanzaId ? String(contextInfo.stanzaId) : undefined,
+    body,
+    sender,
+  };
+}
+
 async function downloadInboundMedia(
   msg: proto.IWebMessageInfo,
   sock: Awaited<ReturnType<typeof createWaSocket>>,
diff --git a/src/web/monitor-inbox.test.ts b/src/web/monitor-inbox.test.ts
index 130fd6e6f..f8ce48d2c 100644
--- a/src/web/monitor-inbox.test.ts
+++ b/src/web/monitor-inbox.test.ts
@@ -107,9 +107,11 @@ describe("web monitor inbox", () => {
       "composing",
       "999@s.whatsapp.net",
     );
-    expect(sock.sendMessage).toHaveBeenCalledWith("999@s.whatsapp.net", {
-      text: "pong",
-    });
+    expect(sock.sendMessage).toHaveBeenCalledWith(
+      "999@s.whatsapp.net",
+      { text: "pong" },
+      { quoted: expect.objectContaining({ key: { id: "abc" } }) },
+    );
 
     await listener.close();
   });
@@ -151,6 +153,53 @@ describe("web monitor inbox", () => {
     await listener.close();
   });
 
+  it("captures reply context from quoted messages", async () => {
+    const onMessage = vi.fn(async (msg) => {
+      await msg.reply("pong");
+    });
+
+    const listener = await monitorWebInbox({ verbose: false, onMessage });
+    const sock = await createWaSocket();
+    const upsert = {
+      type: "notify",
+      messages: [
+        {
+          key: { id: "abc", fromMe: false, remoteJid: "999@s.whatsapp.net" },
+          message: {
+            extendedTextMessage: {
+              text: "reply",
+              contextInfo: {
+                stanzaId: "q1",
+                participant: "111@s.whatsapp.net",
+                quotedMessage: { conversation: "original" },
+              },
+            },
+          },
+          messageTimestamp: 1_700_000_000,
+          pushName: "Tester",
+        },
+      ],
+    };
+
+    sock.ev.emit("messages.upsert", upsert);
+    await new Promise((resolve) => setImmediate(resolve));
+
+    expect(onMessage).toHaveBeenCalledWith(
+      expect.objectContaining({
+        replyToId: "q1",
+        replyToBody: "original",
+        replyToSender: "+111",
+      }),
+    );
+    expect(sock.sendMessage).toHaveBeenCalledWith(
+      "999@s.whatsapp.net",
+      { text: "pong" },
+      { quoted: expect.objectContaining({ key: { id: "abc" } }) },
+    );
+
+    await listener.close();
+  });
+
   it("captures media path for image messages", async () => {
     const onMessage = vi.fn();
     const listener = await monitorWebInbox({ verbose: false, onMessage });