diff --git a/CHANGELOG.md b/CHANGELOG.md index c69779cac..658392067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ - Telegram: honor `/activation` session mode for group mention gating and clarify group activation docs. Thanks @julianengel for PR #377. - Telegram: isolate forum topic transcripts per thread and validate Gemini turn ordering in multi-topic sessions. Thanks @hsrvc for PR #407. - Telegram: render Telegram-safe HTML for outbound formatting and fall back to plain text on parse errors. Thanks @RandyVentures for PR #435. +- Telegram: add `[[audio_as_voice]]` tag to send audio as voice notes (audio files remain default); docs updated. Thanks @manmal for PR #188. - iMessage: ignore disconnect errors during shutdown (avoid unhandled promise rejections). Thanks @antons for PR #359. - Messages: stop defaulting ack reactions to 👀 when identity emoji is missing. - Auto-reply: require slash for control commands to avoid false triggers in normal text. diff --git a/docs/providers/telegram.md b/docs/providers/telegram.md index 77705b6c2..bcb59733b 100644 --- a/docs/providers/telegram.md +++ b/docs/providers/telegram.md @@ -153,6 +153,15 @@ Telegram supports optional threaded replies via tags: Controlled by `telegram.replyToMode`: - `first` (default), `all`, `off`. +## Audio messages (voice vs file) +Telegram distinguishes **voice notes** (round bubble) from **audio files** (metadata card). +Clawdbot defaults to audio files for backward compatibility. + +To force a voice note bubble in agent replies, include this tag anywhere in the reply: +- `[[audio_as_voice]]` — send audio as a voice note instead of a file. + +The tag is stripped from the delivered text. Other providers ignore this tag. + ## Streaming (drafts) Telegram can stream **draft bubbles** while the agent is generating a response. Clawdbot uses Bot API `sendMessageDraft` (not real messages) and then sends the diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts index 97ff55243..fb711ccdf 100644 --- a/src/auto-reply/reply/agent-runner.ts +++ b/src/auto-reply/reply/agent-runner.ts @@ -23,6 +23,7 @@ import type { OriginatingChannelType, TemplateContext } from "../templating.js"; import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js"; import { SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; +import { extractAudioTag } from "./audio-tags.js"; import { createFollowupRunner } from "./followup-runner.js"; import { enqueueFollowupRun, @@ -30,14 +31,12 @@ import { type QueueSettings, scheduleFollowupDrain, } from "./queue.js"; -import { extractAudioTag } from "./audio-tags.js"; import { applyReplyTagsToPayload, applyReplyThreading, filterMessagingToolDuplicates, isRenderablePayload, } from "./reply-payloads.js"; -import { extractReplyToTag } from "./reply-tags.js"; import { createReplyToModeFilter, resolveReplyToMode, @@ -341,6 +340,7 @@ export async function runReplyAgent(params: { const hasMedia = Boolean(taggedPayload.mediaUrl) || (taggedPayload.mediaUrls?.length ?? 0) > 0; + if (!cleaned && !hasMedia) return; if (cleaned?.trim() === SILENT_REPLY_TOKEN && !hasMedia) return; const blockPayload: ReplyPayload = applyReplyToMode({ diff --git a/src/auto-reply/reply/audio-tags.test.ts b/src/auto-reply/reply/audio-tags.test.ts new file mode 100644 index 000000000..4a1b8d16b --- /dev/null +++ b/src/auto-reply/reply/audio-tags.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from "vitest"; + +import { extractAudioTag } from "./audio-tags.js"; + +describe("extractAudioTag", () => { + it("detects audio_as_voice and strips the tag", () => { + const result = extractAudioTag("Hello [[audio_as_voice]] world"); + expect(result.audioAsVoice).toBe(true); + expect(result.hasTag).toBe(true); + expect(result.cleaned).toBe("Hello world"); + }); + + it("returns empty output for missing text", () => { + const result = extractAudioTag(undefined); + expect(result.audioAsVoice).toBe(false); + expect(result.hasTag).toBe(false); + expect(result.cleaned).toBe(""); + }); + + it("removes tag-only messages", () => { + const result = extractAudioTag("[[audio_as_voice]]"); + expect(result.audioAsVoice).toBe(true); + expect(result.cleaned).toBe(""); + }); +}); diff --git a/src/telegram/send.test.ts b/src/telegram/send.test.ts index 823d641eb..fc50cd669 100644 --- a/src/telegram/send.test.ts +++ b/src/telegram/send.test.ts @@ -158,6 +158,77 @@ describe("sendMessageTelegram", () => { expect(res.messageId).toBe("9"); }); + it("sends audio media as files by default", async () => { + const chatId = "123"; + const sendAudio = vi.fn().mockResolvedValue({ + message_id: 10, + chat: { id: chatId }, + }); + const sendVoice = vi.fn().mockResolvedValue({ + message_id: 11, + chat: { id: chatId }, + }); + const api = { sendAudio, sendVoice } as unknown as { + sendAudio: typeof sendAudio; + sendVoice: typeof sendVoice; + }; + + loadWebMedia.mockResolvedValueOnce({ + buffer: Buffer.from("audio"), + contentType: "audio/mpeg", + fileName: "clip.mp3", + }); + + await sendMessageTelegram(chatId, "caption", { + token: "tok", + api, + mediaUrl: "https://example.com/clip.mp3", + }); + + expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), { + caption: "caption", + }); + expect(sendVoice).not.toHaveBeenCalled(); + }); + + it("sends voice messages when asVoice is true and preserves thread params", async () => { + const chatId = "-1001234567890"; + const sendAudio = vi.fn().mockResolvedValue({ + message_id: 12, + chat: { id: chatId }, + }); + const sendVoice = vi.fn().mockResolvedValue({ + message_id: 13, + chat: { id: chatId }, + }); + const api = { sendAudio, sendVoice } as unknown as { + sendAudio: typeof sendAudio; + sendVoice: typeof sendVoice; + }; + + loadWebMedia.mockResolvedValueOnce({ + buffer: Buffer.from("voice"), + contentType: "audio/ogg", + fileName: "note.ogg", + }); + + await sendMessageTelegram(chatId, "voice note", { + token: "tok", + api, + mediaUrl: "https://example.com/note.ogg", + asVoice: true, + messageThreadId: 271, + replyToMessageId: 500, + }); + + expect(sendVoice).toHaveBeenCalledWith(chatId, expect.anything(), { + caption: "voice note", + message_thread_id: 271, + reply_to_message_id: 500, + }); + expect(sendAudio).not.toHaveBeenCalled(); + }); + it("includes message_thread_id for forum topic messages", async () => { const chatId = "-1001234567890"; const sendMessage = vi.fn().mockResolvedValue({ diff --git a/src/web/auto-reply.test.ts b/src/web/auto-reply.test.ts index 4cf7ec565..c5cf7544c 100644 --- a/src/web/auto-reply.test.ts +++ b/src/web/auto-reply.test.ts @@ -317,7 +317,7 @@ describe("partial reply gating", () => { undefined, {}, ); - expect(allowed).toEqual({ text: "ok" }); + expect(allowed).toMatchObject({ text: "ok", audioAsVoice: false }); expect(runEmbeddedPiAgent).toHaveBeenCalledOnce(); }); });