diff --git a/CHANGELOG.md b/CHANGELOG.md index 698220d11..bb5e2b9de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ - Tools: normalize Slack/Discord message timestamps with `timestampMs`/`timestampUtc` while keeping raw provider fields. - Docs: add Date & Time guide and update prompt/timezone configuration docs. - Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc. +- Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev. - Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs. - Fix: guard model fallback against undefined provider/model values. (#954) — thanks @roshanasingh4. - Fix: refactor session store updates, add chat.inject, and harden subagent cleanup flow. (#944) — thanks @tyler6204. diff --git a/docs/cli/message.md b/docs/cli/message.md index ceea3b94d..f887602c8 100644 --- a/docs/cli/message.md +++ b/docs/cli/message.md @@ -44,7 +44,7 @@ Target formats (`--to`): - `send` - Channels: WhatsApp/Telegram/Discord/Slack/Signal/iMessage/MS Teams - - Required: `--to`, `--message` + - Required: `--to`, plus `--message` or `--media` - Optional: `--media`, `--reply-to`, `--thread-id`, `--gif-playback` - Telegram only: `--buttons` (requires `"inlineButtons"` in `channels.telegram.capabilities` or `channels.telegram.accounts..capabilities`) - Telegram only: `--thread-id` (forum topic id) diff --git a/src/agents/tools/telegram-actions.test.ts b/src/agents/tools/telegram-actions.test.ts index 8f4bb2a9b..1465b932f 100644 --- a/src/agents/tools/telegram-actions.test.ts +++ b/src/agents/tools/telegram-actions.test.ts @@ -221,6 +221,43 @@ describe("handleTelegramAction", () => { ); }); + it("allows media-only messages without content", async () => { + const cfg = { + channels: { telegram: { botToken: "tok" } }, + } as ClawdbotConfig; + await handleTelegramAction( + { + action: "sendMessage", + to: "123456", + mediaUrl: "https://example.com/note.ogg", + }, + cfg, + ); + expect(sendMessageTelegram).toHaveBeenCalledWith( + "123456", + "", + expect.objectContaining({ + token: "tok", + mediaUrl: "https://example.com/note.ogg", + }), + ); + }); + + it("requires content when no mediaUrl is provided", async () => { + const cfg = { + channels: { telegram: { botToken: "tok" } }, + } as ClawdbotConfig; + await expect( + handleTelegramAction( + { + action: "sendMessage", + to: "123456", + }, + cfg, + ), + ).rejects.toThrow(/content required/i); + }); + it("respects sendMessage gating", async () => { const cfg = { channels: { diff --git a/src/agents/tools/telegram-actions.ts b/src/agents/tools/telegram-actions.ts index 5ffc52515..47d617990 100644 --- a/src/agents/tools/telegram-actions.ts +++ b/src/agents/tools/telegram-actions.ts @@ -130,8 +130,13 @@ export async function handleTelegramAction( throw new Error("Telegram sendMessage is disabled."); } const to = readStringParam(params, "to", { required: true }); - const content = readStringParam(params, "content", { required: true }); const mediaUrl = readStringParam(params, "mediaUrl"); + // Allow content to be omitted when sending media-only (e.g., voice notes) + const content = + readStringParam(params, "content", { + required: !mediaUrl, + allowEmpty: true, + }) ?? ""; const buttons = readTelegramButtons(params); if (buttons && !hasInlineButtonsCapability({ cfg, accountId: accountId ?? undefined })) { throw new Error( diff --git a/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts b/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts index 73b5e5656..fd43f3a0c 100644 --- a/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts +++ b/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts @@ -19,6 +19,7 @@ const usageMocks = vi.hoisted(() => ({ providers: [], }), formatUsageSummaryLine: vi.fn().mockReturnValue("📊 Usage: Claude 80% left"), + formatUsageWindowSummary: vi.fn().mockReturnValue("Claude 80% left"), resolveUsageProviderId: vi.fn((provider: string) => provider.split("/")[0]), })); @@ -97,6 +98,16 @@ describe("trigger handling", () => { it("filters usage summary to the current model provider", async () => { await withTempHome(async (home) => { usageMocks.loadProviderUsageSummary.mockClear(); + usageMocks.loadProviderUsageSummary.mockResolvedValue({ + updatedAt: 0, + providers: [ + { + provider: "anthropic", + displayName: "Anthropic", + windows: [], + }, + ], + }); const res = await getReplyFromConfig( { diff --git a/src/auto-reply/reply/reply-payloads.ts b/src/auto-reply/reply/reply-payloads.ts index 94551fed6..f9b212377 100644 --- a/src/auto-reply/reply/reply-payloads.ts +++ b/src/auto-reply/reply/reply-payloads.ts @@ -42,7 +42,10 @@ export function applyReplyTagsToPayload( export function isRenderablePayload(payload: ReplyPayload): boolean { return Boolean( - payload.text || payload.mediaUrl || (payload.mediaUrls && payload.mediaUrls.length > 0), + payload.text || + payload.mediaUrl || + (payload.mediaUrls && payload.mediaUrls.length > 0) || + payload.audioAsVoice, ); } diff --git a/src/cli/program/message/register.send.ts b/src/cli/program/message/register.send.ts index 890da58b9..014c7395b 100644 --- a/src/cli/program/message/register.send.ts +++ b/src/cli/program/message/register.send.ts @@ -9,7 +9,7 @@ export function registerMessageSendCommand(message: Command, helpers: MessageCli message .command("send") .description("Send a message") - .requiredOption("-m, --message ", "Message body"), + .option("-m, --message ", "Message body (required unless --media is set)"), ) .option( "--media ", diff --git a/src/gateway/server/__tests__/test-utils.ts b/src/gateway/server/__tests__/test-utils.ts index 2cb403a05..5d8fac524 100644 --- a/src/gateway/server/__tests__/test-utils.ts +++ b/src/gateway/server/__tests__/test-utils.ts @@ -4,8 +4,8 @@ export const createTestRegistry = (overrides: Partial = {}): Plu const base: PluginRegistry = { plugins: [], tools: [], - providers: [], channels: [], + providers: [], gatewayHandlers: {}, httpHandlers: [], cliRegistrars: [], diff --git a/src/infra/outbound/message-action-runner.test.ts b/src/infra/outbound/message-action-runner.test.ts index c3b027107..4723b34aa 100644 --- a/src/infra/outbound/message-action-runner.test.ts +++ b/src/infra/outbound/message-action-runner.test.ts @@ -37,6 +37,37 @@ describe("runMessageAction context isolation", () => { expect(result.kind).toBe("send"); }); + it("allows media-only send when target matches current channel", async () => { + const result = await runMessageAction({ + cfg: slackConfig, + action: "send", + params: { + channel: "slack", + to: "#C123", + media: "https://example.com/note.ogg", + }, + toolContext: { currentChannelId: "C123" }, + dryRun: true, + }); + + expect(result.kind).toBe("send"); + }); + + it("requires message when no media hint is provided", async () => { + await expect( + runMessageAction({ + cfg: slackConfig, + action: "send", + params: { + channel: "slack", + to: "#C123", + }, + toolContext: { currentChannelId: "C123" }, + dryRun: true, + }), + ).rejects.toThrow(/message required/i); + }); + it("blocks send when target differs from current channel", async () => { await expect( runMessageAction({ diff --git a/src/infra/outbound/message-action-runner.ts b/src/infra/outbound/message-action-runner.ts index bb8066747..809b3fd30 100644 --- a/src/infra/outbound/message-action-runner.ts +++ b/src/infra/outbound/message-action-runner.ts @@ -208,10 +208,12 @@ export async function runMessageAction( if (action === "send") { const to = readStringParam(params, "to", { required: true }); + // Allow message to be omitted when sending media-only (e.g., voice notes) + const mediaHint = readStringParam(params, "media", { trim: false }); let message = readStringParam(params, "message", { - required: true, + required: !mediaHint, // Only require message if no media hint allowEmpty: true, - }); + }) ?? ""; const parsed = parseReplyDirectives(message); message = parsed.text; diff --git a/src/plugins/loader.ts b/src/plugins/loader.ts index 70e70a4f9..942406e86 100644 --- a/src/plugins/loader.ts +++ b/src/plugins/loader.ts @@ -189,8 +189,8 @@ function createPluginRecord(params: { enabled: params.enabled, status: params.enabled ? "loaded" : "disabled", toolNames: [], - providerIds: [], channelIds: [], + providerIds: [], gatewayMethods: [], cliCommands: [], services: [], diff --git a/src/telegram/bot-message-context.ts b/src/telegram/bot-message-context.ts index 8b0279e5c..ab9f21d58 100644 --- a/src/telegram/bot-message-context.ts +++ b/src/telegram/bot-message-context.ts @@ -98,6 +98,18 @@ export const buildTelegramMessageContext = async ({ } }; + const sendRecordVoice = async () => { + try { + await bot.api.sendChatAction( + chatId, + "record_voice", + buildTypingThreadParams(resolvedThreadId), + ); + } catch (err) { + logVerbose(`telegram record_voice cue failed for chat ${chatId}: ${String(err)}`); + } + }; + // DM access control (secure defaults): "pairing" (default) / "allowlist" / "open" / "disabled" if (!isGroup) { if (dmPolicy === "disabled") return null; @@ -408,6 +420,7 @@ export const buildTelegramMessageContext = async ({ route, skillFilter, sendTyping, + sendRecordVoice, ackReactionPromise, reactionApi, removeAckAfterReply, diff --git a/src/telegram/bot-message-dispatch.ts b/src/telegram/bot-message-dispatch.ts index c609a07c2..cbcf321e1 100644 --- a/src/telegram/bot-message-dispatch.ts +++ b/src/telegram/bot-message-dispatch.ts @@ -37,6 +37,7 @@ export const dispatchTelegramMessage = async ({ route, skillFilter, sendTyping, + sendRecordVoice, ackReactionPromise, reactionApi, removeAckAfterReply, @@ -144,6 +145,7 @@ export const dispatchTelegramMessage = async ({ replyToMode, textLimit, messageThreadId: resolvedThreadId, + onVoiceRecording: sendRecordVoice, }); didSendReply = true; }, diff --git a/src/telegram/bot/delivery.test.ts b/src/telegram/bot/delivery.test.ts new file mode 100644 index 000000000..65328af90 --- /dev/null +++ b/src/telegram/bot/delivery.test.ts @@ -0,0 +1,77 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import type { Bot } from "grammy"; + +import { deliverReplies } from "./delivery.js"; + +const loadWebMedia = vi.fn(); + +vi.mock("../../web/media.js", () => ({ + loadWebMedia: (...args: unknown[]) => loadWebMedia(...args), +})); + +vi.mock("grammy", () => ({ + InputFile: class { + constructor( + public buffer: Buffer, + public fileName?: string, + ) {} + }, +})); + +describe("deliverReplies", () => { + beforeEach(() => { + loadWebMedia.mockReset(); + }); + + it("skips audioAsVoice-only payloads without logging an error", async () => { + const runtime = { error: vi.fn() }; + const bot = { api: {} } as unknown as Bot; + + await deliverReplies({ + replies: [{ audioAsVoice: true }], + chatId: "123", + token: "tok", + runtime, + bot, + replyToMode: "off", + textLimit: 4000, + }); + + expect(runtime.error).not.toHaveBeenCalled(); + }); + + it("invokes onVoiceRecording before sending a voice note", async () => { + const events: string[] = []; + const runtime = { error: vi.fn() }; + const sendVoice = vi.fn(async () => { + events.push("sendVoice"); + return { message_id: 1, chat: { id: "123" } }; + }); + const bot = { api: { sendVoice } } as unknown as Bot; + const onVoiceRecording = vi.fn(async () => { + events.push("recordVoice"); + }); + + loadWebMedia.mockResolvedValueOnce({ + buffer: Buffer.from("voice"), + contentType: "audio/ogg", + fileName: "note.ogg", + }); + + await deliverReplies({ + replies: [{ mediaUrl: "https://example.com/note.ogg", audioAsVoice: true }], + chatId: "123", + token: "tok", + runtime, + bot, + replyToMode: "off", + textLimit: 4000, + onVoiceRecording, + }); + + expect(onVoiceRecording).toHaveBeenCalledTimes(1); + expect(sendVoice).toHaveBeenCalledTimes(1); + expect(events).toEqual(["recordVoice", "sendVoice"]); + }); +}); diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts index 0f01bd41c..bd787803f 100644 --- a/src/telegram/bot/delivery.ts +++ b/src/telegram/bot/delivery.ts @@ -25,12 +25,19 @@ export async function deliverReplies(params: { replyToMode: ReplyToMode; textLimit: number; messageThreadId?: number; + /** Callback invoked before sending a voice message to switch typing indicator. */ + onVoiceRecording?: () => Promise | void; }) { const { replies, chatId, runtime, bot, replyToMode, textLimit, messageThreadId } = params; const threadParams = buildTelegramThreadParams(messageThreadId); let hasReplied = false; for (const reply of replies) { - if (!reply?.text && !reply?.mediaUrl && !(reply?.mediaUrls?.length ?? 0)) { + const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0; + if (!reply?.text && !hasMedia) { + if (reply?.audioAsVoice) { + logVerbose("telegram reply has audioAsVoice without media/text; skipping"); + continue; + } runtime.error?.(danger("reply missing text/media")); continue; } @@ -99,6 +106,8 @@ export async function deliverReplies(params: { }); if (useVoice) { // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]]) + // Switch typing indicator to record_voice before sending. + await params.onVoiceRecording?.(); await bot.api.sendVoice(chatId, file, { ...mediaParams, });