fix: allow media-only sends

2026-01-16 03:15:07 +00:00
parent f449115ec5
commit a0d2a7232e
15 changed files with 200 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
 - Tools: normalize Slack/Discord message timestamps with `timestampMs`/`timestampUtc` while keeping raw provider fields.
 - Docs: add Date & Time guide and update prompt/timezone configuration docs.
 - Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc.
+- Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev.
 - Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs.
 - Fix: guard model fallback against undefined provider/model values. (#954) — thanks @roshanasingh4.
 - Fix: refactor session store updates, add chat.inject, and harden subagent cleanup flow. (#944) — thanks @tyler6204.
--- a/docs/cli/message.md
+++ b/docs/cli/message.md
@@ -44,7 +44,7 @@ Target formats (`--to`):

 - `send`
  - Channels: WhatsApp/Telegram/Discord/Slack/Signal/iMessage/MS Teams
-  - Required: `--to`, `--message`
+  - Required: `--to`, plus `--message` or `--media`
  - Optional: `--media`, `--reply-to`, `--thread-id`, `--gif-playback`
  - Telegram only: `--buttons` (requires `"inlineButtons"` in `channels.telegram.capabilities` or `channels.telegram.accounts.<id>.capabilities`)
  - Telegram only: `--thread-id` (forum topic id)
--- a/src/agents/tools/telegram-actions.test.ts
+++ b/src/agents/tools/telegram-actions.test.ts
@@ -221,6 +221,43 @@ describe("handleTelegramAction", () => {
    );
  });

+  it("allows media-only messages without content", async () => {
+    const cfg = {
+      channels: { telegram: { botToken: "tok" } },
+    } as ClawdbotConfig;
+    await handleTelegramAction(
+      {
+        action: "sendMessage",
+        to: "123456",
+        mediaUrl: "https://example.com/note.ogg",
+      },
+      cfg,
+    );
+    expect(sendMessageTelegram).toHaveBeenCalledWith(
+      "123456",
+      "",
+      expect.objectContaining({
+        token: "tok",
+        mediaUrl: "https://example.com/note.ogg",
+      }),
+    );
+  });
+
+  it("requires content when no mediaUrl is provided", async () => {
+    const cfg = {
+      channels: { telegram: { botToken: "tok" } },
+    } as ClawdbotConfig;
+    await expect(
+      handleTelegramAction(
+        {
+          action: "sendMessage",
+          to: "123456",
+        },
+        cfg,
+      ),
+    ).rejects.toThrow(/content required/i);
+  });
+
  it("respects sendMessage gating", async () => {
    const cfg = {
      channels: {
--- a/src/agents/tools/telegram-actions.ts
+++ b/src/agents/tools/telegram-actions.ts
@@ -130,8 +130,13 @@ export async function handleTelegramAction(
      throw new Error("Telegram sendMessage is disabled.");
    }
    const to = readStringParam(params, "to", { required: true });
-    const content = readStringParam(params, "content", { required: true });
    const mediaUrl = readStringParam(params, "mediaUrl");
+    // Allow content to be omitted when sending media-only (e.g., voice notes)
+    const content =
+      readStringParam(params, "content", {
+        required: !mediaUrl,
+        allowEmpty: true,
+      }) ?? "";
    const buttons = readTelegramButtons(params);
    if (buttons && !hasInlineButtonsCapability({ cfg, accountId: accountId ?? undefined })) {
      throw new Error(
--- a/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts
+++ b/src/auto-reply/reply.triggers.trigger-handling.filters-usage-summary-current-model-provider.test.ts
@@ -19,6 +19,7 @@ const usageMocks = vi.hoisted(() => ({
    providers: [],
  }),
  formatUsageSummaryLine: vi.fn().mockReturnValue("📊 Usage: Claude 80% left"),
+  formatUsageWindowSummary: vi.fn().mockReturnValue("Claude 80% left"),
  resolveUsageProviderId: vi.fn((provider: string) => provider.split("/")[0]),
 }));

@@ -97,6 +98,16 @@ describe("trigger handling", () => {
  it("filters usage summary to the current model provider", async () => {
    await withTempHome(async (home) => {
      usageMocks.loadProviderUsageSummary.mockClear();
+      usageMocks.loadProviderUsageSummary.mockResolvedValue({
+        updatedAt: 0,
+        providers: [
+          {
+            provider: "anthropic",
+            displayName: "Anthropic",
+            windows: [],
+          },
+        ],
+      });

      const res = await getReplyFromConfig(
        {
--- a/src/auto-reply/reply/reply-payloads.ts
+++ b/src/auto-reply/reply/reply-payloads.ts
@@ -42,7 +42,10 @@ export function applyReplyTagsToPayload(

 export function isRenderablePayload(payload: ReplyPayload): boolean {
  return Boolean(
-    payload.text || payload.mediaUrl || (payload.mediaUrls && payload.mediaUrls.length > 0),
+    payload.text ||
+      payload.mediaUrl ||
+      (payload.mediaUrls && payload.mediaUrls.length > 0) ||
+      payload.audioAsVoice,
  );
 }

--- a/src/cli/program/message/register.send.ts
+++ b/src/cli/program/message/register.send.ts
@@ -9,7 +9,7 @@ export function registerMessageSendCommand(message: Command, helpers: MessageCli
          message
            .command("send")
            .description("Send a message")
-            .requiredOption("-m, --message <text>", "Message body"),
+            .option("-m, --message <text>", "Message body (required unless --media is set)"),
        )
        .option(
          "--media <path-or-url>",
--- a/src/gateway/server/tests/test-utils.ts
+++ b/src/gateway/server/tests/test-utils.ts
@@ -4,8 +4,8 @@ export const createTestRegistry = (overrides: Partial<PluginRegistry> = {}): Plu
  const base: PluginRegistry = {
    plugins: [],
    tools: [],
-    providers: [],
    channels: [],
+    providers: [],
    gatewayHandlers: {},
    httpHandlers: [],
    cliRegistrars: [],
--- a/src/infra/outbound/message-action-runner.test.ts
+++ b/src/infra/outbound/message-action-runner.test.ts
@@ -37,6 +37,37 @@ describe("runMessageAction context isolation", () => {
    expect(result.kind).toBe("send");
  });

+  it("allows media-only send when target matches current channel", async () => {
+    const result = await runMessageAction({
+      cfg: slackConfig,
+      action: "send",
+      params: {
+        channel: "slack",
+        to: "#C123",
+        media: "https://example.com/note.ogg",
+      },
+      toolContext: { currentChannelId: "C123" },
+      dryRun: true,
+    });
+
+    expect(result.kind).toBe("send");
+  });
+
+  it("requires message when no media hint is provided", async () => {
+    await expect(
+      runMessageAction({
+        cfg: slackConfig,
+        action: "send",
+        params: {
+          channel: "slack",
+          to: "#C123",
+        },
+        toolContext: { currentChannelId: "C123" },
+        dryRun: true,
+      }),
+    ).rejects.toThrow(/message required/i);
+  });
+
  it("blocks send when target differs from current channel", async () => {
    await expect(
      runMessageAction({
--- a/src/infra/outbound/message-action-runner.ts
+++ b/src/infra/outbound/message-action-runner.ts
@@ -208,10 +208,12 @@ export async function runMessageAction(

  if (action === "send") {
    const to = readStringParam(params, "to", { required: true });
+    // Allow message to be omitted when sending media-only (e.g., voice notes)
+    const mediaHint = readStringParam(params, "media", { trim: false });
    let message = readStringParam(params, "message", {
-      required: true,
+      required: !mediaHint, // Only require message if no media hint
      allowEmpty: true,
-    });
+    }) ?? "";

    const parsed = parseReplyDirectives(message);
    message = parsed.text;
--- a/src/plugins/loader.ts
+++ b/src/plugins/loader.ts
@@ -189,8 +189,8 @@ function createPluginRecord(params: {
    enabled: params.enabled,
    status: params.enabled ? "loaded" : "disabled",
    toolNames: [],
-    providerIds: [],
    channelIds: [],
+    providerIds: [],
    gatewayMethods: [],
    cliCommands: [],
    services: [],
--- a/src/telegram/bot-message-context.ts
+++ b/src/telegram/bot-message-context.ts
@@ -98,6 +98,18 @@ export const buildTelegramMessageContext = async ({
    }
  };

+  const sendRecordVoice = async () => {
+    try {
+      await bot.api.sendChatAction(
+        chatId,
+        "record_voice",
+        buildTypingThreadParams(resolvedThreadId),
+      );
+    } catch (err) {
+      logVerbose(`telegram record_voice cue failed for chat ${chatId}: ${String(err)}`);
+    }
+  };
+
  // DM access control (secure defaults): "pairing" (default) / "allowlist" / "open" / "disabled"
  if (!isGroup) {
    if (dmPolicy === "disabled") return null;
@@ -408,6 +420,7 @@ export const buildTelegramMessageContext = async ({
    route,
    skillFilter,
    sendTyping,
+    sendRecordVoice,
    ackReactionPromise,
    reactionApi,
    removeAckAfterReply,
--- a/src/telegram/bot-message-dispatch.ts
+++ b/src/telegram/bot-message-dispatch.ts
@@ -37,6 +37,7 @@ export const dispatchTelegramMessage = async ({
    route,
    skillFilter,
    sendTyping,
+    sendRecordVoice,
    ackReactionPromise,
    reactionApi,
    removeAckAfterReply,
@@ -144,6 +145,7 @@ export const dispatchTelegramMessage = async ({
          replyToMode,
          textLimit,
          messageThreadId: resolvedThreadId,
+          onVoiceRecording: sendRecordVoice,
        });
        didSendReply = true;
      },
--- a/src/telegram/bot/delivery.test.ts
+++ b/src/telegram/bot/delivery.test.ts
@@ -0,0 +1,77 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+import type { Bot } from "grammy";
+
+import { deliverReplies } from "./delivery.js";
+
+const loadWebMedia = vi.fn();
+
+vi.mock("../../web/media.js", () => ({
+  loadWebMedia: (...args: unknown[]) => loadWebMedia(...args),
+}));
+
+vi.mock("grammy", () => ({
+  InputFile: class {
+    constructor(
+      public buffer: Buffer,
+      public fileName?: string,
+    ) {}
+  },
+}));
+
+describe("deliverReplies", () => {
+  beforeEach(() => {
+    loadWebMedia.mockReset();
+  });
+
+  it("skips audioAsVoice-only payloads without logging an error", async () => {
+    const runtime = { error: vi.fn() };
+    const bot = { api: {} } as unknown as Bot;
+
+    await deliverReplies({
+      replies: [{ audioAsVoice: true }],
+      chatId: "123",
+      token: "tok",
+      runtime,
+      bot,
+      replyToMode: "off",
+      textLimit: 4000,
+    });
+
+    expect(runtime.error).not.toHaveBeenCalled();
+  });
+
+  it("invokes onVoiceRecording before sending a voice note", async () => {
+    const events: string[] = [];
+    const runtime = { error: vi.fn() };
+    const sendVoice = vi.fn(async () => {
+      events.push("sendVoice");
+      return { message_id: 1, chat: { id: "123" } };
+    });
+    const bot = { api: { sendVoice } } as unknown as Bot;
+    const onVoiceRecording = vi.fn(async () => {
+      events.push("recordVoice");
+    });
+
+    loadWebMedia.mockResolvedValueOnce({
+      buffer: Buffer.from("voice"),
+      contentType: "audio/ogg",
+      fileName: "note.ogg",
+    });
+
+    await deliverReplies({
+      replies: [{ mediaUrl: "https://example.com/note.ogg", audioAsVoice: true }],
+      chatId: "123",
+      token: "tok",
+      runtime,
+      bot,
+      replyToMode: "off",
+      textLimit: 4000,
+      onVoiceRecording,
+    });
+
+    expect(onVoiceRecording).toHaveBeenCalledTimes(1);
+    expect(sendVoice).toHaveBeenCalledTimes(1);
+    expect(events).toEqual(["recordVoice", "sendVoice"]);
+  });
+});
--- a/src/telegram/bot/delivery.ts
+++ b/src/telegram/bot/delivery.ts
@@ -25,12 +25,19 @@ export async function deliverReplies(params: {
  replyToMode: ReplyToMode;
  textLimit: number;
  messageThreadId?: number;
+  /** Callback invoked before sending a voice message to switch typing indicator. */
+  onVoiceRecording?: () => Promise<void> | void;
 }) {
  const { replies, chatId, runtime, bot, replyToMode, textLimit, messageThreadId } = params;
  const threadParams = buildTelegramThreadParams(messageThreadId);
  let hasReplied = false;
  for (const reply of replies) {
-    if (!reply?.text && !reply?.mediaUrl && !(reply?.mediaUrls?.length ?? 0)) {
+    const hasMedia = Boolean(reply?.mediaUrl) || (reply?.mediaUrls?.length ?? 0) > 0;
+    if (!reply?.text && !hasMedia) {
+      if (reply?.audioAsVoice) {
+        logVerbose("telegram reply has audioAsVoice without media/text; skipping");
+        continue;
+      }
      runtime.error?.(danger("reply missing text/media"));
      continue;
    }
@@ -99,6 +106,8 @@ export async function deliverReplies(params: {
        });
        if (useVoice) {
          // Voice message - displays as round playable bubble (opt-in via [[audio_as_voice]])
+          // Switch typing indicator to record_voice before sending.
+          await params.onVoiceRecording?.();
          await bot.api.sendVoice(chatId, file, {
            ...mediaParams,
          });