diff --git a/CHANGELOG.md b/CHANGELOG.md index 239240014..e72e9b7ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Docs: https://docs.clawd.bot ### Changes - TTS: add Edge TTS provider fallback, defaulting to keyless Edge with MP3 retry on format failures. (#1668) Thanks @steipete. https://docs.clawd.bot/tts - Web search: add Brave freshness filter parameter for time-scoped results. (#1688) Thanks @JonUleis. https://docs.clawd.bot/tools/web +- TTS: add auto mode enum (off/always/inbound/tagged) with per-session `/tts` override. (#1667) Thanks @sebslight. https://docs.clawd.bot/tts - Docs: expand FAQ (migration, scheduling, concurrency, model recommendations, OpenAI subscription auth, Pi sizing, hackable install, docs SSL workaround). - Docs: add verbose installer troubleshooting guidance. - Docs: update Fly.io guide notes. diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 67701f946..12226e1f3 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1509,7 +1509,7 @@ voice notes; other channels send MP3 audio. { messages: { tts: { - enabled: true, + auto: "always", // off | always | inbound | tagged mode: "final", // final | all (include tool/block replies) provider: "elevenlabs", summaryModel: "openai/gpt-4.1-mini", @@ -1546,8 +1546,10 @@ voice notes; other channels send MP3 audio. ``` Notes: -- `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`). -- `prefsPath` stores local overrides (enabled/provider/limit/summarize). +- `messages.tts.auto` controls auto‑TTS (`off`, `always`, `inbound`, `tagged`). +- `/tts off|always|inbound|tagged` sets the per‑session auto mode (overrides config). +- `messages.tts.enabled` is legacy; doctor migrates it to `messages.tts.auto`. +- `prefsPath` stores local overrides (provider/limit/summarize). - `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit. - `summaryModel` overrides `agents.defaults.model.primary` for auto-summary. - Accepts `provider/model` or an alias from `agents.defaults.models`. diff --git a/docs/tools/slash-commands.md b/docs/tools/slash-commands.md index 1c45fe95b..84a087dba 100644 --- a/docs/tools/slash-commands.md +++ b/docs/tools/slash-commands.md @@ -68,7 +68,7 @@ Text + native (when enabled): - `/config show|get|set|unset` (persist config to disk, owner-only; requires `commands.config: true`) - `/debug show|set|unset|reset` (runtime overrides, owner-only; requires `commands.debug: true`) - `/usage off|tokens|full|cost` (per-response usage footer or local cost summary) -- `/tts on|off|status|provider|limit|summary|audio` (control TTS; see [/tts](/tts)) +- `/tts off|always|inbound|tagged|status|provider|limit|summary|audio` (control TTS; see [/tts](/tts)) - Discord: native command is `/voice` (Discord reserves `/tts`); text `/tts` still works. - `/stop` - `/restart` diff --git a/docs/tts.md b/docs/tts.md index 61da1f0dc..22dacd611 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -53,8 +53,8 @@ so that provider must also be authenticated if you enable summaries. ## Is it enabled by default? -No. TTS is **disabled** by default. Enable it in config or with `/tts on`, -which writes a local preference override. +No. Auto‑TTS is **off** by default. Enable it in config with +`messages.tts.auto` or per session with `/tts always` (alias: `/tts on`). Edge TTS **is** enabled by default once TTS is on, and is used automatically when no OpenAI or ElevenLabs API keys are available. @@ -70,7 +70,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - enabled: true, + auto: "always", provider: "elevenlabs" } } @@ -83,7 +83,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - enabled: true, + auto: "always", provider: "openai", summaryModel: "openai/gpt-4.1-mini", modelOverrides: { @@ -121,7 +121,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - enabled: true, + auto: "always", provider: "edge", edge: { enabled: true, @@ -156,7 +156,7 @@ Full schema is in [Gateway configuration](/gateway/configuration). { messages: { tts: { - enabled: true, + auto: "always", maxTextLength: 4000, timeoutMs: 30000, prefsPath: "~/.clawdbot/settings/tts.json" @@ -165,13 +165,25 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Only reply with audio after an inbound voice note + +```json5 +{ + messages: { + tts: { + auto: "inbound" + } + } +} +``` + ### Disable auto-summary for long replies ```json5 { messages: { tts: { - enabled: true + auto: "always" } } } @@ -185,7 +197,10 @@ Then run: ### Notes on fields -- `enabled`: master toggle (default `false`; local prefs can override). +- `auto`: auto‑TTS mode (`off`, `always`, `inbound`, `tagged`). + - `inbound` only sends audio after an inbound voice note. + - `tagged` only sends audio when the reply includes `[[tts]]` tags. +- `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). - `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic). - If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key), @@ -195,7 +210,7 @@ Then run: - `modelOverrides`: allow the model to emit TTS directives (on by default). - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). -- `prefsPath`: override the local prefs JSON path. +- `prefsPath`: override the local prefs JSON path (provider/limit/summary). - `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). - `elevenlabs.baseUrl`: override ElevenLabs API base URL. - `elevenlabs.voiceSettings`: @@ -218,6 +233,7 @@ Then run: ## Model-driven overrides (default on) By default, the model **can** emit TTS directives for a single reply. +When `messages.tts.auto` is `tagged`, these directives are required to trigger audio. When enabled, the model can emit `[[tts:...]]` directives to override the voice for a single reply, plus an optional `[[tts:text]]...[[/tts:text]]` block to @@ -338,8 +354,10 @@ Discord note: `/tts` is a built-in Discord command, so Clawdbot registers `/voice` as the native command there. Text `/tts ...` still works. ``` -/tts on /tts off +/tts always +/tts inbound +/tts tagged /tts status /tts provider openai /tts limit 2000 @@ -350,6 +368,7 @@ Discord note: `/tts` is a built-in Discord command, so Clawdbot registers Notes: - Commands require an authorized sender (allowlist/owner rules still apply). - `commands.text` or native command registration must be enabled. +- `off|always|inbound|tagged` are per‑session toggles (`/tts on` is an alias for `/tts always`). - `limit` and `summary` are stored in local prefs, not the main config. - `/tts audio` generates a one-off audio reply (does not toggle TTS on). diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 3e8c71288..5c65fb94c 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -6,19 +6,20 @@ import { getTtsMaxLength, getTtsProvider, isSummarizationEnabled, - isTtsEnabled, isTtsProviderConfigured, + normalizeTtsAutoMode, + resolveTtsAutoMode, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, resolveTtsProviderOrder, setLastTtsAttempt, setSummarizationEnabled, - setTtsEnabled, setTtsMaxLength, setTtsProvider, textToSpeech, } from "../../tts/tts.js"; +import { updateSessionStore } from "../../config/sessions.js"; type ParsedTtsCommand = { action: string; @@ -39,9 +40,9 @@ function ttsUsage(): ReplyPayload { // Keep usage in one place so help/validation stays consistent. return { text: - "⚙️ Usage: /tts [value]" + + "⚙️ Usage: /tts [value]" + "\nExamples:\n" + - "/tts on\n" + + "/tts always\n" + "/tts provider openai\n" + "/tts provider edge\n" + "/tts limit 2000\n" + @@ -71,14 +72,30 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand return { shouldContinue: false, reply: ttsUsage() }; } - if (action === "on") { - setTtsEnabled(prefsPath, true); - return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } }; - } - - if (action === "off") { - setTtsEnabled(prefsPath, false); - return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } }; + const requestedAuto = normalizeTtsAutoMode( + action === "on" ? "always" : action === "off" ? "off" : action, + ); + if (requestedAuto) { + const entry = params.sessionEntry; + const sessionKey = params.sessionKey; + const store = params.sessionStore; + if (entry && store && sessionKey) { + entry.ttsAuto = requestedAuto; + entry.updatedAt = Date.now(); + store[sessionKey] = entry; + if (params.storePath) { + await updateSessionStore(params.storePath, (store) => { + store[sessionKey] = entry; + }); + } + } + const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto; + return { + shouldContinue: false, + reply: { + text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`, + }, + }; } if (action === "audio") { @@ -212,7 +229,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand } if (action === "status") { - const enabled = isTtsEnabled(config, prefsPath); + const sessionAuto = params.sessionEntry?.ttsAuto; + const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto }); + const enabled = autoMode !== "off"; const provider = getTtsProvider(config, prefsPath); const hasKey = isTtsProviderConfigured(config, provider); const providerStatus = @@ -226,9 +245,10 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath); const last = getLastTtsAttempt(); + const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode; const lines = [ "📊 TTS status", - `State: ${enabled ? "✅ enabled" : "❌ disabled"}`, + `Auto: ${enabled ? autoLabel : "off"}`, `Provider: ${provider} (${providerStatus})`, `Text limit: ${maxLength} chars`, `Auto-summary: ${summarize ? "on" : "off"}`, diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 5885d729e..16c83bf30 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -1,4 +1,6 @@ import type { ClawdbotConfig } from "../../config/config.js"; +import { resolveSessionAgentId } from "../../agents/agent-scope.js"; +import { loadSessionStore, resolveStorePath } from "../../config/sessions.js"; import { logVerbose } from "../../globals.js"; import { isDiagnosticsEnabled } from "../../infra/diagnostic-events.js"; import { @@ -14,7 +16,55 @@ import { formatAbortReplyText, tryFastAbortFromMessage } from "./abort.js"; import { shouldSkipDuplicateInbound } from "./inbound-dedupe.js"; import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js"; import { isRoutableChannel, routeReply } from "./route-reply.js"; -import { maybeApplyTtsToPayload } from "../../tts/tts.js"; +import { maybeApplyTtsToPayload, normalizeTtsAutoMode } from "../../tts/tts.js"; + +const AUDIO_PLACEHOLDER_RE = /^(\s*\([^)]*\))?$/i; +const AUDIO_HEADER_RE = /^\[Audio\b/i; + +const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase(); + +const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => { + const rawTypes = [ + typeof ctx.MediaType === "string" ? ctx.MediaType : undefined, + ...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []), + ].filter(Boolean) as string[]; + const types = rawTypes.map((type) => normalizeMediaType(type)); + if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true; + + const body = + typeof ctx.BodyForCommands === "string" + ? ctx.BodyForCommands + : typeof ctx.CommandBody === "string" + ? ctx.CommandBody + : typeof ctx.RawBody === "string" + ? ctx.RawBody + : typeof ctx.Body === "string" + ? ctx.Body + : ""; + const trimmed = body.trim(); + if (!trimmed) return false; + if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true; + return AUDIO_HEADER_RE.test(trimmed); +}; + +const resolveSessionTtsAuto = ( + ctx: FinalizedMsgContext, + cfg: ClawdbotConfig, +): string | undefined => { + const targetSessionKey = + ctx.CommandSource === "native" ? ctx.CommandTargetSessionKey?.trim() : undefined; + const sessionKey = (targetSessionKey ?? ctx.SessionKey)?.trim(); + if (!sessionKey) return undefined; + const agentId = resolveSessionAgentId({ sessionKey, config: cfg }); + const storePath = resolveStorePath(cfg.session?.store, { agentId }); + try { + const store = loadSessionStore(storePath); + const entry = store[sessionKey.toLowerCase()] ?? store[sessionKey]; + return normalizeTtsAutoMode(entry?.ttsAuto); + } catch { + return undefined; + } +}; export type DispatchFromConfigResult = { queuedFinal: boolean; @@ -81,6 +131,8 @@ export async function dispatchReplyFromConfig(params: { return { queuedFinal: false, counts: dispatcher.getQueuedCounts() }; } + const inboundAudio = isInboundAudioContext(ctx); + const sessionTtsAuto = resolveSessionTtsAuto(ctx, cfg); const hookRunner = getGlobalHookRunner(); if (hookRunner?.hasHooks("message_received")) { const timestamp = @@ -223,6 +275,8 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "tool", + inboundAudio, + ttsAuto: sessionTtsAuto, }); if (shouldRouteToOriginating) { await sendPayloadAsync(ttsPayload); @@ -239,6 +293,8 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "block", + inboundAudio, + ttsAuto: sessionTtsAuto, }); if (shouldRouteToOriginating) { await sendPayloadAsync(ttsPayload, context?.abortSignal); @@ -262,6 +318,8 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "final", + inboundAudio, + ttsAuto: sessionTtsAuto, }); if (shouldRouteToOriginating && originatingChannel && originatingTo) { // Route final reply to originating channel. diff --git a/src/auto-reply/reply/session.ts b/src/auto-reply/reply/session.ts index da8ca8acf..45f37afdb 100644 --- a/src/auto-reply/reply/session.ts +++ b/src/auto-reply/reply/session.ts @@ -5,6 +5,7 @@ import path from "node:path"; import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent"; import { resolveSessionAgentId } from "../../agents/agent-scope.js"; import type { ClawdbotConfig } from "../../config/config.js"; +import type { TtsAutoMode } from "../../config/types.tts.js"; import { DEFAULT_RESET_TRIGGERS, deriveSessionMetaPatch, @@ -128,6 +129,7 @@ export async function initSessionState(params: { let persistedThinking: string | undefined; let persistedVerbose: string | undefined; let persistedReasoning: string | undefined; + let persistedTtsAuto: TtsAutoMode | undefined; let persistedModelOverride: string | undefined; let persistedProviderOverride: string | undefined; @@ -220,6 +222,7 @@ export async function initSessionState(params: { persistedThinking = entry.thinkingLevel; persistedVerbose = entry.verboseLevel; persistedReasoning = entry.reasoningLevel; + persistedTtsAuto = entry.ttsAuto; persistedModelOverride = entry.modelOverride; persistedProviderOverride = entry.providerOverride; } else { @@ -258,6 +261,7 @@ export async function initSessionState(params: { thinkingLevel: persistedThinking ?? baseEntry?.thinkingLevel, verboseLevel: persistedVerbose ?? baseEntry?.verboseLevel, reasoningLevel: persistedReasoning ?? baseEntry?.reasoningLevel, + ttsAuto: persistedTtsAuto ?? baseEntry?.ttsAuto, responseUsage: baseEntry?.responseUsage, modelOverride: persistedModelOverride ?? baseEntry?.modelOverride, providerOverride: persistedProviderOverride ?? baseEntry?.providerOverride, diff --git a/src/auto-reply/status.ts b/src/auto-reply/status.ts index 410e2f38c..b6f0f44db 100644 --- a/src/auto-reply/status.ts +++ b/src/auto-reply/status.ts @@ -17,7 +17,7 @@ import { getTtsMaxLength, getTtsProvider, isSummarizationEnabled, - isTtsEnabled, + resolveTtsAutoMode, resolveTtsConfig, resolveTtsPrefsPath, } from "../tts/tts.js"; @@ -252,15 +252,23 @@ const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[]) return `📎 Media: ${parts.join(" · ")}`; }; -const formatVoiceModeLine = (config?: ClawdbotConfig): string | null => { +const formatVoiceModeLine = ( + config?: ClawdbotConfig, + sessionEntry?: SessionEntry, +): string | null => { if (!config) return null; const ttsConfig = resolveTtsConfig(config); const prefsPath = resolveTtsPrefsPath(ttsConfig); - if (!isTtsEnabled(ttsConfig, prefsPath)) return null; + const autoMode = resolveTtsAutoMode({ + config: ttsConfig, + prefsPath, + sessionAuto: sessionEntry?.ttsAuto, + }); + if (autoMode === "off") return null; const provider = getTtsProvider(ttsConfig, prefsPath); const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; - return `🔊 Voice: on · provider=${provider} · limit=${maxLength} · summary=${summarize}`; + return `🔊 Voice: ${autoMode} · provider=${provider} · limit=${maxLength} · summary=${summarize}`; }; export function buildStatusMessage(args: StatusArgs): string { @@ -398,7 +406,7 @@ export function buildStatusMessage(args: StatusArgs): string { const usageCostLine = usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine); const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions); - const voiceLine = formatVoiceModeLine(args.config); + const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry); return [ versionLine, diff --git a/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts b/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts index 58bf62425..8abd285ee 100644 --- a/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts +++ b/src/config/config.legacy-config-detection.accepts-imessage-dmpolicy.test.ts @@ -138,6 +138,16 @@ describe("legacy config detection", () => { expect(res.config?.channels?.telegram?.groups?.["*"]?.requireMention).toBe(false); expect(res.config?.channels?.telegram?.requireMention).toBeUndefined(); }); + it("migrates messages.tts.enabled to messages.tts.auto", async () => { + vi.resetModules(); + const { migrateLegacyConfig } = await import("./config.js"); + const res = migrateLegacyConfig({ + messages: { tts: { enabled: true } }, + }); + expect(res.changes).toContain("Moved messages.tts.enabled → messages.tts.auto (always)."); + expect(res.config?.messages?.tts?.auto).toBe("always"); + expect(res.config?.messages?.tts?.enabled).toBeUndefined(); + }); it("migrates legacy model config to agent.models + model lists", async () => { vi.resetModules(); const { migrateLegacyConfig } = await import("./config.js"); diff --git a/src/config/legacy.migrations.part-3.ts b/src/config/legacy.migrations.part-3.ts index fc34b1768..9db9e3ede 100644 --- a/src/config/legacy.migrations.part-3.ts +++ b/src/config/legacy.migrations.part-3.ts @@ -40,6 +40,26 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_3: LegacyConfigMigration[] = [ delete tools.bash; }, }, + { + id: "messages.tts.enabled->auto", + describe: "Move messages.tts.enabled to messages.tts.auto", + apply: (raw, changes) => { + const messages = getRecord(raw.messages); + const tts = getRecord(messages?.tts); + if (!tts) return; + if (tts.auto !== undefined) { + if ("enabled" in tts) { + delete tts.enabled; + changes.push("Removed messages.tts.enabled (messages.tts.auto already set)."); + } + return; + } + if (typeof tts.enabled !== "boolean") return; + tts.auto = tts.enabled ? "always" : "off"; + delete tts.enabled; + changes.push(`Moved messages.tts.enabled → messages.tts.auto (${String(tts.auto)}).`); + }, + }, { id: "agent.defaults-v2", describe: "Move agent config to agents.defaults and tools", diff --git a/src/config/legacy.rules.ts b/src/config/legacy.rules.ts index 1ec76bc79..4de788a69 100644 --- a/src/config/legacy.rules.ts +++ b/src/config/legacy.rules.ts @@ -120,6 +120,10 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [ message: "agent.imageModelFallbacks was replaced by agents.defaults.imageModel.fallbacks (auto-migrated on load).", }, + { + path: ["messages", "tts", "enabled"], + message: "messages.tts.enabled was replaced by messages.tts.auto (auto-migrated on load).", + }, { path: ["gateway", "token"], message: "gateway.token is ignored; use gateway.auth.token instead (auto-migrated on load).", diff --git a/src/config/sessions/types.ts b/src/config/sessions/types.ts index f7ed268ec..48ce428c1 100644 --- a/src/config/sessions/types.ts +++ b/src/config/sessions/types.ts @@ -4,6 +4,7 @@ import type { Skill } from "@mariozechner/pi-coding-agent"; import type { NormalizedChatType } from "../../channels/chat-type.js"; import type { ChannelId } from "../../channels/plugins/types.js"; import type { DeliveryContext } from "../../utils/delivery-context.js"; +import type { TtsAutoMode } from "../types.tts.js"; export type SessionScope = "per-sender" | "global"; @@ -42,6 +43,7 @@ export type SessionEntry = { verboseLevel?: string; reasoningLevel?: string; elevatedLevel?: string; + ttsAuto?: TtsAutoMode; execHost?: string; execSecurity?: string; execAsk?: string; diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 28b65c96d..4eb4989b9 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -2,6 +2,8 @@ export type TtsProvider = "elevenlabs" | "openai" | "edge"; export type TtsMode = "final" | "all"; +export type TtsAutoMode = "off" | "always" | "inbound" | "tagged"; + export type TtsModelOverrideConfig = { /** Enable model-provided overrides for TTS. */ enabled?: boolean; @@ -22,7 +24,9 @@ export type TtsModelOverrideConfig = { }; export type TtsConfig = { - /** Enable auto-TTS (can be overridden by local prefs). */ + /** Auto-TTS mode (preferred). */ + auto?: TtsAutoMode; + /** Legacy: enable auto-TTS when `auto` is not set. */ enabled?: boolean; /** Apply TTS to final replies only or to all replies (tool/block/final). */ mode?: TtsMode; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index bcf769b67..4a8c80bcc 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -158,8 +158,10 @@ export const MarkdownConfigSchema = z export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); export const TtsModeSchema = z.enum(["final", "all"]); +export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); export const TtsConfigSchema = z .object({ + auto: TtsAutoSchema.optional(), enabled: z.boolean().optional(), mode: TtsModeSchema.optional(), provider: TtsProviderSchema.optional(), diff --git a/src/discord/monitor/message-handler.process.ts b/src/discord/monitor/message-handler.process.ts index 0be4b6d84..6d502be21 100644 --- a/src/discord/monitor/message-handler.process.ts +++ b/src/discord/monitor/message-handler.process.ts @@ -136,9 +136,8 @@ export async function processDiscordMessage(ctx: DiscordMessagePreflightContext) const forumParentSlug = isForumParent && threadParentName ? normalizeDiscordSlug(threadParentName) : ""; const threadChannelId = threadChannel?.id; - const isForumStarter = Boolean( - threadChannelId && isForumParent && forumParentSlug && message.id === threadChannelId, - ); + const isForumStarter = + Boolean(threadChannelId && isForumParent && forumParentSlug) && message.id === threadChannelId; const forumContextLine = isForumStarter ? `[Forum parent: #${forumParentSlug}]` : null; const groupChannel = isGuildMessage && displayChannelSlug ? `#${displayChannelSlug}` : undefined; const groupSubject = isDirectMessage ? undefined : groupChannel; diff --git a/src/gateway/server-methods/tts.ts b/src/gateway/server-methods/tts.ts index e70fb112f..5e4e8254e 100644 --- a/src/gateway/server-methods/tts.ts +++ b/src/gateway/server-methods/tts.ts @@ -5,6 +5,7 @@ import { getTtsProvider, isTtsEnabled, isTtsProviderConfigured, + resolveTtsAutoMode, resolveTtsApiKey, resolveTtsConfig, resolveTtsPrefsPath, @@ -24,11 +25,13 @@ export const ttsHandlers: GatewayRequestHandlers = { const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); const provider = getTtsProvider(config, prefsPath); + const autoMode = resolveTtsAutoMode({ config, prefsPath }); const fallbackProviders = resolveTtsProviderOrder(provider) .slice(1) .filter((candidate) => isTtsProviderConfigured(config, candidate)); respond(true, { enabled: isTtsEnabled(config, prefsPath), + auto: autoMode, provider, fallbackProvider: fallbackProviders[0] ?? null, fallbackProviders, diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index fafe3bbdf..a8c9dce9c 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai"; import { getApiKeyForModel } from "../agents/model-auth.js"; import { resolveModel } from "../agents/pi-embedded-runner/model.js"; -import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js"; +import * as tts from "./tts.js"; vi.mock("@mariozechner/pi-ai", () => ({ completeSimple: vi.fn(), @@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({ requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""), })); +const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts; + const { isValidVoiceId, isValidOpenAIVoice, @@ -431,4 +433,129 @@ describe("tts", () => { ); }); }); + + describe("maybeApplyTtsToPayload", () => { + const baseCfg = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { + tts: { + auto: "inbound", + provider: "openai", + openai: { apiKey: "test-key", model: "gpt-4o-mini-tts", voice: "alloy" }, + }, + }, + }; + + it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(1), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const payload = { text: "Hello world" }; + const result = await maybeApplyTtsToPayload({ + payload, + cfg: baseCfg, + kind: "final", + inboundAudio: false, + }); + + expect(result).toBe(payload); + expect(fetchMock).not.toHaveBeenCalled(); + + globalThis.fetch = originalFetch; + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + + it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(1), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const result = await maybeApplyTtsToPayload({ + payload: { text: "Hello world" }, + cfg: baseCfg, + kind: "final", + inboundAudio: true, + }); + + expect(result.mediaUrl).toBeDefined(); + expect(fetchMock).toHaveBeenCalledTimes(1); + + globalThis.fetch = originalFetch; + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + + it("skips auto-TTS in tagged mode unless a tts tag is present", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(1), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const cfg = { + ...baseCfg, + messages: { + ...baseCfg.messages, + tts: { ...baseCfg.messages.tts, auto: "tagged" }, + }, + }; + + const payload = { text: "Hello world" }; + const result = await maybeApplyTtsToPayload({ + payload, + cfg, + kind: "final", + }); + + expect(result).toBe(payload); + expect(fetchMock).not.toHaveBeenCalled(); + + globalThis.fetch = originalFetch; + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + + it("runs auto-TTS in tagged mode when tags are present", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const originalFetch = globalThis.fetch; + const fetchMock = vi.fn(async () => ({ + ok: true, + arrayBuffer: async () => new ArrayBuffer(1), + })); + globalThis.fetch = fetchMock as unknown as typeof fetch; + + const cfg = { + ...baseCfg, + messages: { + ...baseCfg.messages, + tts: { ...baseCfg.messages.tts, auto: "tagged" }, + }, + }; + + const result = await maybeApplyTtsToPayload({ + payload: { text: "[[tts:text]]Hello world[[/tts:text]]" }, + cfg, + kind: "final", + }); + + expect(result.mediaUrl).toBeDefined(); + expect(fetchMock).toHaveBeenCalledTimes(1); + + globalThis.fetch = originalFetch; + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + }); }); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index cf2823f95..5fa06f8d4 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -20,6 +20,7 @@ import type { ChannelId } from "../channels/plugins/types.js"; import type { ClawdbotConfig } from "../config/config.js"; import type { TtsConfig, + TtsAutoMode, TtsMode, TtsProvider, TtsModelOverrideConfig, @@ -75,8 +76,10 @@ const DEFAULT_OUTPUT = { voiceCompatible: false, }; +const TTS_AUTO_MODES = new Set(["off", "always", "inbound", "tagged"]); + export type ResolvedTtsConfig = { - enabled: boolean; + auto: TtsAutoMode; mode: TtsMode; provider: TtsProvider; providerSource: "config" | "default"; @@ -123,6 +126,7 @@ export type ResolvedTtsConfig = { type TtsUserPrefs = { tts?: { + auto?: TtsAutoMode; enabled?: boolean; provider?: TtsProvider; maxLength?: number; @@ -161,6 +165,7 @@ type TtsDirectiveOverrides = { type TtsDirectiveParseResult = { cleanedText: string; ttsText?: string; + hasDirective: boolean; overrides: TtsDirectiveOverrides; warnings: string[]; }; @@ -187,6 +192,15 @@ type TtsStatusEntry = { let lastTtsAttempt: TtsStatusEntry | undefined; +export function normalizeTtsAutoMode(value: unknown): TtsAutoMode | undefined { + if (typeof value !== "string") return undefined; + const normalized = value.trim().toLowerCase(); + if (TTS_AUTO_MODES.has(normalized as TtsAutoMode)) { + return normalized as TtsAutoMode; + } + return undefined; +} + function resolveModelOverridePolicy( overrides: TtsModelOverrideConfig | undefined, ): ResolvedTtsModelOverrides { @@ -220,8 +234,9 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { const raw: TtsConfig = cfg.messages?.tts ?? {}; const providerSource = raw.provider ? "config" : "default"; const edgeOutputFormat = raw.edge?.outputFormat?.trim(); + const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off"); return { - enabled: raw.enabled ?? false, + auto, mode: raw.mode ?? "final", provider: raw.provider ?? "edge", providerSource, @@ -279,17 +294,48 @@ export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string { return path.join(CONFIG_DIR, "settings", "tts.json"); } +function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined { + const auto = normalizeTtsAutoMode(prefs.tts?.auto); + if (auto) return auto; + if (typeof prefs.tts?.enabled === "boolean") { + return prefs.tts.enabled ? "always" : "off"; + } + return undefined; +} + +export function resolveTtsAutoMode(params: { + config: ResolvedTtsConfig; + prefsPath: string; + sessionAuto?: string; +}): TtsAutoMode { + const sessionAuto = normalizeTtsAutoMode(params.sessionAuto); + if (sessionAuto) return sessionAuto; + const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath)); + if (prefsAuto) return prefsAuto; + return params.config.auto; +} + export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefined { const config = resolveTtsConfig(cfg); const prefsPath = resolveTtsPrefsPath(config); - if (!isTtsEnabled(config, prefsPath)) return undefined; + const autoMode = resolveTtsAutoMode({ config, prefsPath }); + if (autoMode === "off") return undefined; const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; + const autoHint = + autoMode === "inbound" + ? "Only use TTS when the user's last message includes audio/voice." + : autoMode === "tagged" + ? "Only use TTS when you include [[tts]] or [[tts:text]] tags." + : undefined; return [ "Voice (TTS) is enabled.", + autoHint, `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`, "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.", - ].join("\n"); + ] + .filter(Boolean) + .join("\n"); } function readPrefs(prefsPath: string): TtsUserPrefs { @@ -323,16 +369,25 @@ function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2)); } -export function isTtsEnabled(config: ResolvedTtsConfig, prefsPath: string): boolean { - const prefs = readPrefs(prefsPath); - if (prefs.tts?.enabled !== undefined) return prefs.tts.enabled === true; - return config.enabled; +export function isTtsEnabled( + config: ResolvedTtsConfig, + prefsPath: string, + sessionAuto?: string, +): boolean { + return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off"; +} + +export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void { + updatePrefs(prefsPath, (prefs) => { + const next = { ...prefs.tts }; + delete next.enabled; + next.auto = mode; + prefs.tts = next; + }); } export function setTtsEnabled(prefsPath: string, enabled: boolean): void { - updatePrefs(prefsPath, (prefs) => { - prefs.tts = { ...prefs.tts, enabled }; - }); + setTtsAutoMode(prefsPath, enabled ? "always" : "off"); } export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider { @@ -485,15 +540,17 @@ function parseTtsDirectives( policy: ResolvedTtsModelOverrides, ): TtsDirectiveParseResult { if (!policy.enabled) { - return { cleanedText: text, overrides: {}, warnings: [] }; + return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false }; } const overrides: TtsDirectiveOverrides = {}; const warnings: string[] = []; let cleanedText = text; + let hasDirective = false; const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi; cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => { + hasDirective = true; if (policy.allowText && overrides.ttsText == null) { overrides.ttsText = inner.trim(); } @@ -502,6 +559,7 @@ function parseTtsDirectives( const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi; cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { + hasDirective = true; const tokens = body.split(/\s+/).filter(Boolean); for (const token of tokens) { const eqIndex = token.indexOf("="); @@ -672,6 +730,7 @@ function parseTtsDirectives( return { cleanedText, ttsText: overrides.ttsText, + hasDirective, overrides, warnings, }; @@ -1156,13 +1215,17 @@ export async function maybeApplyTtsToPayload(params: { cfg: ClawdbotConfig; channel?: string; kind?: "tool" | "block" | "final"; + inboundAudio?: boolean; + ttsAuto?: string; }): Promise { const config = resolveTtsConfig(params.cfg); const prefsPath = resolveTtsPrefsPath(config); - if (!isTtsEnabled(config, prefsPath)) return params.payload; - - const mode = config.mode ?? "final"; - if (mode === "final" && params.kind && params.kind !== "final") return params.payload; + const autoMode = resolveTtsAutoMode({ + config, + prefsPath, + sessionAuto: params.ttsAuto, + }); + if (autoMode === "off") return params.payload; const text = params.payload.text ?? ""; const directives = parseTtsDirectives(text, config.modelOverrides); @@ -1183,6 +1246,12 @@ export async function maybeApplyTtsToPayload(params: { text: visibleText.length > 0 ? visibleText : undefined, }; + if (autoMode === "tagged" && !directives.hasDirective) return nextPayload; + if (autoMode === "inbound" && params.inboundAudio !== true) return nextPayload; + + const mode = config.mode ?? "final"; + if (mode === "final" && params.kind && params.kind !== "final") return nextPayload; + if (!ttsText.trim()) return nextPayload; if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) return nextPayload; if (text.includes("MEDIA:")) return nextPayload; @@ -1197,7 +1266,7 @@ export async function maybeApplyTtsToPayload(params: { logVerbose( `TTS: skipping long text (${textForAudio.length} > ${maxLength}), summarization disabled.`, ); - return params.payload; + return nextPayload; } try { @@ -1219,7 +1288,7 @@ export async function maybeApplyTtsToPayload(params: { } catch (err) { const error = err as Error; logVerbose(`TTS: summarization failed: ${error.message}`); - return params.payload; + return nextPayload; } } diff --git a/src/types/node-edge-tts.d.ts b/src/types/node-edge-tts.d.ts new file mode 100644 index 000000000..eaaaa9cdf --- /dev/null +++ b/src/types/node-edge-tts.d.ts @@ -0,0 +1,18 @@ +declare module "node-edge-tts" { + export type EdgeTTSOptions = { + voice?: string; + lang?: string; + outputFormat?: string; + saveSubtitles?: boolean; + proxy?: string; + rate?: string; + pitch?: string; + volume?: string; + timeout?: number; + }; + + export class EdgeTTS { + constructor(options?: EdgeTTSOptions); + ttsPromise(text: string, outputPath: string): Promise; + } +} diff --git a/src/web/inbound.media.test.ts b/src/web/inbound.media.test.ts index fcd53a68b..de23f10a9 100644 --- a/src/web/inbound.media.test.ts +++ b/src/web/inbound.media.test.ts @@ -127,9 +127,9 @@ describe("web inbound media saves with extension", () => { realSock.ev.emit("messages.upsert", upsert); // Allow a brief window for the async handler to fire on slower hosts. - for (let i = 0; i < 10; i++) { + for (let i = 0; i < 50; i++) { if (onMessage.mock.calls.length > 0) break; - await new Promise((resolve) => setTimeout(resolve, 5)); + await new Promise((resolve) => setTimeout(resolve, 10)); } expect(onMessage).toHaveBeenCalledTimes(1); @@ -178,9 +178,9 @@ describe("web inbound media saves with extension", () => { realSock.ev.emit("messages.upsert", upsert); - for (let i = 0; i < 10; i++) { + for (let i = 0; i < 50; i++) { if (onMessage.mock.calls.length > 0) break; - await new Promise((resolve) => setTimeout(resolve, 5)); + await new Promise((resolve) => setTimeout(resolve, 10)); } expect(onMessage).toHaveBeenCalledTimes(1); @@ -218,9 +218,9 @@ describe("web inbound media saves with extension", () => { realSock.ev.emit("messages.upsert", upsert); - for (let i = 0; i < 10; i++) { + for (let i = 0; i < 50; i++) { if (onMessage.mock.calls.length > 0) break; - await new Promise((resolve) => setTimeout(resolve, 5)); + await new Promise((resolve) => setTimeout(resolve, 10)); } expect(onMessage).toHaveBeenCalledTimes(1);