TTS: gate auto audio on inbound voice notes (#1667)

Co-authored-by: Sebastian <sebslight@gmail.com>
This commit is contained in:
Seb Slight
2026-01-24 23:35:20 -05:00
committed by GitHub
parent ede5145191
commit d4f60bf16a
20 changed files with 433 additions and 63 deletions

View File

@@ -6,19 +6,20 @@ import {
getTtsMaxLength,
getTtsProvider,
isSummarizationEnabled,
isTtsEnabled,
isTtsProviderConfigured,
normalizeTtsAutoMode,
resolveTtsAutoMode,
resolveTtsApiKey,
resolveTtsConfig,
resolveTtsPrefsPath,
resolveTtsProviderOrder,
setLastTtsAttempt,
setSummarizationEnabled,
setTtsEnabled,
setTtsMaxLength,
setTtsProvider,
textToSpeech,
} from "../../tts/tts.js";
import { updateSessionStore } from "../../config/sessions.js";
type ParsedTtsCommand = {
action: string;
@@ -39,9 +40,9 @@ function ttsUsage(): ReplyPayload {
// Keep usage in one place so help/validation stays consistent.
return {
text:
"⚙️ Usage: /tts <on|off|status|provider|limit|summary|audio> [value]" +
"⚙️ Usage: /tts <off|always|inbound|tagged|status|provider|limit|summary|audio> [value]" +
"\nExamples:\n" +
"/tts on\n" +
"/tts always\n" +
"/tts provider openai\n" +
"/tts provider edge\n" +
"/tts limit 2000\n" +
@@ -71,14 +72,30 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
return { shouldContinue: false, reply: ttsUsage() };
}
if (action === "on") {
setTtsEnabled(prefsPath, true);
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
}
if (action === "off") {
setTtsEnabled(prefsPath, false);
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
const requestedAuto = normalizeTtsAutoMode(
action === "on" ? "always" : action === "off" ? "off" : action,
);
if (requestedAuto) {
const entry = params.sessionEntry;
const sessionKey = params.sessionKey;
const store = params.sessionStore;
if (entry && store && sessionKey) {
entry.ttsAuto = requestedAuto;
entry.updatedAt = Date.now();
store[sessionKey] = entry;
if (params.storePath) {
await updateSessionStore(params.storePath, (store) => {
store[sessionKey] = entry;
});
}
}
const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto;
return {
shouldContinue: false,
reply: {
text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`,
},
};
}
if (action === "audio") {
@@ -212,7 +229,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
}
if (action === "status") {
const enabled = isTtsEnabled(config, prefsPath);
const sessionAuto = params.sessionEntry?.ttsAuto;
const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto });
const enabled = autoMode !== "off";
const provider = getTtsProvider(config, prefsPath);
const hasKey = isTtsProviderConfigured(config, provider);
const providerStatus =
@@ -226,9 +245,10 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath);
const last = getLastTtsAttempt();
const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode;
const lines = [
"📊 TTS status",
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
`Auto: ${enabled ? autoLabel : "off"}`,
`Provider: ${provider} (${providerStatus})`,
`Text limit: ${maxLength} chars`,
`Auto-summary: ${summarize ? "on" : "off"}`,

View File

@@ -1,4 +1,6 @@
import type { ClawdbotConfig } from "../../config/config.js";
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
import { loadSessionStore, resolveStorePath } from "../../config/sessions.js";
import { logVerbose } from "../../globals.js";
import { isDiagnosticsEnabled } from "../../infra/diagnostic-events.js";
import {
@@ -14,7 +16,55 @@ import { formatAbortReplyText, tryFastAbortFromMessage } from "./abort.js";
import { shouldSkipDuplicateInbound } from "./inbound-dedupe.js";
import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js";
import { isRoutableChannel, routeReply } from "./route-reply.js";
import { maybeApplyTtsToPayload } from "../../tts/tts.js";
import { maybeApplyTtsToPayload, normalizeTtsAutoMode } from "../../tts/tts.js";
const AUDIO_PLACEHOLDER_RE = /^<media:audio>(\s*\([^)]*\))?$/i;
const AUDIO_HEADER_RE = /^\[Audio\b/i;
const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase();
const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => {
const rawTypes = [
typeof ctx.MediaType === "string" ? ctx.MediaType : undefined,
...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []),
].filter(Boolean) as string[];
const types = rawTypes.map((type) => normalizeMediaType(type));
if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true;
const body =
typeof ctx.BodyForCommands === "string"
? ctx.BodyForCommands
: typeof ctx.CommandBody === "string"
? ctx.CommandBody
: typeof ctx.RawBody === "string"
? ctx.RawBody
: typeof ctx.Body === "string"
? ctx.Body
: "";
const trimmed = body.trim();
if (!trimmed) return false;
if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true;
return AUDIO_HEADER_RE.test(trimmed);
};
const resolveSessionTtsAuto = (
ctx: FinalizedMsgContext,
cfg: ClawdbotConfig,
): string | undefined => {
const targetSessionKey =
ctx.CommandSource === "native" ? ctx.CommandTargetSessionKey?.trim() : undefined;
const sessionKey = (targetSessionKey ?? ctx.SessionKey)?.trim();
if (!sessionKey) return undefined;
const agentId = resolveSessionAgentId({ sessionKey, config: cfg });
const storePath = resolveStorePath(cfg.session?.store, { agentId });
try {
const store = loadSessionStore(storePath);
const entry = store[sessionKey.toLowerCase()] ?? store[sessionKey];
return normalizeTtsAutoMode(entry?.ttsAuto);
} catch {
return undefined;
}
};
export type DispatchFromConfigResult = {
queuedFinal: boolean;
@@ -81,6 +131,8 @@ export async function dispatchReplyFromConfig(params: {
return { queuedFinal: false, counts: dispatcher.getQueuedCounts() };
}
const inboundAudio = isInboundAudioContext(ctx);
const sessionTtsAuto = resolveSessionTtsAuto(ctx, cfg);
const hookRunner = getGlobalHookRunner();
if (hookRunner?.hasHooks("message_received")) {
const timestamp =
@@ -223,6 +275,8 @@ export async function dispatchReplyFromConfig(params: {
cfg,
channel: ttsChannel,
kind: "tool",
inboundAudio,
ttsAuto: sessionTtsAuto,
});
if (shouldRouteToOriginating) {
await sendPayloadAsync(ttsPayload);
@@ -239,6 +293,8 @@ export async function dispatchReplyFromConfig(params: {
cfg,
channel: ttsChannel,
kind: "block",
inboundAudio,
ttsAuto: sessionTtsAuto,
});
if (shouldRouteToOriginating) {
await sendPayloadAsync(ttsPayload, context?.abortSignal);
@@ -262,6 +318,8 @@ export async function dispatchReplyFromConfig(params: {
cfg,
channel: ttsChannel,
kind: "final",
inboundAudio,
ttsAuto: sessionTtsAuto,
});
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
// Route final reply to originating channel.

View File

@@ -5,6 +5,7 @@ import path from "node:path";
import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
import type { ClawdbotConfig } from "../../config/config.js";
import type { TtsAutoMode } from "../../config/types.tts.js";
import {
DEFAULT_RESET_TRIGGERS,
deriveSessionMetaPatch,
@@ -128,6 +129,7 @@ export async function initSessionState(params: {
let persistedThinking: string | undefined;
let persistedVerbose: string | undefined;
let persistedReasoning: string | undefined;
let persistedTtsAuto: TtsAutoMode | undefined;
let persistedModelOverride: string | undefined;
let persistedProviderOverride: string | undefined;
@@ -220,6 +222,7 @@ export async function initSessionState(params: {
persistedThinking = entry.thinkingLevel;
persistedVerbose = entry.verboseLevel;
persistedReasoning = entry.reasoningLevel;
persistedTtsAuto = entry.ttsAuto;
persistedModelOverride = entry.modelOverride;
persistedProviderOverride = entry.providerOverride;
} else {
@@ -258,6 +261,7 @@ export async function initSessionState(params: {
thinkingLevel: persistedThinking ?? baseEntry?.thinkingLevel,
verboseLevel: persistedVerbose ?? baseEntry?.verboseLevel,
reasoningLevel: persistedReasoning ?? baseEntry?.reasoningLevel,
ttsAuto: persistedTtsAuto ?? baseEntry?.ttsAuto,
responseUsage: baseEntry?.responseUsage,
modelOverride: persistedModelOverride ?? baseEntry?.modelOverride,
providerOverride: persistedProviderOverride ?? baseEntry?.providerOverride,

View File

@@ -17,7 +17,7 @@ import {
getTtsMaxLength,
getTtsProvider,
isSummarizationEnabled,
isTtsEnabled,
resolveTtsAutoMode,
resolveTtsConfig,
resolveTtsPrefsPath,
} from "../tts/tts.js";
@@ -252,15 +252,23 @@ const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[])
return `📎 Media: ${parts.join(" · ")}`;
};
const formatVoiceModeLine = (config?: ClawdbotConfig): string | null => {
const formatVoiceModeLine = (
config?: ClawdbotConfig,
sessionEntry?: SessionEntry,
): string | null => {
if (!config) return null;
const ttsConfig = resolveTtsConfig(config);
const prefsPath = resolveTtsPrefsPath(ttsConfig);
if (!isTtsEnabled(ttsConfig, prefsPath)) return null;
const autoMode = resolveTtsAutoMode({
config: ttsConfig,
prefsPath,
sessionAuto: sessionEntry?.ttsAuto,
});
if (autoMode === "off") return null;
const provider = getTtsProvider(ttsConfig, prefsPath);
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
return `🔊 Voice: on · provider=${provider} · limit=${maxLength} · summary=${summarize}`;
return `🔊 Voice: ${autoMode} · provider=${provider} · limit=${maxLength} · summary=${summarize}`;
};
export function buildStatusMessage(args: StatusArgs): string {
@@ -398,7 +406,7 @@ export function buildStatusMessage(args: StatusArgs): string {
const usageCostLine =
usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine);
const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions);
const voiceLine = formatVoiceModeLine(args.config);
const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry);
return [
versionLine,