TTS: gate auto audio on inbound voice notes (#1667)
Co-authored-by: Sebastian <sebslight@gmail.com>
This commit is contained in:
@@ -6,19 +6,20 @@ import {
|
||||
getTtsMaxLength,
|
||||
getTtsProvider,
|
||||
isSummarizationEnabled,
|
||||
isTtsEnabled,
|
||||
isTtsProviderConfigured,
|
||||
normalizeTtsAutoMode,
|
||||
resolveTtsAutoMode,
|
||||
resolveTtsApiKey,
|
||||
resolveTtsConfig,
|
||||
resolveTtsPrefsPath,
|
||||
resolveTtsProviderOrder,
|
||||
setLastTtsAttempt,
|
||||
setSummarizationEnabled,
|
||||
setTtsEnabled,
|
||||
setTtsMaxLength,
|
||||
setTtsProvider,
|
||||
textToSpeech,
|
||||
} from "../../tts/tts.js";
|
||||
import { updateSessionStore } from "../../config/sessions.js";
|
||||
|
||||
type ParsedTtsCommand = {
|
||||
action: string;
|
||||
@@ -39,9 +40,9 @@ function ttsUsage(): ReplyPayload {
|
||||
// Keep usage in one place so help/validation stays consistent.
|
||||
return {
|
||||
text:
|
||||
"⚙️ Usage: /tts <on|off|status|provider|limit|summary|audio> [value]" +
|
||||
"⚙️ Usage: /tts <off|always|inbound|tagged|status|provider|limit|summary|audio> [value]" +
|
||||
"\nExamples:\n" +
|
||||
"/tts on\n" +
|
||||
"/tts always\n" +
|
||||
"/tts provider openai\n" +
|
||||
"/tts provider edge\n" +
|
||||
"/tts limit 2000\n" +
|
||||
@@ -71,14 +72,30 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
return { shouldContinue: false, reply: ttsUsage() };
|
||||
}
|
||||
|
||||
if (action === "on") {
|
||||
setTtsEnabled(prefsPath, true);
|
||||
return { shouldContinue: false, reply: { text: "🔊 TTS enabled." } };
|
||||
}
|
||||
|
||||
if (action === "off") {
|
||||
setTtsEnabled(prefsPath, false);
|
||||
return { shouldContinue: false, reply: { text: "🔇 TTS disabled." } };
|
||||
const requestedAuto = normalizeTtsAutoMode(
|
||||
action === "on" ? "always" : action === "off" ? "off" : action,
|
||||
);
|
||||
if (requestedAuto) {
|
||||
const entry = params.sessionEntry;
|
||||
const sessionKey = params.sessionKey;
|
||||
const store = params.sessionStore;
|
||||
if (entry && store && sessionKey) {
|
||||
entry.ttsAuto = requestedAuto;
|
||||
entry.updatedAt = Date.now();
|
||||
store[sessionKey] = entry;
|
||||
if (params.storePath) {
|
||||
await updateSessionStore(params.storePath, (store) => {
|
||||
store[sessionKey] = entry;
|
||||
});
|
||||
}
|
||||
}
|
||||
const label = requestedAuto === "always" ? "enabled (always)" : requestedAuto;
|
||||
return {
|
||||
shouldContinue: false,
|
||||
reply: {
|
||||
text: requestedAuto === "off" ? "🔇 TTS disabled." : `🔊 TTS ${label}.`,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
if (action === "audio") {
|
||||
@@ -212,7 +229,9 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
}
|
||||
|
||||
if (action === "status") {
|
||||
const enabled = isTtsEnabled(config, prefsPath);
|
||||
const sessionAuto = params.sessionEntry?.ttsAuto;
|
||||
const autoMode = resolveTtsAutoMode({ config, prefsPath, sessionAuto });
|
||||
const enabled = autoMode !== "off";
|
||||
const provider = getTtsProvider(config, prefsPath);
|
||||
const hasKey = isTtsProviderConfigured(config, provider);
|
||||
const providerStatus =
|
||||
@@ -226,9 +245,10 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath);
|
||||
const last = getLastTtsAttempt();
|
||||
const autoLabel = sessionAuto ? `${autoMode} (session)` : autoMode;
|
||||
const lines = [
|
||||
"📊 TTS status",
|
||||
`State: ${enabled ? "✅ enabled" : "❌ disabled"}`,
|
||||
`Auto: ${enabled ? autoLabel : "off"}`,
|
||||
`Provider: ${provider} (${providerStatus})`,
|
||||
`Text limit: ${maxLength} chars`,
|
||||
`Auto-summary: ${summarize ? "on" : "off"}`,
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import type { ClawdbotConfig } from "../../config/config.js";
|
||||
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
|
||||
import { loadSessionStore, resolveStorePath } from "../../config/sessions.js";
|
||||
import { logVerbose } from "../../globals.js";
|
||||
import { isDiagnosticsEnabled } from "../../infra/diagnostic-events.js";
|
||||
import {
|
||||
@@ -14,7 +16,55 @@ import { formatAbortReplyText, tryFastAbortFromMessage } from "./abort.js";
|
||||
import { shouldSkipDuplicateInbound } from "./inbound-dedupe.js";
|
||||
import type { ReplyDispatcher, ReplyDispatchKind } from "./reply-dispatcher.js";
|
||||
import { isRoutableChannel, routeReply } from "./route-reply.js";
|
||||
import { maybeApplyTtsToPayload } from "../../tts/tts.js";
|
||||
import { maybeApplyTtsToPayload, normalizeTtsAutoMode } from "../../tts/tts.js";
|
||||
|
||||
const AUDIO_PLACEHOLDER_RE = /^<media:audio>(\s*\([^)]*\))?$/i;
|
||||
const AUDIO_HEADER_RE = /^\[Audio\b/i;
|
||||
|
||||
const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase();
|
||||
|
||||
const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => {
|
||||
const rawTypes = [
|
||||
typeof ctx.MediaType === "string" ? ctx.MediaType : undefined,
|
||||
...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []),
|
||||
].filter(Boolean) as string[];
|
||||
const types = rawTypes.map((type) => normalizeMediaType(type));
|
||||
if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true;
|
||||
|
||||
const body =
|
||||
typeof ctx.BodyForCommands === "string"
|
||||
? ctx.BodyForCommands
|
||||
: typeof ctx.CommandBody === "string"
|
||||
? ctx.CommandBody
|
||||
: typeof ctx.RawBody === "string"
|
||||
? ctx.RawBody
|
||||
: typeof ctx.Body === "string"
|
||||
? ctx.Body
|
||||
: "";
|
||||
const trimmed = body.trim();
|
||||
if (!trimmed) return false;
|
||||
if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true;
|
||||
return AUDIO_HEADER_RE.test(trimmed);
|
||||
};
|
||||
|
||||
const resolveSessionTtsAuto = (
|
||||
ctx: FinalizedMsgContext,
|
||||
cfg: ClawdbotConfig,
|
||||
): string | undefined => {
|
||||
const targetSessionKey =
|
||||
ctx.CommandSource === "native" ? ctx.CommandTargetSessionKey?.trim() : undefined;
|
||||
const sessionKey = (targetSessionKey ?? ctx.SessionKey)?.trim();
|
||||
if (!sessionKey) return undefined;
|
||||
const agentId = resolveSessionAgentId({ sessionKey, config: cfg });
|
||||
const storePath = resolveStorePath(cfg.session?.store, { agentId });
|
||||
try {
|
||||
const store = loadSessionStore(storePath);
|
||||
const entry = store[sessionKey.toLowerCase()] ?? store[sessionKey];
|
||||
return normalizeTtsAutoMode(entry?.ttsAuto);
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
};
|
||||
|
||||
export type DispatchFromConfigResult = {
|
||||
queuedFinal: boolean;
|
||||
@@ -81,6 +131,8 @@ export async function dispatchReplyFromConfig(params: {
|
||||
return { queuedFinal: false, counts: dispatcher.getQueuedCounts() };
|
||||
}
|
||||
|
||||
const inboundAudio = isInboundAudioContext(ctx);
|
||||
const sessionTtsAuto = resolveSessionTtsAuto(ctx, cfg);
|
||||
const hookRunner = getGlobalHookRunner();
|
||||
if (hookRunner?.hasHooks("message_received")) {
|
||||
const timestamp =
|
||||
@@ -223,6 +275,8 @@ export async function dispatchReplyFromConfig(params: {
|
||||
cfg,
|
||||
channel: ttsChannel,
|
||||
kind: "tool",
|
||||
inboundAudio,
|
||||
ttsAuto: sessionTtsAuto,
|
||||
});
|
||||
if (shouldRouteToOriginating) {
|
||||
await sendPayloadAsync(ttsPayload);
|
||||
@@ -239,6 +293,8 @@ export async function dispatchReplyFromConfig(params: {
|
||||
cfg,
|
||||
channel: ttsChannel,
|
||||
kind: "block",
|
||||
inboundAudio,
|
||||
ttsAuto: sessionTtsAuto,
|
||||
});
|
||||
if (shouldRouteToOriginating) {
|
||||
await sendPayloadAsync(ttsPayload, context?.abortSignal);
|
||||
@@ -262,6 +318,8 @@ export async function dispatchReplyFromConfig(params: {
|
||||
cfg,
|
||||
channel: ttsChannel,
|
||||
kind: "final",
|
||||
inboundAudio,
|
||||
ttsAuto: sessionTtsAuto,
|
||||
});
|
||||
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
|
||||
// Route final reply to originating channel.
|
||||
|
||||
@@ -5,6 +5,7 @@ import path from "node:path";
|
||||
import { CURRENT_SESSION_VERSION, SessionManager } from "@mariozechner/pi-coding-agent";
|
||||
import { resolveSessionAgentId } from "../../agents/agent-scope.js";
|
||||
import type { ClawdbotConfig } from "../../config/config.js";
|
||||
import type { TtsAutoMode } from "../../config/types.tts.js";
|
||||
import {
|
||||
DEFAULT_RESET_TRIGGERS,
|
||||
deriveSessionMetaPatch,
|
||||
@@ -128,6 +129,7 @@ export async function initSessionState(params: {
|
||||
let persistedThinking: string | undefined;
|
||||
let persistedVerbose: string | undefined;
|
||||
let persistedReasoning: string | undefined;
|
||||
let persistedTtsAuto: TtsAutoMode | undefined;
|
||||
let persistedModelOverride: string | undefined;
|
||||
let persistedProviderOverride: string | undefined;
|
||||
|
||||
@@ -220,6 +222,7 @@ export async function initSessionState(params: {
|
||||
persistedThinking = entry.thinkingLevel;
|
||||
persistedVerbose = entry.verboseLevel;
|
||||
persistedReasoning = entry.reasoningLevel;
|
||||
persistedTtsAuto = entry.ttsAuto;
|
||||
persistedModelOverride = entry.modelOverride;
|
||||
persistedProviderOverride = entry.providerOverride;
|
||||
} else {
|
||||
@@ -258,6 +261,7 @@ export async function initSessionState(params: {
|
||||
thinkingLevel: persistedThinking ?? baseEntry?.thinkingLevel,
|
||||
verboseLevel: persistedVerbose ?? baseEntry?.verboseLevel,
|
||||
reasoningLevel: persistedReasoning ?? baseEntry?.reasoningLevel,
|
||||
ttsAuto: persistedTtsAuto ?? baseEntry?.ttsAuto,
|
||||
responseUsage: baseEntry?.responseUsage,
|
||||
modelOverride: persistedModelOverride ?? baseEntry?.modelOverride,
|
||||
providerOverride: persistedProviderOverride ?? baseEntry?.providerOverride,
|
||||
|
||||
@@ -17,7 +17,7 @@ import {
|
||||
getTtsMaxLength,
|
||||
getTtsProvider,
|
||||
isSummarizationEnabled,
|
||||
isTtsEnabled,
|
||||
resolveTtsAutoMode,
|
||||
resolveTtsConfig,
|
||||
resolveTtsPrefsPath,
|
||||
} from "../tts/tts.js";
|
||||
@@ -252,15 +252,23 @@ const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[])
|
||||
return `📎 Media: ${parts.join(" · ")}`;
|
||||
};
|
||||
|
||||
const formatVoiceModeLine = (config?: ClawdbotConfig): string | null => {
|
||||
const formatVoiceModeLine = (
|
||||
config?: ClawdbotConfig,
|
||||
sessionEntry?: SessionEntry,
|
||||
): string | null => {
|
||||
if (!config) return null;
|
||||
const ttsConfig = resolveTtsConfig(config);
|
||||
const prefsPath = resolveTtsPrefsPath(ttsConfig);
|
||||
if (!isTtsEnabled(ttsConfig, prefsPath)) return null;
|
||||
const autoMode = resolveTtsAutoMode({
|
||||
config: ttsConfig,
|
||||
prefsPath,
|
||||
sessionAuto: sessionEntry?.ttsAuto,
|
||||
});
|
||||
if (autoMode === "off") return null;
|
||||
const provider = getTtsProvider(ttsConfig, prefsPath);
|
||||
const maxLength = getTtsMaxLength(prefsPath);
|
||||
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||
return `🔊 Voice: on · provider=${provider} · limit=${maxLength} · summary=${summarize}`;
|
||||
return `🔊 Voice: ${autoMode} · provider=${provider} · limit=${maxLength} · summary=${summarize}`;
|
||||
};
|
||||
|
||||
export function buildStatusMessage(args: StatusArgs): string {
|
||||
@@ -398,7 +406,7 @@ export function buildStatusMessage(args: StatusArgs): string {
|
||||
const usageCostLine =
|
||||
usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine);
|
||||
const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions);
|
||||
const voiceLine = formatVoiceModeLine(args.config);
|
||||
const voiceLine = formatVoiceModeLine(args.config, args.sessionEntry);
|
||||
|
||||
return [
|
||||
versionLine,
|
||||
|
||||
Reference in New Issue
Block a user