refactor: unify media understanding pipeline

This commit is contained in:
Peter Steinberger
2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions

View File

@@ -107,14 +107,18 @@ const FIELD_LABELS: Record<string, string> = {
"tools.media.image.maxChars": "Image Understanding Max Chars",
"tools.media.image.prompt": "Image Understanding Prompt",
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
"tools.media.image.attachments": "Image Understanding Attachment Policy",
"tools.media.image.models": "Image Understanding Models",
"tools.media.image.scope": "Image Understanding Scope",
"tools.media.models": "Media Understanding Shared Models",
"tools.media.concurrency": "Media Understanding Concurrency",
"tools.media.audio.enabled": "Enable Audio Understanding",
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
"tools.media.audio.prompt": "Audio Understanding Prompt",
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
"tools.media.audio.language": "Audio Understanding Language",
"tools.media.audio.attachments": "Audio Understanding Attachment Policy",
"tools.media.audio.models": "Audio Understanding Models",
"tools.media.audio.scope": "Audio Understanding Scope",
"tools.media.video.enabled": "Enable Video Understanding",
@@ -122,6 +126,7 @@ const FIELD_LABELS: Record<string, string> = {
"tools.media.video.maxChars": "Video Understanding Max Chars",
"tools.media.video.prompt": "Video Understanding Prompt",
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
"tools.media.video.attachments": "Video Understanding Attachment Policy",
"tools.media.video.models": "Video Understanding Models",
"tools.media.video.scope": "Video Understanding Scope",
"tools.profile": "Tool Profile",

View File

@@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = {
export type MediaUnderstandingCapability = "image" | "audio" | "video";
export type MediaUnderstandingAttachmentsConfig = {
/** Select the first matching attachment or process multiple. */
mode?: "first" | "all";
/** Max number of attachments to process (default: 1). */
maxAttachments?: number;
/** Attachment ordering preference. */
prefer?: "first" | "last" | "path" | "url";
};
export type MediaUnderstandingModelConfig = {
/** provider API id (e.g. openai, google). */
provider?: string;
@@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = {
timeoutSeconds?: number;
/** Default language hint (audio). */
language?: string;
/** Attachment selection policy. */
attachments?: MediaUnderstandingAttachmentsConfig;
/** Ordered model list (fallbacks in order). */
models?: MediaUnderstandingModelConfig[];
};
export type MediaToolsConfig = {
/** Shared model list applied across image/audio/video. */
models?: MediaUnderstandingModelConfig[];
/** Max concurrent media understanding runs. */
concurrency?: number;
image?: MediaUnderstandingConfig;
audio?: MediaUnderstandingConfig;
video?: MediaUnderstandingConfig;

View File

@@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
.optional();
export const MediaUnderstandingAttachmentsSchema = z
.object({
mode: z.union([z.literal("first"), z.literal("all")]).optional(),
maxAttachments: z.number().int().positive().optional(),
prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(),
})
.optional();
export const MediaUnderstandingModelSchema = z
.object({
provider: z.string().optional(),
@@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z
prompt: z.string().optional(),
timeoutSeconds: z.number().int().positive().optional(),
language: z.string().optional(),
attachments: MediaUnderstandingAttachmentsSchema,
models: z.array(MediaUnderstandingModelSchema).optional(),
})
.optional();
export const ToolsMediaSchema = z
.object({
models: z.array(MediaUnderstandingModelSchema).optional(),
concurrency: z.number().int().positive().optional(),
image: ToolsMediaUnderstandingSchema.optional(),
audio: ToolsMediaUnderstandingSchema.optional(),
video: ToolsMediaUnderstandingSchema.optional(),