refactor: unify media understanding pipeline

2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -107,14 +107,18 @@ const FIELD_LABELS: Record<string, string> = {
  "tools.media.image.maxChars": "Image Understanding Max Chars",
  "tools.media.image.prompt": "Image Understanding Prompt",
  "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
+  "tools.media.image.attachments": "Image Understanding Attachment Policy",
  "tools.media.image.models": "Image Understanding Models",
  "tools.media.image.scope": "Image Understanding Scope",
+  "tools.media.models": "Media Understanding Shared Models",
+  "tools.media.concurrency": "Media Understanding Concurrency",
  "tools.media.audio.enabled": "Enable Audio Understanding",
  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
  "tools.media.audio.prompt": "Audio Understanding Prompt",
  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
  "tools.media.audio.language": "Audio Understanding Language",
+  "tools.media.audio.attachments": "Audio Understanding Attachment Policy",
  "tools.media.audio.models": "Audio Understanding Models",
  "tools.media.audio.scope": "Audio Understanding Scope",
  "tools.media.video.enabled": "Enable Video Understanding",
@@ -122,6 +126,7 @@ const FIELD_LABELS: Record<string, string> = {
  "tools.media.video.maxChars": "Video Understanding Max Chars",
  "tools.media.video.prompt": "Video Understanding Prompt",
  "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
+  "tools.media.video.attachments": "Video Understanding Attachment Policy",
  "tools.media.video.models": "Video Understanding Models",
  "tools.media.video.scope": "Video Understanding Scope",
  "tools.profile": "Tool Profile",
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = {

 export type MediaUnderstandingCapability = "image" | "audio" | "video";

+export type MediaUnderstandingAttachmentsConfig = {
+  /** Select the first matching attachment or process multiple. */
+  mode?: "first" | "all";
+  /** Max number of attachments to process (default: 1). */
+  maxAttachments?: number;
+  /** Attachment ordering preference. */
+  prefer?: "first" | "last" | "path" | "url";
+};
+
 export type MediaUnderstandingModelConfig = {
  /** provider API id (e.g. openai, google). */
  provider?: string;
@@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = {
  timeoutSeconds?: number;
  /** Default language hint (audio). */
  language?: string;
+  /** Attachment selection policy. */
+  attachments?: MediaUnderstandingAttachmentsConfig;
  /** Ordered model list (fallbacks in order). */
  models?: MediaUnderstandingModelConfig[];
 };

 export type MediaToolsConfig = {
+  /** Shared model list applied across image/audio/video. */
+  models?: MediaUnderstandingModelConfig[];
+  /** Max concurrent media understanding runs. */
+  concurrency?: number;
  image?: MediaUnderstandingConfig;
  audio?: MediaUnderstandingConfig;
  video?: MediaUnderstandingConfig;
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z
  .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
  .optional();

+export const MediaUnderstandingAttachmentsSchema = z
+  .object({
+    mode: z.union([z.literal("first"), z.literal("all")]).optional(),
+    maxAttachments: z.number().int().positive().optional(),
+    prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(),
+  })
+  .optional();
+
 export const MediaUnderstandingModelSchema = z
  .object({
    provider: z.string().optional(),
@@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z
    prompt: z.string().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
+    attachments: MediaUnderstandingAttachmentsSchema,
    models: z.array(MediaUnderstandingModelSchema).optional(),
  })
  .optional();

 export const ToolsMediaSchema = z
  .object({
+    models: z.array(MediaUnderstandingModelSchema).optional(),
+    concurrency: z.number().int().positive().optional(),
    image: ToolsMediaUnderstandingSchema.optional(),
    audio: ToolsMediaUnderstandingSchema.optional(),
    video: ToolsMediaUnderstandingSchema.optional(),