refactor: unify media understanding pipeline

This commit is contained in:
Peter Steinberger
2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const anthropicProvider: MediaUnderstandingProvider = {
id: "anthropic",
describeImage: describeImageWithModel,
};

View File

@@ -1,7 +1,9 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { describeGeminiVideo } from "./video.js";
export const googleProvider: MediaUnderstandingProvider = {
id: "google",
describeImage: describeImageWithModel,
describeVideo: describeGeminiVideo,
};

View File

@@ -0,0 +1,66 @@
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
import { getApiKeyForModel } from "../../agents/model-auth.js";
import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
export async function describeImageWithModel(
params: ImageDescriptionRequest,
): Promise<ImageDescriptionResult> {
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
agentDir: params.agentDir,
profileId: params.profile,
preferredProfile: params.preferredProfile,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
const base64 = params.buffer.toString("base64");
if (model.provider === "minimax") {
const text = await minimaxUnderstandImage({
apiKey: apiKeyInfo.apiKey,
prompt: params.prompt ?? "Describe the image.",
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
modelBaseUrl: model.baseUrl,
});
return { text, model: model.id };
}
const context: Context = {
messages: [
{
role: "user",
content: [
{ type: "text", text: params.prompt ?? "Describe the image." },
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
],
timestamp: Date.now(),
},
],
};
const message = (await complete(model, context, {
apiKey: apiKeyInfo.apiKey,
maxTokens: params.maxTokens ?? 512,
})) as AssistantMessage;
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
}

View File

@@ -1,10 +1,18 @@
import { normalizeProviderId } from "../../agents/model-selection.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { anthropicProvider } from "./anthropic/index.js";
import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxProvider } from "./minimax/index.js";
import { openaiProvider } from "./openai/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
const PROVIDERS: MediaUnderstandingProvider[] = [
groqProvider,
openaiProvider,
googleProvider,
anthropicProvider,
minimaxProvider,
];
export function normalizeMediaProviderId(id: string): string {
const normalized = normalizeProviderId(id);

View File

@@ -0,0 +1,7 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const minimaxProvider: MediaUnderstandingProvider = {
id: "minimax",
describeImage: describeImageWithModel,
};

View File

@@ -1,7 +1,9 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
export const openaiProvider: MediaUnderstandingProvider = {
id: "openai",
describeImage: describeImageWithModel,
transcribeAudio: transcribeOpenAiCompatibleAudio,
};