refactor: unify media understanding pipeline
This commit is contained in:
7
src/media-understanding/providers/anthropic/index.ts
Normal file
7
src/media-understanding/providers/anthropic/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const anthropicProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,7 +1,9 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
describeImage: describeImageWithModel,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
|
||||
66
src/media-understanding/providers/image.ts
Normal file
66
src/media-understanding/providers/image.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
|
||||
import { complete } from "@mariozechner/pi-ai";
|
||||
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
|
||||
|
||||
import { getApiKeyForModel } from "../../agents/model-auth.js";
|
||||
import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
|
||||
import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
|
||||
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
|
||||
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
|
||||
|
||||
export async function describeImageWithModel(
|
||||
params: ImageDescriptionRequest,
|
||||
): Promise<ImageDescriptionResult> {
|
||||
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
|
||||
const authStorage = discoverAuthStorage(params.agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
|
||||
if (!model) {
|
||||
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
|
||||
}
|
||||
if (!model.input?.includes("image")) {
|
||||
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
|
||||
}
|
||||
const apiKeyInfo = await getApiKeyForModel({
|
||||
model,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
profileId: params.profile,
|
||||
preferredProfile: params.preferredProfile,
|
||||
});
|
||||
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
||||
|
||||
const base64 = params.buffer.toString("base64");
|
||||
if (model.provider === "minimax") {
|
||||
const text = await minimaxUnderstandImage({
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
prompt: params.prompt ?? "Describe the image.",
|
||||
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
|
||||
modelBaseUrl: model.baseUrl,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
|
||||
const context: Context = {
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: params.prompt ?? "Describe the image." },
|
||||
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
};
|
||||
const message = (await complete(model, context, {
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
maxTokens: params.maxTokens ?? 512,
|
||||
})) as AssistantMessage;
|
||||
const text = coerceImageAssistantText({
|
||||
message,
|
||||
provider: model.provider,
|
||||
model: model.id,
|
||||
});
|
||||
return { text, model: model.id };
|
||||
}
|
||||
@@ -1,10 +1,18 @@
|
||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||
import type { MediaUnderstandingProvider } from "../types.js";
|
||||
import { anthropicProvider } from "./anthropic/index.js";
|
||||
import { googleProvider } from "./google/index.js";
|
||||
import { groqProvider } from "./groq/index.js";
|
||||
import { minimaxProvider } from "./minimax/index.js";
|
||||
import { openaiProvider } from "./openai/index.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [
|
||||
groqProvider,
|
||||
openaiProvider,
|
||||
googleProvider,
|
||||
anthropicProvider,
|
||||
minimaxProvider,
|
||||
];
|
||||
|
||||
export function normalizeMediaProviderId(id: string): string {
|
||||
const normalized = normalizeProviderId(id);
|
||||
|
||||
7
src/media-understanding/providers/minimax/index.ts
Normal file
7
src/media-understanding/providers/minimax/index.ts
Normal file
@@ -0,0 +1,7 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const minimaxProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,7 +1,9 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
export const openaiProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user