mirror of
https://github.com/moltbot/moltbot.git
synced 2026-03-30 01:06:11 +00:00
feat(plugins): move media understanding into vendor plugins
This commit is contained in:
@@ -23,10 +23,10 @@ import {
|
||||
import { buildTokenProfileId, validateAnthropicSetupToken } from "../../src/commands/auth-token.js";
|
||||
import { applyAuthProfileConfig } from "../../src/commands/onboard-auth.js";
|
||||
import { fetchClaudeUsage } from "../../src/infra/provider-usage.fetch.js";
|
||||
import { anthropicProvider } from "../../src/media-understanding/providers/anthropic/index.js";
|
||||
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
|
||||
import type { ProviderAuthResult } from "../../src/plugins/types.js";
|
||||
import { normalizeSecretInput } from "../../src/utils/normalize-secret-input.js";
|
||||
import { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
|
||||
const PROVIDER_ID = "anthropic";
|
||||
const DEFAULT_ANTHROPIC_MODEL = "anthropic/claude-sonnet-4-6";
|
||||
@@ -395,7 +395,7 @@ const anthropicPlugin = {
|
||||
profileId: ctx.profileId,
|
||||
}),
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(anthropicProvider);
|
||||
api.registerMediaUnderstandingProvider(anthropicMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
8
extensions/anthropic/media-understanding-provider.ts
Normal file
8
extensions/anthropic/media-understanding-provider.ts
Normal file
@@ -0,0 +1,8 @@
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
|
||||
|
||||
export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -7,11 +7,11 @@ import {
|
||||
GOOGLE_GEMINI_DEFAULT_MODEL,
|
||||
applyGoogleGeminiModelDefault,
|
||||
} from "../../src/commands/google-gemini-model-default.js";
|
||||
import { googleProvider } from "../../src/media-understanding/providers/google/index.js";
|
||||
import { emptyPluginConfigSchema } from "../../src/plugins/config-schema.js";
|
||||
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
|
||||
import type { OpenClawPluginApi } from "../../src/plugins/types.js";
|
||||
import { registerGoogleGeminiCliProvider } from "./gemini-cli-provider.js";
|
||||
import { googleMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { isModernGoogleModel, resolveGoogle31ForwardCompatModel } from "./provider-models.js";
|
||||
|
||||
const googlePlugin = {
|
||||
@@ -52,7 +52,7 @@ const googlePlugin = {
|
||||
isModernModelRef: ({ modelId }) => isModernGoogleModel(modelId),
|
||||
});
|
||||
registerGoogleGeminiCliProvider(api);
|
||||
api.registerMediaUnderstandingProvider(googleProvider);
|
||||
api.registerMediaUnderstandingProvider(googleMediaUnderstandingProvider);
|
||||
api.registerWebSearchProvider(
|
||||
createPluginBackedWebSearchProvider({
|
||||
id: "gemini",
|
||||
|
||||
150
extensions/google/media-understanding-provider.ts
Normal file
150
extensions/google/media-understanding-provider.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
import { normalizeGoogleModelId } from "../../src/agents/model-id-normalization.js";
|
||||
import { parseGeminiAuth } from "../../src/infra/gemini-auth.js";
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "../../src/media-understanding/providers/shared.js";
|
||||
import type {
|
||||
AudioTranscriptionRequest,
|
||||
AudioTranscriptionResult,
|
||||
MediaUnderstandingProvider,
|
||||
VideoDescriptionRequest,
|
||||
VideoDescriptionResult,
|
||||
} from "../../src/media-understanding/types.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
|
||||
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||
|
||||
async function generateGeminiInlineDataText(params: {
|
||||
buffer: Buffer;
|
||||
mime?: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
headers?: Record<string, string>;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
timeoutMs: number;
|
||||
fetchFn?: typeof fetch;
|
||||
defaultBaseUrl: string;
|
||||
defaultModel: string;
|
||||
defaultPrompt: string;
|
||||
defaultMime: string;
|
||||
httpErrorLabel: string;
|
||||
missingTextError: string;
|
||||
}): Promise<{ text: string; model: string }> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const model = (() => {
|
||||
const trimmed = params.model?.trim();
|
||||
if (!trimmed) {
|
||||
return params.defaultModel;
|
||||
}
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
})();
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const authHeaders = parseGeminiAuth(params.apiKey);
|
||||
const headers = new Headers(params.headers);
|
||||
for (const [key, value] of Object.entries(authHeaders.headers)) {
|
||||
if (!headers.has(key)) {
|
||||
headers.set(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
const prompt = (() => {
|
||||
const trimmed = params.prompt?.trim();
|
||||
return trimmed || params.defaultPrompt;
|
||||
})();
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: prompt },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? params.defaultMime,
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const { response: res, release } = await postJsonRequest({
|
||||
url,
|
||||
headers,
|
||||
body,
|
||||
timeoutMs: params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivateNetwork: allowPrivate,
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error(params.missingTextError);
|
||||
}
|
||||
return { text, model };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
|
||||
export async function transcribeGeminiAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
|
||||
defaultMime: "audio/wav",
|
||||
httpErrorLabel: "Audio transcription failed",
|
||||
missingTextError: "Audio transcription response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
|
||||
export async function describeGeminiVideo(
|
||||
params: VideoDescriptionRequest,
|
||||
): Promise<VideoDescriptionResult> {
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
|
||||
defaultMime: "video/mp4",
|
||||
httpErrorLabel: "Video description failed",
|
||||
missingTextError: "Video description response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
|
||||
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeGeminiAudio,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
@@ -9,11 +9,11 @@ import {
|
||||
import { ensureAuthProfileStore, listProfilesForProvider } from "../../src/agents/auth-profiles.js";
|
||||
import { MINIMAX_OAUTH_MARKER } from "../../src/agents/model-auth-markers.js";
|
||||
import { fetchMinimaxUsage } from "../../src/infra/provider-usage.fetch.js";
|
||||
import {
|
||||
minimaxPortalProvider,
|
||||
minimaxProvider,
|
||||
} from "../../src/media-understanding/providers/minimax/index.js";
|
||||
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
|
||||
import {
|
||||
minimaxMediaUnderstandingProvider,
|
||||
minimaxPortalMediaUnderstandingProvider,
|
||||
} from "./media-understanding-provider.js";
|
||||
import { loginMiniMaxPortalOAuth, type MiniMaxRegion } from "./oauth.js";
|
||||
import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js";
|
||||
import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js";
|
||||
@@ -274,8 +274,8 @@ const minimaxPlugin = {
|
||||
],
|
||||
isModernModelRef: ({ modelId }) => isModernMiniMaxModel(modelId),
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(minimaxProvider);
|
||||
api.registerMediaUnderstandingProvider(minimaxPortalProvider);
|
||||
api.registerMediaUnderstandingProvider(minimaxMediaUnderstandingProvider);
|
||||
api.registerMediaUnderstandingProvider(minimaxPortalMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
14
extensions/minimax/media-understanding-provider.ts
Normal file
14
extensions/minimax/media-understanding-provider.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
|
||||
|
||||
export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
|
||||
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,6 +1,6 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { mistralProvider } from "../../src/media-understanding/providers/mistral/index.js";
|
||||
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
|
||||
import { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { applyMistralConfig, MISTRAL_DEFAULT_MODEL_REF } from "./onboard.js";
|
||||
|
||||
const PROVIDER_ID = "mistral";
|
||||
@@ -51,7 +51,7 @@ const mistralPlugin = {
|
||||
],
|
||||
},
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(mistralProvider);
|
||||
api.registerMediaUnderstandingProvider(mistralMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
17
extensions/mistral/media-understanding-provider.ts
Normal file
17
extensions/mistral/media-understanding-provider.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import { transcribeOpenAiCompatibleAudio } from "../../src/media-understanding/providers/openai-compatible-audio.js";
|
||||
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
|
||||
|
||||
const DEFAULT_MISTRAL_AUDIO_BASE_URL = "https://api.mistral.ai/v1";
|
||||
const DEFAULT_MISTRAL_AUDIO_MODEL = "voxtral-mini-latest";
|
||||
|
||||
export const mistralMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "mistral",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: async (req) =>
|
||||
await transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
baseUrl: req.baseUrl ?? DEFAULT_MISTRAL_AUDIO_BASE_URL,
|
||||
defaultBaseUrl: DEFAULT_MISTRAL_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_MISTRAL_AUDIO_MODEL,
|
||||
}),
|
||||
};
|
||||
@@ -7,10 +7,10 @@ import {
|
||||
getScopedCredentialValue,
|
||||
setScopedCredentialValue,
|
||||
} from "../../src/agents/tools/web-search-plugin-factory.js";
|
||||
import { moonshotProvider } from "../../src/media-understanding/providers/moonshot/index.js";
|
||||
import { emptyPluginConfigSchema } from "../../src/plugins/config-schema.js";
|
||||
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
|
||||
import type { OpenClawPluginApi } from "../../src/plugins/types.js";
|
||||
import { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import {
|
||||
applyMoonshotConfig,
|
||||
applyMoonshotConfigCn,
|
||||
@@ -100,7 +100,7 @@ const moonshotPlugin = {
|
||||
return createMoonshotThinkingWrapper(ctx.streamFn, thinkingType);
|
||||
},
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(moonshotProvider);
|
||||
api.registerMediaUnderstandingProvider(moonshotMediaUnderstandingProvider);
|
||||
api.registerWebSearchProvider(
|
||||
createPluginBackedWebSearchProvider({
|
||||
id: "kimi",
|
||||
|
||||
@@ -1,5 +1,14 @@
|
||||
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||
import { assertOkOrThrowHttpError, normalizeBaseUrl, postJsonRequest } from "../shared.js";
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postJsonRequest,
|
||||
} from "../../src/media-understanding/providers/shared.js";
|
||||
import type {
|
||||
MediaUnderstandingProvider,
|
||||
VideoDescriptionRequest,
|
||||
VideoDescriptionResult,
|
||||
} from "../../src/media-understanding/types.js";
|
||||
|
||||
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
|
||||
const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
|
||||
@@ -104,3 +113,10 @@ export async function describeMoonshotVideo(
|
||||
await release();
|
||||
}
|
||||
}
|
||||
|
||||
export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "moonshot",
|
||||
capabilities: ["image", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeVideo: describeMoonshotVideo,
|
||||
};
|
||||
@@ -1,6 +1,6 @@
|
||||
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
|
||||
import { openaiProvider } from "../../src/media-understanding/providers/openai/index.js";
|
||||
import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js";
|
||||
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
|
||||
import { buildOpenAIProvider } from "./openai-provider.js";
|
||||
|
||||
@@ -13,7 +13,7 @@ const openAIPlugin = {
|
||||
api.registerProvider(buildOpenAIProvider());
|
||||
api.registerProvider(buildOpenAICodexProviderPlugin());
|
||||
api.registerSpeechProvider(buildOpenAISpeechProvider());
|
||||
api.registerMediaUnderstandingProvider(openaiProvider);
|
||||
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
23
extensions/openai/media-understanding-provider.ts
Normal file
23
extensions/openai/media-understanding-provider.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../../src/media-understanding/providers/openai-compatible-audio.js";
|
||||
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
|
||||
|
||||
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";
|
||||
|
||||
export async function transcribeOpenAiAudio(
|
||||
params: import("../../src/media-understanding/types.js").AudioTranscriptionRequest,
|
||||
) {
|
||||
return await transcribeOpenAiCompatibleAudio({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_OPENAI_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_OPENAI_AUDIO_MODEL,
|
||||
});
|
||||
}
|
||||
|
||||
export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
capabilities: ["image", "audio"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeOpenAiAudio,
|
||||
};
|
||||
@@ -106,6 +106,15 @@ export function createPluginRuntimeMock(overrides: DeepPartial<PluginRuntime> =
|
||||
textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"],
|
||||
listVoices: vi.fn() as unknown as PluginRuntime["tts"]["listVoices"],
|
||||
},
|
||||
mediaUnderstanding: {
|
||||
runFile: vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["runFile"],
|
||||
describeImageFile:
|
||||
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["describeImageFile"],
|
||||
describeVideoFile:
|
||||
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["describeVideoFile"],
|
||||
transcribeAudioFile:
|
||||
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["transcribeAudioFile"],
|
||||
},
|
||||
stt: {
|
||||
transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"],
|
||||
},
|
||||
|
||||
@@ -24,9 +24,9 @@ import { applyAuthProfileConfig } from "../../src/commands/onboard-auth.js";
|
||||
import type { SecretInput } from "../../src/config/types.secrets.js";
|
||||
import { resolveRequiredHomeDir } from "../../src/infra/home-dir.js";
|
||||
import { fetchZaiUsage } from "../../src/infra/provider-usage.fetch.js";
|
||||
import { zaiProvider } from "../../src/media-understanding/providers/zai/index.js";
|
||||
import { normalizeOptionalSecretInput } from "../../src/utils/normalize-secret-input.js";
|
||||
import { detectZaiEndpoint, type ZaiEndpointId } from "./detect.js";
|
||||
import { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
|
||||
import { applyZaiConfig, applyZaiProviderConfig, ZAI_DEFAULT_MODEL_REF } from "./onboard.js";
|
||||
|
||||
const PROVIDER_ID = "zai";
|
||||
@@ -335,7 +335,7 @@ const zaiPlugin = {
|
||||
fetchUsageSnapshot: async (ctx) => await fetchZaiUsage(ctx.token, ctx.timeoutMs, ctx.fetchFn),
|
||||
isCacheTtlEligible: () => true,
|
||||
});
|
||||
api.registerMediaUnderstandingProvider(zaiProvider);
|
||||
api.registerMediaUnderstandingProvider(zaiMediaUnderstandingProvider);
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
8
extensions/zai/media-understanding-provider.ts
Normal file
8
extensions/zai/media-understanding-provider.ts
Normal file
@@ -0,0 +1,8 @@
|
||||
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
|
||||
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
|
||||
|
||||
export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "zai",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,8 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const anthropicProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,21 +0,0 @@
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import { generateGeminiInlineDataText } from "./inline-data.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
|
||||
|
||||
export async function transcribeGeminiAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
|
||||
defaultMime: "audio/wav",
|
||||
httpErrorLabel: "Audio transcription failed",
|
||||
missingTextError: "Audio transcription response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { transcribeGeminiAudio } from "./audio.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeGeminiAudio,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
@@ -1,93 +0,0 @@
|
||||
import { normalizeGoogleModelId } from "../../../agents/model-id-normalization.js";
|
||||
import { parseGeminiAuth } from "../../../infra/gemini-auth.js";
|
||||
import { assertOkOrThrowHttpError, normalizeBaseUrl, postJsonRequest } from "../shared.js";
|
||||
|
||||
export async function generateGeminiInlineDataText(params: {
|
||||
buffer: Buffer;
|
||||
mime?: string;
|
||||
apiKey: string;
|
||||
baseUrl?: string;
|
||||
headers?: Record<string, string>;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
timeoutMs: number;
|
||||
fetchFn?: typeof fetch;
|
||||
defaultBaseUrl: string;
|
||||
defaultModel: string;
|
||||
defaultPrompt: string;
|
||||
defaultMime: string;
|
||||
httpErrorLabel: string;
|
||||
missingTextError: string;
|
||||
}): Promise<{ text: string; model: string }> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const model = (() => {
|
||||
const trimmed = params.model?.trim();
|
||||
if (!trimmed) {
|
||||
return params.defaultModel;
|
||||
}
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
})();
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const authHeaders = parseGeminiAuth(params.apiKey);
|
||||
const headers = new Headers(params.headers);
|
||||
for (const [key, value] of Object.entries(authHeaders.headers)) {
|
||||
if (!headers.has(key)) {
|
||||
headers.set(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
const prompt = (() => {
|
||||
const trimmed = params.prompt?.trim();
|
||||
return trimmed || params.defaultPrompt;
|
||||
})();
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: prompt },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? params.defaultMime,
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const { response: res, release } = await postJsonRequest({
|
||||
url,
|
||||
headers,
|
||||
body,
|
||||
timeoutMs: params.timeoutMs,
|
||||
fetchFn,
|
||||
allowPrivateNetwork: allowPrivate,
|
||||
});
|
||||
|
||||
try {
|
||||
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error(params.missingTextError);
|
||||
}
|
||||
return { text, model };
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { describeGeminiVideo } from "../../../../extensions/google/media-understanding-provider.js";
|
||||
import * as ssrf from "../../../infra/net/ssrf.js";
|
||||
import { withFetchPreconnect } from "../../../test-utils/fetch-mock.js";
|
||||
import { createRequestCaptureJsonFetch } from "../audio.test-helpers.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
const TEST_NET_IP = "203.0.113.10";
|
||||
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||
import { generateGeminiInlineDataText } from "./inline-data.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||
|
||||
export async function describeGeminiVideo(
|
||||
params: VideoDescriptionRequest,
|
||||
): Promise<VideoDescriptionResult> {
|
||||
const { text, model } = await generateGeminiInlineDataText({
|
||||
...params,
|
||||
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
|
||||
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
|
||||
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
|
||||
defaultMime: "video/mp4",
|
||||
httpErrorLabel: "Video description failed",
|
||||
missingTextError: "Video description response missing text",
|
||||
});
|
||||
return { text, model };
|
||||
}
|
||||
@@ -1,7 +1,8 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../openai-compatible-audio.js";
|
||||
|
||||
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
const DEFAULT_GROQ_AUDIO_MODEL = "whisper-large-v3-turbo";
|
||||
|
||||
export const groqProvider: MediaUnderstandingProvider = {
|
||||
id: "groq",
|
||||
@@ -10,5 +11,7 @@ export const groqProvider: MediaUnderstandingProvider = {
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
|
||||
defaultBaseUrl: DEFAULT_GROQ_AUDIO_BASE_URL,
|
||||
defaultModel: DEFAULT_GROQ_AUDIO_MODEL,
|
||||
}),
|
||||
};
|
||||
|
||||
@@ -8,35 +8,15 @@ describe("media-understanding provider registry", () => {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
});
|
||||
|
||||
it("registers the Mistral provider", () => {
|
||||
it("keeps core-owned fallback providers registered by default", () => {
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const provider = getMediaUnderstandingProvider("mistral", registry);
|
||||
const groqProvider = getMediaUnderstandingProvider("groq", registry);
|
||||
const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
|
||||
|
||||
expect(provider?.id).toBe("mistral");
|
||||
expect(provider?.capabilities).toEqual(["audio"]);
|
||||
});
|
||||
|
||||
it("keeps provider id normalization behavior", () => {
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const provider = getMediaUnderstandingProvider("gemini", registry);
|
||||
|
||||
expect(provider?.id).toBe("google");
|
||||
});
|
||||
|
||||
it("registers the Moonshot provider", () => {
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const provider = getMediaUnderstandingProvider("moonshot", registry);
|
||||
|
||||
expect(provider?.id).toBe("moonshot");
|
||||
expect(provider?.capabilities).toEqual(["image", "video"]);
|
||||
});
|
||||
|
||||
it("registers the minimax portal provider", () => {
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const provider = getMediaUnderstandingProvider("minimax-portal", registry);
|
||||
|
||||
expect(provider?.id).toBe("minimax-portal");
|
||||
expect(provider?.capabilities).toEqual(["image"]);
|
||||
expect(groqProvider?.id).toBe("groq");
|
||||
expect(groqProvider?.capabilities).toEqual(["audio"]);
|
||||
expect(deepgramProvider?.id).toBe("deepgram");
|
||||
expect(deepgramProvider?.capabilities).toEqual(["audio"]);
|
||||
});
|
||||
|
||||
it("merges plugin-registered media providers into the active registry", async () => {
|
||||
@@ -61,4 +41,23 @@ describe("media-understanding provider registry", () => {
|
||||
expect(provider?.id).toBe("google");
|
||||
expect(await provider?.describeVideo?.({} as never)).toEqual({ text: "plugin video" });
|
||||
});
|
||||
|
||||
it("keeps provider id normalization behavior for plugin-owned providers", () => {
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "google",
|
||||
pluginName: "Google Plugin",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
},
|
||||
});
|
||||
setActivePluginRegistry(pluginRegistry);
|
||||
|
||||
const registry = buildMediaUnderstandingRegistry();
|
||||
const provider = getMediaUnderstandingProvider("gemini", registry);
|
||||
|
||||
expect(provider?.id).toBe("google");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,28 +1,10 @@
|
||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||
import { getActivePluginRegistry } from "../../plugins/runtime.js";
|
||||
import type { MediaUnderstandingProvider } from "../types.js";
|
||||
import { anthropicProvider } from "./anthropic/index.js";
|
||||
import { deepgramProvider } from "./deepgram/index.js";
|
||||
import { googleProvider } from "./google/index.js";
|
||||
import { groqProvider } from "./groq/index.js";
|
||||
import { minimaxPortalProvider, minimaxProvider } from "./minimax/index.js";
|
||||
import { mistralProvider } from "./mistral/index.js";
|
||||
import { moonshotProvider } from "./moonshot/index.js";
|
||||
import { openaiProvider } from "./openai/index.js";
|
||||
import { zaiProvider } from "./zai/index.js";
|
||||
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [
|
||||
groqProvider,
|
||||
openaiProvider,
|
||||
googleProvider,
|
||||
anthropicProvider,
|
||||
minimaxProvider,
|
||||
minimaxPortalProvider,
|
||||
moonshotProvider,
|
||||
mistralProvider,
|
||||
zaiProvider,
|
||||
deepgramProvider,
|
||||
];
|
||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider];
|
||||
|
||||
function mergeProviderIntoRegistry(
|
||||
registry: Map<string, MediaUnderstandingProvider>,
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const minimaxProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
|
||||
export const minimaxPortalProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
@@ -1,23 +1,23 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { mistralMediaUnderstandingProvider } from "../../../../extensions/mistral/media-understanding-provider.js";
|
||||
import {
|
||||
createRequestCaptureJsonFetch,
|
||||
installPinnedHostnameTestHooks,
|
||||
} from "../audio.test-helpers.js";
|
||||
import { mistralProvider } from "./index.js";
|
||||
|
||||
installPinnedHostnameTestHooks();
|
||||
|
||||
describe("mistralProvider", () => {
|
||||
describe("mistralMediaUnderstandingProvider", () => {
|
||||
it("has expected provider metadata", () => {
|
||||
expect(mistralProvider.id).toBe("mistral");
|
||||
expect(mistralProvider.capabilities).toEqual(["audio"]);
|
||||
expect(mistralProvider.transcribeAudio).toBeDefined();
|
||||
expect(mistralMediaUnderstandingProvider.id).toBe("mistral");
|
||||
expect(mistralMediaUnderstandingProvider.capabilities).toEqual(["audio"]);
|
||||
expect(mistralMediaUnderstandingProvider.transcribeAudio).toBeDefined();
|
||||
});
|
||||
|
||||
it("uses Mistral base URL by default", async () => {
|
||||
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "bonjour" });
|
||||
|
||||
const result = await mistralProvider.transcribeAudio!({
|
||||
const result = await mistralMediaUnderstandingProvider.transcribeAudio!({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.ogg",
|
||||
apiKey: "test-mistral-key", // pragma: allowlist secret
|
||||
@@ -32,7 +32,7 @@ describe("mistralProvider", () => {
|
||||
it("allows overriding baseUrl", async () => {
|
||||
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "ok" });
|
||||
|
||||
await mistralProvider.transcribeAudio!({
|
||||
await mistralMediaUnderstandingProvider.transcribeAudio!({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "note.mp3",
|
||||
apiKey: "key", // pragma: allowlist secret
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
|
||||
|
||||
const DEFAULT_MISTRAL_AUDIO_BASE_URL = "https://api.mistral.ai/v1";
|
||||
|
||||
export const mistralProvider: MediaUnderstandingProvider = {
|
||||
id: "mistral",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: (req) =>
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
baseUrl: req.baseUrl ?? DEFAULT_MISTRAL_AUDIO_BASE_URL,
|
||||
}),
|
||||
};
|
||||
@@ -1,10 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { describeMoonshotVideo } from "./video.js";
|
||||
|
||||
export const moonshotProvider: MediaUnderstandingProvider = {
|
||||
id: "moonshot",
|
||||
capabilities: ["image", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeVideo: describeMoonshotVideo,
|
||||
};
|
||||
@@ -1,9 +1,9 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describeMoonshotVideo } from "../../../../extensions/moonshot/media-understanding-provider.js";
|
||||
import {
|
||||
createRequestCaptureJsonFetch,
|
||||
installPinnedHostnameTestHooks,
|
||||
} from "../audio.test-helpers.js";
|
||||
import { describeMoonshotVideo } from "./video.js";
|
||||
|
||||
installPinnedHostnameTestHooks();
|
||||
|
||||
|
||||
@@ -1,29 +1,31 @@
|
||||
import path from "node:path";
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../types.js";
|
||||
import {
|
||||
assertOkOrThrowHttpError,
|
||||
normalizeBaseUrl,
|
||||
postTranscriptionRequest,
|
||||
requireTranscriptionText,
|
||||
} from "../shared.js";
|
||||
} from "./shared.js";
|
||||
|
||||
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";
|
||||
type OpenAiCompatibleAudioParams = AudioTranscriptionRequest & {
|
||||
defaultBaseUrl: string;
|
||||
defaultModel: string;
|
||||
};
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
function resolveModel(model: string | undefined, fallback: string): string {
|
||||
const trimmed = model?.trim();
|
||||
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
|
||||
return trimmed || fallback;
|
||||
}
|
||||
|
||||
export async function transcribeOpenAiCompatibleAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
params: OpenAiCompatibleAudioParams,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
|
||||
const allowPrivate = Boolean(params.baseUrl?.trim());
|
||||
const url = `${baseUrl}/audio/transcriptions`;
|
||||
|
||||
const model = resolveModel(params.model);
|
||||
const model = resolveModel(params.model, params.defaultModel);
|
||||
const form = new FormData();
|
||||
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
|
||||
const bytes = new Uint8Array(params.buffer);
|
||||
@@ -1,18 +1,18 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { transcribeOpenAiAudio } from "../../../../extensions/openai/media-understanding-provider.js";
|
||||
import {
|
||||
createAuthCaptureJsonFetch,
|
||||
createRequestCaptureJsonFetch,
|
||||
installPinnedHostnameTestHooks,
|
||||
} from "../audio.test-helpers.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
installPinnedHostnameTestHooks();
|
||||
|
||||
describe("transcribeOpenAiCompatibleAudio", () => {
|
||||
describe("transcribeOpenAiAudio", () => {
|
||||
it("respects lowercase authorization header overrides", async () => {
|
||||
const { fetchFn, getAuthHeader } = createAuthCaptureJsonFetch({ text: "ok" });
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
const result = await transcribeOpenAiAudio({
|
||||
buffer: Buffer.from("audio"),
|
||||
fileName: "note.mp3",
|
||||
apiKey: "test-key",
|
||||
@@ -28,7 +28,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
|
||||
it("builds the expected request payload", async () => {
|
||||
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "hello" });
|
||||
|
||||
const result = await transcribeOpenAiCompatibleAudio({
|
||||
const result = await transcribeOpenAiAudio({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.wav",
|
||||
apiKey: "test-key",
|
||||
@@ -72,7 +72,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
|
||||
const { fetchFn } = createRequestCaptureJsonFetch({});
|
||||
|
||||
await expect(
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
transcribeOpenAiAudio({
|
||||
buffer: Buffer.from("audio-bytes"),
|
||||
fileName: "voice.wav",
|
||||
apiKey: "test-key",
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
export const openaiProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
capabilities: ["image", "audio"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||
};
|
||||
@@ -1,8 +0,0 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const zaiProvider: MediaUnderstandingProvider = {
|
||||
id: "zai",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
92
src/media-understanding/runtime.test.ts
Normal file
92
src/media-understanding/runtime.test.ts
Normal file
@@ -0,0 +1,92 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { createEmptyPluginRegistry } from "../plugins/registry.js";
|
||||
import { setActivePluginRegistry } from "../plugins/runtime.js";
|
||||
import { describeImageFile, runMediaUnderstandingFile } from "./runtime.js";
|
||||
|
||||
describe("media-understanding runtime helpers", () => {
|
||||
afterEach(() => {
|
||||
setActivePluginRegistry(createEmptyPluginRegistry());
|
||||
});
|
||||
|
||||
it("describes images through the active media-understanding registry", async () => {
|
||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-runtime-"));
|
||||
const imagePath = path.join(tempDir, "sample.jpg");
|
||||
await fs.writeFile(imagePath, Buffer.from("image-bytes"));
|
||||
|
||||
const pluginRegistry = createEmptyPluginRegistry();
|
||||
pluginRegistry.mediaUnderstandingProviders.push({
|
||||
pluginId: "vision-plugin",
|
||||
pluginName: "Vision Plugin",
|
||||
source: "test",
|
||||
provider: {
|
||||
id: "vision-plugin",
|
||||
capabilities: ["image"],
|
||||
describeImage: async () => ({ text: "image ok", model: "vision-v1" }),
|
||||
},
|
||||
});
|
||||
setActivePluginRegistry(pluginRegistry);
|
||||
|
||||
const cfg = {
|
||||
tools: {
|
||||
media: {
|
||||
image: {
|
||||
models: [{ provider: "vision-plugin", model: "vision-v1" }],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
|
||||
const result = await describeImageFile({
|
||||
filePath: imagePath,
|
||||
mime: "image/jpeg",
|
||||
cfg,
|
||||
agentDir: "/tmp/agent",
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
text: "image ok",
|
||||
provider: "vision-plugin",
|
||||
model: "vision-v1",
|
||||
output: {
|
||||
kind: "image.description",
|
||||
attachmentIndex: 0,
|
||||
text: "image ok",
|
||||
provider: "vision-plugin",
|
||||
model: "vision-v1",
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("returns undefined when no media output is produced", async () => {
|
||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-runtime-"));
|
||||
const imagePath = path.join(tempDir, "sample.jpg");
|
||||
await fs.writeFile(imagePath, Buffer.from("image-bytes"));
|
||||
|
||||
const result = await runMediaUnderstandingFile({
|
||||
capability: "image",
|
||||
filePath: imagePath,
|
||||
mime: "image/jpeg",
|
||||
cfg: {
|
||||
tools: {
|
||||
media: {
|
||||
image: {
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig,
|
||||
agentDir: "/tmp/agent",
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
text: undefined,
|
||||
provider: undefined,
|
||||
model: undefined,
|
||||
output: undefined,
|
||||
});
|
||||
});
|
||||
});
|
||||
112
src/media-understanding/runtime.ts
Normal file
112
src/media-understanding/runtime.ts
Normal file
@@ -0,0 +1,112 @@
|
||||
import path from "node:path";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import {
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
type ActiveMediaModel,
|
||||
} from "./runner.js";
|
||||
import type { MediaUnderstandingCapability, MediaUnderstandingOutput } from "./types.js";
|
||||
|
||||
const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandingOutput["kind"]> = {
|
||||
audio: "audio.transcription",
|
||||
image: "image.description",
|
||||
video: "video.description",
|
||||
};
|
||||
|
||||
export type RunMediaUnderstandingFileParams = {
|
||||
capability: MediaUnderstandingCapability;
|
||||
filePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
};
|
||||
|
||||
export type RunMediaUnderstandingFileResult = {
|
||||
text: string | undefined;
|
||||
provider?: string;
|
||||
model?: string;
|
||||
output?: MediaUnderstandingOutput;
|
||||
};
|
||||
|
||||
function buildFileContext(params: { filePath: string; mime?: string }): MsgContext {
|
||||
return {
|
||||
MediaPath: params.filePath,
|
||||
MediaType: params.mime,
|
||||
};
|
||||
}
|
||||
|
||||
export async function runMediaUnderstandingFile(
|
||||
params: RunMediaUnderstandingFileParams,
|
||||
): Promise<RunMediaUnderstandingFileResult> {
|
||||
const ctx = buildFileContext(params);
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
if (attachments.length === 0) {
|
||||
return { text: undefined };
|
||||
}
|
||||
|
||||
const providerRegistry = buildProviderRegistry();
|
||||
const cache = createMediaAttachmentCache(attachments, {
|
||||
localPathRoots: [path.dirname(params.filePath)],
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: params.capability,
|
||||
cfg: params.cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media: attachments,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
config: params.cfg.tools?.media?.[params.capability],
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
const output = result.outputs.find(
|
||||
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
|
||||
);
|
||||
const text = output?.text?.trim();
|
||||
return {
|
||||
text: text || undefined,
|
||||
provider: output?.provider,
|
||||
model: output?.model,
|
||||
output,
|
||||
};
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
export async function describeImageFile(params: {
|
||||
filePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<RunMediaUnderstandingFileResult> {
|
||||
return await runMediaUnderstandingFile({ ...params, capability: "image" });
|
||||
}
|
||||
|
||||
export async function describeVideoFile(params: {
|
||||
filePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<RunMediaUnderstandingFileResult> {
|
||||
return await runMediaUnderstandingFile({ ...params, capability: "video" });
|
||||
}
|
||||
|
||||
export async function transcribeAudioFile(params: {
|
||||
filePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<{ text: string | undefined }> {
|
||||
const result = await runMediaUnderstandingFile({ ...params, capability: "audio" });
|
||||
return { text: result.text };
|
||||
}
|
||||
@@ -1,13 +1,13 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
|
||||
const { runAudioTranscription } = vi.hoisted(() => {
|
||||
const runAudioTranscription = vi.fn();
|
||||
return { runAudioTranscription };
|
||||
const { transcribeAudioFileFromRuntime } = vi.hoisted(() => {
|
||||
const transcribeAudioFileFromRuntime = vi.fn();
|
||||
return { transcribeAudioFileFromRuntime };
|
||||
});
|
||||
|
||||
vi.mock("./audio-transcription-runner.js", () => ({
|
||||
runAudioTranscription,
|
||||
vi.mock("./runtime.js", () => ({
|
||||
transcribeAudioFile: transcribeAudioFileFromRuntime,
|
||||
}));
|
||||
|
||||
import { transcribeAudioFile } from "./transcribe-audio.js";
|
||||
@@ -17,27 +17,23 @@ describe("transcribeAudioFile", () => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it("does not force audio/wav when mime is omitted", async () => {
|
||||
runAudioTranscription.mockResolvedValue({ transcript: "hello", attachments: [] });
|
||||
it("forwards file transcription requests to the shared runtime helper", async () => {
|
||||
transcribeAudioFileFromRuntime.mockResolvedValue({ text: "hello" });
|
||||
|
||||
const result = await transcribeAudioFile({
|
||||
filePath: "/tmp/note.mp3",
|
||||
cfg: {} as OpenClawConfig,
|
||||
});
|
||||
|
||||
expect(runAudioTranscription).toHaveBeenCalledWith({
|
||||
ctx: {
|
||||
MediaPath: "/tmp/note.mp3",
|
||||
MediaType: undefined,
|
||||
},
|
||||
expect(transcribeAudioFileFromRuntime).toHaveBeenCalledWith({
|
||||
filePath: "/tmp/note.mp3",
|
||||
cfg: {} as OpenClawConfig,
|
||||
agentDir: undefined,
|
||||
});
|
||||
expect(result).toEqual({ text: "hello" });
|
||||
});
|
||||
|
||||
it("returns undefined when helper returns no transcript", async () => {
|
||||
runAudioTranscription.mockResolvedValue({ transcript: undefined, attachments: [] });
|
||||
it("returns undefined when the runtime helper returns no transcript", async () => {
|
||||
transcribeAudioFileFromRuntime.mockResolvedValue({ text: undefined });
|
||||
|
||||
const result = await transcribeAudioFile({
|
||||
filePath: "/tmp/missing.wav",
|
||||
@@ -51,7 +47,7 @@ describe("transcribeAudioFile", () => {
|
||||
const cfg = {
|
||||
tools: { media: { audio: { timeoutSeconds: 10 } } },
|
||||
} as unknown as OpenClawConfig;
|
||||
runAudioTranscription.mockRejectedValue(new Error("boom"));
|
||||
transcribeAudioFileFromRuntime.mockRejectedValue(new Error("boom"));
|
||||
|
||||
await expect(
|
||||
transcribeAudioFile({
|
||||
|
||||
@@ -1,29 +1 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { runAudioTranscription } from "./audio-transcription-runner.js";
|
||||
|
||||
/**
|
||||
* Transcribe an audio file using the configured media-understanding provider.
|
||||
*
|
||||
* Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
|
||||
* falling back through configured models until one succeeds.
|
||||
*
|
||||
* This is the runtime-exposed entry point for external plugins (e.g. marmot)
|
||||
* that need STT without importing internal media-understanding modules directly.
|
||||
*/
|
||||
export async function transcribeAudioFile(params: {
|
||||
filePath: string;
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
mime?: string;
|
||||
}): Promise<{ text: string | undefined }> {
|
||||
const ctx = {
|
||||
MediaPath: params.filePath,
|
||||
MediaType: params.mime,
|
||||
};
|
||||
const { transcript } = await runAudioTranscription({
|
||||
ctx,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
return { text: transcript };
|
||||
}
|
||||
export { transcribeAudioFile } from "./runtime.js";
|
||||
|
||||
@@ -574,34 +574,62 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
||||
});
|
||||
};
|
||||
|
||||
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
|
||||
const id = provider.id.trim();
|
||||
const registerUniqueProviderLike = <
|
||||
T extends { id: string },
|
||||
R extends {
|
||||
pluginId: string;
|
||||
pluginName?: string;
|
||||
provider: T;
|
||||
source: string;
|
||||
rootDir?: string;
|
||||
},
|
||||
>(params: {
|
||||
record: PluginRecord;
|
||||
provider: T;
|
||||
kindLabel: string;
|
||||
registrations: R[];
|
||||
ownedIds: string[];
|
||||
}) => {
|
||||
const id = params.provider.id.trim();
|
||||
const { record, kindLabel } = params;
|
||||
const missingLabel = `${kindLabel} registration missing id`;
|
||||
const duplicateLabel = `${kindLabel} already registered: ${id}`;
|
||||
if (!id) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: "speech provider registration missing id",
|
||||
message: missingLabel,
|
||||
});
|
||||
return;
|
||||
}
|
||||
const existing = registry.speechProviders.find((entry) => entry.provider.id === id);
|
||||
const existing = params.registrations.find((entry) => entry.provider.id === id);
|
||||
if (existing) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: `speech provider already registered: ${id} (${existing.pluginId})`,
|
||||
message: `${duplicateLabel} (${existing.pluginId})`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
record.speechProviderIds.push(id);
|
||||
registry.speechProviders.push({
|
||||
params.ownedIds.push(id);
|
||||
params.registrations.push({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
provider,
|
||||
provider: params.provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
} as R);
|
||||
};
|
||||
|
||||
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
|
||||
registerUniqueProviderLike({
|
||||
record,
|
||||
provider,
|
||||
kindLabel: "speech provider",
|
||||
registrations: registry.speechProviders,
|
||||
ownedIds: record.speechProviderIds,
|
||||
});
|
||||
};
|
||||
|
||||
@@ -609,64 +637,22 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
|
||||
record: PluginRecord,
|
||||
provider: MediaUnderstandingProviderPlugin,
|
||||
) => {
|
||||
const id = provider.id.trim();
|
||||
if (!id) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: "media provider registration missing id",
|
||||
});
|
||||
return;
|
||||
}
|
||||
const existing = registry.mediaUnderstandingProviders.find((entry) => entry.provider.id === id);
|
||||
if (existing) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: `media provider already registered: ${id} (${existing.pluginId})`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
record.mediaUnderstandingProviderIds.push(id);
|
||||
registry.mediaUnderstandingProviders.push({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
registerUniqueProviderLike({
|
||||
record,
|
||||
provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
kindLabel: "media provider",
|
||||
registrations: registry.mediaUnderstandingProviders,
|
||||
ownedIds: record.mediaUnderstandingProviderIds,
|
||||
});
|
||||
};
|
||||
|
||||
const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => {
|
||||
const id = provider.id.trim();
|
||||
if (!id) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: "web search provider registration missing id",
|
||||
});
|
||||
return;
|
||||
}
|
||||
const existing = registry.webSearchProviders.find((entry) => entry.provider.id === id);
|
||||
if (existing) {
|
||||
pushDiagnostic({
|
||||
level: "error",
|
||||
pluginId: record.id,
|
||||
source: record.source,
|
||||
message: `web search provider already registered: ${id} (${existing.pluginId})`,
|
||||
});
|
||||
return;
|
||||
}
|
||||
record.webSearchProviderIds.push(id);
|
||||
registry.webSearchProviders.push({
|
||||
pluginId: record.id,
|
||||
pluginName: record.name,
|
||||
registerUniqueProviderLike({
|
||||
record,
|
||||
provider,
|
||||
source: record.source,
|
||||
rootDir: record.rootDir,
|
||||
kindLabel: "web search provider",
|
||||
registrations: registry.webSearchProviders,
|
||||
ownedIds: record.webSearchProviderIds,
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
@@ -4,7 +4,12 @@ import {
|
||||
resolveApiKeyForProvider as resolveApiKeyForProviderRaw,
|
||||
} from "../../agents/model-auth.js";
|
||||
import { resolveStateDir } from "../../config/paths.js";
|
||||
import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js";
|
||||
import {
|
||||
describeImageFile,
|
||||
describeVideoFile,
|
||||
runMediaUnderstandingFile,
|
||||
transcribeAudioFile,
|
||||
} from "../../media-understanding/runtime.js";
|
||||
import { listSpeechVoices, textToSpeech, textToSpeechTelephony } from "../../tts/tts.js";
|
||||
import { createRuntimeAgent } from "./runtime-agent.js";
|
||||
import { createRuntimeChannel } from "./runtime-channel.js";
|
||||
@@ -136,6 +141,12 @@ export function createPluginRuntime(_options: CreatePluginRuntimeOptions = {}):
|
||||
system: createRuntimeSystem(),
|
||||
media: createRuntimeMedia(),
|
||||
tts: { textToSpeech, textToSpeechTelephony, listVoices: listSpeechVoices },
|
||||
mediaUnderstanding: {
|
||||
runFile: runMediaUnderstandingFile,
|
||||
describeImageFile,
|
||||
describeVideoFile,
|
||||
transcribeAudioFile,
|
||||
},
|
||||
stt: { transcribeAudioFile },
|
||||
tools: createRuntimeTools(),
|
||||
channel: createRuntimeChannel(),
|
||||
|
||||
@@ -51,6 +51,12 @@ export type PluginRuntimeCore = {
|
||||
textToSpeechTelephony: typeof import("../../tts/tts.js").textToSpeechTelephony;
|
||||
listVoices: typeof import("../../tts/tts.js").listSpeechVoices;
|
||||
};
|
||||
mediaUnderstanding: {
|
||||
runFile: typeof import("../../media-understanding/runtime.js").runMediaUnderstandingFile;
|
||||
describeImageFile: typeof import("../../media-understanding/runtime.js").describeImageFile;
|
||||
describeVideoFile: typeof import("../../media-understanding/runtime.js").describeVideoFile;
|
||||
transcribeAudioFile: typeof import("../../media-understanding/runtime.js").transcribeAudioFile;
|
||||
};
|
||||
stt: {
|
||||
transcribeAudioFile: typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user