feat(plugins): move media understanding into vendor plugins

This commit is contained in:
Peter Steinberger
2026-03-16 20:58:22 -07:00
parent e064c1198e
commit c081dc52b7
40 changed files with 602 additions and 407 deletions

View File

@@ -23,10 +23,10 @@ import {
import { buildTokenProfileId, validateAnthropicSetupToken } from "../../src/commands/auth-token.js";
import { applyAuthProfileConfig } from "../../src/commands/onboard-auth.js";
import { fetchClaudeUsage } from "../../src/infra/provider-usage.fetch.js";
import { anthropicProvider } from "../../src/media-understanding/providers/anthropic/index.js";
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
import type { ProviderAuthResult } from "../../src/plugins/types.js";
import { normalizeSecretInput } from "../../src/utils/normalize-secret-input.js";
import { anthropicMediaUnderstandingProvider } from "./media-understanding-provider.js";
const PROVIDER_ID = "anthropic";
const DEFAULT_ANTHROPIC_MODEL = "anthropic/claude-sonnet-4-6";
@@ -395,7 +395,7 @@ const anthropicPlugin = {
profileId: ctx.profileId,
}),
});
api.registerMediaUnderstandingProvider(anthropicProvider);
api.registerMediaUnderstandingProvider(anthropicMediaUnderstandingProvider);
},
};

View File

@@ -0,0 +1,8 @@
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "anthropic",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -7,11 +7,11 @@ import {
GOOGLE_GEMINI_DEFAULT_MODEL,
applyGoogleGeminiModelDefault,
} from "../../src/commands/google-gemini-model-default.js";
import { googleProvider } from "../../src/media-understanding/providers/google/index.js";
import { emptyPluginConfigSchema } from "../../src/plugins/config-schema.js";
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
import type { OpenClawPluginApi } from "../../src/plugins/types.js";
import { registerGoogleGeminiCliProvider } from "./gemini-cli-provider.js";
import { googleMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { isModernGoogleModel, resolveGoogle31ForwardCompatModel } from "./provider-models.js";
const googlePlugin = {
@@ -52,7 +52,7 @@ const googlePlugin = {
isModernModelRef: ({ modelId }) => isModernGoogleModel(modelId),
});
registerGoogleGeminiCliProvider(api);
api.registerMediaUnderstandingProvider(googleProvider);
api.registerMediaUnderstandingProvider(googleMediaUnderstandingProvider);
api.registerWebSearchProvider(
createPluginBackedWebSearchProvider({
id: "gemini",

View File

@@ -0,0 +1,150 @@
import { normalizeGoogleModelId } from "../../src/agents/model-id-normalization.js";
import { parseGeminiAuth } from "../../src/infra/gemini-auth.js";
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "../../src/media-understanding/providers/shared.js";
import type {
AudioTranscriptionRequest,
AudioTranscriptionResult,
MediaUnderstandingProvider,
VideoDescriptionRequest,
VideoDescriptionResult,
} from "../../src/media-understanding/types.js";
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
async function generateGeminiInlineDataText(params: {
buffer: Buffer;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
model?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
defaultBaseUrl: string;
defaultModel: string;
defaultPrompt: string;
defaultMime: string;
httpErrorLabel: string;
missingTextError: string;
}): Promise<{ text: string; model: string }> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
const allowPrivate = Boolean(params.baseUrl?.trim());
const model = (() => {
const trimmed = params.model?.trim();
if (!trimmed) {
return params.defaultModel;
}
return normalizeGoogleModelId(trimmed);
})();
const url = `${baseUrl}/models/${model}:generateContent`;
const authHeaders = parseGeminiAuth(params.apiKey);
const headers = new Headers(params.headers);
for (const [key, value] of Object.entries(authHeaders.headers)) {
if (!headers.has(key)) {
headers.set(key, value);
}
}
const prompt = (() => {
const trimmed = params.prompt?.trim();
return trimmed || params.defaultPrompt;
})();
const body = {
contents: [
{
role: "user",
parts: [
{ text: prompt },
{
inline_data: {
mime_type: params.mime ?? params.defaultMime,
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await postJsonRequest({
url,
headers,
body,
timeoutMs: params.timeoutMs,
fetchFn,
allowPrivateNetwork: allowPrivate,
});
try {
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error(params.missingTextError);
}
return { text, model };
} finally {
await release();
}
}
export async function transcribeGeminiAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
defaultMime: "audio/wav",
httpErrorLabel: "Audio transcription failed",
missingTextError: "Audio transcription response missing text",
});
return { text, model };
}
export async function describeGeminiVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
defaultMime: "video/mp4",
httpErrorLabel: "Video description failed",
missingTextError: "Video description response missing text",
});
return { text, model };
}
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "google",
capabilities: ["image", "audio", "video"],
describeImage: describeImageWithModel,
transcribeAudio: transcribeGeminiAudio,
describeVideo: describeGeminiVideo,
};

View File

@@ -9,11 +9,11 @@ import {
import { ensureAuthProfileStore, listProfilesForProvider } from "../../src/agents/auth-profiles.js";
import { MINIMAX_OAUTH_MARKER } from "../../src/agents/model-auth-markers.js";
import { fetchMinimaxUsage } from "../../src/infra/provider-usage.fetch.js";
import {
minimaxPortalProvider,
minimaxProvider,
} from "../../src/media-understanding/providers/minimax/index.js";
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
import {
minimaxMediaUnderstandingProvider,
minimaxPortalMediaUnderstandingProvider,
} from "./media-understanding-provider.js";
import { loginMiniMaxPortalOAuth, type MiniMaxRegion } from "./oauth.js";
import { applyMinimaxApiConfig, applyMinimaxApiConfigCn } from "./onboard.js";
import { buildMinimaxPortalProvider, buildMinimaxProvider } from "./provider-catalog.js";
@@ -274,8 +274,8 @@ const minimaxPlugin = {
],
isModernModelRef: ({ modelId }) => isModernMiniMaxModel(modelId),
});
api.registerMediaUnderstandingProvider(minimaxProvider);
api.registerMediaUnderstandingProvider(minimaxPortalProvider);
api.registerMediaUnderstandingProvider(minimaxMediaUnderstandingProvider);
api.registerMediaUnderstandingProvider(minimaxPortalMediaUnderstandingProvider);
},
};

View File

@@ -0,0 +1,14 @@
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax",
capabilities: ["image"],
describeImage: describeImageWithModel,
};
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax-portal",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -1,6 +1,6 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { mistralProvider } from "../../src/media-understanding/providers/mistral/index.js";
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
import { mistralMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { applyMistralConfig, MISTRAL_DEFAULT_MODEL_REF } from "./onboard.js";
const PROVIDER_ID = "mistral";
@@ -51,7 +51,7 @@ const mistralPlugin = {
],
},
});
api.registerMediaUnderstandingProvider(mistralProvider);
api.registerMediaUnderstandingProvider(mistralMediaUnderstandingProvider);
},
};

View File

@@ -0,0 +1,17 @@
import { transcribeOpenAiCompatibleAudio } from "../../src/media-understanding/providers/openai-compatible-audio.js";
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
const DEFAULT_MISTRAL_AUDIO_BASE_URL = "https://api.mistral.ai/v1";
const DEFAULT_MISTRAL_AUDIO_MODEL = "voxtral-mini-latest";
export const mistralMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "mistral",
capabilities: ["audio"],
transcribeAudio: async (req) =>
await transcribeOpenAiCompatibleAudio({
...req,
baseUrl: req.baseUrl ?? DEFAULT_MISTRAL_AUDIO_BASE_URL,
defaultBaseUrl: DEFAULT_MISTRAL_AUDIO_BASE_URL,
defaultModel: DEFAULT_MISTRAL_AUDIO_MODEL,
}),
};

View File

@@ -7,10 +7,10 @@ import {
getScopedCredentialValue,
setScopedCredentialValue,
} from "../../src/agents/tools/web-search-plugin-factory.js";
import { moonshotProvider } from "../../src/media-understanding/providers/moonshot/index.js";
import { emptyPluginConfigSchema } from "../../src/plugins/config-schema.js";
import { createProviderApiKeyAuthMethod } from "../../src/plugins/provider-api-key-auth.js";
import type { OpenClawPluginApi } from "../../src/plugins/types.js";
import { moonshotMediaUnderstandingProvider } from "./media-understanding-provider.js";
import {
applyMoonshotConfig,
applyMoonshotConfigCn,
@@ -100,7 +100,7 @@ const moonshotPlugin = {
return createMoonshotThinkingWrapper(ctx.streamFn, thinkingType);
},
});
api.registerMediaUnderstandingProvider(moonshotProvider);
api.registerMediaUnderstandingProvider(moonshotMediaUnderstandingProvider);
api.registerWebSearchProvider(
createPluginBackedWebSearchProvider({
id: "kimi",

View File

@@ -1,5 +1,14 @@
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
import { assertOkOrThrowHttpError, normalizeBaseUrl, postJsonRequest } from "../shared.js";
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postJsonRequest,
} from "../../src/media-understanding/providers/shared.js";
import type {
MediaUnderstandingProvider,
VideoDescriptionRequest,
VideoDescriptionResult,
} from "../../src/media-understanding/types.js";
export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1";
const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5";
@@ -104,3 +113,10 @@ export async function describeMoonshotVideo(
await release();
}
}
export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "moonshot",
capabilities: ["image", "video"],
describeImage: describeImageWithModel,
describeVideo: describeMoonshotVideo,
};

View File

@@ -1,6 +1,6 @@
import { emptyPluginConfigSchema, type OpenClawPluginApi } from "openclaw/plugin-sdk/core";
import { openaiProvider } from "../../src/media-understanding/providers/openai/index.js";
import { buildOpenAISpeechProvider } from "../../src/tts/providers/openai.js";
import { openaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { buildOpenAICodexProviderPlugin } from "./openai-codex-provider.js";
import { buildOpenAIProvider } from "./openai-provider.js";
@@ -13,7 +13,7 @@ const openAIPlugin = {
api.registerProvider(buildOpenAIProvider());
api.registerProvider(buildOpenAICodexProviderPlugin());
api.registerSpeechProvider(buildOpenAISpeechProvider());
api.registerMediaUnderstandingProvider(openaiProvider);
api.registerMediaUnderstandingProvider(openaiMediaUnderstandingProvider);
},
};

View File

@@ -0,0 +1,23 @@
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import { transcribeOpenAiCompatibleAudio } from "../../src/media-understanding/providers/openai-compatible-audio.js";
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";
export async function transcribeOpenAiAudio(
params: import("../../src/media-understanding/types.js").AudioTranscriptionRequest,
) {
return await transcribeOpenAiCompatibleAudio({
...params,
defaultBaseUrl: DEFAULT_OPENAI_AUDIO_BASE_URL,
defaultModel: DEFAULT_OPENAI_AUDIO_MODEL,
});
}
export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openai",
capabilities: ["image", "audio"],
describeImage: describeImageWithModel,
transcribeAudio: transcribeOpenAiAudio,
};

View File

@@ -106,6 +106,15 @@ export function createPluginRuntimeMock(overrides: DeepPartial<PluginRuntime> =
textToSpeechTelephony: vi.fn() as unknown as PluginRuntime["tts"]["textToSpeechTelephony"],
listVoices: vi.fn() as unknown as PluginRuntime["tts"]["listVoices"],
},
mediaUnderstanding: {
runFile: vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["runFile"],
describeImageFile:
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["describeImageFile"],
describeVideoFile:
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["describeVideoFile"],
transcribeAudioFile:
vi.fn() as unknown as PluginRuntime["mediaUnderstanding"]["transcribeAudioFile"],
},
stt: {
transcribeAudioFile: vi.fn() as unknown as PluginRuntime["stt"]["transcribeAudioFile"],
},

View File

@@ -24,9 +24,9 @@ import { applyAuthProfileConfig } from "../../src/commands/onboard-auth.js";
import type { SecretInput } from "../../src/config/types.secrets.js";
import { resolveRequiredHomeDir } from "../../src/infra/home-dir.js";
import { fetchZaiUsage } from "../../src/infra/provider-usage.fetch.js";
import { zaiProvider } from "../../src/media-understanding/providers/zai/index.js";
import { normalizeOptionalSecretInput } from "../../src/utils/normalize-secret-input.js";
import { detectZaiEndpoint, type ZaiEndpointId } from "./detect.js";
import { zaiMediaUnderstandingProvider } from "./media-understanding-provider.js";
import { applyZaiConfig, applyZaiProviderConfig, ZAI_DEFAULT_MODEL_REF } from "./onboard.js";
const PROVIDER_ID = "zai";
@@ -335,7 +335,7 @@ const zaiPlugin = {
fetchUsageSnapshot: async (ctx) => await fetchZaiUsage(ctx.token, ctx.timeoutMs, ctx.fetchFn),
isCacheTtlEligible: () => true,
});
api.registerMediaUnderstandingProvider(zaiProvider);
api.registerMediaUnderstandingProvider(zaiMediaUnderstandingProvider);
},
};

View File

@@ -0,0 +1,8 @@
import { describeImageWithModel } from "../../src/media-understanding/providers/image.js";
import type { MediaUnderstandingProvider } from "../../src/media-understanding/types.js";
export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "zai",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -1,8 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const anthropicProvider: MediaUnderstandingProvider = {
id: "anthropic",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -1,21 +0,0 @@
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import { generateGeminiInlineDataText } from "./inline-data.js";
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
export async function transcribeGeminiAudio(
params: AudioTranscriptionRequest,
): Promise<AudioTranscriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_AUDIO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_AUDIO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_AUDIO_PROMPT,
defaultMime: "audio/wav",
httpErrorLabel: "Audio transcription failed",
missingTextError: "Audio transcription response missing text",
});
return { text, model };
}

View File

@@ -1,12 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { transcribeGeminiAudio } from "./audio.js";
import { describeGeminiVideo } from "./video.js";
export const googleProvider: MediaUnderstandingProvider = {
id: "google",
capabilities: ["image", "audio", "video"],
describeImage: describeImageWithModel,
transcribeAudio: transcribeGeminiAudio,
describeVideo: describeGeminiVideo,
};

View File

@@ -1,93 +0,0 @@
import { normalizeGoogleModelId } from "../../../agents/model-id-normalization.js";
import { parseGeminiAuth } from "../../../infra/gemini-auth.js";
import { assertOkOrThrowHttpError, normalizeBaseUrl, postJsonRequest } from "../shared.js";
export async function generateGeminiInlineDataText(params: {
buffer: Buffer;
mime?: string;
apiKey: string;
baseUrl?: string;
headers?: Record<string, string>;
model?: string;
prompt?: string;
timeoutMs: number;
fetchFn?: typeof fetch;
defaultBaseUrl: string;
defaultModel: string;
defaultPrompt: string;
defaultMime: string;
httpErrorLabel: string;
missingTextError: string;
}): Promise<{ text: string; model: string }> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
const allowPrivate = Boolean(params.baseUrl?.trim());
const model = (() => {
const trimmed = params.model?.trim();
if (!trimmed) {
return params.defaultModel;
}
return normalizeGoogleModelId(trimmed);
})();
const url = `${baseUrl}/models/${model}:generateContent`;
const authHeaders = parseGeminiAuth(params.apiKey);
const headers = new Headers(params.headers);
for (const [key, value] of Object.entries(authHeaders.headers)) {
if (!headers.has(key)) {
headers.set(key, value);
}
}
const prompt = (() => {
const trimmed = params.prompt?.trim();
return trimmed || params.defaultPrompt;
})();
const body = {
contents: [
{
role: "user",
parts: [
{ text: prompt },
{
inline_data: {
mime_type: params.mime ?? params.defaultMime,
data: params.buffer.toString("base64"),
},
},
],
},
],
};
const { response: res, release } = await postJsonRequest({
url,
headers,
body,
timeoutMs: params.timeoutMs,
fetchFn,
allowPrivateNetwork: allowPrivate,
});
try {
await assertOkOrThrowHttpError(res, params.httpErrorLabel);
const payload = (await res.json()) as {
candidates?: Array<{
content?: { parts?: Array<{ text?: string }> };
}>;
};
const parts = payload.candidates?.[0]?.content?.parts ?? [];
const text = parts
.map((part) => part?.text?.trim())
.filter(Boolean)
.join("\n");
if (!text) {
throw new Error(params.missingTextError);
}
return { text, model };
} finally {
await release();
}
}

View File

@@ -1,8 +1,8 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { describeGeminiVideo } from "../../../../extensions/google/media-understanding-provider.js";
import * as ssrf from "../../../infra/net/ssrf.js";
import { withFetchPreconnect } from "../../../test-utils/fetch-mock.js";
import { createRequestCaptureJsonFetch } from "../audio.test-helpers.js";
import { describeGeminiVideo } from "./video.js";
const TEST_NET_IP = "203.0.113.10";

View File

@@ -1,21 +0,0 @@
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
import { generateGeminiInlineDataText } from "./inline-data.js";
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
export async function describeGeminiVideo(
params: VideoDescriptionRequest,
): Promise<VideoDescriptionResult> {
const { text, model } = await generateGeminiInlineDataText({
...params,
defaultBaseUrl: DEFAULT_GOOGLE_VIDEO_BASE_URL,
defaultModel: DEFAULT_GOOGLE_VIDEO_MODEL,
defaultPrompt: DEFAULT_GOOGLE_VIDEO_PROMPT,
defaultMime: "video/mp4",
httpErrorLabel: "Video description failed",
missingTextError: "Video description response missing text",
});
return { text, model };
}

View File

@@ -1,7 +1,8 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
import { transcribeOpenAiCompatibleAudio } from "../openai-compatible-audio.js";
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
const DEFAULT_GROQ_AUDIO_MODEL = "whisper-large-v3-turbo";
export const groqProvider: MediaUnderstandingProvider = {
id: "groq",
@@ -10,5 +11,7 @@ export const groqProvider: MediaUnderstandingProvider = {
transcribeOpenAiCompatibleAudio({
...req,
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
defaultBaseUrl: DEFAULT_GROQ_AUDIO_BASE_URL,
defaultModel: DEFAULT_GROQ_AUDIO_MODEL,
}),
};

View File

@@ -8,35 +8,15 @@ describe("media-understanding provider registry", () => {
setActivePluginRegistry(createEmptyPluginRegistry());
});
it("registers the Mistral provider", () => {
it("keeps core-owned fallback providers registered by default", () => {
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("mistral", registry);
const groqProvider = getMediaUnderstandingProvider("groq", registry);
const deepgramProvider = getMediaUnderstandingProvider("deepgram", registry);
expect(provider?.id).toBe("mistral");
expect(provider?.capabilities).toEqual(["audio"]);
});
it("keeps provider id normalization behavior", () => {
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("gemini", registry);
expect(provider?.id).toBe("google");
});
it("registers the Moonshot provider", () => {
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("moonshot", registry);
expect(provider?.id).toBe("moonshot");
expect(provider?.capabilities).toEqual(["image", "video"]);
});
it("registers the minimax portal provider", () => {
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("minimax-portal", registry);
expect(provider?.id).toBe("minimax-portal");
expect(provider?.capabilities).toEqual(["image"]);
expect(groqProvider?.id).toBe("groq");
expect(groqProvider?.capabilities).toEqual(["audio"]);
expect(deepgramProvider?.id).toBe("deepgram");
expect(deepgramProvider?.capabilities).toEqual(["audio"]);
});
it("merges plugin-registered media providers into the active registry", async () => {
@@ -61,4 +41,23 @@ describe("media-understanding provider registry", () => {
expect(provider?.id).toBe("google");
expect(await provider?.describeVideo?.({} as never)).toEqual({ text: "plugin video" });
});
it("keeps provider id normalization behavior for plugin-owned providers", () => {
const pluginRegistry = createEmptyPluginRegistry();
pluginRegistry.mediaUnderstandingProviders.push({
pluginId: "google",
pluginName: "Google Plugin",
source: "test",
provider: {
id: "google",
capabilities: ["image", "audio", "video"],
},
});
setActivePluginRegistry(pluginRegistry);
const registry = buildMediaUnderstandingRegistry();
const provider = getMediaUnderstandingProvider("gemini", registry);
expect(provider?.id).toBe("google");
});
});

View File

@@ -1,28 +1,10 @@
import { normalizeProviderId } from "../../agents/model-selection.js";
import { getActivePluginRegistry } from "../../plugins/runtime.js";
import type { MediaUnderstandingProvider } from "../types.js";
import { anthropicProvider } from "./anthropic/index.js";
import { deepgramProvider } from "./deepgram/index.js";
import { googleProvider } from "./google/index.js";
import { groqProvider } from "./groq/index.js";
import { minimaxPortalProvider, minimaxProvider } from "./minimax/index.js";
import { mistralProvider } from "./mistral/index.js";
import { moonshotProvider } from "./moonshot/index.js";
import { openaiProvider } from "./openai/index.js";
import { zaiProvider } from "./zai/index.js";
const PROVIDERS: MediaUnderstandingProvider[] = [
groqProvider,
openaiProvider,
googleProvider,
anthropicProvider,
minimaxProvider,
minimaxPortalProvider,
moonshotProvider,
mistralProvider,
zaiProvider,
deepgramProvider,
];
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, deepgramProvider];
function mergeProviderIntoRegistry(
registry: Map<string, MediaUnderstandingProvider>,

View File

@@ -1,14 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const minimaxProvider: MediaUnderstandingProvider = {
id: "minimax",
capabilities: ["image"],
describeImage: describeImageWithModel,
};
export const minimaxPortalProvider: MediaUnderstandingProvider = {
id: "minimax-portal",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -1,23 +1,23 @@
import { describe, expect, it } from "vitest";
import { mistralMediaUnderstandingProvider } from "../../../../extensions/mistral/media-understanding-provider.js";
import {
createRequestCaptureJsonFetch,
installPinnedHostnameTestHooks,
} from "../audio.test-helpers.js";
import { mistralProvider } from "./index.js";
installPinnedHostnameTestHooks();
describe("mistralProvider", () => {
describe("mistralMediaUnderstandingProvider", () => {
it("has expected provider metadata", () => {
expect(mistralProvider.id).toBe("mistral");
expect(mistralProvider.capabilities).toEqual(["audio"]);
expect(mistralProvider.transcribeAudio).toBeDefined();
expect(mistralMediaUnderstandingProvider.id).toBe("mistral");
expect(mistralMediaUnderstandingProvider.capabilities).toEqual(["audio"]);
expect(mistralMediaUnderstandingProvider.transcribeAudio).toBeDefined();
});
it("uses Mistral base URL by default", async () => {
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "bonjour" });
const result = await mistralProvider.transcribeAudio!({
const result = await mistralMediaUnderstandingProvider.transcribeAudio!({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.ogg",
apiKey: "test-mistral-key", // pragma: allowlist secret
@@ -32,7 +32,7 @@ describe("mistralProvider", () => {
it("allows overriding baseUrl", async () => {
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "ok" });
await mistralProvider.transcribeAudio!({
await mistralMediaUnderstandingProvider.transcribeAudio!({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "key", // pragma: allowlist secret

View File

@@ -1,14 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
const DEFAULT_MISTRAL_AUDIO_BASE_URL = "https://api.mistral.ai/v1";
export const mistralProvider: MediaUnderstandingProvider = {
id: "mistral",
capabilities: ["audio"],
transcribeAudio: (req) =>
transcribeOpenAiCompatibleAudio({
...req,
baseUrl: req.baseUrl ?? DEFAULT_MISTRAL_AUDIO_BASE_URL,
}),
};

View File

@@ -1,10 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { describeMoonshotVideo } from "./video.js";
export const moonshotProvider: MediaUnderstandingProvider = {
id: "moonshot",
capabilities: ["image", "video"],
describeImage: describeImageWithModel,
describeVideo: describeMoonshotVideo,
};

View File

@@ -1,9 +1,9 @@
import { describe, expect, it } from "vitest";
import { describeMoonshotVideo } from "../../../../extensions/moonshot/media-understanding-provider.js";
import {
createRequestCaptureJsonFetch,
installPinnedHostnameTestHooks,
} from "../audio.test-helpers.js";
import { describeMoonshotVideo } from "./video.js";
installPinnedHostnameTestHooks();

View File

@@ -1,29 +1,31 @@
import path from "node:path";
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../types.js";
import {
assertOkOrThrowHttpError,
normalizeBaseUrl,
postTranscriptionRequest,
requireTranscriptionText,
} from "../shared.js";
} from "./shared.js";
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";
type OpenAiCompatibleAudioParams = AudioTranscriptionRequest & {
defaultBaseUrl: string;
defaultModel: string;
};
function resolveModel(model?: string): string {
function resolveModel(model: string | undefined, fallback: string): string {
const trimmed = model?.trim();
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
return trimmed || fallback;
}
export async function transcribeOpenAiCompatibleAudio(
params: AudioTranscriptionRequest,
params: OpenAiCompatibleAudioParams,
): Promise<AudioTranscriptionResult> {
const fetchFn = params.fetchFn ?? fetch;
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
const baseUrl = normalizeBaseUrl(params.baseUrl, params.defaultBaseUrl);
const allowPrivate = Boolean(params.baseUrl?.trim());
const url = `${baseUrl}/audio/transcriptions`;
const model = resolveModel(params.model);
const model = resolveModel(params.model, params.defaultModel);
const form = new FormData();
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
const bytes = new Uint8Array(params.buffer);

View File

@@ -1,18 +1,18 @@
import { describe, expect, it } from "vitest";
import { transcribeOpenAiAudio } from "../../../../extensions/openai/media-understanding-provider.js";
import {
createAuthCaptureJsonFetch,
createRequestCaptureJsonFetch,
installPinnedHostnameTestHooks,
} from "../audio.test-helpers.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
installPinnedHostnameTestHooks();
describe("transcribeOpenAiCompatibleAudio", () => {
describe("transcribeOpenAiAudio", () => {
it("respects lowercase authorization header overrides", async () => {
const { fetchFn, getAuthHeader } = createAuthCaptureJsonFetch({ text: "ok" });
const result = await transcribeOpenAiCompatibleAudio({
const result = await transcribeOpenAiAudio({
buffer: Buffer.from("audio"),
fileName: "note.mp3",
apiKey: "test-key",
@@ -28,7 +28,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
it("builds the expected request payload", async () => {
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "hello" });
const result = await transcribeOpenAiCompatibleAudio({
const result = await transcribeOpenAiAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",
@@ -72,7 +72,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
const { fetchFn } = createRequestCaptureJsonFetch({});
await expect(
transcribeOpenAiCompatibleAudio({
transcribeOpenAiAudio({
buffer: Buffer.from("audio-bytes"),
fileName: "voice.wav",
apiKey: "test-key",

View File

@@ -1,10 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
export const openaiProvider: MediaUnderstandingProvider = {
id: "openai",
capabilities: ["image", "audio"],
describeImage: describeImageWithModel,
transcribeAudio: transcribeOpenAiCompatibleAudio,
};

View File

@@ -1,8 +0,0 @@
import type { MediaUnderstandingProvider } from "../../types.js";
import { describeImageWithModel } from "../image.js";
export const zaiProvider: MediaUnderstandingProvider = {
id: "zai",
capabilities: ["image"],
describeImage: describeImageWithModel,
};

View File

@@ -0,0 +1,92 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { createEmptyPluginRegistry } from "../plugins/registry.js";
import { setActivePluginRegistry } from "../plugins/runtime.js";
import { describeImageFile, runMediaUnderstandingFile } from "./runtime.js";
describe("media-understanding runtime helpers", () => {
afterEach(() => {
setActivePluginRegistry(createEmptyPluginRegistry());
});
it("describes images through the active media-understanding registry", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-runtime-"));
const imagePath = path.join(tempDir, "sample.jpg");
await fs.writeFile(imagePath, Buffer.from("image-bytes"));
const pluginRegistry = createEmptyPluginRegistry();
pluginRegistry.mediaUnderstandingProviders.push({
pluginId: "vision-plugin",
pluginName: "Vision Plugin",
source: "test",
provider: {
id: "vision-plugin",
capabilities: ["image"],
describeImage: async () => ({ text: "image ok", model: "vision-v1" }),
},
});
setActivePluginRegistry(pluginRegistry);
const cfg = {
tools: {
media: {
image: {
models: [{ provider: "vision-plugin", model: "vision-v1" }],
},
},
},
} as OpenClawConfig;
const result = await describeImageFile({
filePath: imagePath,
mime: "image/jpeg",
cfg,
agentDir: "/tmp/agent",
});
expect(result).toEqual({
text: "image ok",
provider: "vision-plugin",
model: "vision-v1",
output: {
kind: "image.description",
attachmentIndex: 0,
text: "image ok",
provider: "vision-plugin",
model: "vision-v1",
},
});
});
it("returns undefined when no media output is produced", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-runtime-"));
const imagePath = path.join(tempDir, "sample.jpg");
await fs.writeFile(imagePath, Buffer.from("image-bytes"));
const result = await runMediaUnderstandingFile({
capability: "image",
filePath: imagePath,
mime: "image/jpeg",
cfg: {
tools: {
media: {
image: {
enabled: false,
},
},
},
} as OpenClawConfig,
agentDir: "/tmp/agent",
});
expect(result).toEqual({
text: undefined,
provider: undefined,
model: undefined,
output: undefined,
});
});
});

View File

@@ -0,0 +1,112 @@
import path from "node:path";
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/config.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
type ActiveMediaModel,
} from "./runner.js";
import type { MediaUnderstandingCapability, MediaUnderstandingOutput } from "./types.js";
const KIND_BY_CAPABILITY: Record<MediaUnderstandingCapability, MediaUnderstandingOutput["kind"]> = {
audio: "audio.transcription",
image: "image.description",
video: "video.description",
};
export type RunMediaUnderstandingFileParams = {
capability: MediaUnderstandingCapability;
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
};
export type RunMediaUnderstandingFileResult = {
text: string | undefined;
provider?: string;
model?: string;
output?: MediaUnderstandingOutput;
};
function buildFileContext(params: { filePath: string; mime?: string }): MsgContext {
return {
MediaPath: params.filePath,
MediaType: params.mime,
};
}
export async function runMediaUnderstandingFile(
params: RunMediaUnderstandingFileParams,
): Promise<RunMediaUnderstandingFileResult> {
const ctx = buildFileContext(params);
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
}
const providerRegistry = buildProviderRegistry();
const cache = createMediaAttachmentCache(attachments, {
localPathRoots: [path.dirname(params.filePath)],
});
try {
const result = await runCapability({
capability: params.capability,
cfg: params.cfg,
ctx,
attachments: cache,
media: attachments,
agentDir: params.agentDir,
providerRegistry,
config: params.cfg.tools?.media?.[params.capability],
activeModel: params.activeModel,
});
const output = result.outputs.find(
(entry) => entry.kind === KIND_BY_CAPABILITY[params.capability],
);
const text = output?.text?.trim();
return {
text: text || undefined,
provider: output?.provider,
model: output?.model,
output,
};
} finally {
await cache.cleanup();
}
}
export async function describeImageFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "image" });
}
export async function describeVideoFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<RunMediaUnderstandingFileResult> {
return await runMediaUnderstandingFile({ ...params, capability: "video" });
}
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
activeModel?: ActiveMediaModel;
}): Promise<{ text: string | undefined }> {
const result = await runMediaUnderstandingFile({ ...params, capability: "audio" });
return { text: result.text };
}

View File

@@ -1,13 +1,13 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
const { runAudioTranscription } = vi.hoisted(() => {
const runAudioTranscription = vi.fn();
return { runAudioTranscription };
const { transcribeAudioFileFromRuntime } = vi.hoisted(() => {
const transcribeAudioFileFromRuntime = vi.fn();
return { transcribeAudioFileFromRuntime };
});
vi.mock("./audio-transcription-runner.js", () => ({
runAudioTranscription,
vi.mock("./runtime.js", () => ({
transcribeAudioFile: transcribeAudioFileFromRuntime,
}));
import { transcribeAudioFile } from "./transcribe-audio.js";
@@ -17,27 +17,23 @@ describe("transcribeAudioFile", () => {
vi.clearAllMocks();
});
it("does not force audio/wav when mime is omitted", async () => {
runAudioTranscription.mockResolvedValue({ transcript: "hello", attachments: [] });
it("forwards file transcription requests to the shared runtime helper", async () => {
transcribeAudioFileFromRuntime.mockResolvedValue({ text: "hello" });
const result = await transcribeAudioFile({
filePath: "/tmp/note.mp3",
cfg: {} as OpenClawConfig,
});
expect(runAudioTranscription).toHaveBeenCalledWith({
ctx: {
MediaPath: "/tmp/note.mp3",
MediaType: undefined,
},
expect(transcribeAudioFileFromRuntime).toHaveBeenCalledWith({
filePath: "/tmp/note.mp3",
cfg: {} as OpenClawConfig,
agentDir: undefined,
});
expect(result).toEqual({ text: "hello" });
});
it("returns undefined when helper returns no transcript", async () => {
runAudioTranscription.mockResolvedValue({ transcript: undefined, attachments: [] });
it("returns undefined when the runtime helper returns no transcript", async () => {
transcribeAudioFileFromRuntime.mockResolvedValue({ text: undefined });
const result = await transcribeAudioFile({
filePath: "/tmp/missing.wav",
@@ -51,7 +47,7 @@ describe("transcribeAudioFile", () => {
const cfg = {
tools: { media: { audio: { timeoutSeconds: 10 } } },
} as unknown as OpenClawConfig;
runAudioTranscription.mockRejectedValue(new Error("boom"));
transcribeAudioFileFromRuntime.mockRejectedValue(new Error("boom"));
await expect(
transcribeAudioFile({

View File

@@ -1,29 +1 @@
import type { OpenClawConfig } from "../config/config.js";
import { runAudioTranscription } from "./audio-transcription-runner.js";
/**
* Transcribe an audio file using the configured media-understanding provider.
*
* Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
* falling back through configured models until one succeeds.
*
* This is the runtime-exposed entry point for external plugins (e.g. marmot)
* that need STT without importing internal media-understanding modules directly.
*/
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
}): Promise<{ text: string | undefined }> {
const ctx = {
MediaPath: params.filePath,
MediaType: params.mime,
};
const { transcript } = await runAudioTranscription({
ctx,
cfg: params.cfg,
agentDir: params.agentDir,
});
return { text: transcript };
}
export { transcribeAudioFile } from "./runtime.js";

View File

@@ -574,34 +574,62 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
});
};
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
const id = provider.id.trim();
const registerUniqueProviderLike = <
T extends { id: string },
R extends {
pluginId: string;
pluginName?: string;
provider: T;
source: string;
rootDir?: string;
},
>(params: {
record: PluginRecord;
provider: T;
kindLabel: string;
registrations: R[];
ownedIds: string[];
}) => {
const id = params.provider.id.trim();
const { record, kindLabel } = params;
const missingLabel = `${kindLabel} registration missing id`;
const duplicateLabel = `${kindLabel} already registered: ${id}`;
if (!id) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: "speech provider registration missing id",
message: missingLabel,
});
return;
}
const existing = registry.speechProviders.find((entry) => entry.provider.id === id);
const existing = params.registrations.find((entry) => entry.provider.id === id);
if (existing) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: `speech provider already registered: ${id} (${existing.pluginId})`,
message: `${duplicateLabel} (${existing.pluginId})`,
});
return;
}
record.speechProviderIds.push(id);
registry.speechProviders.push({
params.ownedIds.push(id);
params.registrations.push({
pluginId: record.id,
pluginName: record.name,
provider,
provider: params.provider,
source: record.source,
rootDir: record.rootDir,
} as R);
};
const registerSpeechProvider = (record: PluginRecord, provider: SpeechProviderPlugin) => {
registerUniqueProviderLike({
record,
provider,
kindLabel: "speech provider",
registrations: registry.speechProviders,
ownedIds: record.speechProviderIds,
});
};
@@ -609,64 +637,22 @@ export function createPluginRegistry(registryParams: PluginRegistryParams) {
record: PluginRecord,
provider: MediaUnderstandingProviderPlugin,
) => {
const id = provider.id.trim();
if (!id) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: "media provider registration missing id",
});
return;
}
const existing = registry.mediaUnderstandingProviders.find((entry) => entry.provider.id === id);
if (existing) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: `media provider already registered: ${id} (${existing.pluginId})`,
});
return;
}
record.mediaUnderstandingProviderIds.push(id);
registry.mediaUnderstandingProviders.push({
pluginId: record.id,
pluginName: record.name,
registerUniqueProviderLike({
record,
provider,
source: record.source,
rootDir: record.rootDir,
kindLabel: "media provider",
registrations: registry.mediaUnderstandingProviders,
ownedIds: record.mediaUnderstandingProviderIds,
});
};
const registerWebSearchProvider = (record: PluginRecord, provider: WebSearchProviderPlugin) => {
const id = provider.id.trim();
if (!id) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: "web search provider registration missing id",
});
return;
}
const existing = registry.webSearchProviders.find((entry) => entry.provider.id === id);
if (existing) {
pushDiagnostic({
level: "error",
pluginId: record.id,
source: record.source,
message: `web search provider already registered: ${id} (${existing.pluginId})`,
});
return;
}
record.webSearchProviderIds.push(id);
registry.webSearchProviders.push({
pluginId: record.id,
pluginName: record.name,
registerUniqueProviderLike({
record,
provider,
source: record.source,
rootDir: record.rootDir,
kindLabel: "web search provider",
registrations: registry.webSearchProviders,
ownedIds: record.webSearchProviderIds,
});
};

View File

@@ -4,7 +4,12 @@ import {
resolveApiKeyForProvider as resolveApiKeyForProviderRaw,
} from "../../agents/model-auth.js";
import { resolveStateDir } from "../../config/paths.js";
import { transcribeAudioFile } from "../../media-understanding/transcribe-audio.js";
import {
describeImageFile,
describeVideoFile,
runMediaUnderstandingFile,
transcribeAudioFile,
} from "../../media-understanding/runtime.js";
import { listSpeechVoices, textToSpeech, textToSpeechTelephony } from "../../tts/tts.js";
import { createRuntimeAgent } from "./runtime-agent.js";
import { createRuntimeChannel } from "./runtime-channel.js";
@@ -136,6 +141,12 @@ export function createPluginRuntime(_options: CreatePluginRuntimeOptions = {}):
system: createRuntimeSystem(),
media: createRuntimeMedia(),
tts: { textToSpeech, textToSpeechTelephony, listVoices: listSpeechVoices },
mediaUnderstanding: {
runFile: runMediaUnderstandingFile,
describeImageFile,
describeVideoFile,
transcribeAudioFile,
},
stt: { transcribeAudioFile },
tools: createRuntimeTools(),
channel: createRuntimeChannel(),

View File

@@ -51,6 +51,12 @@ export type PluginRuntimeCore = {
textToSpeechTelephony: typeof import("../../tts/tts.js").textToSpeechTelephony;
listVoices: typeof import("../../tts/tts.js").listSpeechVoices;
};
mediaUnderstanding: {
runFile: typeof import("../../media-understanding/runtime.js").runMediaUnderstandingFile;
describeImageFile: typeof import("../../media-understanding/runtime.js").describeImageFile;
describeVideoFile: typeof import("../../media-understanding/runtime.js").describeVideoFile;
transcribeAudioFile: typeof import("../../media-understanding/runtime.js").transcribeAudioFile;
};
stt: {
transcribeAudioFile: typeof import("../../media-understanding/transcribe-audio.js").transcribeAudioFile;
};