Skip to content
Prev Previous commit
Next Next commit
feat: add multimodal embeddings (text + image + video) to VoyageAI in…
…tegration

- New tool: voyageai_multimodal_embeddings using voyage-multimodal-3.5 model
- New API route: /api/tools/voyageai/multimodal-embeddings for server-side file handling
- Supports text, image files/URLs, video files/URLs in a single embedding
- Uses file-upload subBlocks with basic/advanced mode for images and video
- Internal proxy pattern: downloads UserFiles via downloadFileFromStorage, converts to base64
- URL validation via validateUrlWithDNS for SSRF protection
- 14 new unit tests (tool metadata, body, response transform)
- 5 new integration tests (text-only, image URL, text+image, dimensions, auth)
- 8 new block tests (multimodal operation, params, subBlocks)
  • Loading branch information
fzowl committed Mar 26, 2026
commit e729a82dad823eb97cf7d2b029c40eedb5cd15ab
211 changes: 211 additions & 0 deletions apps/sim/app/api/tools/voyageai/multimodal-embeddings/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import { createLogger } from '@sim/logger'
import { type NextRequest, NextResponse } from 'next/server'
import { z } from 'zod'
import { checkInternalAuth } from '@/lib/auth/hybrid'
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
import { generateRequestId } from '@/lib/core/utils/request'
import { RawFileInputArraySchema, RawFileInputSchema } from '@/lib/uploads/utils/file-schemas'
import { processSingleFileToUserFile } from '@/lib/uploads/utils/file-utils'
import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'

export const dynamic = 'force-dynamic'

const logger = createLogger('VoyageAIMultimodalAPI')

const MultimodalEmbeddingsSchema = z.object({
apiKey: z.string().min(1, 'API key is required'),
input: z.string().optional().nullable(),
imageFiles: z.union([RawFileInputSchema, RawFileInputArraySchema]).optional().nullable(),
imageUrls: z.string().optional().nullable(),
videoFile: RawFileInputSchema.optional().nullable(),
videoUrl: z.string().optional().nullable(),
model: z.string().optional().default('voyage-multimodal-3.5'),
inputType: z.enum(['query', 'document']).optional().nullable(),
})

export async function POST(request: NextRequest) {
const requestId = generateRequestId()

try {
const authResult = await checkInternalAuth(request, { requireWorkflowId: false })
if (!authResult.success) {
logger.warn(`[${requestId}] Unauthorized multimodal embeddings attempt`)
return NextResponse.json(
{ success: false, error: authResult.error || 'Authentication required' },
{ status: 401 }
)
}

const body = await request.json()
const params = MultimodalEmbeddingsSchema.parse(body)

const content: Array<Record<string, string>> = []

// Add text content
if (params.input?.trim()) {
content.push({ type: 'text', text: params.input })
}

// Process image files → base64
if (params.imageFiles) {
const files = Array.isArray(params.imageFiles) ? params.imageFiles : [params.imageFiles]
for (const rawFile of files) {
try {
const userFile = processSingleFileToUserFile(rawFile, requestId, logger)
let base64 = userFile.base64
if (!base64) {
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
base64 = buffer.toString('base64')
logger.info(`[${requestId}] Converted image to base64 (${buffer.length} bytes)`)
}
const mimeType = userFile.type || 'image/jpeg'
content.push({
type: 'image_base64',
image_base64: `data:${mimeType};base64,${base64}`,
})
} catch (error) {
logger.error(`[${requestId}] Failed to process image file:`, error)
return NextResponse.json(
{ success: false, error: `Failed to process image file: ${error instanceof Error ? error.message : 'Unknown error'}` },
{ status: 400 }
)
}
}
}

// Process image URLs
if (params.imageUrls?.trim()) {
let urls: string[]
try {
urls = JSON.parse(params.imageUrls)
} catch {
urls = params.imageUrls
.split(/[,\n]/)
.map((u) => u.trim())
.filter(Boolean)
}
Comment thread
cursor[bot] marked this conversation as resolved.

for (const url of urls) {
const validation = await validateUrlWithDNS(url, 'imageUrl')
if (!validation.isValid) {
return NextResponse.json(
{ success: false, error: `Invalid image URL: ${validation.error}` },
{ status: 400 }
)
}
content.push({ type: 'image_url', image_url: url })
}
}

// Process video file → base64
if (params.videoFile) {
try {
const userFile = processSingleFileToUserFile(params.videoFile, requestId, logger)
let base64 = userFile.base64
if (!base64) {
const buffer = await downloadFileFromStorage(userFile, requestId, logger)
base64 = buffer.toString('base64')
logger.info(`[${requestId}] Converted video to base64 (${buffer.length} bytes)`)
}
const mimeType = userFile.type || 'video/mp4'
content.push({
type: 'video_base64',
video_base64: `data:${mimeType};base64,${base64}`,
})
} catch (error) {
logger.error(`[${requestId}] Failed to process video file:`, error)
return NextResponse.json(
{ success: false, error: `Failed to process video file: ${error instanceof Error ? error.message : 'Unknown error'}` },
{ status: 400 }
)
}
}

// Process video URL
if (params.videoUrl?.trim()) {
const validation = await validateUrlWithDNS(params.videoUrl, 'videoUrl')
if (!validation.isValid) {
return NextResponse.json(
{ success: false, error: `Invalid video URL: ${validation.error}` },
{ status: 400 }
)
}
content.push({ type: 'video_url', video_url: params.videoUrl })
}

if (content.length === 0) {
return NextResponse.json(
{ success: false, error: 'At least one input (text, image, or video) is required' },
{ status: 400 }
)
}

logger.info(`[${requestId}] Calling VoyageAI multimodal embeddings`, {
contentTypes: content.map((c) => c.type),
model: params.model,
})

// Build VoyageAI request
const voyageBody: Record<string, unknown> = {
inputs: [{ content }],
model: params.model,
}
if (params.inputType) {
voyageBody.input_type = params.inputType
}

const voyageResponse = await fetch('https://api.voyageai.com/v1/multimodalembeddings', {
method: 'POST',
headers: {
Authorization: `Bearer ${params.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(voyageBody),
})

if (!voyageResponse.ok) {
const errorText = await voyageResponse.text()
logger.error(`[${requestId}] VoyageAI API error: ${voyageResponse.status}`, { errorText })
return NextResponse.json(
{ success: false, error: `VoyageAI API error: ${voyageResponse.status} - ${errorText}` },
{ status: voyageResponse.status }
)
}

const data = await voyageResponse.json()

logger.info(`[${requestId}] Multimodal embeddings generated successfully`, {
embeddingsCount: data.data?.length,
totalTokens: data.usage?.total_tokens,
})

return NextResponse.json({
success: true,
output: {
embeddings: data.data.map((item: { embedding: number[] }) => item.embedding),
model: data.model,
usage: {
text_tokens: data.usage?.text_tokens,
image_pixels: data.usage?.image_pixels,
video_pixels: data.usage?.video_pixels,
total_tokens: data.usage?.total_tokens,
},
},
})
} catch (error) {
if (error instanceof z.ZodError) {
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
return NextResponse.json(
{ success: false, error: 'Invalid request data', details: error.errors },
{ status: 400 }
)
}

const errorMessage = error instanceof Error ? error.message : 'Unknown error'
logger.error(`[${requestId}] Multimodal embeddings failed:`, error)
return NextResponse.json(
{ success: false, error: `Multimodal embeddings failed: ${errorMessage}` },
{ status: 500 }
)
}
}
109 changes: 107 additions & 2 deletions apps/sim/blocks/blocks/voyageai.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,12 @@ describe('VoyageAIBlock', () => {
expect(VoyageAIBlock.bgColor).toBe('#1A1A2E')
})

it('should list both tool IDs in access', () => {
expect(VoyageAIBlock.tools.access).toEqual(['voyageai_embeddings', 'voyageai_rerank'])
it('should list all tool IDs in access', () => {
expect(VoyageAIBlock.tools.access).toEqual([
'voyageai_embeddings',
'voyageai_multimodal_embeddings',
'voyageai_rerank',
])
})

it('should have tools.config.tool and tools.config.params functions', () => {
Expand Down Expand Up @@ -159,6 +163,31 @@ describe('VoyageAIBlock', () => {
expect(modelIds).toContain('voyage-law-2')
})

it('should have multimodal-specific subBlocks with correct conditions', () => {
const mmBlocks = VoyageAIBlock.subBlocks.filter(
(sb) =>
sb.condition &&
typeof sb.condition === 'object' &&
'value' in sb.condition &&
sb.condition.value === 'multimodal_embeddings'
)
const ids = mmBlocks.map((sb) => sb.id)
expect(ids).toContain('multimodalInput')
expect(ids).toContain('imageFiles')
expect(ids).toContain('imageFilesRef')
expect(ids).toContain('videoFile')
expect(ids).toContain('videoFileRef')
expect(ids).toContain('multimodalModel')
})

it('should have multimodal models in the dropdown', () => {
const modelBlock = VoyageAIBlock.subBlocks.find((sb) => sb.id === 'multimodalModel') as any
expect(modelBlock).toBeDefined()
const modelIds = modelBlock.options.map((o: any) => o.id)
expect(modelIds).toContain('voyage-multimodal-3.5')
expect(modelIds).toContain('voyage-multimodal-3')
})

it('should have all rerank models in the dropdown', () => {
const modelBlock = VoyageAIBlock.subBlocks.find((sb) => sb.id === 'rerankModel') as any
expect(modelBlock).toBeDefined()
Expand Down Expand Up @@ -200,6 +229,12 @@ describe('VoyageAIBlock', () => {
expect(toolFunction({ operation: 'embeddings' })).toBe('voyageai_embeddings')
})

it('should return voyageai_multimodal_embeddings for multimodal_embeddings operation', () => {
expect(toolFunction({ operation: 'multimodal_embeddings' })).toBe(
'voyageai_multimodal_embeddings'
)
})

it('should return voyageai_rerank for rerank operation', () => {
expect(toolFunction({ operation: 'rerank' })).toBe('voyageai_rerank')
})
Expand Down Expand Up @@ -398,5 +433,75 @@ describe('VoyageAIBlock', () => {
expect(result.embeddingModel).toBeUndefined()
})
})

describe('multimodal_embeddings operation', () => {
it('should pass text input and model', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
multimodalInput: 'describe this image',
multimodalModel: 'voyage-multimodal-3.5',
})
expect(result.apiKey).toBe('va-key')
expect(result.input).toBe('describe this image')
expect(result.model).toBe('voyage-multimodal-3.5')
})

it('should pass image URLs', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
imageUrls: 'https://example.com/img.jpg',
multimodalModel: 'voyage-multimodal-3.5',
})
expect(result.imageUrls).toBe('https://example.com/img.jpg')
})

it('should pass video URL', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
videoUrl: 'https://example.com/video.mp4',
multimodalModel: 'voyage-multimodal-3.5',
})
expect(result.videoUrl).toBe('https://example.com/video.mp4')
})

it('should pass inputType for multimodal', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
multimodalInput: 'test',
multimodalModel: 'voyage-multimodal-3.5',
multimodalInputType: 'query',
})
expect(result.inputType).toBe('query')
})

it('should omit empty optional fields', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
multimodalModel: 'voyage-multimodal-3.5',
})
expect(result.input).toBeUndefined()
expect(result.imageFiles).toBeUndefined()
expect(result.imageUrls).toBeUndefined()
expect(result.videoFile).toBeUndefined()
expect(result.videoUrl).toBeUndefined()
})

it('should not include text embedding or rerank fields', () => {
const result = paramsFunction({
operation: 'multimodal_embeddings',
apiKey: 'va-key',
multimodalModel: 'voyage-multimodal-3.5',
embeddingModel: 'should not appear',
query: 'should not appear',
})
expect(result.embeddingModel).toBeUndefined()
expect(result.query).toBeUndefined()
})
})
})
})
Loading