Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:24:51 +08:00
commit 8aebb293cd
31 changed files with 7386 additions and 0 deletions

View File

@@ -0,0 +1,152 @@
/**
* Multimodal Video and Audio Understanding with Gemini API
*
* Demonstrates:
* - Video analysis (what happens in the video)
* - Audio transcription and understanding
* - PDF document parsing
* - Combining multiple modalities
*
* Prerequisites:
* - npm install @google/genai@1.27.0
* - export GEMINI_API_KEY="..."
*/
import { GoogleGenAI } from '@google/genai';
import fs from 'fs';
async function main() {
const ai = new GoogleGenAI({
apiKey: process.env.GEMINI_API_KEY,
});
try {
// Example 1: Analyze video
console.log('Example 1: Video Analysis\n');
const videoPath = '/path/to/video.mp4'; // Replace with actual path
const videoData = fs.readFileSync(videoPath);
const base64Video = videoData.toString('base64');
const response1 = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: [
{
parts: [
{ text: 'Describe what happens in this video. Summarize the key events.' },
{
inlineData: {
data: base64Video,
mimeType: 'video/mp4'
}
}
]
}
]
});
console.log(response1.text);
console.log('\n---\n');
// Example 2: Transcribe and analyze audio
console.log('Example 2: Audio Transcription and Analysis\n');
const audioPath = '/path/to/audio.mp3'; // Replace with actual path
const audioData = fs.readFileSync(audioPath);
const base64Audio = audioData.toString('base64');
const response2 = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: [
{
parts: [
{ text: 'Transcribe this audio and provide a summary of the main points discussed.' },
{
inlineData: {
data: base64Audio,
mimeType: 'audio/mp3'
}
}
]
}
]
});
console.log(response2.text);
console.log('\n---\n');
// Example 3: Parse PDF document
console.log('Example 3: PDF Document Parsing\n');
const pdfPath = '/path/to/document.pdf'; // Replace with actual path
const pdfData = fs.readFileSync(pdfPath);
const base64Pdf = pdfData.toString('base64');
const response3 = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: [
{
parts: [
{ text: 'Summarize the key points in this PDF document. Extract any important data or conclusions.' },
{
inlineData: {
data: base64Pdf,
mimeType: 'application/pdf'
}
}
]
}
]
});
console.log(response3.text);
console.log('\n---\n');
// Example 4: Combine multiple modalities
console.log('Example 4: Multiple Modalities (Video + Text Questions)\n');
const response4 = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: [
{
parts: [
{ text: 'Based on this video, answer these questions:\n1. How many people appear?\n2. What is the main activity?\n3. Where does this take place?' },
{ inlineData: { data: base64Video, mimeType: 'video/mp4' } }
]
}
]
});
console.log(response4.text);
} catch (error: any) {
console.error('Error:', error.message);
if (error.message.includes('ENOENT')) {
console.error('\n⚠ File not found. Update the file path variables with valid paths.');
}
}
}
/**
* Supported Video Formats:
* - MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV
* Max size: 2GB (use File API for larger - Phase 2)
* Max length (inline): 2 minutes
*
* Supported Audio Formats:
* - MP3, WAV, FLAC, AAC, OGG, OPUS
* Max size: 20MB
*
* PDF:
* - Max size: 30MB
* - Text-based PDFs work best
* - Scanned images may have lower accuracy
*
* Tips:
* - For videos > 2 minutes, use the File API (Phase 2)
* - Specific prompts yield better results
* - You can combine text, images, video, audio, and PDFs in one request
*/
main();