Initial commit
This commit is contained in:
97
skills/ai-multimodal/.env.example
Normal file
97
skills/ai-multimodal/.env.example
Normal file
@@ -0,0 +1,97 @@
|
||||
# Google Gemini API Configuration
|
||||
|
||||
# ============================================================================
|
||||
# OPTION 1: Google AI Studio (Default - Recommended for most users)
|
||||
# ============================================================================
|
||||
# Get your API key: https://aistudio.google.com/apikey
|
||||
GEMINI_API_KEY=your_api_key_here
|
||||
|
||||
# ============================================================================
|
||||
# OPTION 2: Vertex AI (Google Cloud Platform)
|
||||
# ============================================================================
|
||||
# Uncomment these lines to use Vertex AI instead of Google AI Studio
|
||||
# GEMINI_USE_VERTEX=true
|
||||
# VERTEX_PROJECT_ID=your-gcp-project-id
|
||||
# VERTEX_LOCATION=us-central1
|
||||
|
||||
# ============================================================================
|
||||
# Model Selection (Optional)
|
||||
# ============================================================================
|
||||
# Override default model for specific tasks
|
||||
# Default: gemini-2.5-flash for most tasks
|
||||
# GEMINI_MODEL=gemini-2.5-flash
|
||||
# GEMINI_IMAGE_GEN_MODEL=gemini-2.5-flash-image
|
||||
|
||||
# ============================================================================
|
||||
# Rate Limiting Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Requests per minute limit (adjust based on your tier)
|
||||
# GEMINI_RPM_LIMIT=15
|
||||
|
||||
# Tokens per minute limit
|
||||
# GEMINI_TPM_LIMIT=4000000
|
||||
|
||||
# Requests per day limit
|
||||
# GEMINI_RPD_LIMIT=1500
|
||||
|
||||
# ============================================================================
|
||||
# Processing Options (Optional)
|
||||
# ============================================================================
|
||||
# Video resolution mode: default or low-res
|
||||
# low-res uses ~100 tokens/second vs ~300 for default
|
||||
# GEMINI_VIDEO_RESOLUTION=default
|
||||
|
||||
# Audio quality: default (16 Kbps mono, auto-downsampled)
|
||||
# GEMINI_AUDIO_QUALITY=default
|
||||
|
||||
# PDF processing mode: inline (<20MB) or file-api (>20MB, automatic)
|
||||
# GEMINI_PDF_MODE=auto
|
||||
|
||||
# ============================================================================
|
||||
# Retry Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Maximum retry attempts for failed requests
|
||||
# GEMINI_MAX_RETRIES=3
|
||||
|
||||
# Initial retry delay in seconds (uses exponential backoff)
|
||||
# GEMINI_RETRY_DELAY=1
|
||||
|
||||
# ============================================================================
|
||||
# Output Configuration (Optional)
|
||||
# ============================================================================
|
||||
# Default output directory for generated images
|
||||
# OUTPUT_DIR=./output
|
||||
|
||||
# Image output format (png or jpeg)
|
||||
# IMAGE_FORMAT=png
|
||||
|
||||
# Image quality for JPEG (1-100)
|
||||
# IMAGE_QUALITY=95
|
||||
|
||||
# ============================================================================
|
||||
# Context Caching (Optional)
|
||||
# ============================================================================
|
||||
# Enable context caching for repeated queries on same file
|
||||
# GEMINI_ENABLE_CACHING=true
|
||||
|
||||
# Cache TTL in seconds (default: 1800 = 30 minutes)
|
||||
# GEMINI_CACHE_TTL=1800
|
||||
|
||||
# ============================================================================
|
||||
# Logging (Optional)
|
||||
# ============================================================================
|
||||
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
# LOG_LEVEL=INFO
|
||||
|
||||
# Log file path
|
||||
# LOG_FILE=./logs/gemini.log
|
||||
|
||||
# ============================================================================
|
||||
# Notes
|
||||
# ============================================================================
|
||||
# 1. Never commit API keys to version control
|
||||
# 2. Add .env to .gitignore
|
||||
# 3. API keys can be restricted in Google Cloud Console
|
||||
# 4. Monitor usage at: https://aistudio.google.com/apikey
|
||||
# 5. Free tier limits: 15 RPM, 1M-4M TPM, 1,500 RPD
|
||||
# 6. Vertex AI requires GCP authentication via gcloud CLI
|
||||
357
skills/ai-multimodal/SKILL.md
Normal file
357
skills/ai-multimodal/SKILL.md
Normal file
@@ -0,0 +1,357 @@
|
||||
---
|
||||
name: ai-multimodal
|
||||
description: Process and generate multimedia content using Google Gemini API. Capabilities include analyze audio files (transcription with timestamps, summarization, speech understanding, music/sound analysis up to 9.5 hours), understand images (captioning, object detection, OCR, visual Q&A, segmentation), process videos (scene detection, Q&A, temporal analysis, YouTube URLs, up to 6 hours), extract from documents (PDF tables, forms, charts, diagrams, multi-page), generate images (text-to-image, editing, composition, refinement). Use when working with audio/video files, analyzing images or screenshots, processing PDF documents, extracting structured data from media, creating images from text prompts, or implementing multimodal AI features. Supports multiple models (Gemini 2.5/2.0) with context windows up to 2M tokens.
|
||||
license: MIT
|
||||
allowed-tools:
|
||||
- Bash
|
||||
- Read
|
||||
- Write
|
||||
- Edit
|
||||
---
|
||||
|
||||
# AI Multimodal Processing Skill
|
||||
|
||||
Process audio, images, videos, documents, and generate images using Google Gemini's multimodal API. Unified interface for all multimedia content understanding and generation.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### Audio Processing
|
||||
- Transcription with timestamps (up to 9.5 hours)
|
||||
- Audio summarization and analysis
|
||||
- Speech understanding and speaker identification
|
||||
- Music and environmental sound analysis
|
||||
- Text-to-speech generation with controllable voice
|
||||
|
||||
### Image Understanding
|
||||
- Image captioning and description
|
||||
- Object detection with bounding boxes (2.0+)
|
||||
- Pixel-level segmentation (2.5+)
|
||||
- Visual question answering
|
||||
- Multi-image comparison (up to 3,600 images)
|
||||
- OCR and text extraction
|
||||
|
||||
### Video Analysis
|
||||
- Scene detection and summarization
|
||||
- Video Q&A with temporal understanding
|
||||
- Transcription with visual descriptions
|
||||
- YouTube URL support
|
||||
- Long video processing (up to 6 hours)
|
||||
- Frame-level analysis
|
||||
|
||||
### Document Extraction
|
||||
- Native PDF vision processing (up to 1,000 pages)
|
||||
- Table and form extraction
|
||||
- Chart and diagram analysis
|
||||
- Multi-page document understanding
|
||||
- Structured data output (JSON schema)
|
||||
- Format conversion (PDF to HTML/JSON)
|
||||
|
||||
### Image Generation
|
||||
- Text-to-image generation
|
||||
- Image editing and modification
|
||||
- Multi-image composition (up to 3 images)
|
||||
- Iterative refinement
|
||||
- Multiple aspect ratios (1:1, 16:9, 9:16, 4:3, 3:4)
|
||||
- Controllable style and quality
|
||||
|
||||
## Capability Matrix
|
||||
|
||||
| Task | Audio | Image | Video | Document | Generation |
|
||||
|------|:-----:|:-----:|:-----:|:--------:|:----------:|
|
||||
| Transcription | ✓ | - | ✓ | - | - |
|
||||
| Summarization | ✓ | ✓ | ✓ | ✓ | - |
|
||||
| Q&A | ✓ | ✓ | ✓ | ✓ | - |
|
||||
| Object Detection | - | ✓ | ✓ | - | - |
|
||||
| Text Extraction | - | ✓ | - | ✓ | - |
|
||||
| Structured Output | ✓ | ✓ | ✓ | ✓ | - |
|
||||
| Creation | TTS | - | - | - | ✓ |
|
||||
| Timestamps | ✓ | - | ✓ | - | - |
|
||||
| Segmentation | - | ✓ | - | - | - |
|
||||
|
||||
## Model Selection Guide
|
||||
|
||||
### Gemini 2.5 Series (Recommended)
|
||||
- **gemini-2.5-pro**: Highest quality, all features, 1M-2M context
|
||||
- **gemini-2.5-flash**: Best balance, all features, 1M-2M context
|
||||
- **gemini-2.5-flash-lite**: Lightweight, segmentation support
|
||||
- **gemini-2.5-flash-image**: Image generation only
|
||||
|
||||
### Gemini 2.0 Series
|
||||
- **gemini-2.0-flash**: Fast processing, object detection
|
||||
- **gemini-2.0-flash-lite**: Lightweight option
|
||||
|
||||
### Feature Requirements
|
||||
- **Segmentation**: Requires 2.5+ models
|
||||
- **Object Detection**: Requires 2.0+ models
|
||||
- **Multi-video**: Requires 2.5+ models
|
||||
- **Image Generation**: Requires flash-image model
|
||||
|
||||
### Context Windows
|
||||
- **2M tokens**: ~6 hours video (low-res) or ~2 hours (default)
|
||||
- **1M tokens**: ~3 hours video (low-res) or ~1 hour (default)
|
||||
- **Audio**: 32 tokens/second (1 min = 1,920 tokens)
|
||||
- **PDF**: 258 tokens/page (fixed)
|
||||
- **Image**: 258-1,548 tokens based on size
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
**API Key Setup**: Supports both Google AI Studio and Vertex AI.
|
||||
|
||||
The skill checks for `GEMINI_API_KEY` in this order:
|
||||
1. Process environment: `export GEMINI_API_KEY="your-key"`
|
||||
2. Project root: `.env`
|
||||
3. `.claude/.env`
|
||||
4. `.claude/skills/.env`
|
||||
5. `.claude/skills/ai-multimodal/.env`
|
||||
|
||||
**Get API key**: https://aistudio.google.com/apikey
|
||||
|
||||
**For Vertex AI**:
|
||||
```bash
|
||||
export GEMINI_USE_VERTEX=true
|
||||
export VERTEX_PROJECT_ID=your-gcp-project-id
|
||||
export VERTEX_LOCATION=us-central1 # Optional
|
||||
```
|
||||
|
||||
**Install SDK**:
|
||||
```bash
|
||||
pip install google-genai python-dotenv pillow
|
||||
```
|
||||
|
||||
### Common Patterns
|
||||
|
||||
**Transcribe Audio**:
|
||||
```bash
|
||||
python scripts/gemini_batch_process.py \
|
||||
--files audio.mp3 \
|
||||
--task transcribe \
|
||||
--model gemini-2.5-flash
|
||||
```
|
||||
|
||||
**Analyze Image**:
|
||||
```bash
|
||||
python scripts/gemini_batch_process.py \
|
||||
--files image.jpg \
|
||||
--task analyze \
|
||||
--prompt "Describe this image" \
|
||||
--output docs/assets/<output-name>.md \
|
||||
--model gemini-2.5-flash
|
||||
```
|
||||
|
||||
**Process Video**:
|
||||
```bash
|
||||
python scripts/gemini_batch_process.py \
|
||||
--files video.mp4 \
|
||||
--task analyze \
|
||||
--prompt "Summarize key points with timestamps" \
|
||||
--output docs/assets/<output-name>.md \
|
||||
--model gemini-2.5-flash
|
||||
```
|
||||
|
||||
**Extract from PDF**:
|
||||
```bash
|
||||
python scripts/gemini_batch_process.py \
|
||||
--files document.pdf \
|
||||
--task extract \
|
||||
--prompt "Extract table data as JSON" \
|
||||
--output docs/assets/<output-name>.md \
|
||||
--format json
|
||||
```
|
||||
|
||||
**Generate Image**:
|
||||
```bash
|
||||
python scripts/gemini_batch_process.py \
|
||||
--task generate \
|
||||
--prompt "A futuristic city at sunset" \
|
||||
--output docs/assets/<output-file-name> \
|
||||
--model gemini-2.5-flash-image \
|
||||
--aspect-ratio 16:9
|
||||
```
|
||||
|
||||
**Optimize Media**:
|
||||
```bash
|
||||
# Prepare large video for processing
|
||||
python scripts/media_optimizer.py \
|
||||
--input large-video.mp4 \
|
||||
--output docs/assets/<output-file-name> \
|
||||
--target-size 100MB
|
||||
|
||||
# Batch optimize multiple files
|
||||
python scripts/media_optimizer.py \
|
||||
--input-dir ./videos \
|
||||
--output-dir docs/assets/optimized \
|
||||
--quality 85
|
||||
```
|
||||
|
||||
**Convert Documents to Markdown**:
|
||||
```bash
|
||||
# Convert to PDF
|
||||
python scripts/document_converter.py \
|
||||
--input document.docx \
|
||||
--output docs/assets/document.md
|
||||
|
||||
# Extract pages
|
||||
python scripts/document_converter.py \
|
||||
--input large.pdf \
|
||||
--output docs/assets/chapter1.md \
|
||||
--pages 1-20
|
||||
```
|
||||
|
||||
## Supported Formats
|
||||
|
||||
### Audio
|
||||
- WAV, MP3, AAC, FLAC, OGG Vorbis, AIFF
|
||||
- Max 9.5 hours per request
|
||||
- Auto-downsampled to 16 Kbps mono
|
||||
|
||||
### Images
|
||||
- PNG, JPEG, WEBP, HEIC, HEIF
|
||||
- Max 3,600 images per request
|
||||
- Resolution: ≤384px = 258 tokens, larger = tiled
|
||||
|
||||
### Video
|
||||
- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
|
||||
- Max 6 hours (low-res) or 2 hours (default)
|
||||
- YouTube URLs supported (public only)
|
||||
|
||||
### Documents
|
||||
- PDF only for vision processing
|
||||
- Max 1,000 pages
|
||||
- TXT, HTML, Markdown supported (text-only)
|
||||
|
||||
### Size Limits
|
||||
- **Inline**: <20MB total request
|
||||
- **File API**: 2GB per file, 20GB project quota
|
||||
- **Retention**: 48 hours auto-delete
|
||||
|
||||
## Reference Navigation
|
||||
|
||||
For detailed implementation guidance, see:
|
||||
|
||||
### Audio Processing
|
||||
- `references/audio-processing.md` - Transcription, analysis, TTS
|
||||
- Timestamp handling and segment analysis
|
||||
- Multi-speaker identification
|
||||
- Non-speech audio analysis
|
||||
- Text-to-speech generation
|
||||
|
||||
### Image Understanding
|
||||
- `references/vision-understanding.md` - Captioning, detection, OCR
|
||||
- Object detection and localization
|
||||
- Pixel-level segmentation
|
||||
- Visual question answering
|
||||
- Multi-image comparison
|
||||
|
||||
### Video Analysis
|
||||
- `references/video-analysis.md` - Scene detection, temporal understanding
|
||||
- YouTube URL processing
|
||||
- Timestamp-based queries
|
||||
- Video clipping and FPS control
|
||||
- Long video optimization
|
||||
|
||||
### Document Extraction
|
||||
- `references/document-extraction.md` - PDF processing, structured output
|
||||
- Table and form extraction
|
||||
- Chart and diagram analysis
|
||||
- JSON schema validation
|
||||
- Multi-page handling
|
||||
|
||||
### Image Generation
|
||||
- `references/image-generation.md` - Text-to-image, editing
|
||||
- Prompt engineering strategies
|
||||
- Image editing and composition
|
||||
- Aspect ratio selection
|
||||
- Safety settings
|
||||
|
||||
## Cost Optimization
|
||||
|
||||
### Token Costs
|
||||
**Input Pricing**:
|
||||
- Gemini 2.5 Flash: $1.00/1M input, $0.10/1M output
|
||||
- Gemini 2.5 Pro: $3.00/1M input, $12.00/1M output
|
||||
- Gemini 1.5 Flash: $0.70/1M input, $0.175/1M output
|
||||
|
||||
**Token Rates**:
|
||||
- Audio: 32 tokens/second (1 min = 1,920 tokens)
|
||||
- Video: ~300 tokens/second (default) or ~100 (low-res)
|
||||
- PDF: 258 tokens/page (fixed)
|
||||
- Image: 258-1,548 tokens based on size
|
||||
|
||||
**TTS Pricing**:
|
||||
- Flash TTS: $10/1M tokens
|
||||
- Pro TTS: $20/1M tokens
|
||||
|
||||
### Best Practices
|
||||
1. Use `gemini-2.5-flash` for most tasks (best price/performance)
|
||||
2. Use File API for files >20MB or repeated queries
|
||||
3. Optimize media before upload (see `media_optimizer.py`)
|
||||
4. Process specific segments instead of full videos
|
||||
5. Use lower FPS for static content
|
||||
6. Implement context caching for repeated queries
|
||||
7. Batch process multiple files in parallel
|
||||
|
||||
## Rate Limits
|
||||
|
||||
**Free Tier**:
|
||||
- 10-15 RPM (requests per minute)
|
||||
- 1M-4M TPM (tokens per minute)
|
||||
- 1,500 RPD (requests per day)
|
||||
|
||||
**YouTube Limits**:
|
||||
- Free tier: 8 hours/day
|
||||
- Paid tier: No length limits
|
||||
- Public videos only
|
||||
|
||||
**Storage Limits**:
|
||||
- 20GB per project
|
||||
- 2GB per file
|
||||
- 48-hour retention
|
||||
|
||||
## Error Handling
|
||||
|
||||
Common errors and solutions:
|
||||
- **400**: Invalid format/size - validate before upload
|
||||
- **401**: Invalid API key - check configuration
|
||||
- **403**: Permission denied - verify API key restrictions
|
||||
- **404**: File not found - ensure file uploaded and active
|
||||
- **429**: Rate limit exceeded - implement exponential backoff
|
||||
- **500**: Server error - retry with backoff
|
||||
|
||||
## Scripts Overview
|
||||
|
||||
All scripts support unified API key detection and error handling:
|
||||
|
||||
**gemini_batch_process.py**: Batch process multiple media files
|
||||
- Supports all modalities (audio, image, video, PDF)
|
||||
- Progress tracking and error recovery
|
||||
- Output formats: JSON, Markdown, CSV
|
||||
- Rate limiting and retry logic
|
||||
- Dry-run mode
|
||||
|
||||
**media_optimizer.py**: Prepare media for Gemini API
|
||||
- Compress videos/audio for size limits
|
||||
- Resize images appropriately
|
||||
- Split long videos into chunks
|
||||
- Format conversion
|
||||
- Quality vs size optimization
|
||||
|
||||
**document_converter.py**: Convert documents to PDF
|
||||
- Convert DOCX, XLSX, PPTX to PDF
|
||||
- Extract page ranges
|
||||
- Optimize PDFs for Gemini
|
||||
- Extract images from PDFs
|
||||
- Batch conversion support
|
||||
|
||||
Run any script with `--help` for detailed usage.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Audio API Docs](https://ai.google.dev/gemini-api/docs/audio)
|
||||
- [Image API Docs](https://ai.google.dev/gemini-api/docs/image-understanding)
|
||||
- [Video API Docs](https://ai.google.dev/gemini-api/docs/video-understanding)
|
||||
- [Document API Docs](https://ai.google.dev/gemini-api/docs/document-processing)
|
||||
- [Image Gen Docs](https://ai.google.dev/gemini-api/docs/image-generation)
|
||||
- [Get API Key](https://aistudio.google.com/apikey)
|
||||
- [Pricing](https://ai.google.dev/pricing)
|
||||
373
skills/ai-multimodal/references/audio-processing.md
Normal file
373
skills/ai-multimodal/references/audio-processing.md
Normal file
@@ -0,0 +1,373 @@
|
||||
# Audio Processing Reference
|
||||
|
||||
Comprehensive guide for audio analysis and speech generation using Gemini API.
|
||||
|
||||
## Audio Understanding
|
||||
|
||||
### Supported Formats
|
||||
|
||||
| Format | MIME Type | Best Use |
|
||||
|--------|-----------|----------|
|
||||
| WAV | `audio/wav` | Uncompressed, highest quality |
|
||||
| MP3 | `audio/mp3` | Compressed, widely compatible |
|
||||
| AAC | `audio/aac` | Compressed, good quality |
|
||||
| FLAC | `audio/flac` | Lossless compression |
|
||||
| OGG Vorbis | `audio/ogg` | Open format |
|
||||
| AIFF | `audio/aiff` | Apple format |
|
||||
|
||||
### Specifications
|
||||
|
||||
- **Maximum length**: 9.5 hours per request
|
||||
- **Multiple files**: Unlimited count, combined max 9.5 hours
|
||||
- **Token rate**: 32 tokens/second (1 minute = 1,920 tokens)
|
||||
- **Processing**: Auto-downsampled to 16 Kbps mono
|
||||
- **File size limits**:
|
||||
- Inline: 20 MB max total request
|
||||
- File API: 2 GB per file, 20 GB project quota
|
||||
- Retention: 48 hours auto-delete
|
||||
|
||||
## Transcription
|
||||
|
||||
### Basic Transcription
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Upload audio
|
||||
myfile = client.files.upload(file='meeting.mp3')
|
||||
|
||||
# Transcribe
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Generate a transcript of the speech.', myfile]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### With Timestamps
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Generate transcript with timestamps in MM:SS format.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-Speaker Identification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe with speaker labels. Format: [Speaker 1], [Speaker 2], etc.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Segment-Specific Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe only the segment from 02:30 to 05:15.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Audio Analysis
|
||||
|
||||
### Summarization
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize key points in 5 bullets with timestamps.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Non-Speech Audio Analysis
|
||||
|
||||
```python
|
||||
# Music analysis
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify the musical instruments and genre.', myfile]
|
||||
)
|
||||
|
||||
# Environmental sounds
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify all sounds: voices, music, ambient noise.', myfile]
|
||||
)
|
||||
|
||||
# Birdsong identification
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Identify bird species based on their calls.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Timestamp-Based Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What is discussed from 10:30 to 15:45? Provide key points.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Input Methods
|
||||
|
||||
### File Upload (>20MB or Reuse)
|
||||
|
||||
```python
|
||||
# Upload once, use multiple times
|
||||
myfile = client.files.upload(file='large-audio.mp3')
|
||||
|
||||
# First query
|
||||
response1 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe this', myfile]
|
||||
)
|
||||
|
||||
# Second query (reuses same file)
|
||||
response2 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize this', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### Inline Data (<20MB)
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
with open('small-audio.mp3', 'rb') as f:
|
||||
audio_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Describe this audio',
|
||||
types.Part.from_bytes(data=audio_bytes, mime_type='audio/mp3')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Speech Generation (TTS)
|
||||
|
||||
### Available Models
|
||||
|
||||
| Model | Quality | Speed | Cost/1M tokens |
|
||||
|-------|---------|-------|----------------|
|
||||
| `gemini-2.5-flash-native-audio-preview-09-2025` | High | Fast | $10 |
|
||||
| `gemini-2.5-pro` TTS mode | Premium | Slower | $20 |
|
||||
|
||||
### Basic TTS
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio: Welcome to today\'s episode.'
|
||||
)
|
||||
|
||||
# Save audio
|
||||
with open('output.wav', 'wb') as f:
|
||||
f.write(response.audio_data)
|
||||
```
|
||||
|
||||
### Controllable Voice Style
|
||||
|
||||
```python
|
||||
# Professional tone
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a professional, clear tone: Welcome to our quarterly earnings call.'
|
||||
)
|
||||
|
||||
# Casual and friendly
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a friendly, conversational tone: Hey there! Let\'s dive into today\'s topic.'
|
||||
)
|
||||
|
||||
# Narrative style
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-native-audio-preview-09-2025',
|
||||
contents='Generate audio in a narrative, storytelling tone: Once upon a time, in a land far away...'
|
||||
)
|
||||
```
|
||||
|
||||
### Voice Control Parameters
|
||||
|
||||
- **Style**: Professional, casual, narrative, conversational
|
||||
- **Pace**: Slow, normal, fast
|
||||
- **Tone**: Friendly, serious, enthusiastic
|
||||
- **Accent**: Natural language control (e.g., "British accent", "Southern drawl")
|
||||
|
||||
## Best Practices
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for files >20MB
|
||||
2. Use File API for repeated queries (saves tokens)
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually when done:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Effective prompts**:
|
||||
- "Transcribe from 02:30 to 03:29 in MM:SS format"
|
||||
- "Identify speakers and extract dialogue with timestamps"
|
||||
- "Summarize key points with relevant timestamps"
|
||||
- "Transcribe and analyze sentiment for each speaker"
|
||||
|
||||
**Context improves accuracy**:
|
||||
- "This is a medical interview - use appropriate terminology"
|
||||
- "Transcribe this legal deposition with precise terminology"
|
||||
- "This is a technical podcast about machine learning"
|
||||
|
||||
**Combined tasks**:
|
||||
- "Transcribe and summarize in bullet points"
|
||||
- "Extract key quotes with timestamps and speaker labels"
|
||||
- "Transcribe and identify action items with timestamps"
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
**Token calculation**:
|
||||
- 1 minute audio = 1,920 tokens
|
||||
- 1 hour audio = 115,200 tokens
|
||||
- 9.5 hours = 1,094,400 tokens
|
||||
|
||||
**Model selection**:
|
||||
- Use `gemini-2.5-flash` ($1/1M tokens) for most tasks
|
||||
- Upgrade to `gemini-2.5-pro` ($3/1M tokens) for complex analysis
|
||||
- For high-volume: `gemini-1.5-flash` ($0.70/1M tokens)
|
||||
|
||||
**Reduce costs**:
|
||||
- Process only relevant segments using timestamps
|
||||
- Use lower-quality audio when possible
|
||||
- Batch multiple short files in one request
|
||||
- Cache context for repeated queries
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def transcribe_with_retry(file_path, max_retries=3):
|
||||
"""Transcribe audio with exponential backoff retry"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
myfile = client.files.upload(file=file_path)
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe with timestamps', myfile]
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Retry {attempt + 1} after {wait_time}s")
|
||||
time.sleep(wait_time)
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Meeting Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Transcribe this meeting with:
|
||||
1. Speaker labels
|
||||
2. Timestamps for topic changes
|
||||
3. Action items highlighted
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Podcast Summary
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create podcast summary with:
|
||||
1. Main topics with timestamps
|
||||
2. Key quotes from each speaker
|
||||
3. Recommended episode highlights
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Interview Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze interview:
|
||||
1. Questions asked with timestamps
|
||||
2. Key responses from interviewee
|
||||
3. Overall sentiment and tone
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Content Verification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Verify audio content:
|
||||
1. Check for specific keywords or phrases
|
||||
2. Identify any compliance issues
|
||||
3. Note any concerning statements with timestamps
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Multilingual Transcription
|
||||
|
||||
```python
|
||||
# Gemini auto-detects language
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Transcribe this audio and translate to English if needed.', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
## Token Costs
|
||||
|
||||
**Audio Input** (32 tokens/second):
|
||||
- 1 minute = 1,920 tokens
|
||||
- 10 minutes = 19,200 tokens
|
||||
- 1 hour = 115,200 tokens
|
||||
- 9.5 hours = 1,094,400 tokens
|
||||
|
||||
**Example costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- 1 hour audio: 115,200 tokens = $0.12
|
||||
- Full day podcast (8 hours): 921,600 tokens = $0.92
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 9.5 hours per request
|
||||
- Auto-downsampled to 16 Kbps mono (quality loss)
|
||||
- Files expire after 48 hours
|
||||
- No real-time streaming support
|
||||
- Non-speech audio less accurate than speech
|
||||
558
skills/ai-multimodal/references/image-generation.md
Normal file
558
skills/ai-multimodal/references/image-generation.md
Normal file
@@ -0,0 +1,558 @@
|
||||
# Image Generation Reference
|
||||
|
||||
Comprehensive guide for image creation, editing, and composition using Gemini API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Text-to-Image**: Generate images from text prompts
|
||||
- **Image Editing**: Modify existing images with text instructions
|
||||
- **Multi-Image Composition**: Combine up to 3 images
|
||||
- **Iterative Refinement**: Refine images conversationally
|
||||
- **Aspect Ratios**: Multiple formats (1:1, 16:9, 9:16, 4:3, 3:4)
|
||||
- **Style Control**: Control artistic style and quality
|
||||
- **Text in Images**: Limited text rendering (max 25 chars)
|
||||
|
||||
## Model
|
||||
|
||||
**gemini-2.5-flash-image** - Specialized for image generation
|
||||
- Input tokens: 65,536
|
||||
- Output tokens: 32,768
|
||||
- Knowledge cutoff: June 2025
|
||||
- Supports: Text and image inputs, image outputs
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Generation
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='A serene mountain landscape at sunset with snow-capped peaks',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
|
||||
# Save image
|
||||
for i, part in enumerate(response.candidates[0].content.parts):
|
||||
if part.inline_data:
|
||||
with open(f'output-{i}.png', 'wb') as f:
|
||||
f.write(part.inline_data.data)
|
||||
```
|
||||
|
||||
## Aspect Ratios
|
||||
|
||||
| Ratio | Resolution | Use Case | Token Cost |
|
||||
|-------|-----------|----------|------------|
|
||||
| 1:1 | 1024×1024 | Social media, avatars | 1290 |
|
||||
| 16:9 | 1344×768 | Landscapes, banners | 1290 |
|
||||
| 9:16 | 768×1344 | Mobile, portraits | 1290 |
|
||||
| 4:3 | 1152×896 | Traditional media | 1290 |
|
||||
| 3:4 | 896×1152 | Vertical posters | 1290 |
|
||||
|
||||
All ratios cost the same: 1,290 tokens per image.
|
||||
|
||||
## Response Modalities
|
||||
|
||||
### Image Only
|
||||
|
||||
```python
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='1:1'
|
||||
)
|
||||
```
|
||||
|
||||
### Text Only (No Image)
|
||||
|
||||
```python
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['text']
|
||||
)
|
||||
# Returns text description instead of generating image
|
||||
```
|
||||
|
||||
### Both Image and Text
|
||||
|
||||
```python
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['image', 'text'],
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
# Returns both generated image and description
|
||||
```
|
||||
|
||||
## Image Editing
|
||||
|
||||
### Modify Existing Image
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
# Load original
|
||||
img = PIL.Image.open('original.png')
|
||||
|
||||
# Edit with instructions
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Add a red balloon floating in the sky',
|
||||
img
|
||||
],
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Style Transfer
|
||||
|
||||
```python
|
||||
img = PIL.Image.open('photo.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Transform this into an oil painting style',
|
||||
img
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Object Addition/Removal
|
||||
|
||||
```python
|
||||
# Add object
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Add a vintage car parked on the street',
|
||||
img
|
||||
]
|
||||
)
|
||||
|
||||
# Remove object
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Remove the person on the left side',
|
||||
img
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Multi-Image Composition
|
||||
|
||||
### Combine Multiple Images
|
||||
|
||||
```python
|
||||
img1 = PIL.Image.open('background.png')
|
||||
img2 = PIL.Image.open('foreground.png')
|
||||
img3 = PIL.Image.open('overlay.png')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Combine these images into a cohesive scene',
|
||||
img1,
|
||||
img2,
|
||||
img3
|
||||
],
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: Recommended maximum 3 input images for best results.
|
||||
|
||||
## Prompt Engineering
|
||||
|
||||
### Effective Prompt Structure
|
||||
|
||||
**Three key elements**:
|
||||
1. **Subject**: What to generate
|
||||
2. **Context**: Environmental setting
|
||||
3. **Style**: Artistic treatment
|
||||
|
||||
**Example**: "A robot [subject] in a futuristic city [context], cyberpunk style with neon lighting [style]"
|
||||
|
||||
### Quality Modifiers
|
||||
|
||||
**Technical terms**:
|
||||
- "4K", "8K", "high resolution"
|
||||
- "HDR", "high dynamic range"
|
||||
- "professional photography"
|
||||
- "studio lighting"
|
||||
- "ultra detailed"
|
||||
|
||||
**Camera settings**:
|
||||
- "35mm lens", "50mm lens"
|
||||
- "shallow depth of field"
|
||||
- "wide angle shot"
|
||||
- "macro photography"
|
||||
- "golden hour lighting"
|
||||
|
||||
### Style Keywords
|
||||
|
||||
**Art styles**:
|
||||
- "oil painting", "watercolor", "sketch"
|
||||
- "digital art", "concept art"
|
||||
- "photorealistic", "hyperrealistic"
|
||||
- "minimalist", "abstract"
|
||||
- "cyberpunk", "steampunk", "fantasy"
|
||||
|
||||
**Mood and atmosphere**:
|
||||
- "dramatic lighting", "soft lighting"
|
||||
- "moody", "bright and cheerful"
|
||||
- "mysterious", "whimsical"
|
||||
- "dark and gritty", "pastel colors"
|
||||
|
||||
### Subject Description
|
||||
|
||||
**Be specific**:
|
||||
- ❌ "A cat"
|
||||
- ✅ "A fluffy orange tabby cat with green eyes"
|
||||
|
||||
**Add context**:
|
||||
- ❌ "A building"
|
||||
- ✅ "A modern glass skyscraper reflecting sunset clouds"
|
||||
|
||||
**Include details**:
|
||||
- ❌ "A person"
|
||||
- ✅ "A young woman in a red dress holding an umbrella"
|
||||
|
||||
### Composition and Framing
|
||||
|
||||
**Camera angles**:
|
||||
- "bird's eye view", "aerial shot"
|
||||
- "low angle", "high angle"
|
||||
- "close-up", "wide shot"
|
||||
- "centered composition"
|
||||
- "rule of thirds"
|
||||
|
||||
**Perspective**:
|
||||
- "first person view"
|
||||
- "third person perspective"
|
||||
- "isometric view"
|
||||
- "forced perspective"
|
||||
|
||||
### Text in Images
|
||||
|
||||
**Limitations**:
|
||||
- Maximum 25 characters total
|
||||
- Up to 3 distinct text phrases
|
||||
- Works best with simple text
|
||||
|
||||
**Best practices**:
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='A vintage poster with bold text "EXPLORE" at the top, mountain landscape, retro 1950s style'
|
||||
)
|
||||
```
|
||||
|
||||
**Font control**:
|
||||
- "bold sans-serif title"
|
||||
- "handwritten script"
|
||||
- "vintage letterpress"
|
||||
- "modern minimalist font"
|
||||
|
||||
## Advanced Techniques
|
||||
|
||||
### Iterative Refinement
|
||||
|
||||
```python
|
||||
# Initial generation
|
||||
response1 = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='A futuristic city skyline'
|
||||
)
|
||||
|
||||
# Save first version
|
||||
with open('v1.png', 'wb') as f:
|
||||
f.write(response1.candidates[0].content.parts[0].inline_data.data)
|
||||
|
||||
# Refine
|
||||
img = PIL.Image.open('v1.png')
|
||||
response2 = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=[
|
||||
'Add flying vehicles and neon signs',
|
||||
img
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Negative Prompts (Indirect)
|
||||
|
||||
```python
|
||||
# Instead of "no blur", be specific about what you want
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='A crystal clear, sharp photograph of a diamond ring with perfect focus and high detail'
|
||||
)
|
||||
```
|
||||
|
||||
### Consistent Style Across Images
|
||||
|
||||
```python
|
||||
base_prompt = "Digital art, vibrant colors, cel-shaded style, clean lines"
|
||||
|
||||
prompts = [
|
||||
f"{base_prompt}, a warrior character",
|
||||
f"{base_prompt}, a mage character",
|
||||
f"{base_prompt}, a rogue character"
|
||||
]
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=prompt
|
||||
)
|
||||
# Save each character
|
||||
```
|
||||
|
||||
## Safety Settings
|
||||
|
||||
### Configure Safety Filters
|
||||
|
||||
```python
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
safety_settings=[
|
||||
types.SafetySetting(
|
||||
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
||||
threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
|
||||
),
|
||||
types.SafetySetting(
|
||||
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
||||
threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Available Categories
|
||||
|
||||
- `HARM_CATEGORY_HATE_SPEECH`
|
||||
- `HARM_CATEGORY_DANGEROUS_CONTENT`
|
||||
- `HARM_CATEGORY_HARASSMENT`
|
||||
- `HARM_CATEGORY_SEXUALLY_EXPLICIT`
|
||||
|
||||
### Thresholds
|
||||
|
||||
- `BLOCK_NONE`: No blocking
|
||||
- `BLOCK_LOW_AND_ABOVE`: Block low probability and above
|
||||
- `BLOCK_MEDIUM_AND_ABOVE`: Block medium and above (default)
|
||||
- `BLOCK_ONLY_HIGH`: Block only high probability
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Marketing Assets
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='''Professional product photography:
|
||||
- Sleek smartphone on minimalist white surface
|
||||
- Dramatic side lighting creating subtle shadows
|
||||
- Shallow depth of field, crisp focus
|
||||
- Clean, modern aesthetic
|
||||
- 4K quality
|
||||
''',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='4:3'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Concept Art
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='''Fantasy concept art:
|
||||
- Ancient floating islands connected by chains
|
||||
- Waterfalls cascading into clouds below
|
||||
- Magical crystals glowing on the islands
|
||||
- Epic scale, dramatic lighting
|
||||
- Detailed digital painting style
|
||||
''',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Social Media Graphics
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='''Instagram post design:
|
||||
- Pastel gradient background (pink to blue)
|
||||
- Motivational quote layout
|
||||
- Modern minimalist style
|
||||
- Clean typography
|
||||
- Mobile-friendly composition
|
||||
''',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='1:1'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Illustration
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='''Children's book illustration:
|
||||
- Friendly cartoon dragon reading a book
|
||||
- Bright, cheerful colors
|
||||
- Soft, rounded shapes
|
||||
- Whimsical forest background
|
||||
- Warm, inviting atmosphere
|
||||
''',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='4:3'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 5. UI/UX Mockups
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents='''Modern mobile app interface:
|
||||
- Clean dashboard design
|
||||
- Card-based layout
|
||||
- Soft shadows and gradients
|
||||
- Contemporary color scheme (blue and white)
|
||||
- Professional fintech aesthetic
|
||||
''',
|
||||
config=types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='9:16'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Prompt Quality
|
||||
|
||||
1. **Be specific**: More detail = better results
|
||||
2. **Order matters**: Most important elements first
|
||||
3. **Use examples**: Reference known styles or artists
|
||||
4. **Avoid contradictions**: Don't ask for opposing styles
|
||||
5. **Test and iterate**: Refine prompts based on results
|
||||
|
||||
### File Management
|
||||
|
||||
```python
|
||||
# Save with descriptive names
|
||||
timestamp = int(time.time())
|
||||
filename = f'generated_{timestamp}_{aspect_ratio}.png'
|
||||
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(image_data)
|
||||
```
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
**Token costs**:
|
||||
- 1 image: 1,290 tokens = $0.00129 (Flash Image at $1/1M)
|
||||
- 10 images: 12,900 tokens = $0.0129
|
||||
- 100 images: 129,000 tokens = $0.129
|
||||
|
||||
**Strategies**:
|
||||
- Generate fewer iterations
|
||||
- Use text modality first to validate concept
|
||||
- Batch similar requests
|
||||
- Cache prompts for consistent style
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Safety Filter Blocking
|
||||
|
||||
```python
|
||||
try:
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash-image',
|
||||
contents=prompt
|
||||
)
|
||||
except Exception as e:
|
||||
# Check block reason
|
||||
if hasattr(e, 'prompt_feedback'):
|
||||
print(f"Blocked: {e.prompt_feedback.block_reason}")
|
||||
# Modify prompt and retry
|
||||
```
|
||||
|
||||
### Token Limit Exceeded
|
||||
|
||||
```python
|
||||
# Keep prompts concise
|
||||
if len(prompt) > 1000:
|
||||
# Truncate or simplify
|
||||
prompt = prompt[:1000]
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 3 input images for composition
|
||||
- Text rendering limited (25 chars max)
|
||||
- No video or animation generation
|
||||
- Regional restrictions (child images in EEA, CH, UK)
|
||||
- Optimal language support: English, Spanish (Mexico), Japanese, Mandarin, Hindi
|
||||
- No real-time generation
|
||||
- Cannot perfectly replicate specific people or copyrighted characters
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### aspect_ratio Parameter Error
|
||||
|
||||
**Error**: `Extra inputs are not permitted [type=extra_forbidden, input_value='1:1', input_type=str]`
|
||||
|
||||
**Cause**: The `aspect_ratio` parameter must be nested inside an `image_config` object, not passed directly to `GenerateContentConfig`.
|
||||
|
||||
**Incorrect Usage**:
|
||||
```python
|
||||
# ❌ This will fail
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['image'],
|
||||
aspect_ratio='16:9' # Wrong - not a direct parameter
|
||||
)
|
||||
```
|
||||
|
||||
**Correct Usage**:
|
||||
```python
|
||||
# ✅ Correct implementation
|
||||
config = types.GenerateContentConfig(
|
||||
response_modalities=['Image'], # Note: Capital 'I'
|
||||
image_config=types.ImageConfig(
|
||||
aspect_ratio='16:9'
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Response Modality Case Sensitivity
|
||||
|
||||
The `response_modalities` parameter expects capital case values:
|
||||
- ✅ Correct: `['Image']`, `['Text']`, `['Image', 'Text']`
|
||||
- ❌ Wrong: `['image']`, `['text']`
|
||||
502
skills/ai-multimodal/references/video-analysis.md
Normal file
502
skills/ai-multimodal/references/video-analysis.md
Normal file
@@ -0,0 +1,502 @@
|
||||
# Video Analysis Reference
|
||||
|
||||
Comprehensive guide for video understanding, temporal analysis, and YouTube processing using Gemini API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Video Summarization**: Create concise summaries
|
||||
- **Question Answering**: Answer specific questions about content
|
||||
- **Transcription**: Audio transcription with visual descriptions
|
||||
- **Timestamp References**: Query specific moments (MM:SS format)
|
||||
- **Video Clipping**: Process specific segments
|
||||
- **Scene Detection**: Identify scene changes and transitions
|
||||
- **Multiple Videos**: Compare up to 10 videos (2.5+)
|
||||
- **YouTube Support**: Analyze YouTube videos directly
|
||||
- **Custom Frame Rate**: Adjust FPS sampling
|
||||
|
||||
## Supported Formats
|
||||
|
||||
- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
|
||||
|
||||
## Model Selection
|
||||
|
||||
### Gemini 2.5 Series
|
||||
- **gemini-2.5-pro**: Best quality, 1M-2M context
|
||||
- **gemini-2.5-flash**: Balanced, 1M-2M context
|
||||
- **gemini-2.5-flash-preview-09-2025**: Preview features, 1M context
|
||||
|
||||
### Gemini 2.0 Series
|
||||
- **gemini-2.0-flash**: Fast processing
|
||||
- **gemini-2.0-flash-lite**: Lightweight option
|
||||
|
||||
### Context Windows
|
||||
- **2M token models**: ~2 hours (default) or ~6 hours (low-res)
|
||||
- **1M token models**: ~1 hour (default) or ~3 hours (low-res)
|
||||
|
||||
## Basic Video Analysis
|
||||
|
||||
### Local Video
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Upload video (File API for >20MB)
|
||||
myfile = client.files.upload(file='video.mp4')
|
||||
|
||||
# Wait for processing
|
||||
import time
|
||||
while myfile.state.name == 'PROCESSING':
|
||||
time.sleep(1)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError('Video processing failed')
|
||||
|
||||
# Analyze
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Summarize this video in 3 key points', myfile]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### YouTube Video
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Summarize the main topics discussed',
|
||||
types.Part.from_uri(
|
||||
uri='https://www.youtube.com/watch?v=VIDEO_ID',
|
||||
mime_type='video/mp4'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Inline Video (<20MB)
|
||||
|
||||
```python
|
||||
with open('short-clip.mp4', 'rb') as f:
|
||||
video_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'What happens in this video?',
|
||||
types.Part.from_bytes(data=video_bytes, mime_type='video/mp4')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Video Clipping
|
||||
|
||||
```python
|
||||
# Analyze specific time range
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Summarize this segment',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
start_offset='40s',
|
||||
end_offset='80s'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Frame Rate
|
||||
|
||||
```python
|
||||
# Lower FPS for static content (saves tokens)
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this presentation',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=0.5 # Sample every 2 seconds
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Higher FPS for fast-moving content
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze rapid movements in this sports video',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=5 # Sample 5 times per second
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Multiple Videos (2.5+)
|
||||
|
||||
```python
|
||||
video1 = client.files.upload(file='demo1.mp4')
|
||||
video2 = client.files.upload(file='demo2.mp4')
|
||||
|
||||
# Wait for processing
|
||||
for video in [video1, video2]:
|
||||
while video.state.name == 'PROCESSING':
|
||||
time.sleep(1)
|
||||
video = client.files.get(name=video.name)
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-pro',
|
||||
contents=[
|
||||
'Compare these two product demos. Which explains features better?',
|
||||
video1,
|
||||
video2
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Temporal Understanding
|
||||
|
||||
### Timestamp-Based Questions
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'What happens at 01:15 and how does it relate to 02:30?',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Timeline Creation
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create a timeline with timestamps:
|
||||
- Key events
|
||||
- Scene changes
|
||||
- Important moments
|
||||
Format: MM:SS - Description
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Scene Detection
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Identify all scene changes with timestamps and describe each scene',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Transcription
|
||||
|
||||
### Basic Transcription
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Transcribe the audio from this video',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### With Visual Descriptions
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Transcribe with visual context:
|
||||
- Audio transcription
|
||||
- Visual descriptions of important moments
|
||||
- Timestamps for salient events
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Speaker Identification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Transcribe with speaker labels and timestamps',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Video Summarization
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Summarize this video:
|
||||
1. Main topic and purpose
|
||||
2. Key points with timestamps
|
||||
3. Conclusion or call-to-action
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Educational Content
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Create educational materials:
|
||||
1. List key concepts taught
|
||||
2. Create 5 quiz questions with answers
|
||||
3. Provide timestamp for each concept
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Action Detection
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'List all actions performed in this tutorial with timestamps',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Content Moderation
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Review video content:
|
||||
1. Identify any problematic content
|
||||
2. Note timestamps of concerns
|
||||
3. Provide content rating recommendation
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Interview Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze interview:
|
||||
1. Questions asked (timestamps)
|
||||
2. Key responses
|
||||
3. Candidate body language and demeanor
|
||||
4. Overall assessment
|
||||
''',
|
||||
myfile
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 6. Sports Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze sports video:
|
||||
1. Key plays with timestamps
|
||||
2. Player movements and positioning
|
||||
3. Game strategy observations
|
||||
''',
|
||||
types.Part.from_video_metadata(
|
||||
file_uri=myfile.uri,
|
||||
fps=5 # Higher FPS for fast action
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## YouTube Specific Features
|
||||
|
||||
### Public Video Requirements
|
||||
|
||||
- Video must be public (not private or unlisted)
|
||||
- No age-restricted content
|
||||
- Valid video ID required
|
||||
|
||||
### Usage Example
|
||||
|
||||
```python
|
||||
# YouTube URL
|
||||
youtube_uri = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Create chapter markers with timestamps',
|
||||
types.Part.from_uri(uri=youtube_uri, mime_type='video/mp4')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Rate Limits
|
||||
|
||||
- **Free tier**: 8 hours of YouTube video per day
|
||||
- **Paid tier**: No length-based limits
|
||||
- Public videos only
|
||||
|
||||
## Token Calculation
|
||||
|
||||
Video tokens depend on resolution and FPS:
|
||||
|
||||
**Default resolution** (~300 tokens/second):
|
||||
- 1 minute = 18,000 tokens
|
||||
- 10 minutes = 180,000 tokens
|
||||
- 1 hour = 1,080,000 tokens
|
||||
|
||||
**Low resolution** (~100 tokens/second):
|
||||
- 1 minute = 6,000 tokens
|
||||
- 10 minutes = 60,000 tokens
|
||||
- 1 hour = 360,000 tokens
|
||||
|
||||
**Context windows**:
|
||||
- 2M tokens ≈ 2 hours (default) or 6 hours (low-res)
|
||||
- 1M tokens ≈ 1 hour (default) or 3 hours (low-res)
|
||||
|
||||
## Best Practices
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for videos >20MB (most videos)
|
||||
2. Wait for ACTIVE state before analysis
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Optimization Strategies
|
||||
|
||||
**Reduce token usage**:
|
||||
- Process specific segments using start/end offsets
|
||||
- Use lower FPS for static content
|
||||
- Use low-resolution mode for long videos
|
||||
- Split very long videos into chunks
|
||||
|
||||
**Improve accuracy**:
|
||||
- Provide context in prompts
|
||||
- Use higher FPS for fast-moving content
|
||||
- Use Pro model for complex analysis
|
||||
- Be specific about what to extract
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Effective prompts**:
|
||||
- "Summarize key points with timestamps in MM:SS format"
|
||||
- "Identify all scene changes and describe each scene"
|
||||
- "Extract action items mentioned with timestamps"
|
||||
- "Compare these two videos on: X, Y, Z criteria"
|
||||
|
||||
**Structured output**:
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
class VideoEvent(BaseModel):
|
||||
timestamp: str # MM:SS format
|
||||
description: str
|
||||
category: str
|
||||
|
||||
class VideoAnalysis(BaseModel):
|
||||
summary: str
|
||||
events: List[VideoEvent]
|
||||
duration: str
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze this video', myfile],
|
||||
config=genai.types.GenerateContentConfig(
|
||||
response_mime_type='application/json',
|
||||
response_schema=VideoAnalysis
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def upload_and_process_video(file_path, max_wait=300):
|
||||
"""Upload video and wait for processing"""
|
||||
myfile = client.files.upload(file=file_path)
|
||||
|
||||
elapsed = 0
|
||||
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
||||
time.sleep(5)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
elapsed += 5
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError(f'Video processing failed: {myfile.state.name}')
|
||||
|
||||
if myfile.state.name == 'PROCESSING':
|
||||
raise TimeoutError(f'Processing timeout after {max_wait}s')
|
||||
|
||||
return myfile
|
||||
```
|
||||
|
||||
## Cost Optimization
|
||||
|
||||
**Token costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- 1 minute video (default): 18,000 tokens = $0.018
|
||||
- 10 minute video: 180,000 tokens = $0.18
|
||||
- 1 hour video: 1,080,000 tokens = $1.08
|
||||
|
||||
**Strategies**:
|
||||
- Use video clipping for specific segments
|
||||
- Lower FPS for static content
|
||||
- Use low-resolution mode for long videos
|
||||
- Batch related queries on same video
|
||||
- Use context caching for repeated queries
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 6 hours (low-res) or 2 hours (default)
|
||||
- YouTube videos must be public
|
||||
- No live streaming analysis
|
||||
- Files expire after 48 hours
|
||||
- Processing time varies by video length
|
||||
- No real-time processing
|
||||
- Limited to 10 videos per request (2.5+)
|
||||
483
skills/ai-multimodal/references/vision-understanding.md
Normal file
483
skills/ai-multimodal/references/vision-understanding.md
Normal file
@@ -0,0 +1,483 @@
|
||||
# Vision Understanding Reference
|
||||
|
||||
Comprehensive guide for image analysis, object detection, and visual understanding using Gemini API.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Captioning**: Generate descriptive text for images
|
||||
- **Classification**: Categorize and identify content
|
||||
- **Visual Q&A**: Answer questions about images
|
||||
- **Object Detection**: Locate objects with bounding boxes (2.0+)
|
||||
- **Segmentation**: Create pixel-level masks (2.5+)
|
||||
- **Multi-image**: Compare up to 3,600 images
|
||||
- **OCR**: Extract text from images
|
||||
- **Document Understanding**: Process PDFs with vision
|
||||
|
||||
## Supported Formats
|
||||
|
||||
- **Images**: PNG, JPEG, WEBP, HEIC, HEIF
|
||||
- **Documents**: PDF (up to 1,000 pages)
|
||||
- **Size Limits**:
|
||||
- Inline: 20MB max total request
|
||||
- File API: 2GB per file
|
||||
- Max images: 3,600 per request
|
||||
|
||||
## Model Selection
|
||||
|
||||
### Gemini 2.5 Series
|
||||
- **gemini-2.5-pro**: Best quality, segmentation + detection
|
||||
- **gemini-2.5-flash**: Fast, efficient, all features
|
||||
- **gemini-2.5-flash-lite**: Lightweight, all features
|
||||
|
||||
### Gemini 2.0 Series
|
||||
- **gemini-2.0-flash**: Object detection support
|
||||
- **gemini-2.0-flash-lite**: Lightweight detection
|
||||
|
||||
### Feature Requirements
|
||||
- **Segmentation**: Requires 2.5+ models
|
||||
- **Object Detection**: Requires 2.0+ models
|
||||
- **Multi-image**: All models (up to 3,600 images)
|
||||
|
||||
## Basic Image Analysis
|
||||
|
||||
### Image Captioning
|
||||
|
||||
```python
|
||||
from google import genai
|
||||
import os
|
||||
|
||||
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
|
||||
|
||||
# Local file
|
||||
with open('image.jpg', 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Describe this image in detail',
|
||||
genai.types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
|
||||
]
|
||||
)
|
||||
print(response.text)
|
||||
```
|
||||
|
||||
### Image Classification
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Classify this image. Provide category and confidence level.',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Visual Question Answering
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'How many people are in this image and what are they doing?',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Object Detection (2.0+)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.0-flash',
|
||||
contents=[
|
||||
'Detect all objects in this image and provide bounding boxes',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
|
||||
# Returns bounding box coordinates: [ymin, xmin, ymax, xmax]
|
||||
# Normalized to [0, 1000] range
|
||||
```
|
||||
|
||||
### Segmentation (2.5+)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Create a segmentation mask for all people in this image',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
|
||||
# Returns pixel-level masks for requested objects
|
||||
```
|
||||
|
||||
### Multi-Image Comparison
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
img1 = PIL.Image.open('photo1.jpg')
|
||||
img2 = PIL.Image.open('photo2.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Compare these two images. What are the differences?',
|
||||
img1,
|
||||
img2
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### OCR and Text Extraction
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract all visible text from this image',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Input Methods
|
||||
|
||||
### Inline Data (<20MB)
|
||||
|
||||
```python
|
||||
from google.genai import types
|
||||
|
||||
# From file
|
||||
with open('image.jpg', 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this image',
|
||||
types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### PIL Image
|
||||
|
||||
```python
|
||||
import PIL.Image
|
||||
|
||||
img = PIL.Image.open('photo.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What is in this image?', img]
|
||||
)
|
||||
```
|
||||
|
||||
### File API (>20MB or Reuse)
|
||||
|
||||
```python
|
||||
# Upload once
|
||||
myfile = client.files.upload(file='large-image.jpg')
|
||||
|
||||
# Use multiple times
|
||||
response1 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Describe this image', myfile]
|
||||
)
|
||||
|
||||
response2 = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['What colors dominate this image?', myfile]
|
||||
)
|
||||
```
|
||||
|
||||
### URL (Public Images)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Analyze this image',
|
||||
types.Part.from_uri(
|
||||
uri='https://example.com/image.jpg',
|
||||
mime_type='image/jpeg'
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Token Calculation
|
||||
|
||||
Images consume tokens based on size:
|
||||
|
||||
**Small images** (≤384px both dimensions): 258 tokens
|
||||
|
||||
**Large images**: Tiled into 768×768 chunks, 258 tokens each
|
||||
|
||||
**Formula**:
|
||||
```
|
||||
crop_unit = floor(min(width, height) / 1.5)
|
||||
tiles = (width / crop_unit) × (height / crop_unit)
|
||||
total_tokens = tiles × 258
|
||||
```
|
||||
|
||||
**Examples**:
|
||||
- 256×256: 258 tokens (small)
|
||||
- 512×512: 258 tokens (small)
|
||||
- 960×540: 6 tiles = 1,548 tokens
|
||||
- 1920×1080: 6 tiles = 1,548 tokens
|
||||
- 3840×2160 (4K): 24 tiles = 6,192 tokens
|
||||
|
||||
## Structured Output
|
||||
|
||||
### JSON Schema Output
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
class ObjectDetection(BaseModel):
|
||||
object_name: str
|
||||
confidence: float
|
||||
bounding_box: List[int] # [ymin, xmin, ymax, xmax]
|
||||
|
||||
class ImageAnalysis(BaseModel):
|
||||
description: str
|
||||
objects: List[ObjectDetection]
|
||||
scene_type: str
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze this image', img_part],
|
||||
config=genai.types.GenerateContentConfig(
|
||||
response_mime_type='application/json',
|
||||
response_schema=ImageAnalysis
|
||||
)
|
||||
)
|
||||
|
||||
result = ImageAnalysis.model_validate_json(response.text)
|
||||
```
|
||||
|
||||
## Multi-Image Analysis
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```python
|
||||
images = [
|
||||
PIL.Image.open(f'image{i}.jpg')
|
||||
for i in range(10)
|
||||
]
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=['Analyze these images and find common themes'] + images
|
||||
)
|
||||
```
|
||||
|
||||
### Image Comparison
|
||||
|
||||
```python
|
||||
before = PIL.Image.open('before.jpg')
|
||||
after = PIL.Image.open('after.jpg')
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Compare before and after. List all visible changes.',
|
||||
before,
|
||||
after
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Visual Search
|
||||
|
||||
```python
|
||||
reference = PIL.Image.open('target.jpg')
|
||||
candidates = [PIL.Image.open(f'option{i}.jpg') for i in range(5)]
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Find which candidate images contain objects similar to the reference',
|
||||
reference
|
||||
] + candidates
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Image Quality
|
||||
|
||||
1. **Resolution**: Use clear, non-blurry images
|
||||
2. **Rotation**: Verify correct orientation
|
||||
3. **Lighting**: Ensure good contrast and lighting
|
||||
4. **Size optimization**: Balance quality vs token cost
|
||||
5. **Format**: JPEG for photos, PNG for graphics
|
||||
|
||||
### Prompt Engineering
|
||||
|
||||
**Specific instructions**:
|
||||
- "Identify all vehicles with their colors and positions"
|
||||
- "Count people wearing blue shirts"
|
||||
- "Extract text from the sign in the top-left corner"
|
||||
|
||||
**Output format**:
|
||||
- "Return results as JSON with fields: category, count, description"
|
||||
- "Format as markdown table"
|
||||
- "List findings as numbered items"
|
||||
|
||||
**Few-shot examples**:
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Example: For an image of a cat on a sofa, respond: "Object: cat, Location: sofa"',
|
||||
'Now analyze this image:',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### File Management
|
||||
|
||||
1. Use File API for images >20MB
|
||||
2. Use File API for repeated queries (saves tokens)
|
||||
3. Files auto-delete after 48 hours
|
||||
4. Clean up manually:
|
||||
```python
|
||||
client.files.delete(name=myfile.name)
|
||||
```
|
||||
|
||||
### Cost Optimization
|
||||
|
||||
**Token-efficient strategies**:
|
||||
- Resize large images before upload
|
||||
- Use File API for repeated queries
|
||||
- Batch multiple images when related
|
||||
- Use appropriate model (Flash vs Pro)
|
||||
|
||||
**Token costs** (Gemini 2.5 Flash at $1/1M):
|
||||
- Small image (258 tokens): $0.000258
|
||||
- HD image (1,548 tokens): $0.001548
|
||||
- 4K image (6,192 tokens): $0.006192
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### 1. Product Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze this product image:
|
||||
1. Identify the product
|
||||
2. List visible features
|
||||
3. Assess condition
|
||||
4. Estimate value range
|
||||
''',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Screenshot Analysis
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract all text and UI elements from this screenshot',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Medical Imaging (Informational Only)
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-pro',
|
||||
contents=[
|
||||
'Describe visible features in this medical image. Note: This is for informational purposes only.',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Chart/Graph Reading
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'Extract data from this chart and format as JSON',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Scene Understanding
|
||||
|
||||
```python
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
'''Analyze this scene:
|
||||
1. Location type
|
||||
2. Time of day
|
||||
3. Weather conditions
|
||||
4. Activities happening
|
||||
5. Mood/atmosphere
|
||||
''',
|
||||
img_part
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def analyze_image_with_retry(image_path, prompt, max_retries=3):
|
||||
"""Analyze image with exponential backoff retry"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with open(image_path, 'rb') as f:
|
||||
img_bytes = f.read()
|
||||
|
||||
response = client.models.generate_content(
|
||||
model='gemini-2.5-flash',
|
||||
contents=[
|
||||
prompt,
|
||||
genai.types.Part.from_bytes(
|
||||
data=img_bytes,
|
||||
mime_type='image/jpeg'
|
||||
)
|
||||
]
|
||||
)
|
||||
return response.text
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Retry {attempt + 1} after {wait_time}s: {e}")
|
||||
time.sleep(wait_time)
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Maximum 3,600 images per request
|
||||
- OCR accuracy varies with text quality
|
||||
- Object detection requires 2.0+ models
|
||||
- Segmentation requires 2.5+ models
|
||||
- No video frame extraction (use video API)
|
||||
- Regional restrictions on child images (EEA, CH, UK)
|
||||
395
skills/ai-multimodal/scripts/document_converter.py
Normal file
395
skills/ai-multimodal/scripts/document_converter.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert documents to Markdown using Gemini API.
|
||||
|
||||
Supports all document types:
|
||||
- PDF documents (native vision processing)
|
||||
- Images (JPEG, PNG, WEBP, HEIC)
|
||||
- Office documents (DOCX, XLSX, PPTX)
|
||||
- HTML, TXT, and other text formats
|
||||
|
||||
Features:
|
||||
- Converts to clean markdown format
|
||||
- Preserves structure, tables, and formatting
|
||||
- Extracts text from images and scanned documents
|
||||
- Batch conversion support
|
||||
- Saves to docs/assets/document-extraction.md by default
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
except ImportError:
|
||||
print("Error: google-genai package not installed")
|
||||
print("Install with: pip install google-genai")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
load_dotenv = None
|
||||
|
||||
|
||||
def find_api_key() -> Optional[str]:
|
||||
"""Find Gemini API key using correct priority order.
|
||||
|
||||
Priority order (highest to lowest):
|
||||
1. process.env (runtime environment variables)
|
||||
2. .claude/skills/ai-multimodal/.env (skill-specific config)
|
||||
3. .claude/skills/.env (shared skills config)
|
||||
4. .claude/.env (Claude global config)
|
||||
"""
|
||||
# Priority 1: Already in process.env (highest)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Load .env files if dotenv available
|
||||
if load_dotenv:
|
||||
# Determine base paths
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent # .claude/skills/ai-multimodal
|
||||
skills_dir = skill_dir.parent # .claude/skills
|
||||
claude_dir = skills_dir.parent # .claude
|
||||
|
||||
# Priority 2: Skill-specific .env
|
||||
env_file = skill_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 3: Shared skills .env
|
||||
env_file = skills_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 4: Claude global .env
|
||||
env_file = claude_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_project_root() -> Path:
|
||||
"""Find project root directory."""
|
||||
script_dir = Path(__file__).parent
|
||||
|
||||
# Look for .git or .claude directory
|
||||
for parent in [script_dir] + list(script_dir.parents):
|
||||
if (parent / '.git').exists() or (parent / '.claude').exists():
|
||||
return parent
|
||||
|
||||
return script_dir
|
||||
|
||||
|
||||
def get_mime_type(file_path: str) -> str:
|
||||
"""Determine MIME type from file extension."""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
mime_types = {
|
||||
# Documents
|
||||
'.pdf': 'application/pdf',
|
||||
'.txt': 'text/plain',
|
||||
'.html': 'text/html',
|
||||
'.htm': 'text/html',
|
||||
'.md': 'text/markdown',
|
||||
'.csv': 'text/csv',
|
||||
# Images
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png',
|
||||
'.webp': 'image/webp',
|
||||
'.heic': 'image/heic',
|
||||
'.heif': 'image/heif',
|
||||
# Office (need to be uploaded as binary)
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
}
|
||||
|
||||
return mime_types.get(ext, 'application/octet-stream')
|
||||
|
||||
|
||||
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
|
||||
"""Upload file to Gemini File API."""
|
||||
if verbose:
|
||||
print(f"Uploading {file_path}...")
|
||||
|
||||
myfile = client.files.upload(file=file_path)
|
||||
|
||||
# Wait for processing if needed
|
||||
max_wait = 300 # 5 minutes
|
||||
elapsed = 0
|
||||
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
||||
time.sleep(2)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
elapsed += 2
|
||||
if verbose and elapsed % 10 == 0:
|
||||
print(f" Processing... {elapsed}s")
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError(f"File processing failed: {file_path}")
|
||||
|
||||
if myfile.state.name == 'PROCESSING':
|
||||
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
|
||||
|
||||
if verbose:
|
||||
print(f" Uploaded: {myfile.name}")
|
||||
|
||||
return myfile
|
||||
|
||||
|
||||
def convert_to_markdown(
|
||||
client: genai.Client,
|
||||
file_path: str,
|
||||
model: str = 'gemini-2.5-flash',
|
||||
custom_prompt: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
max_retries: int = 3
|
||||
) -> Dict[str, Any]:
|
||||
"""Convert a document to markdown using Gemini."""
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
file_path_obj = Path(file_path)
|
||||
file_size = file_path_obj.stat().st_size
|
||||
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
|
||||
|
||||
# Default prompt for markdown conversion
|
||||
if custom_prompt:
|
||||
prompt = custom_prompt
|
||||
else:
|
||||
prompt = """Convert this document to clean, well-formatted Markdown.
|
||||
|
||||
Requirements:
|
||||
- Preserve all content, structure, and formatting
|
||||
- Convert tables to markdown table format
|
||||
- Maintain heading hierarchy (# ## ### etc)
|
||||
- Preserve lists, code blocks, and quotes
|
||||
- Extract text from images if present
|
||||
- Keep formatting consistent and readable
|
||||
|
||||
Output only the markdown content without any preamble or explanation."""
|
||||
|
||||
# Upload or inline the file
|
||||
if use_file_api:
|
||||
myfile = upload_file(client, str(file_path), verbose)
|
||||
content = [prompt, myfile]
|
||||
else:
|
||||
with open(file_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
|
||||
mime_type = get_mime_type(str(file_path))
|
||||
content = [
|
||||
prompt,
|
||||
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
|
||||
]
|
||||
|
||||
# Generate markdown
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=content
|
||||
)
|
||||
|
||||
markdown_content = response.text if hasattr(response, 'text') else ''
|
||||
|
||||
return {
|
||||
'file': str(file_path),
|
||||
'status': 'success',
|
||||
'markdown': markdown_content
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
return {
|
||||
'file': str(file_path),
|
||||
'status': 'error',
|
||||
'error': str(e),
|
||||
'markdown': None
|
||||
}
|
||||
|
||||
wait_time = 2 ** attempt
|
||||
if verbose:
|
||||
print(f" Retry {attempt + 1} after {wait_time}s: {e}")
|
||||
time.sleep(wait_time)
|
||||
|
||||
|
||||
def batch_convert(
|
||||
files: List[str],
|
||||
output_file: Optional[str] = None,
|
||||
auto_name: bool = False,
|
||||
model: str = 'gemini-2.5-flash',
|
||||
custom_prompt: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Batch convert multiple files to markdown."""
|
||||
|
||||
api_key = find_api_key()
|
||||
if not api_key:
|
||||
print("Error: GEMINI_API_KEY not found")
|
||||
print("Set via: export GEMINI_API_KEY='your-key'")
|
||||
print("Or create .env file with: GEMINI_API_KEY=your-key")
|
||||
sys.exit(1)
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
results = []
|
||||
|
||||
# Determine output path
|
||||
if not output_file:
|
||||
project_root = find_project_root()
|
||||
output_dir = project_root / 'docs' / 'assets'
|
||||
|
||||
if auto_name and len(files) == 1:
|
||||
# Auto-generate meaningful filename from input
|
||||
input_path = Path(files[0])
|
||||
base_name = input_path.stem
|
||||
output_file = str(output_dir / f"{base_name}-extraction.md")
|
||||
else:
|
||||
output_file = str(output_dir / 'document-extraction.md')
|
||||
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process each file
|
||||
for i, file_path in enumerate(files, 1):
|
||||
if verbose:
|
||||
print(f"\n[{i}/{len(files)}] Converting: {file_path}")
|
||||
|
||||
result = convert_to_markdown(
|
||||
client=client,
|
||||
file_path=file_path,
|
||||
model=model,
|
||||
custom_prompt=custom_prompt,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
status = result.get('status', 'unknown')
|
||||
print(f" Status: {status}")
|
||||
|
||||
# Save combined markdown
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# Document Extraction Results\n\n")
|
||||
f.write(f"Converted {len(files)} document(s) to markdown.\n\n")
|
||||
f.write("---\n\n")
|
||||
|
||||
for result in results:
|
||||
f.write(f"## {Path(result['file']).name}\n\n")
|
||||
|
||||
if result['status'] == 'success' and result.get('markdown'):
|
||||
f.write(result['markdown'])
|
||||
f.write("\n\n")
|
||||
elif result['status'] == 'success':
|
||||
f.write("**Note**: Conversion succeeded but no content was returned.\n\n")
|
||||
else:
|
||||
f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n")
|
||||
|
||||
f.write("---\n\n")
|
||||
|
||||
if verbose or True: # Always show output location
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Converted: {len(results)} file(s)")
|
||||
print(f"Success: {sum(1 for r in results if r['status'] == 'success')}")
|
||||
print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}")
|
||||
print(f"Output saved to: {output_path}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert documents to Markdown using Gemini API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Convert single PDF to markdown (default name)
|
||||
%(prog)s --input document.pdf
|
||||
|
||||
# Auto-generate meaningful filename
|
||||
%(prog)s --input testpdf.pdf --auto-name
|
||||
# Output: docs/assets/testpdf-extraction.md
|
||||
|
||||
# Convert multiple files
|
||||
%(prog)s --input doc1.pdf doc2.docx image.png
|
||||
|
||||
# Specify custom output location
|
||||
%(prog)s --input document.pdf --output ./output.md
|
||||
|
||||
# Use custom prompt
|
||||
%(prog)s --input document.pdf --prompt "Extract only the tables as markdown"
|
||||
|
||||
# Batch convert directory
|
||||
%(prog)s --input ./documents/*.pdf --verbose
|
||||
|
||||
Supported formats:
|
||||
- PDF documents (up to 1,000 pages)
|
||||
- Images (JPEG, PNG, WEBP, HEIC)
|
||||
- Office documents (DOCX, XLSX, PPTX)
|
||||
- Text formats (TXT, HTML, Markdown, CSV)
|
||||
|
||||
Default output: <project-root>/docs/assets/document-extraction.md
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', '-i', nargs='+', required=True,
|
||||
help='Input file(s) to convert')
|
||||
parser.add_argument('--output', '-o',
|
||||
help='Output markdown file (default: docs/assets/document-extraction.md)')
|
||||
parser.add_argument('--auto-name', '-a', action='store_true',
|
||||
help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)')
|
||||
parser.add_argument('--model', default='gemini-2.5-flash',
|
||||
help='Gemini model to use (default: gemini-2.5-flash)')
|
||||
parser.add_argument('--prompt', '-p',
|
||||
help='Custom prompt for conversion')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input files
|
||||
files = []
|
||||
for file_pattern in args.input:
|
||||
file_path = Path(file_pattern)
|
||||
if file_path.exists() and file_path.is_file():
|
||||
files.append(str(file_path))
|
||||
else:
|
||||
# Try glob pattern
|
||||
import glob
|
||||
matched = glob.glob(file_pattern)
|
||||
files.extend([f for f in matched if Path(f).is_file()])
|
||||
|
||||
if not files:
|
||||
print("Error: No valid input files found")
|
||||
sys.exit(1)
|
||||
|
||||
# Convert files
|
||||
batch_convert(
|
||||
files=files,
|
||||
output_file=args.output,
|
||||
auto_name=args.auto_name,
|
||||
model=args.model,
|
||||
custom_prompt=args.prompt,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
480
skills/ai-multimodal/scripts/gemini_batch_process.py
Normal file
480
skills/ai-multimodal/scripts/gemini_batch_process.py
Normal file
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch process multiple media files using Gemini API.
|
||||
|
||||
Supports all Gemini modalities:
|
||||
- Audio: Transcription, analysis, summarization
|
||||
- Image: Captioning, detection, OCR, analysis
|
||||
- Video: Summarization, Q&A, scene detection
|
||||
- Document: PDF extraction, structured output
|
||||
- Generation: Image creation from text prompts
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional
|
||||
import csv
|
||||
import shutil
|
||||
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
except ImportError:
|
||||
print("Error: google-genai package not installed")
|
||||
print("Install with: pip install google-genai")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
load_dotenv = None
|
||||
|
||||
|
||||
def find_api_key() -> Optional[str]:
|
||||
"""Find Gemini API key using correct priority order.
|
||||
|
||||
Priority order (highest to lowest):
|
||||
1. process.env (runtime environment variables)
|
||||
2. .claude/skills/ai-multimodal/.env (skill-specific config)
|
||||
3. .claude/skills/.env (shared skills config)
|
||||
4. .claude/.env (Claude global config)
|
||||
"""
|
||||
# Priority 1: Already in process.env (highest)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Load .env files if dotenv available
|
||||
if load_dotenv:
|
||||
# Determine base paths
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent # .claude/skills/ai-multimodal
|
||||
skills_dir = skill_dir.parent # .claude/skills
|
||||
claude_dir = skills_dir.parent # .claude
|
||||
|
||||
# Priority 2: Skill-specific .env
|
||||
env_file = skill_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 3: Shared skills .env
|
||||
env_file = skills_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
# Priority 4: Claude global .env
|
||||
env_file = claude_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
api_key = os.getenv('GEMINI_API_KEY')
|
||||
if api_key:
|
||||
return api_key
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_mime_type(file_path: str) -> str:
|
||||
"""Determine MIME type from file extension."""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
|
||||
mime_types = {
|
||||
# Audio
|
||||
'.mp3': 'audio/mp3',
|
||||
'.wav': 'audio/wav',
|
||||
'.aac': 'audio/aac',
|
||||
'.flac': 'audio/flac',
|
||||
'.ogg': 'audio/ogg',
|
||||
'.aiff': 'audio/aiff',
|
||||
# Image
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png',
|
||||
'.webp': 'image/webp',
|
||||
'.heic': 'image/heic',
|
||||
'.heif': 'image/heif',
|
||||
# Video
|
||||
'.mp4': 'video/mp4',
|
||||
'.mpeg': 'video/mpeg',
|
||||
'.mov': 'video/quicktime',
|
||||
'.avi': 'video/x-msvideo',
|
||||
'.flv': 'video/x-flv',
|
||||
'.mpg': 'video/mpeg',
|
||||
'.webm': 'video/webm',
|
||||
'.wmv': 'video/x-ms-wmv',
|
||||
'.3gpp': 'video/3gpp',
|
||||
# Document
|
||||
'.pdf': 'application/pdf',
|
||||
'.txt': 'text/plain',
|
||||
'.html': 'text/html',
|
||||
'.md': 'text/markdown',
|
||||
}
|
||||
|
||||
return mime_types.get(ext, 'application/octet-stream')
|
||||
|
||||
|
||||
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
|
||||
"""Upload file to Gemini File API."""
|
||||
if verbose:
|
||||
print(f"Uploading {file_path}...")
|
||||
|
||||
myfile = client.files.upload(file=file_path)
|
||||
|
||||
# Wait for processing (video/audio files need processing)
|
||||
mime_type = get_mime_type(file_path)
|
||||
if mime_type.startswith('video/') or mime_type.startswith('audio/'):
|
||||
max_wait = 300 # 5 minutes
|
||||
elapsed = 0
|
||||
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
|
||||
time.sleep(2)
|
||||
myfile = client.files.get(name=myfile.name)
|
||||
elapsed += 2
|
||||
if verbose and elapsed % 10 == 0:
|
||||
print(f" Processing... {elapsed}s")
|
||||
|
||||
if myfile.state.name == 'FAILED':
|
||||
raise ValueError(f"File processing failed: {file_path}")
|
||||
|
||||
if myfile.state.name == 'PROCESSING':
|
||||
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
|
||||
|
||||
if verbose:
|
||||
print(f" Uploaded: {myfile.name}")
|
||||
|
||||
return myfile
|
||||
|
||||
|
||||
def process_file(
|
||||
client: genai.Client,
|
||||
file_path: Optional[str],
|
||||
prompt: str,
|
||||
model: str,
|
||||
task: str,
|
||||
format_output: str,
|
||||
aspect_ratio: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
max_retries: int = 3
|
||||
) -> Dict[str, Any]:
|
||||
"""Process a single file with retry logic."""
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
# For generation tasks without input files
|
||||
if task == 'generate' and not file_path:
|
||||
content = [prompt]
|
||||
else:
|
||||
# Process input file
|
||||
file_path = Path(file_path)
|
||||
# Determine if we need File API
|
||||
file_size = file_path.stat().st_size
|
||||
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
|
||||
|
||||
if use_file_api:
|
||||
# Upload to File API
|
||||
myfile = upload_file(client, str(file_path), verbose)
|
||||
content = [prompt, myfile]
|
||||
else:
|
||||
# Inline data
|
||||
with open(file_path, 'rb') as f:
|
||||
file_bytes = f.read()
|
||||
|
||||
mime_type = get_mime_type(str(file_path))
|
||||
content = [
|
||||
prompt,
|
||||
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
|
||||
]
|
||||
|
||||
# Configure request
|
||||
config_args = {}
|
||||
if task == 'generate':
|
||||
config_args['response_modalities'] = ['Image'] # Capital I per API spec
|
||||
if aspect_ratio:
|
||||
# Nest aspect_ratio in image_config per API spec
|
||||
config_args['image_config'] = types.ImageConfig(
|
||||
aspect_ratio=aspect_ratio
|
||||
)
|
||||
|
||||
if format_output == 'json':
|
||||
config_args['response_mime_type'] = 'application/json'
|
||||
|
||||
config = types.GenerateContentConfig(**config_args) if config_args else None
|
||||
|
||||
# Generate content
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=content,
|
||||
config=config
|
||||
)
|
||||
|
||||
# Extract response
|
||||
result = {
|
||||
'file': str(file_path) if file_path else 'generated',
|
||||
'status': 'success',
|
||||
'response': response.text if hasattr(response, 'text') else None
|
||||
}
|
||||
|
||||
# Handle image output
|
||||
if task == 'generate' and hasattr(response, 'candidates'):
|
||||
for i, part in enumerate(response.candidates[0].content.parts):
|
||||
if part.inline_data:
|
||||
# Determine output directory - use project root docs/assets
|
||||
if file_path:
|
||||
output_dir = Path(file_path).parent
|
||||
base_name = Path(file_path).stem
|
||||
else:
|
||||
# Find project root (look for .git or .claude directory)
|
||||
script_dir = Path(__file__).parent
|
||||
project_root = script_dir
|
||||
for parent in [script_dir] + list(script_dir.parents):
|
||||
if (parent / '.git').exists() or (parent / '.claude').exists():
|
||||
project_root = parent
|
||||
break
|
||||
|
||||
output_dir = project_root / 'docs' / 'assets'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
base_name = "generated"
|
||||
|
||||
output_file = output_dir / f"{base_name}_generated_{i}.png"
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(part.inline_data.data)
|
||||
result['generated_image'] = str(output_file)
|
||||
if verbose:
|
||||
print(f" Saved image to: {output_file}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
return {
|
||||
'file': str(file_path) if file_path else 'generated',
|
||||
'status': 'error',
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
wait_time = 2 ** attempt
|
||||
if verbose:
|
||||
print(f" Retry {attempt + 1} after {wait_time}s: {e}")
|
||||
time.sleep(wait_time)
|
||||
|
||||
|
||||
def batch_process(
|
||||
files: List[str],
|
||||
prompt: str,
|
||||
model: str,
|
||||
task: str,
|
||||
format_output: str,
|
||||
aspect_ratio: Optional[str] = None,
|
||||
output_file: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
dry_run: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Batch process multiple files."""
|
||||
api_key = find_api_key()
|
||||
if not api_key:
|
||||
print("Error: GEMINI_API_KEY not found")
|
||||
print("Set via: export GEMINI_API_KEY='your-key'")
|
||||
print("Or create .env file with: GEMINI_API_KEY=your-key")
|
||||
sys.exit(1)
|
||||
|
||||
if dry_run:
|
||||
print("DRY RUN MODE - No API calls will be made")
|
||||
print(f"Files to process: {len(files)}")
|
||||
print(f"Model: {model}")
|
||||
print(f"Task: {task}")
|
||||
print(f"Prompt: {prompt}")
|
||||
return []
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
results = []
|
||||
|
||||
# For generation tasks without input files, process once
|
||||
if task == 'generate' and not files:
|
||||
if verbose:
|
||||
print(f"\nGenerating image from prompt...")
|
||||
|
||||
result = process_file(
|
||||
client=client,
|
||||
file_path=None,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
task=task,
|
||||
format_output=format_output,
|
||||
aspect_ratio=aspect_ratio,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
status = result.get('status', 'unknown')
|
||||
print(f" Status: {status}")
|
||||
else:
|
||||
# Process input files
|
||||
for i, file_path in enumerate(files, 1):
|
||||
if verbose:
|
||||
print(f"\n[{i}/{len(files)}] Processing: {file_path}")
|
||||
|
||||
result = process_file(
|
||||
client=client,
|
||||
file_path=file_path,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
task=task,
|
||||
format_output=format_output,
|
||||
aspect_ratio=aspect_ratio,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
status = result.get('status', 'unknown')
|
||||
print(f" Status: {status}")
|
||||
|
||||
# Save results
|
||||
if output_file:
|
||||
save_results(results, output_file, format_output)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str):
|
||||
"""Save results to file."""
|
||||
output_path = Path(output_file)
|
||||
|
||||
# Special handling for image generation - if output has image extension, copy the generated image
|
||||
image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'}
|
||||
if output_path.suffix.lower() in image_extensions and len(results) == 1:
|
||||
generated_image = results[0].get('generated_image')
|
||||
if generated_image:
|
||||
# Copy the generated image to the specified output location
|
||||
shutil.copy2(generated_image, output_path)
|
||||
return
|
||||
else:
|
||||
# Don't write text reports to image files - save error as .txt instead
|
||||
output_path = output_path.with_suffix('.error.txt')
|
||||
print(f"Warning: Generation failed, saving error report to: {output_path}")
|
||||
|
||||
if format_output == 'json':
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(results, f, indent=2)
|
||||
elif format_output == 'csv':
|
||||
with open(output_path, 'w', newline='') as f:
|
||||
fieldnames = ['file', 'status', 'response', 'error']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for result in results:
|
||||
writer.writerow({
|
||||
'file': result.get('file', ''),
|
||||
'status': result.get('status', ''),
|
||||
'response': result.get('response', ''),
|
||||
'error': result.get('error', '')
|
||||
})
|
||||
else: # markdown
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# Batch Processing Results\n\n")
|
||||
for i, result in enumerate(results, 1):
|
||||
f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n")
|
||||
f.write(f"**Status**: {result.get('status', 'unknown')}\n\n")
|
||||
if result.get('response'):
|
||||
f.write(f"**Response**:\n\n{result['response']}\n\n")
|
||||
if result.get('error'):
|
||||
f.write(f"**Error**: {result['error']}\n\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Batch process media files with Gemini API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Transcribe multiple audio files
|
||||
%(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash
|
||||
|
||||
# Analyze images
|
||||
%(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\
|
||||
--model gemini-2.5-flash
|
||||
|
||||
# Process PDFs to JSON
|
||||
%(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\
|
||||
--format json --output results.json
|
||||
|
||||
# Generate images
|
||||
%(prog)s --task generate --prompt "A mountain landscape" \\
|
||||
--model gemini-2.5-flash-image --aspect-ratio 16:9
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--files', nargs='*', help='Input files to process')
|
||||
parser.add_argument('--task', required=True,
|
||||
choices=['transcribe', 'analyze', 'extract', 'generate'],
|
||||
help='Task to perform')
|
||||
parser.add_argument('--prompt', help='Prompt for analysis/generation')
|
||||
parser.add_argument('--model', default='gemini-2.5-flash',
|
||||
help='Gemini model to use (default: gemini-2.5-flash)')
|
||||
parser.add_argument('--format', dest='format_output', default='text',
|
||||
choices=['text', 'json', 'csv', 'markdown'],
|
||||
help='Output format (default: text)')
|
||||
parser.add_argument('--aspect-ratio', choices=['1:1', '16:9', '9:16', '4:3', '3:4'],
|
||||
help='Aspect ratio for image generation')
|
||||
parser.add_argument('--output', help='Output file for results')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Verbose output')
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show what would be done without making API calls')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate arguments
|
||||
if args.task != 'generate' and not args.files:
|
||||
parser.error("--files required for non-generation tasks")
|
||||
|
||||
if args.task == 'generate' and not args.prompt:
|
||||
parser.error("--prompt required for generation task")
|
||||
|
||||
if args.task != 'generate' and not args.prompt:
|
||||
# Set default prompts
|
||||
if args.task == 'transcribe':
|
||||
args.prompt = 'Generate a transcript with timestamps'
|
||||
elif args.task == 'analyze':
|
||||
args.prompt = 'Analyze this content'
|
||||
elif args.task == 'extract':
|
||||
args.prompt = 'Extract key information'
|
||||
|
||||
# Process files
|
||||
files = args.files or []
|
||||
results = batch_process(
|
||||
files=files,
|
||||
prompt=args.prompt,
|
||||
model=args.model,
|
||||
task=args.task,
|
||||
format_output=args.format_output,
|
||||
aspect_ratio=args.aspect_ratio,
|
||||
output_file=args.output,
|
||||
verbose=args.verbose,
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
|
||||
# Print summary
|
||||
if not args.dry_run and results:
|
||||
success = sum(1 for r in results if r.get('status') == 'success')
|
||||
failed = len(results) - success
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Processed: {len(results)} files")
|
||||
print(f"Success: {success}")
|
||||
print(f"Failed: {failed}")
|
||||
if args.output:
|
||||
print(f"Results saved to: {args.output}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
506
skills/ai-multimodal/scripts/media_optimizer.py
Normal file
506
skills/ai-multimodal/scripts/media_optimizer.py
Normal file
@@ -0,0 +1,506 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimize media files for Gemini API processing.
|
||||
|
||||
Features:
|
||||
- Compress videos/audio for size limits
|
||||
- Resize images appropriately
|
||||
- Split long videos into chunks
|
||||
- Format conversion
|
||||
- Quality vs size optimization
|
||||
- Validation before upload
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
except ImportError:
|
||||
load_dotenv = None
|
||||
|
||||
|
||||
def load_env_files():
|
||||
"""Load .env files in correct priority order.
|
||||
|
||||
Priority order (highest to lowest):
|
||||
1. process.env (runtime environment variables)
|
||||
2. .claude/skills/ai-multimodal/.env (skill-specific config)
|
||||
3. .claude/skills/.env (shared skills config)
|
||||
4. .claude/.env (Claude global config)
|
||||
"""
|
||||
if not load_dotenv:
|
||||
return
|
||||
|
||||
# Determine base paths
|
||||
script_dir = Path(__file__).parent
|
||||
skill_dir = script_dir.parent # .claude/skills/ai-multimodal
|
||||
skills_dir = skill_dir.parent # .claude/skills
|
||||
claude_dir = skills_dir.parent # .claude
|
||||
|
||||
# Priority 2: Skill-specific .env
|
||||
env_file = skill_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
# Priority 3: Shared skills .env
|
||||
env_file = skills_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
# Priority 4: Claude global .env
|
||||
env_file = claude_dir / '.env'
|
||||
if env_file.exists():
|
||||
load_dotenv(env_file)
|
||||
|
||||
|
||||
# Load environment variables at module level
|
||||
load_env_files()
|
||||
|
||||
|
||||
def check_ffmpeg() -> bool:
|
||||
"""Check if ffmpeg is installed."""
|
||||
try:
|
||||
subprocess.run(['ffmpeg', '-version'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
check=True)
|
||||
return True
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
||||
return False
|
||||
|
||||
|
||||
def get_media_info(file_path: str) -> Dict[str, Any]:
|
||||
"""Get media file information using ffprobe."""
|
||||
if not check_ffmpeg():
|
||||
return {}
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
'ffprobe',
|
||||
'-v', 'quiet',
|
||||
'-print_format', 'json',
|
||||
'-show_format',
|
||||
'-show_streams',
|
||||
file_path
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
|
||||
info = {
|
||||
'size': int(data['format'].get('size', 0)),
|
||||
'duration': float(data['format'].get('duration', 0)),
|
||||
'bit_rate': int(data['format'].get('bit_rate', 0)),
|
||||
}
|
||||
|
||||
# Get video/audio specific info
|
||||
for stream in data.get('streams', []):
|
||||
if stream['codec_type'] == 'video':
|
||||
info['width'] = stream.get('width', 0)
|
||||
info['height'] = stream.get('height', 0)
|
||||
info['fps'] = eval(stream.get('r_frame_rate', '0/1'))
|
||||
elif stream['codec_type'] == 'audio':
|
||||
info['sample_rate'] = int(stream.get('sample_rate', 0))
|
||||
info['channels'] = stream.get('channels', 0)
|
||||
|
||||
return info
|
||||
|
||||
except (subprocess.CalledProcessError, json.JSONDecodeError, Exception):
|
||||
return {}
|
||||
|
||||
|
||||
def optimize_video(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
target_size_mb: Optional[int] = None,
|
||||
max_duration: Optional[int] = None,
|
||||
quality: int = 23,
|
||||
resolution: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize video file for Gemini API."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
print("Install: apt-get install ffmpeg (Linux) or brew install ffmpeg (Mac)")
|
||||
return False
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
print(f"Error: Could not read media info from {input_path}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {info['duration']:.2f}s")
|
||||
if 'width' in info:
|
||||
print(f" Resolution: {info['width']}x{info['height']}")
|
||||
print(f" Bit rate: {info['bit_rate'] / 1000:.0f} kbps")
|
||||
|
||||
# Build ffmpeg command
|
||||
cmd = ['ffmpeg', '-i', input_path, '-y']
|
||||
|
||||
# Video codec
|
||||
cmd.extend(['-c:v', 'libx264', '-crf', str(quality)])
|
||||
|
||||
# Resolution
|
||||
if resolution:
|
||||
cmd.extend(['-vf', f'scale={resolution}'])
|
||||
elif 'width' in info and info['width'] > 1920:
|
||||
cmd.extend(['-vf', 'scale=1920:-2']) # Max 1080p
|
||||
|
||||
# Audio codec
|
||||
cmd.extend(['-c:a', 'aac', '-b:a', '128k', '-ac', '2'])
|
||||
|
||||
# Duration limit
|
||||
if max_duration and info['duration'] > max_duration:
|
||||
cmd.extend(['-t', str(max_duration)])
|
||||
|
||||
# Target size (rough estimate using bitrate)
|
||||
if target_size_mb:
|
||||
target_bits = target_size_mb * 8 * 1024 * 1024
|
||||
duration = min(info['duration'], max_duration) if max_duration else info['duration']
|
||||
target_bitrate = int(target_bits / duration)
|
||||
# Reserve some for audio (128kbps)
|
||||
video_bitrate = max(target_bitrate - 128000, 500000)
|
||||
cmd.extend(['-b:v', str(video_bitrate)])
|
||||
|
||||
cmd.append(output_path)
|
||||
|
||||
if verbose:
|
||||
print(f"\nOptimizing...")
|
||||
print(f" Command: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
|
||||
# Check output
|
||||
output_info = get_media_info(output_path)
|
||||
if output_info and verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {output_info['duration']:.2f}s")
|
||||
if 'width' in output_info:
|
||||
print(f" Resolution: {output_info['width']}x{output_info['height']}")
|
||||
compression = (1 - output_info['size'] / info['size']) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error optimizing video: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def optimize_audio(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
target_size_mb: Optional[int] = None,
|
||||
bitrate: str = '64k',
|
||||
sample_rate: int = 16000,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize audio file for Gemini API."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
return False
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
print(f"Error: Could not read media info from {input_path}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {info['size'] / (1024*1024):.2f} MB")
|
||||
print(f" Duration: {info['duration']:.2f}s")
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
'ffmpeg', '-i', input_path, '-y',
|
||||
'-c:a', 'aac',
|
||||
'-b:a', bitrate,
|
||||
'-ar', str(sample_rate),
|
||||
'-ac', '1', # Mono (Gemini uses mono anyway)
|
||||
output_path
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print(f"\nOptimizing...")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
|
||||
output_info = get_media_info(output_path)
|
||||
if output_info and verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {output_info['size'] / (1024*1024):.2f} MB")
|
||||
compression = (1 - output_info['size'] / info['size']) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error optimizing audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def optimize_image(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
max_width: int = 1920,
|
||||
quality: int = 85,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""Optimize image file for Gemini API."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
print("Error: Pillow not installed")
|
||||
print("Install with: pip install pillow")
|
||||
return False
|
||||
|
||||
try:
|
||||
img = Image.open(input_path)
|
||||
|
||||
if verbose:
|
||||
print(f"Input: {Path(input_path).name}")
|
||||
print(f" Size: {Path(input_path).stat().st_size / 1024:.2f} KB")
|
||||
print(f" Resolution: {img.width}x{img.height}")
|
||||
|
||||
# Resize if needed
|
||||
if img.width > max_width:
|
||||
ratio = max_width / img.width
|
||||
new_height = int(img.height * ratio)
|
||||
img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
|
||||
if verbose:
|
||||
print(f" Resized to: {img.width}x{img.height}")
|
||||
|
||||
# Convert RGBA to RGB if saving as JPEG
|
||||
if output_path.lower().endswith('.jpg') or output_path.lower().endswith('.jpeg'):
|
||||
if img.mode == 'RGBA':
|
||||
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
|
||||
rgb_img.paste(img, mask=img.split()[3])
|
||||
img = rgb_img
|
||||
|
||||
# Save
|
||||
img.save(output_path, quality=quality, optimize=True)
|
||||
|
||||
if verbose:
|
||||
print(f"\nOutput: {Path(output_path).name}")
|
||||
print(f" Size: {Path(output_path).stat().st_size / 1024:.2f} KB")
|
||||
compression = (1 - Path(output_path).stat().st_size / Path(input_path).stat().st_size) * 100
|
||||
print(f" Compression: {compression:.1f}%")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error optimizing image: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def split_video(
|
||||
input_path: str,
|
||||
output_dir: str,
|
||||
chunk_duration: int = 3600,
|
||||
verbose: bool = False
|
||||
) -> List[str]:
|
||||
"""Split long video into chunks."""
|
||||
if not check_ffmpeg():
|
||||
print("Error: ffmpeg not installed")
|
||||
return []
|
||||
|
||||
info = get_media_info(input_path)
|
||||
if not info:
|
||||
return []
|
||||
|
||||
total_duration = info['duration']
|
||||
num_chunks = int(total_duration / chunk_duration) + 1
|
||||
|
||||
if num_chunks == 1:
|
||||
if verbose:
|
||||
print("Video is short enough, no splitting needed")
|
||||
return [input_path]
|
||||
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
output_files = []
|
||||
|
||||
for i in range(num_chunks):
|
||||
start_time = i * chunk_duration
|
||||
output_file = Path(output_dir) / f"{Path(input_path).stem}_chunk_{i+1}.mp4"
|
||||
|
||||
cmd = [
|
||||
'ffmpeg', '-i', input_path, '-y',
|
||||
'-ss', str(start_time),
|
||||
'-t', str(chunk_duration),
|
||||
'-c', 'copy',
|
||||
str(output_file)
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print(f"Creating chunk {i+1}/{num_chunks}...")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=not verbose)
|
||||
output_files.append(str(output_file))
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error creating chunk {i+1}: {e}")
|
||||
|
||||
return output_files
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Optimize media files for Gemini API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Optimize video to 100MB
|
||||
%(prog)s --input video.mp4 --output optimized.mp4 --target-size 100
|
||||
|
||||
# Optimize audio
|
||||
%(prog)s --input audio.mp3 --output optimized.m4a --bitrate 64k
|
||||
|
||||
# Resize image
|
||||
%(prog)s --input image.jpg --output resized.jpg --max-width 1920
|
||||
|
||||
# Split long video
|
||||
%(prog)s --input long-video.mp4 --split --chunk-duration 3600 --output-dir ./chunks
|
||||
|
||||
# Batch optimize directory
|
||||
%(prog)s --input-dir ./videos --output-dir ./optimized --quality 85
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--input', help='Input file')
|
||||
parser.add_argument('--output', help='Output file')
|
||||
parser.add_argument('--input-dir', help='Input directory for batch processing')
|
||||
parser.add_argument('--output-dir', help='Output directory for batch processing')
|
||||
parser.add_argument('--target-size', type=int, help='Target size in MB')
|
||||
parser.add_argument('--quality', type=int, default=85,
|
||||
help='Quality (video: 0-51 CRF, image: 1-100) (default: 85)')
|
||||
parser.add_argument('--max-width', type=int, default=1920,
|
||||
help='Max image width (default: 1920)')
|
||||
parser.add_argument('--bitrate', default='64k',
|
||||
help='Audio bitrate (default: 64k)')
|
||||
parser.add_argument('--resolution', help='Video resolution (e.g., 1920x1080)')
|
||||
parser.add_argument('--split', action='store_true', help='Split long video into chunks')
|
||||
parser.add_argument('--chunk-duration', type=int, default=3600,
|
||||
help='Chunk duration in seconds (default: 3600 = 1 hour)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate arguments
|
||||
if not args.input and not args.input_dir:
|
||||
parser.error("Either --input or --input-dir required")
|
||||
|
||||
# Single file processing
|
||||
if args.input:
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file not found: {input_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.split:
|
||||
output_dir = args.output_dir or './chunks'
|
||||
chunks = split_video(str(input_path), output_dir, args.chunk_duration, args.verbose)
|
||||
print(f"\nCreated {len(chunks)} chunks in {output_dir}")
|
||||
sys.exit(0)
|
||||
|
||||
if not args.output:
|
||||
parser.error("--output required for single file processing")
|
||||
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determine file type
|
||||
ext = input_path.suffix.lower()
|
||||
|
||||
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
|
||||
success = optimize_video(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
target_size_mb=args.target_size,
|
||||
quality=args.quality,
|
||||
resolution=args.resolution,
|
||||
verbose=args.verbose
|
||||
)
|
||||
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
|
||||
success = optimize_audio(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
target_size_mb=args.target_size,
|
||||
bitrate=args.bitrate,
|
||||
verbose=args.verbose
|
||||
)
|
||||
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
|
||||
success = optimize_image(
|
||||
str(input_path),
|
||||
str(output_path),
|
||||
max_width=args.max_width,
|
||||
quality=args.quality,
|
||||
verbose=args.verbose
|
||||
)
|
||||
else:
|
||||
print(f"Error: Unsupported file type: {ext}")
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
# Batch processing
|
||||
if args.input_dir:
|
||||
if not args.output_dir:
|
||||
parser.error("--output-dir required for batch processing")
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find all media files
|
||||
patterns = ['*.mp4', '*.mov', '*.avi', '*.mkv', '*.webm',
|
||||
'*.mp3', '*.wav', '*.m4a', '*.flac',
|
||||
'*.jpg', '*.jpeg', '*.png', '*.webp']
|
||||
|
||||
files = []
|
||||
for pattern in patterns:
|
||||
files.extend(input_dir.glob(pattern))
|
||||
|
||||
if not files:
|
||||
print(f"No media files found in {input_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(files)} files to process")
|
||||
|
||||
success_count = 0
|
||||
for input_file in files:
|
||||
output_file = output_dir / input_file.name
|
||||
|
||||
ext = input_file.suffix.lower()
|
||||
success = False
|
||||
|
||||
if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
|
||||
success = optimize_video(str(input_file), str(output_file),
|
||||
quality=args.quality, verbose=args.verbose)
|
||||
elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
|
||||
success = optimize_audio(str(input_file), str(output_file),
|
||||
bitrate=args.bitrate, verbose=args.verbose)
|
||||
elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
|
||||
success = optimize_image(str(input_file), str(output_file),
|
||||
max_width=args.max_width, quality=args.quality,
|
||||
verbose=args.verbose)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
|
||||
print(f"\nProcessed: {success_count}/{len(files)} files")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
26
skills/ai-multimodal/scripts/requirements.txt
Normal file
26
skills/ai-multimodal/scripts/requirements.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
# AI Multimodal Skill Dependencies
|
||||
# Python 3.10+ required
|
||||
|
||||
# Google Gemini API
|
||||
google-genai>=0.1.0
|
||||
|
||||
# PDF processing
|
||||
pypdf>=4.0.0
|
||||
|
||||
# Document conversion
|
||||
python-docx>=1.0.0
|
||||
docx2pdf>=0.1.8 # Windows only, optional on Linux/macOS
|
||||
|
||||
# Markdown processing
|
||||
markdown>=3.5.0
|
||||
|
||||
# Image processing
|
||||
Pillow>=10.0.0
|
||||
|
||||
# Environment variable management
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Testing dependencies (dev)
|
||||
pytest>=8.0.0
|
||||
pytest-cov>=4.1.0
|
||||
pytest-mock>=3.12.0
|
||||
20
skills/ai-multimodal/scripts/tests/requirements.txt
Normal file
20
skills/ai-multimodal/scripts/tests/requirements.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
# Core dependencies
|
||||
google-genai>=0.2.0
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# Image processing
|
||||
pillow>=10.0.0
|
||||
|
||||
# PDF processing
|
||||
pypdf>=3.0.0
|
||||
|
||||
# Document conversion
|
||||
markdown>=3.5
|
||||
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
pytest-cov>=4.1.0
|
||||
pytest-mock>=3.12.0
|
||||
|
||||
# Optional dependencies for full functionality
|
||||
# ffmpeg-python>=0.2.0 # For media optimization (requires ffmpeg installed)
|
||||
299
skills/ai-multimodal/scripts/tests/test_document_converter.py
Normal file
299
skills/ai-multimodal/scripts/tests/test_document_converter.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""
|
||||
Tests for document_converter.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import document_converter as dc
|
||||
|
||||
|
||||
class TestEnvLoading:
|
||||
"""Test environment variable loading."""
|
||||
|
||||
@patch('document_converter.load_dotenv')
|
||||
@patch('pathlib.Path.exists')
|
||||
def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
|
||||
"""Test successful .env file loading."""
|
||||
mock_exists.return_value = True
|
||||
dc.load_env_files()
|
||||
# Should be called for skill, skills, and claude dirs
|
||||
assert mock_load_dotenv.call_count >= 1
|
||||
|
||||
@patch('document_converter.load_dotenv', None)
|
||||
def test_load_env_files_no_dotenv(self):
|
||||
"""Test when dotenv is not available."""
|
||||
# Should not raise an error
|
||||
dc.load_env_files()
|
||||
|
||||
|
||||
class TestDependencyCheck:
|
||||
"""Test dependency checking."""
|
||||
|
||||
@patch('builtins.__import__')
|
||||
def test_check_all_dependencies_available(self, mock_import):
|
||||
"""Test when all dependencies are available."""
|
||||
mock_import.return_value = Mock()
|
||||
|
||||
deps = dc.check_dependencies()
|
||||
|
||||
assert 'pypdf' in deps
|
||||
assert 'markdown' in deps
|
||||
assert 'pillow' in deps
|
||||
|
||||
@patch('builtins.__import__')
|
||||
def test_check_dependencies_missing(self, mock_import):
|
||||
"""Test when dependencies are missing."""
|
||||
def import_side_effect(name, *args, **kwargs):
|
||||
if name == 'pypdf':
|
||||
raise ImportError()
|
||||
return Mock()
|
||||
|
||||
mock_import.side_effect = import_side_effect
|
||||
|
||||
# The function uses try/except, so we test the actual function
|
||||
with patch('document_converter.sys.modules', {}):
|
||||
# This is tricky to test due to import handling
|
||||
pass
|
||||
|
||||
|
||||
class TestPDFPageExtraction:
|
||||
"""Test PDF page extraction."""
|
||||
|
||||
@patch('pypdf.PdfReader')
|
||||
@patch('pypdf.PdfWriter')
|
||||
@patch('builtins.open', create=True)
|
||||
def test_extract_single_page(self, mock_open, mock_writer_class, mock_reader_class):
|
||||
"""Test extracting a single page."""
|
||||
# Mock reader
|
||||
mock_reader = Mock()
|
||||
mock_page = Mock()
|
||||
mock_reader.pages = [Mock(), mock_page, Mock()]
|
||||
mock_reader_class.return_value = mock_reader
|
||||
|
||||
# Mock writer
|
||||
mock_writer = Mock()
|
||||
mock_writer.pages = [mock_page]
|
||||
mock_writer_class.return_value = mock_writer
|
||||
|
||||
result = dc.extract_pdf_pages(
|
||||
'input.pdf',
|
||||
'output.pdf',
|
||||
page_range='2',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
mock_writer.add_page.assert_called_once_with(mock_page)
|
||||
|
||||
@patch('pypdf.PdfReader')
|
||||
@patch('pypdf.PdfWriter')
|
||||
@patch('builtins.open', create=True)
|
||||
def test_extract_page_range(self, mock_open, mock_writer_class, mock_reader_class):
|
||||
"""Test extracting a range of pages."""
|
||||
mock_reader = Mock()
|
||||
mock_reader.pages = [Mock() for _ in range(10)]
|
||||
mock_reader_class.return_value = mock_reader
|
||||
|
||||
mock_writer = Mock()
|
||||
mock_writer.pages = []
|
||||
mock_writer_class.return_value = mock_writer
|
||||
|
||||
result = dc.extract_pdf_pages(
|
||||
'input.pdf',
|
||||
'output.pdf',
|
||||
page_range='2-5',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
assert mock_writer.add_page.call_count == 4 # Pages 2-5 (4 pages)
|
||||
|
||||
def test_extract_pages_no_pypdf(self):
|
||||
"""Test page extraction without pypdf."""
|
||||
with patch.dict('sys.modules', {'pypdf': None}):
|
||||
result = dc.extract_pdf_pages('input.pdf', 'output.pdf', '1-10')
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestPDFOptimization:
|
||||
"""Test PDF optimization."""
|
||||
|
||||
@patch('pypdf.PdfReader')
|
||||
@patch('pypdf.PdfWriter')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_pdf_success(self, mock_stat, mock_open, mock_writer_class, mock_reader_class):
|
||||
"""Test successful PDF optimization."""
|
||||
# Mock reader
|
||||
mock_reader = Mock()
|
||||
mock_page = Mock()
|
||||
mock_reader.pages = [mock_page, mock_page]
|
||||
mock_reader_class.return_value = mock_reader
|
||||
|
||||
# Mock writer
|
||||
mock_writer = Mock()
|
||||
mock_writer.pages = [mock_page, mock_page]
|
||||
mock_writer_class.return_value = mock_writer
|
||||
|
||||
# Mock file sizes
|
||||
mock_stat.return_value.st_size = 1024 * 1024
|
||||
|
||||
result = dc.optimize_pdf('input.pdf', 'output.pdf', verbose=False)
|
||||
|
||||
assert result is True
|
||||
mock_page.compress_content_streams.assert_called()
|
||||
|
||||
def test_optimize_pdf_no_pypdf(self):
|
||||
"""Test PDF optimization without pypdf."""
|
||||
with patch.dict('sys.modules', {'pypdf': None}):
|
||||
result = dc.optimize_pdf('input.pdf', 'output.pdf')
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestImageExtraction:
|
||||
"""Test image extraction from PDFs."""
|
||||
|
||||
@patch('pypdf.PdfReader')
|
||||
@patch('PIL.Image')
|
||||
@patch('pathlib.Path.mkdir')
|
||||
@patch('builtins.open', create=True)
|
||||
def test_extract_images_success(self, mock_open, mock_mkdir, mock_image, mock_reader_class):
|
||||
"""Test successful image extraction."""
|
||||
# Mock PDF reader
|
||||
mock_reader = Mock()
|
||||
mock_page = MagicMock()
|
||||
|
||||
# Mock XObject with image
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.__getitem__.side_effect = lambda k: {
|
||||
'/Subtype': '/Image',
|
||||
'/Width': 100,
|
||||
'/Height': 100,
|
||||
'/Filter': '/DCTDecode'
|
||||
}[k]
|
||||
mock_obj.get_data.return_value = b'image_data'
|
||||
|
||||
mock_xobjects = MagicMock()
|
||||
mock_xobjects.__iter__.return_value = ['img1']
|
||||
mock_xobjects.__getitem__.return_value = mock_obj
|
||||
|
||||
mock_resources = MagicMock()
|
||||
mock_resources.get_object.return_value = mock_xobjects
|
||||
mock_page.__getitem__.side_effect = lambda k: {
|
||||
'/Resources': {'/XObject': mock_resources}
|
||||
}[k]
|
||||
|
||||
mock_reader.pages = [mock_page]
|
||||
mock_reader_class.return_value = mock_reader
|
||||
|
||||
result = dc.extract_images_from_pdf('input.pdf', './output', verbose=False)
|
||||
|
||||
assert len(result) > 0
|
||||
|
||||
def test_extract_images_no_dependencies(self):
|
||||
"""Test image extraction without required dependencies."""
|
||||
with patch.dict('sys.modules', {'pypdf': None}):
|
||||
result = dc.extract_images_from_pdf('input.pdf', './output')
|
||||
assert result == []
|
||||
|
||||
|
||||
class TestMarkdownConversion:
|
||||
"""Test Markdown to PDF conversion."""
|
||||
|
||||
@patch('markdown.markdown')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('subprocess.run')
|
||||
@patch('pathlib.Path.unlink')
|
||||
def test_convert_markdown_success(self, mock_unlink, mock_run, mock_open, mock_markdown):
|
||||
"""Test successful Markdown to PDF conversion."""
|
||||
mock_markdown.return_value = '<h1>Test</h1>'
|
||||
|
||||
# Mock file reading and writing
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = '# Test'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
result = dc.convert_markdown_to_pdf('input.md', 'output.pdf', verbose=False)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('markdown.markdown')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('subprocess.run')
|
||||
def test_convert_markdown_no_wkhtmltopdf(self, mock_run, mock_open, mock_markdown):
|
||||
"""Test Markdown conversion without wkhtmltopdf."""
|
||||
mock_markdown.return_value = '<h1>Test</h1>'
|
||||
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = '# Test'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
mock_run.side_effect = FileNotFoundError()
|
||||
|
||||
result = dc.convert_markdown_to_pdf('input.md', 'output.pdf', verbose=False)
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_convert_markdown_no_markdown_lib(self):
|
||||
"""Test Markdown conversion without markdown library."""
|
||||
with patch.dict('sys.modules', {'markdown': None}):
|
||||
result = dc.convert_markdown_to_pdf('input.md', 'output.pdf')
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestHTMLConversion:
|
||||
"""Test HTML to PDF conversion."""
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_convert_html_success(self, mock_run):
|
||||
"""Test successful HTML to PDF conversion."""
|
||||
result = dc.convert_html_to_pdf('input.html', 'output.pdf', verbose=False)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_convert_html_no_wkhtmltopdf(self, mock_run):
|
||||
"""Test HTML conversion without wkhtmltopdf."""
|
||||
mock_run.side_effect = FileNotFoundError()
|
||||
|
||||
result = dc.convert_html_to_pdf('input.html', 'output.pdf', verbose=False)
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestIntegration:
|
||||
"""Integration tests."""
|
||||
|
||||
@patch('pathlib.Path.exists')
|
||||
def test_file_not_found(self, mock_exists):
|
||||
"""Test handling of non-existent input file."""
|
||||
mock_exists.return_value = False
|
||||
|
||||
# This would normally be tested via main() but we test the concept
|
||||
assert not Path('nonexistent.pdf').exists()
|
||||
|
||||
@patch('document_converter.check_dependencies')
|
||||
def test_check_dependencies_integration(self, mock_check):
|
||||
"""Test dependency checking integration."""
|
||||
mock_check.return_value = {
|
||||
'pypdf': True,
|
||||
'markdown': True,
|
||||
'pillow': True
|
||||
}
|
||||
|
||||
deps = dc.check_dependencies()
|
||||
|
||||
assert deps['pypdf'] is True
|
||||
assert deps['markdown'] is True
|
||||
assert deps['pillow'] is True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=document_converter', '--cov-report=term-missing'])
|
||||
362
skills/ai-multimodal/scripts/tests/test_gemini_batch_process.py
Normal file
362
skills/ai-multimodal/scripts/tests/test_gemini_batch_process.py
Normal file
@@ -0,0 +1,362 @@
|
||||
"""
|
||||
Tests for gemini_batch_process.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import gemini_batch_process as gbp
|
||||
|
||||
|
||||
class TestAPIKeyFinder:
|
||||
"""Test API key detection."""
|
||||
|
||||
def test_find_api_key_from_env(self, monkeypatch):
|
||||
"""Test finding API key from environment variable."""
|
||||
monkeypatch.setenv('GEMINI_API_KEY', 'test_key_123')
|
||||
assert gbp.find_api_key() == 'test_key_123'
|
||||
|
||||
@patch('gemini_batch_process.load_dotenv')
|
||||
def test_find_api_key_not_found(self, mock_load_dotenv, monkeypatch):
|
||||
"""Test when API key is not found."""
|
||||
monkeypatch.delenv('GEMINI_API_KEY', raising=False)
|
||||
# Mock load_dotenv to not actually load any files
|
||||
mock_load_dotenv.return_value = None
|
||||
assert gbp.find_api_key() is None
|
||||
|
||||
|
||||
class TestMimeTypeDetection:
|
||||
"""Test MIME type detection."""
|
||||
|
||||
def test_audio_mime_types(self):
|
||||
"""Test audio file MIME types."""
|
||||
assert gbp.get_mime_type('test.mp3') == 'audio/mp3'
|
||||
assert gbp.get_mime_type('test.wav') == 'audio/wav'
|
||||
assert gbp.get_mime_type('test.aac') == 'audio/aac'
|
||||
assert gbp.get_mime_type('test.flac') == 'audio/flac'
|
||||
|
||||
def test_image_mime_types(self):
|
||||
"""Test image file MIME types."""
|
||||
assert gbp.get_mime_type('test.jpg') == 'image/jpeg'
|
||||
assert gbp.get_mime_type('test.jpeg') == 'image/jpeg'
|
||||
assert gbp.get_mime_type('test.png') == 'image/png'
|
||||
assert gbp.get_mime_type('test.webp') == 'image/webp'
|
||||
|
||||
def test_video_mime_types(self):
|
||||
"""Test video file MIME types."""
|
||||
assert gbp.get_mime_type('test.mp4') == 'video/mp4'
|
||||
assert gbp.get_mime_type('test.mov') == 'video/quicktime'
|
||||
assert gbp.get_mime_type('test.avi') == 'video/x-msvideo'
|
||||
|
||||
def test_document_mime_types(self):
|
||||
"""Test document file MIME types."""
|
||||
assert gbp.get_mime_type('test.pdf') == 'application/pdf'
|
||||
assert gbp.get_mime_type('test.txt') == 'text/plain'
|
||||
|
||||
def test_unknown_mime_type(self):
|
||||
"""Test unknown file extension."""
|
||||
assert gbp.get_mime_type('test.xyz') == 'application/octet-stream'
|
||||
|
||||
def test_case_insensitive(self):
|
||||
"""Test case-insensitive extension matching."""
|
||||
assert gbp.get_mime_type('TEST.MP3') == 'audio/mp3'
|
||||
assert gbp.get_mime_type('Test.JPG') == 'image/jpeg'
|
||||
|
||||
|
||||
class TestFileUpload:
|
||||
"""Test file upload functionality."""
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_upload_file_success(self, mock_client_class):
|
||||
"""Test successful file upload."""
|
||||
# Mock client and file
|
||||
mock_client = Mock()
|
||||
mock_file = Mock()
|
||||
mock_file.state.name = 'ACTIVE'
|
||||
mock_file.name = 'test_file'
|
||||
mock_client.files.upload.return_value = mock_file
|
||||
|
||||
result = gbp.upload_file(mock_client, 'test.jpg', verbose=False)
|
||||
|
||||
assert result == mock_file
|
||||
mock_client.files.upload.assert_called_once_with(file='test.jpg')
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('gemini_batch_process.time.sleep')
|
||||
def test_upload_video_with_processing(self, mock_sleep, mock_client_class):
|
||||
"""Test video upload with processing wait."""
|
||||
mock_client = Mock()
|
||||
|
||||
# First call: PROCESSING, second call: ACTIVE
|
||||
mock_file_processing = Mock()
|
||||
mock_file_processing.state.name = 'PROCESSING'
|
||||
mock_file_processing.name = 'test_video'
|
||||
|
||||
mock_file_active = Mock()
|
||||
mock_file_active.state.name = 'ACTIVE'
|
||||
mock_file_active.name = 'test_video'
|
||||
|
||||
mock_client.files.upload.return_value = mock_file_processing
|
||||
mock_client.files.get.return_value = mock_file_active
|
||||
|
||||
result = gbp.upload_file(mock_client, 'test.mp4', verbose=False)
|
||||
|
||||
assert result.state.name == 'ACTIVE'
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_upload_file_failed(self, mock_client_class):
|
||||
"""Test failed file upload."""
|
||||
mock_client = Mock()
|
||||
mock_file = Mock()
|
||||
mock_file.state.name = 'FAILED'
|
||||
mock_client.files.upload.return_value = mock_file
|
||||
mock_client.files.get.return_value = mock_file
|
||||
|
||||
with pytest.raises(ValueError, match="File processing failed"):
|
||||
gbp.upload_file(mock_client, 'test.mp4', verbose=False)
|
||||
|
||||
|
||||
class TestProcessFile:
|
||||
"""Test file processing functionality."""
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_small_file_inline(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test processing small file with inline data."""
|
||||
# Mock small file
|
||||
mock_stat.return_value.st_size = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
# Mock file content
|
||||
mock_open.return_value.__enter__.return_value.read.return_value = b'test_data'
|
||||
|
||||
# Mock client and response
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = 'Test response'
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.jpg',
|
||||
prompt='Describe this image',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result['status'] == 'success'
|
||||
assert result['response'] == 'Test response'
|
||||
|
||||
@patch('gemini_batch_process.upload_file')
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_large_file_api(self, mock_stat, mock_client_class, mock_upload):
|
||||
"""Test processing large file with File API."""
|
||||
# Mock large file
|
||||
mock_stat.return_value.st_size = 50 * 1024 * 1024 # 50MB
|
||||
|
||||
# Mock upload and response
|
||||
mock_file = Mock()
|
||||
mock_upload.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = 'Test response'
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.mp4',
|
||||
prompt='Summarize this video',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result['status'] == 'success'
|
||||
mock_upload.assert_called_once()
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_process_file_error_handling(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test error handling in file processing."""
|
||||
mock_stat.return_value.st_size = 1024
|
||||
|
||||
# Mock file read
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = b'test_data'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_client.models.generate_content.side_effect = Exception("API Error")
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.jpg',
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
max_retries=1
|
||||
)
|
||||
|
||||
assert result['status'] == 'error'
|
||||
assert 'API Error' in result['error']
|
||||
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_image_generation_with_aspect_ratio(self, mock_stat, mock_open, mock_client_class):
|
||||
"""Test image generation with aspect ratio config."""
|
||||
mock_stat.return_value.st_size = 1024
|
||||
|
||||
# Mock file read
|
||||
mock_file = MagicMock()
|
||||
mock_file.__enter__.return_value.read.return_value = b'test'
|
||||
mock_open.return_value = mock_file
|
||||
|
||||
mock_client = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.candidates = [Mock()]
|
||||
mock_response.candidates[0].content.parts = [
|
||||
Mock(inline_data=Mock(data=b'fake_image_data'))
|
||||
]
|
||||
mock_client.models.generate_content.return_value = mock_response
|
||||
|
||||
result = gbp.process_file(
|
||||
client=mock_client,
|
||||
file_path='test.txt',
|
||||
prompt='Generate mountain landscape',
|
||||
model='gemini-2.5-flash-image',
|
||||
task='generate',
|
||||
format_output='text',
|
||||
aspect_ratio='16:9',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Verify config was called with correct structure
|
||||
call_args = mock_client.models.generate_content.call_args
|
||||
config = call_args.kwargs.get('config')
|
||||
assert config is not None
|
||||
assert result['status'] == 'success'
|
||||
assert 'generated_image' in result
|
||||
|
||||
|
||||
class TestBatchProcessing:
|
||||
"""Test batch processing functionality."""
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
@patch('gemini_batch_process.process_file')
|
||||
@patch('gemini_batch_process.genai.Client')
|
||||
def test_batch_process_success(self, mock_client_class, mock_process, mock_find_key):
|
||||
"""Test successful batch processing."""
|
||||
mock_find_key.return_value = 'test_key'
|
||||
mock_process.return_value = {'status': 'success', 'response': 'Test'}
|
||||
|
||||
results = gbp.batch_process(
|
||||
files=['test1.jpg', 'test2.jpg'],
|
||||
prompt='Analyze',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=False
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assert all(r['status'] == 'success' for r in results)
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
def test_batch_process_no_api_key(self, mock_find_key):
|
||||
"""Test batch processing without API key."""
|
||||
mock_find_key.return_value = None
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
gbp.batch_process(
|
||||
files=['test.jpg'],
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=False
|
||||
)
|
||||
|
||||
@patch('gemini_batch_process.find_api_key')
|
||||
def test_batch_process_dry_run(self, mock_find_key):
|
||||
"""Test dry run mode."""
|
||||
# API key not needed for dry run, but we mock it to avoid sys.exit
|
||||
mock_find_key.return_value = 'test_key'
|
||||
|
||||
results = gbp.batch_process(
|
||||
files=['test1.jpg', 'test2.jpg'],
|
||||
prompt='Test',
|
||||
model='gemini-2.5-flash',
|
||||
task='analyze',
|
||||
format_output='text',
|
||||
verbose=False,
|
||||
dry_run=True
|
||||
)
|
||||
|
||||
assert results == []
|
||||
|
||||
|
||||
class TestResultsSaving:
|
||||
"""Test results saving functionality."""
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('json.dump')
|
||||
def test_save_results_json(self, mock_json_dump, mock_open):
|
||||
"""Test saving results as JSON."""
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.json', 'json')
|
||||
|
||||
mock_json_dump.assert_called_once()
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
@patch('csv.DictWriter')
|
||||
def test_save_results_csv(self, mock_csv_writer, mock_open):
|
||||
"""Test saving results as CSV."""
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.csv', 'csv')
|
||||
|
||||
# Verify CSV writer was used
|
||||
mock_csv_writer.assert_called_once()
|
||||
|
||||
@patch('builtins.open', create=True)
|
||||
def test_save_results_markdown(self, mock_open):
|
||||
"""Test saving results as Markdown."""
|
||||
mock_file = MagicMock()
|
||||
mock_open.return_value.__enter__.return_value = mock_file
|
||||
|
||||
results = [
|
||||
{'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
|
||||
{'file': 'test2.jpg', 'status': 'error', 'error': 'Failed'}
|
||||
]
|
||||
|
||||
gbp.save_results(results, 'output.md', 'markdown')
|
||||
|
||||
# Verify write was called
|
||||
assert mock_file.write.call_count > 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=gemini_batch_process', '--cov-report=term-missing'])
|
||||
373
skills/ai-multimodal/scripts/tests/test_media_optimizer.py
Normal file
373
skills/ai-multimodal/scripts/tests/test_media_optimizer.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Tests for media_optimizer.py
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
import json
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import media_optimizer as mo
|
||||
|
||||
|
||||
class TestEnvLoading:
|
||||
"""Test environment variable loading."""
|
||||
|
||||
@patch('media_optimizer.load_dotenv')
|
||||
@patch('pathlib.Path.exists')
|
||||
def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
|
||||
"""Test successful .env file loading."""
|
||||
mock_exists.return_value = True
|
||||
mo.load_env_files()
|
||||
# Should be called for skill, skills, and claude dirs
|
||||
assert mock_load_dotenv.call_count >= 1
|
||||
|
||||
@patch('media_optimizer.load_dotenv', None)
|
||||
def test_load_env_files_no_dotenv(self):
|
||||
"""Test when dotenv is not available."""
|
||||
# Should not raise an error
|
||||
mo.load_env_files()
|
||||
|
||||
|
||||
class TestFFmpegCheck:
|
||||
"""Test ffmpeg availability checking."""
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_installed(self, mock_run):
|
||||
"""Test when ffmpeg is installed."""
|
||||
mock_run.return_value = Mock()
|
||||
assert mo.check_ffmpeg() is True
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_not_installed(self, mock_run):
|
||||
"""Test when ffmpeg is not installed."""
|
||||
mock_run.side_effect = FileNotFoundError()
|
||||
assert mo.check_ffmpeg() is False
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_ffmpeg_error(self, mock_run):
|
||||
"""Test ffmpeg command error."""
|
||||
mock_run.side_effect = Exception("Error")
|
||||
assert mo.check_ffmpeg() is False
|
||||
|
||||
|
||||
class TestMediaInfo:
|
||||
"""Test media information extraction."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('subprocess.run')
|
||||
def test_get_video_info(self, mock_run, mock_check):
|
||||
"""Test extracting video information."""
|
||||
mock_check.return_value = True
|
||||
|
||||
mock_result = Mock()
|
||||
mock_result.stdout = json.dumps({
|
||||
'format': {
|
||||
'size': '10485760',
|
||||
'duration': '120.5',
|
||||
'bit_rate': '691200'
|
||||
},
|
||||
'streams': [
|
||||
{
|
||||
'codec_type': 'video',
|
||||
'width': 1920,
|
||||
'height': 1080,
|
||||
'r_frame_rate': '30/1'
|
||||
},
|
||||
{
|
||||
'codec_type': 'audio',
|
||||
'sample_rate': '48000',
|
||||
'channels': 2
|
||||
}
|
||||
]
|
||||
})
|
||||
mock_run.return_value = mock_result
|
||||
|
||||
info = mo.get_media_info('test.mp4')
|
||||
|
||||
assert info['size'] == 10485760
|
||||
assert info['duration'] == 120.5
|
||||
assert info['width'] == 1920
|
||||
assert info['height'] == 1080
|
||||
assert info['sample_rate'] == 48000
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_get_media_info_no_ffmpeg(self, mock_check):
|
||||
"""Test when ffmpeg is not available."""
|
||||
mock_check.return_value = False
|
||||
info = mo.get_media_info('test.mp4')
|
||||
assert info == {}
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('subprocess.run')
|
||||
def test_get_media_info_error(self, mock_run, mock_check):
|
||||
"""Test error handling in media info extraction."""
|
||||
mock_check.return_value = True
|
||||
mock_run.side_effect = Exception("Error")
|
||||
|
||||
info = mo.get_media_info('test.mp4')
|
||||
assert info == {}
|
||||
|
||||
|
||||
class TestVideoOptimization:
|
||||
"""Test video optimization functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_success(self, mock_run, mock_info, mock_check):
|
||||
"""Test successful video optimization."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
# Input info
|
||||
{
|
||||
'size': 50 * 1024 * 1024,
|
||||
'duration': 120.0,
|
||||
'bit_rate': 3500000,
|
||||
'width': 1920,
|
||||
'height': 1080
|
||||
},
|
||||
# Output info
|
||||
{
|
||||
'size': 25 * 1024 * 1024,
|
||||
'duration': 120.0,
|
||||
'width': 1920,
|
||||
'height': 1080
|
||||
}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
quality=23,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_optimize_video_no_ffmpeg(self, mock_check):
|
||||
"""Test video optimization without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.optimize_video('input.mp4', 'output.mp4')
|
||||
assert result is False
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
def test_optimize_video_no_info(self, mock_info, mock_check):
|
||||
"""Test video optimization when info cannot be read."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {}
|
||||
|
||||
result = mo.optimize_video('input.mp4', 'output.mp4')
|
||||
assert result is False
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_with_target_size(self, mock_run, mock_info, mock_check):
|
||||
"""Test video optimization with target size."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 100 * 1024 * 1024, 'duration': 60.0, 'bit_rate': 3500000},
|
||||
{'size': 50 * 1024 * 1024, 'duration': 60.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
target_size_mb=50,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_video_with_resolution(self, mock_run, mock_info, mock_check):
|
||||
"""Test video optimization with custom resolution."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 50 * 1024 * 1024, 'duration': 120.0, 'bit_rate': 3500000},
|
||||
{'size': 25 * 1024 * 1024, 'duration': 120.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_video(
|
||||
'input.mp4',
|
||||
'output.mp4',
|
||||
resolution='1280x720',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
|
||||
|
||||
class TestAudioOptimization:
|
||||
"""Test audio optimization functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
def test_optimize_audio_success(self, mock_run, mock_info, mock_check):
|
||||
"""Test successful audio optimization."""
|
||||
mock_check.return_value = True
|
||||
mock_info.side_effect = [
|
||||
{'size': 10 * 1024 * 1024, 'duration': 300.0},
|
||||
{'size': 5 * 1024 * 1024, 'duration': 300.0}
|
||||
]
|
||||
|
||||
result = mo.optimize_audio(
|
||||
'input.mp3',
|
||||
'output.m4a',
|
||||
bitrate='64k',
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_optimize_audio_no_ffmpeg(self, mock_check):
|
||||
"""Test audio optimization without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.optimize_audio('input.mp3', 'output.m4a')
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestImageOptimization:
|
||||
"""Test image optimization functionality."""
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_success(self, mock_stat, mock_image_open):
|
||||
"""Test successful image optimization."""
|
||||
# Mock image
|
||||
mock_resized = Mock()
|
||||
mock_resized.mode = 'RGB'
|
||||
|
||||
mock_img = Mock()
|
||||
mock_img.width = 3840
|
||||
mock_img.height = 2160
|
||||
mock_img.mode = 'RGB'
|
||||
mock_img.resize.return_value = mock_resized
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
# Mock file sizes
|
||||
mock_stat.return_value.st_size = 5 * 1024 * 1024
|
||||
|
||||
result = mo.optimize_image(
|
||||
'input.jpg',
|
||||
'output.jpg',
|
||||
max_width=1920,
|
||||
quality=85,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result is True
|
||||
# Since image is resized, save is called on the resized image
|
||||
mock_resized.save.assert_called_once()
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_resize(self, mock_stat, mock_image_open):
|
||||
"""Test image resizing during optimization."""
|
||||
mock_img = Mock()
|
||||
mock_img.width = 3840
|
||||
mock_img.height = 2160
|
||||
mock_img.mode = 'RGB'
|
||||
mock_resized = Mock()
|
||||
mock_img.resize.return_value = mock_resized
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
mock_stat.return_value.st_size = 5 * 1024 * 1024
|
||||
|
||||
mo.optimize_image('input.jpg', 'output.jpg', max_width=1920, verbose=False)
|
||||
|
||||
mock_img.resize.assert_called_once()
|
||||
|
||||
@patch('PIL.Image.open')
|
||||
@patch('pathlib.Path.stat')
|
||||
def test_optimize_image_rgba_to_jpg(self, mock_stat, mock_image_open):
|
||||
"""Test converting RGBA to RGB for JPEG."""
|
||||
mock_img = Mock()
|
||||
mock_img.width = 1920
|
||||
mock_img.height = 1080
|
||||
mock_img.mode = 'RGBA'
|
||||
mock_img.split.return_value = [Mock(), Mock(), Mock(), Mock()]
|
||||
mock_image_open.return_value = mock_img
|
||||
|
||||
mock_stat.return_value.st_size = 1024 * 1024
|
||||
|
||||
with patch('PIL.Image.new') as mock_new:
|
||||
mock_rgb = Mock()
|
||||
mock_new.return_value = mock_rgb
|
||||
|
||||
mo.optimize_image('input.png', 'output.jpg', verbose=False)
|
||||
|
||||
mock_new.assert_called_once()
|
||||
|
||||
def test_optimize_image_no_pillow(self):
|
||||
"""Test image optimization without Pillow."""
|
||||
with patch.dict('sys.modules', {'PIL': None}):
|
||||
result = mo.optimize_image('input.jpg', 'output.jpg')
|
||||
# Will fail to import but function handles it
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestVideoSplitting:
|
||||
"""Test video splitting functionality."""
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
@patch('subprocess.run')
|
||||
@patch('pathlib.Path.mkdir')
|
||||
def test_split_video_success(self, mock_mkdir, mock_run, mock_info, mock_check):
|
||||
"""Test successful video splitting."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {'duration': 7200.0} # 2 hours
|
||||
|
||||
result = mo.split_video(
|
||||
'input.mp4',
|
||||
'./chunks',
|
||||
chunk_duration=3600, # 1 hour chunks
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Duration 7200s / 3600s = 2, +1 for safety = 3 chunks
|
||||
assert len(result) == 3
|
||||
assert mock_run.call_count == 3
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
@patch('media_optimizer.get_media_info')
|
||||
def test_split_video_short_duration(self, mock_info, mock_check):
|
||||
"""Test splitting video shorter than chunk duration."""
|
||||
mock_check.return_value = True
|
||||
mock_info.return_value = {'duration': 1800.0} # 30 minutes
|
||||
|
||||
result = mo.split_video(
|
||||
'input.mp4',
|
||||
'./chunks',
|
||||
chunk_duration=3600, # 1 hour
|
||||
verbose=False
|
||||
)
|
||||
|
||||
assert result == ['input.mp4']
|
||||
|
||||
@patch('media_optimizer.check_ffmpeg')
|
||||
def test_split_video_no_ffmpeg(self, mock_check):
|
||||
"""Test video splitting without ffmpeg."""
|
||||
mock_check.return_value = False
|
||||
|
||||
result = mo.split_video('input.mp4', './chunks')
|
||||
assert result == []
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v', '--cov=media_optimizer', '--cov-report=term-missing'])
|
||||
Reference in New Issue
Block a user