Initial commit

2025-11-30 08:48:52 +08:00
commit 6ec3196ecc
434 changed files with 125248 additions and 0 deletions
--- a/skills/ai-multimodal/.env.example
+++ b/skills/ai-multimodal/.env.example
@@ -0,0 +1,97 @@
+# Google Gemini API Configuration
+
+# ============================================================================
+# OPTION 1: Google AI Studio (Default - Recommended for most users)
+# ============================================================================
+# Get your API key: https://aistudio.google.com/apikey
+GEMINI_API_KEY=your_api_key_here
+
+# ============================================================================
+# OPTION 2: Vertex AI (Google Cloud Platform)
+# ============================================================================
+# Uncomment these lines to use Vertex AI instead of Google AI Studio
+# GEMINI_USE_VERTEX=true
+# VERTEX_PROJECT_ID=your-gcp-project-id
+# VERTEX_LOCATION=us-central1
+
+# ============================================================================
+# Model Selection (Optional)
+# ============================================================================
+# Override default model for specific tasks
+# Default: gemini-2.5-flash for most tasks
+# GEMINI_MODEL=gemini-2.5-flash
+# GEMINI_IMAGE_GEN_MODEL=gemini-2.5-flash-image
+
+# ============================================================================
+# Rate Limiting Configuration (Optional)
+# ============================================================================
+# Requests per minute limit (adjust based on your tier)
+# GEMINI_RPM_LIMIT=15
+
+# Tokens per minute limit
+# GEMINI_TPM_LIMIT=4000000
+
+# Requests per day limit
+# GEMINI_RPD_LIMIT=1500
+
+# ============================================================================
+# Processing Options (Optional)
+# ============================================================================
+# Video resolution mode: default or low-res
+# low-res uses ~100 tokens/second vs ~300 for default
+# GEMINI_VIDEO_RESOLUTION=default
+
+# Audio quality: default (16 Kbps mono, auto-downsampled)
+# GEMINI_AUDIO_QUALITY=default
+
+# PDF processing mode: inline (<20MB) or file-api (>20MB, automatic)
+# GEMINI_PDF_MODE=auto
+
+# ============================================================================
+# Retry Configuration (Optional)
+# ============================================================================
+# Maximum retry attempts for failed requests
+# GEMINI_MAX_RETRIES=3
+
+# Initial retry delay in seconds (uses exponential backoff)
+# GEMINI_RETRY_DELAY=1
+
+# ============================================================================
+# Output Configuration (Optional)
+# ============================================================================
+# Default output directory for generated images
+# OUTPUT_DIR=./output
+
+# Image output format (png or jpeg)
+# IMAGE_FORMAT=png
+
+# Image quality for JPEG (1-100)
+# IMAGE_QUALITY=95
+
+# ============================================================================
+# Context Caching (Optional)
+# ============================================================================
+# Enable context caching for repeated queries on same file
+# GEMINI_ENABLE_CACHING=true
+
+# Cache TTL in seconds (default: 1800 = 30 minutes)
+# GEMINI_CACHE_TTL=1800
+
+# ============================================================================
+# Logging (Optional)
+# ============================================================================
+# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
+# LOG_LEVEL=INFO
+
+# Log file path
+# LOG_FILE=./logs/gemini.log
+
+# ============================================================================
+# Notes
+# ============================================================================
+# 1. Never commit API keys to version control
+# 2. Add .env to .gitignore
+# 3. API keys can be restricted in Google Cloud Console
+# 4. Monitor usage at: https://aistudio.google.com/apikey
+# 5. Free tier limits: 15 RPM, 1M-4M TPM, 1,500 RPD
+# 6. Vertex AI requires GCP authentication via gcloud CLI
--- a/skills/ai-multimodal/SKILL.md
+++ b/skills/ai-multimodal/SKILL.md
@@ -0,0 +1,357 @@
+---
+name: ai-multimodal
+description: Process and generate multimedia content using Google Gemini API. Capabilities include analyze audio files (transcription with timestamps, summarization, speech understanding, music/sound analysis up to 9.5 hours), understand images (captioning, object detection, OCR, visual Q&A, segmentation), process videos (scene detection, Q&A, temporal analysis, YouTube URLs, up to 6 hours), extract from documents (PDF tables, forms, charts, diagrams, multi-page), generate images (text-to-image, editing, composition, refinement). Use when working with audio/video files, analyzing images or screenshots, processing PDF documents, extracting structured data from media, creating images from text prompts, or implementing multimodal AI features. Supports multiple models (Gemini 2.5/2.0) with context windows up to 2M tokens.
+license: MIT
+allowed-tools:
+  - Bash
+  - Read
+  - Write
+  - Edit
+---
+
+# AI Multimodal Processing Skill
+
+Process audio, images, videos, documents, and generate images using Google Gemini's multimodal API. Unified interface for all multimedia content understanding and generation.
+
+## Core Capabilities
+
+### Audio Processing
+- Transcription with timestamps (up to 9.5 hours)
+- Audio summarization and analysis
+- Speech understanding and speaker identification
+- Music and environmental sound analysis
+- Text-to-speech generation with controllable voice
+
+### Image Understanding
+- Image captioning and description
+- Object detection with bounding boxes (2.0+)
+- Pixel-level segmentation (2.5+)
+- Visual question answering
+- Multi-image comparison (up to 3,600 images)
+- OCR and text extraction
+
+### Video Analysis
+- Scene detection and summarization
+- Video Q&A with temporal understanding
+- Transcription with visual descriptions
+- YouTube URL support
+- Long video processing (up to 6 hours)
+- Frame-level analysis
+
+### Document Extraction
+- Native PDF vision processing (up to 1,000 pages)
+- Table and form extraction
+- Chart and diagram analysis
+- Multi-page document understanding
+- Structured data output (JSON schema)
+- Format conversion (PDF to HTML/JSON)
+
+### Image Generation
+- Text-to-image generation
+- Image editing and modification
+- Multi-image composition (up to 3 images)
+- Iterative refinement
+- Multiple aspect ratios (1:1, 16:9, 9:16, 4:3, 3:4)
+- Controllable style and quality
+
+## Capability Matrix
+
+| Task | Audio | Image | Video | Document | Generation |
+|------|:-----:|:-----:|:-----:|:--------:|:----------:|
+| Transcription | ✓ | - | ✓ | - | - |
+| Summarization | ✓ | ✓ | ✓ | ✓ | - |
+| Q&A | ✓ | ✓ | ✓ | ✓ | - |
+| Object Detection | - | ✓ | ✓ | - | - |
+| Text Extraction | - | ✓ | - | ✓ | - |
+| Structured Output | ✓ | ✓ | ✓ | ✓ | - |
+| Creation | TTS | - | - | - | ✓ |
+| Timestamps | ✓ | - | ✓ | - | - |
+| Segmentation | - | ✓ | - | - | - |
+
+## Model Selection Guide
+
+### Gemini 2.5 Series (Recommended)
+- **gemini-2.5-pro**: Highest quality, all features, 1M-2M context
+- **gemini-2.5-flash**: Best balance, all features, 1M-2M context
+- **gemini-2.5-flash-lite**: Lightweight, segmentation support
+- **gemini-2.5-flash-image**: Image generation only
+
+### Gemini 2.0 Series
+- **gemini-2.0-flash**: Fast processing, object detection
+- **gemini-2.0-flash-lite**: Lightweight option
+
+### Feature Requirements
+- **Segmentation**: Requires 2.5+ models
+- **Object Detection**: Requires 2.0+ models
+- **Multi-video**: Requires 2.5+ models
+- **Image Generation**: Requires flash-image model
+
+### Context Windows
+- **2M tokens**: ~6 hours video (low-res) or ~2 hours (default)
+- **1M tokens**: ~3 hours video (low-res) or ~1 hour (default)
+- **Audio**: 32 tokens/second (1 min = 1,920 tokens)
+- **PDF**: 258 tokens/page (fixed)
+- **Image**: 258-1,548 tokens based on size
+
+## Quick Start
+
+### Prerequisites
+
+**API Key Setup**: Supports both Google AI Studio and Vertex AI.
+
+The skill checks for `GEMINI_API_KEY` in this order:
+1. Process environment: `export GEMINI_API_KEY="your-key"`
+2. Project root: `.env`
+3. `.claude/.env`
+4. `.claude/skills/.env`
+5. `.claude/skills/ai-multimodal/.env`
+
+**Get API key**: https://aistudio.google.com/apikey
+
+**For Vertex AI**:
+```bash
+export GEMINI_USE_VERTEX=true
+export VERTEX_PROJECT_ID=your-gcp-project-id
+export VERTEX_LOCATION=us-central1  # Optional
+```
+
+**Install SDK**:
+```bash
+pip install google-genai python-dotenv pillow
+```
+
+### Common Patterns
+
+**Transcribe Audio**:
+```bash
+python scripts/gemini_batch_process.py \
+  --files audio.mp3 \
+  --task transcribe \
+  --model gemini-2.5-flash
+```
+
+**Analyze Image**:
+```bash
+python scripts/gemini_batch_process.py \
+  --files image.jpg \
+  --task analyze \
+  --prompt "Describe this image" \
+  --output docs/assets/<output-name>.md \
+  --model gemini-2.5-flash
+```
+
+**Process Video**:
+```bash
+python scripts/gemini_batch_process.py \
+  --files video.mp4 \
+  --task analyze \
+  --prompt "Summarize key points with timestamps" \
+  --output docs/assets/<output-name>.md \
+  --model gemini-2.5-flash
+```
+
+**Extract from PDF**:
+```bash
+python scripts/gemini_batch_process.py \
+  --files document.pdf \
+  --task extract \
+  --prompt "Extract table data as JSON" \
+  --output docs/assets/<output-name>.md \
+  --format json
+```
+
+**Generate Image**:
+```bash
+python scripts/gemini_batch_process.py \
+  --task generate \
+  --prompt "A futuristic city at sunset" \
+  --output docs/assets/<output-file-name> \
+  --model gemini-2.5-flash-image \
+  --aspect-ratio 16:9
+```
+
+**Optimize Media**:
+```bash
+# Prepare large video for processing
+python scripts/media_optimizer.py \
+  --input large-video.mp4 \
+  --output docs/assets/<output-file-name> \
+  --target-size 100MB
+
+# Batch optimize multiple files
+python scripts/media_optimizer.py \
+  --input-dir ./videos \
+  --output-dir docs/assets/optimized \
+  --quality 85
+```
+
+**Convert Documents to Markdown**:
+```bash
+# Convert to PDF
+python scripts/document_converter.py \
+  --input document.docx \
+  --output docs/assets/document.md
+
+# Extract pages
+python scripts/document_converter.py \
+  --input large.pdf \
+  --output docs/assets/chapter1.md \
+  --pages 1-20
+```
+
+## Supported Formats
+
+### Audio
+- WAV, MP3, AAC, FLAC, OGG Vorbis, AIFF
+- Max 9.5 hours per request
+- Auto-downsampled to 16 Kbps mono
+
+### Images
+- PNG, JPEG, WEBP, HEIC, HEIF
+- Max 3,600 images per request
+- Resolution: ≤384px = 258 tokens, larger = tiled
+
+### Video
+- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
+- Max 6 hours (low-res) or 2 hours (default)
+- YouTube URLs supported (public only)
+
+### Documents
+- PDF only for vision processing
+- Max 1,000 pages
+- TXT, HTML, Markdown supported (text-only)
+
+### Size Limits
+- **Inline**: <20MB total request
+- **File API**: 2GB per file, 20GB project quota
+- **Retention**: 48 hours auto-delete
+
+## Reference Navigation
+
+For detailed implementation guidance, see:
+
+### Audio Processing
+- `references/audio-processing.md` - Transcription, analysis, TTS
+  - Timestamp handling and segment analysis
+  - Multi-speaker identification
+  - Non-speech audio analysis
+  - Text-to-speech generation
+
+### Image Understanding
+- `references/vision-understanding.md` - Captioning, detection, OCR
+  - Object detection and localization
+  - Pixel-level segmentation
+  - Visual question answering
+  - Multi-image comparison
+
+### Video Analysis
+- `references/video-analysis.md` - Scene detection, temporal understanding
+  - YouTube URL processing
+  - Timestamp-based queries
+  - Video clipping and FPS control
+  - Long video optimization
+
+### Document Extraction
+- `references/document-extraction.md` - PDF processing, structured output
+  - Table and form extraction
+  - Chart and diagram analysis
+  - JSON schema validation
+  - Multi-page handling
+
+### Image Generation
+- `references/image-generation.md` - Text-to-image, editing
+  - Prompt engineering strategies
+  - Image editing and composition
+  - Aspect ratio selection
+  - Safety settings
+
+## Cost Optimization
+
+### Token Costs
+**Input Pricing**:
+- Gemini 2.5 Flash: $1.00/1M input, $0.10/1M output
+- Gemini 2.5 Pro: $3.00/1M input, $12.00/1M output
+- Gemini 1.5 Flash: $0.70/1M input, $0.175/1M output
+
+**Token Rates**:
+- Audio: 32 tokens/second (1 min = 1,920 tokens)
+- Video: ~300 tokens/second (default) or ~100 (low-res)
+- PDF: 258 tokens/page (fixed)
+- Image: 258-1,548 tokens based on size
+
+**TTS Pricing**:
+- Flash TTS: $10/1M tokens
+- Pro TTS: $20/1M tokens
+
+### Best Practices
+1. Use `gemini-2.5-flash` for most tasks (best price/performance)
+2. Use File API for files >20MB or repeated queries
+3. Optimize media before upload (see `media_optimizer.py`)
+4. Process specific segments instead of full videos
+5. Use lower FPS for static content
+6. Implement context caching for repeated queries
+7. Batch process multiple files in parallel
+
+## Rate Limits
+
+**Free Tier**:
+- 10-15 RPM (requests per minute)
+- 1M-4M TPM (tokens per minute)
+- 1,500 RPD (requests per day)
+
+**YouTube Limits**:
+- Free tier: 8 hours/day
+- Paid tier: No length limits
+- Public videos only
+
+**Storage Limits**:
+- 20GB per project
+- 2GB per file
+- 48-hour retention
+
+## Error Handling
+
+Common errors and solutions:
+- **400**: Invalid format/size - validate before upload
+- **401**: Invalid API key - check configuration
+- **403**: Permission denied - verify API key restrictions
+- **404**: File not found - ensure file uploaded and active
+- **429**: Rate limit exceeded - implement exponential backoff
+- **500**: Server error - retry with backoff
+
+## Scripts Overview
+
+All scripts support unified API key detection and error handling:
+
+**gemini_batch_process.py**: Batch process multiple media files
+- Supports all modalities (audio, image, video, PDF)
+- Progress tracking and error recovery
+- Output formats: JSON, Markdown, CSV
+- Rate limiting and retry logic
+- Dry-run mode
+
+**media_optimizer.py**: Prepare media for Gemini API
+- Compress videos/audio for size limits
+- Resize images appropriately
+- Split long videos into chunks
+- Format conversion
+- Quality vs size optimization
+
+**document_converter.py**: Convert documents to PDF
+- Convert DOCX, XLSX, PPTX to PDF
+- Extract page ranges
+- Optimize PDFs for Gemini
+- Extract images from PDFs
+- Batch conversion support
+
+Run any script with `--help` for detailed usage.
+
+## Resources
+
+- [Audio API Docs](https://ai.google.dev/gemini-api/docs/audio)
+- [Image API Docs](https://ai.google.dev/gemini-api/docs/image-understanding)
+- [Video API Docs](https://ai.google.dev/gemini-api/docs/video-understanding)
+- [Document API Docs](https://ai.google.dev/gemini-api/docs/document-processing)
+- [Image Gen Docs](https://ai.google.dev/gemini-api/docs/image-generation)
+- [Get API Key](https://aistudio.google.com/apikey)
+- [Pricing](https://ai.google.dev/pricing)
--- a/skills/ai-multimodal/references/audio-processing.md
+++ b/skills/ai-multimodal/references/audio-processing.md
@@ -0,0 +1,373 @@
+# Audio Processing Reference
+
+Comprehensive guide for audio analysis and speech generation using Gemini API.
+
+## Audio Understanding
+
+### Supported Formats
+
+| Format | MIME Type | Best Use |
+|--------|-----------|----------|
+| WAV | `audio/wav` | Uncompressed, highest quality |
+| MP3 | `audio/mp3` | Compressed, widely compatible |
+| AAC | `audio/aac` | Compressed, good quality |
+| FLAC | `audio/flac` | Lossless compression |
+| OGG Vorbis | `audio/ogg` | Open format |
+| AIFF | `audio/aiff` | Apple format |
+
+### Specifications
+
+- **Maximum length**: 9.5 hours per request
+- **Multiple files**: Unlimited count, combined max 9.5 hours
+- **Token rate**: 32 tokens/second (1 minute = 1,920 tokens)
+- **Processing**: Auto-downsampled to 16 Kbps mono
+- **File size limits**:
+  - Inline: 20 MB max total request
+  - File API: 2 GB per file, 20 GB project quota
+  - Retention: 48 hours auto-delete
+
+## Transcription
+
+### Basic Transcription
+
+```python
+from google import genai
+import os
+
+client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
+
+# Upload audio
+myfile = client.files.upload(file='meeting.mp3')
+
+# Transcribe
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Generate a transcript of the speech.', myfile]
+)
+print(response.text)
+```
+
+### With Timestamps
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Generate transcript with timestamps in MM:SS format.', myfile]
+)
+```
+
+### Multi-Speaker Identification
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Transcribe with speaker labels. Format: [Speaker 1], [Speaker 2], etc.', myfile]
+)
+```
+
+### Segment-Specific Transcription
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Transcribe only the segment from 02:30 to 05:15.', myfile]
+)
+```
+
+## Audio Analysis
+
+### Summarization
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Summarize key points in 5 bullets with timestamps.', myfile]
+)
+```
+
+### Non-Speech Audio Analysis
+
+```python
+# Music analysis
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Identify the musical instruments and genre.', myfile]
+)
+
+# Environmental sounds
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Identify all sounds: voices, music, ambient noise.', myfile]
+)
+
+# Birdsong identification
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Identify bird species based on their calls.', myfile]
+)
+```
+
+### Timestamp-Based Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['What is discussed from 10:30 to 15:45? Provide key points.', myfile]
+)
+```
+
+## Input Methods
+
+### File Upload (>20MB or Reuse)
+
+```python
+# Upload once, use multiple times
+myfile = client.files.upload(file='large-audio.mp3')
+
+# First query
+response1 = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Transcribe this', myfile]
+)
+
+# Second query (reuses same file)
+response2 = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Summarize this', myfile]
+)
+```
+
+### Inline Data (<20MB)
+
+```python
+from google.genai import types
+
+with open('small-audio.mp3', 'rb') as f:
+    audio_bytes = f.read()
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Describe this audio',
+        types.Part.from_bytes(data=audio_bytes, mime_type='audio/mp3')
+    ]
+)
+```
+
+## Speech Generation (TTS)
+
+### Available Models
+
+| Model | Quality | Speed | Cost/1M tokens |
+|-------|---------|-------|----------------|
+| `gemini-2.5-flash-native-audio-preview-09-2025` | High | Fast | $10 |
+| `gemini-2.5-pro` TTS mode | Premium | Slower | $20 |
+
+### Basic TTS
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-native-audio-preview-09-2025',
+    contents='Generate audio: Welcome to today\'s episode.'
+)
+
+# Save audio
+with open('output.wav', 'wb') as f:
+    f.write(response.audio_data)
+```
+
+### Controllable Voice Style
+
+```python
+# Professional tone
+response = client.models.generate_content(
+    model='gemini-2.5-flash-native-audio-preview-09-2025',
+    contents='Generate audio in a professional, clear tone: Welcome to our quarterly earnings call.'
+)
+
+# Casual and friendly
+response = client.models.generate_content(
+    model='gemini-2.5-flash-native-audio-preview-09-2025',
+    contents='Generate audio in a friendly, conversational tone: Hey there! Let\'s dive into today\'s topic.'
+)
+
+# Narrative style
+response = client.models.generate_content(
+    model='gemini-2.5-flash-native-audio-preview-09-2025',
+    contents='Generate audio in a narrative, storytelling tone: Once upon a time, in a land far away...'
+)
+```
+
+### Voice Control Parameters
+
+- **Style**: Professional, casual, narrative, conversational
+- **Pace**: Slow, normal, fast
+- **Tone**: Friendly, serious, enthusiastic
+- **Accent**: Natural language control (e.g., "British accent", "Southern drawl")
+
+## Best Practices
+
+### File Management
+
+1. Use File API for files >20MB
+2. Use File API for repeated queries (saves tokens)
+3. Files auto-delete after 48 hours
+4. Clean up manually when done:
+   ```python
+   client.files.delete(name=myfile.name)
+   ```
+
+### Prompt Engineering
+
+**Effective prompts**:
+- "Transcribe from 02:30 to 03:29 in MM:SS format"
+- "Identify speakers and extract dialogue with timestamps"
+- "Summarize key points with relevant timestamps"
+- "Transcribe and analyze sentiment for each speaker"
+
+**Context improves accuracy**:
+- "This is a medical interview - use appropriate terminology"
+- "Transcribe this legal deposition with precise terminology"
+- "This is a technical podcast about machine learning"
+
+**Combined tasks**:
+- "Transcribe and summarize in bullet points"
+- "Extract key quotes with timestamps and speaker labels"
+- "Transcribe and identify action items with timestamps"
+
+### Cost Optimization
+
+**Token calculation**:
+- 1 minute audio = 1,920 tokens
+- 1 hour audio = 115,200 tokens
+- 9.5 hours = 1,094,400 tokens
+
+**Model selection**:
+- Use `gemini-2.5-flash` ($1/1M tokens) for most tasks
+- Upgrade to `gemini-2.5-pro` ($3/1M tokens) for complex analysis
+- For high-volume: `gemini-1.5-flash` ($0.70/1M tokens)
+
+**Reduce costs**:
+- Process only relevant segments using timestamps
+- Use lower-quality audio when possible
+- Batch multiple short files in one request
+- Cache context for repeated queries
+
+### Error Handling
+
+```python
+import time
+
+def transcribe_with_retry(file_path, max_retries=3):
+    """Transcribe audio with exponential backoff retry"""
+    for attempt in range(max_retries):
+        try:
+            myfile = client.files.upload(file=file_path)
+            response = client.models.generate_content(
+                model='gemini-2.5-flash',
+                contents=['Transcribe with timestamps', myfile]
+            )
+            return response.text
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            wait_time = 2 ** attempt
+            print(f"Retry {attempt + 1} after {wait_time}s")
+            time.sleep(wait_time)
+```
+
+## Common Use Cases
+
+### 1. Meeting Transcription
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Transcribe this meeting with:
+        1. Speaker labels
+        2. Timestamps for topic changes
+        3. Action items highlighted
+        ''',
+        myfile
+    ]
+)
+```
+
+### 2. Podcast Summary
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Create podcast summary with:
+        1. Main topics with timestamps
+        2. Key quotes from each speaker
+        3. Recommended episode highlights
+        ''',
+        myfile
+    ]
+)
+```
+
+### 3. Interview Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Analyze interview:
+        1. Questions asked with timestamps
+        2. Key responses from interviewee
+        3. Overall sentiment and tone
+        ''',
+        myfile
+    ]
+)
+```
+
+### 4. Content Verification
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Verify audio content:
+        1. Check for specific keywords or phrases
+        2. Identify any compliance issues
+        3. Note any concerning statements with timestamps
+        ''',
+        myfile
+    ]
+)
+```
+
+### 5. Multilingual Transcription
+
+```python
+# Gemini auto-detects language
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Transcribe this audio and translate to English if needed.', myfile]
+)
+```
+
+## Token Costs
+
+**Audio Input** (32 tokens/second):
+- 1 minute = 1,920 tokens
+- 10 minutes = 19,200 tokens
+- 1 hour = 115,200 tokens
+- 9.5 hours = 1,094,400 tokens
+
+**Example costs** (Gemini 2.5 Flash at $1/1M):
+- 1 hour audio: 115,200 tokens = $0.12
+- Full day podcast (8 hours): 921,600 tokens = $0.92
+
+## Limitations
+
+- Maximum 9.5 hours per request
+- Auto-downsampled to 16 Kbps mono (quality loss)
+- Files expire after 48 hours
+- No real-time streaming support
+- Non-speech audio less accurate than speech
--- a/skills/ai-multimodal/references/image-generation.md
+++ b/skills/ai-multimodal/references/image-generation.md
@@ -0,0 +1,558 @@
+# Image Generation Reference
+
+Comprehensive guide for image creation, editing, and composition using Gemini API.
+
+## Core Capabilities
+
+- **Text-to-Image**: Generate images from text prompts
+- **Image Editing**: Modify existing images with text instructions
+- **Multi-Image Composition**: Combine up to 3 images
+- **Iterative Refinement**: Refine images conversationally
+- **Aspect Ratios**: Multiple formats (1:1, 16:9, 9:16, 4:3, 3:4)
+- **Style Control**: Control artistic style and quality
+- **Text in Images**: Limited text rendering (max 25 chars)
+
+## Model
+
+**gemini-2.5-flash-image** - Specialized for image generation
+- Input tokens: 65,536
+- Output tokens: 32,768
+- Knowledge cutoff: June 2025
+- Supports: Text and image inputs, image outputs
+
+## Quick Start
+
+### Basic Generation
+
+```python
+from google import genai
+from google.genai import types
+import os
+
+client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='A serene mountain landscape at sunset with snow-capped peaks',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='16:9'
+    )
+)
+
+# Save image
+for i, part in enumerate(response.candidates[0].content.parts):
+    if part.inline_data:
+        with open(f'output-{i}.png', 'wb') as f:
+            f.write(part.inline_data.data)
+```
+
+## Aspect Ratios
+
+| Ratio | Resolution | Use Case | Token Cost |
+|-------|-----------|----------|------------|
+| 1:1 | 1024×1024 | Social media, avatars | 1290 |
+| 16:9 | 1344×768 | Landscapes, banners | 1290 |
+| 9:16 | 768×1344 | Mobile, portraits | 1290 |
+| 4:3 | 1152×896 | Traditional media | 1290 |
+| 3:4 | 896×1152 | Vertical posters | 1290 |
+
+All ratios cost the same: 1,290 tokens per image.
+
+## Response Modalities
+
+### Image Only
+
+```python
+config = types.GenerateContentConfig(
+    response_modalities=['image'],
+    aspect_ratio='1:1'
+)
+```
+
+### Text Only (No Image)
+
+```python
+config = types.GenerateContentConfig(
+    response_modalities=['text']
+)
+# Returns text description instead of generating image
+```
+
+### Both Image and Text
+
+```python
+config = types.GenerateContentConfig(
+    response_modalities=['image', 'text'],
+    aspect_ratio='16:9'
+)
+# Returns both generated image and description
+```
+
+## Image Editing
+
+### Modify Existing Image
+
+```python
+import PIL.Image
+
+# Load original
+img = PIL.Image.open('original.png')
+
+# Edit with instructions
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Add a red balloon floating in the sky',
+        img
+    ],
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='16:9'
+    )
+)
+```
+
+### Style Transfer
+
+```python
+img = PIL.Image.open('photo.jpg')
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Transform this into an oil painting style',
+        img
+    ]
+)
+```
+
+### Object Addition/Removal
+
+```python
+# Add object
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Add a vintage car parked on the street',
+        img
+    ]
+)
+
+# Remove object
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Remove the person on the left side',
+        img
+    ]
+)
+```
+
+## Multi-Image Composition
+
+### Combine Multiple Images
+
+```python
+img1 = PIL.Image.open('background.png')
+img2 = PIL.Image.open('foreground.png')
+img3 = PIL.Image.open('overlay.png')
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Combine these images into a cohesive scene',
+        img1,
+        img2,
+        img3
+    ],
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='16:9'
+    )
+)
+```
+
+**Note**: Recommended maximum 3 input images for best results.
+
+## Prompt Engineering
+
+### Effective Prompt Structure
+
+**Three key elements**:
+1. **Subject**: What to generate
+2. **Context**: Environmental setting
+3. **Style**: Artistic treatment
+
+**Example**: "A robot [subject] in a futuristic city [context], cyberpunk style with neon lighting [style]"
+
+### Quality Modifiers
+
+**Technical terms**:
+- "4K", "8K", "high resolution"
+- "HDR", "high dynamic range"
+- "professional photography"
+- "studio lighting"
+- "ultra detailed"
+
+**Camera settings**:
+- "35mm lens", "50mm lens"
+- "shallow depth of field"
+- "wide angle shot"
+- "macro photography"
+- "golden hour lighting"
+
+### Style Keywords
+
+**Art styles**:
+- "oil painting", "watercolor", "sketch"
+- "digital art", "concept art"
+- "photorealistic", "hyperrealistic"
+- "minimalist", "abstract"
+- "cyberpunk", "steampunk", "fantasy"
+
+**Mood and atmosphere**:
+- "dramatic lighting", "soft lighting"
+- "moody", "bright and cheerful"
+- "mysterious", "whimsical"
+- "dark and gritty", "pastel colors"
+
+### Subject Description
+
+**Be specific**:
+- ❌ "A cat"
+- ✅ "A fluffy orange tabby cat with green eyes"
+
+**Add context**:
+- ❌ "A building"
+- ✅ "A modern glass skyscraper reflecting sunset clouds"
+
+**Include details**:
+- ❌ "A person"
+- ✅ "A young woman in a red dress holding an umbrella"
+
+### Composition and Framing
+
+**Camera angles**:
+- "bird's eye view", "aerial shot"
+- "low angle", "high angle"
+- "close-up", "wide shot"
+- "centered composition"
+- "rule of thirds"
+
+**Perspective**:
+- "first person view"
+- "third person perspective"
+- "isometric view"
+- "forced perspective"
+
+### Text in Images
+
+**Limitations**:
+- Maximum 25 characters total
+- Up to 3 distinct text phrases
+- Works best with simple text
+
+**Best practices**:
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='A vintage poster with bold text "EXPLORE" at the top, mountain landscape, retro 1950s style'
+)
+```
+
+**Font control**:
+- "bold sans-serif title"
+- "handwritten script"
+- "vintage letterpress"
+- "modern minimalist font"
+
+## Advanced Techniques
+
+### Iterative Refinement
+
+```python
+# Initial generation
+response1 = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='A futuristic city skyline'
+)
+
+# Save first version
+with open('v1.png', 'wb') as f:
+    f.write(response1.candidates[0].content.parts[0].inline_data.data)
+
+# Refine
+img = PIL.Image.open('v1.png')
+response2 = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents=[
+        'Add flying vehicles and neon signs',
+        img
+    ]
+)
+```
+
+### Negative Prompts (Indirect)
+
+```python
+# Instead of "no blur", be specific about what you want
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='A crystal clear, sharp photograph of a diamond ring with perfect focus and high detail'
+)
+```
+
+### Consistent Style Across Images
+
+```python
+base_prompt = "Digital art, vibrant colors, cel-shaded style, clean lines"
+
+prompts = [
+    f"{base_prompt}, a warrior character",
+    f"{base_prompt}, a mage character",
+    f"{base_prompt}, a rogue character"
+]
+
+for i, prompt in enumerate(prompts):
+    response = client.models.generate_content(
+        model='gemini-2.5-flash-image',
+        contents=prompt
+    )
+    # Save each character
+```
+
+## Safety Settings
+
+### Configure Safety Filters
+
+```python
+config = types.GenerateContentConfig(
+    response_modalities=['image'],
+    safety_settings=[
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+            threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
+        ),
+        types.SafetySetting(
+            category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+            threshold=types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
+        )
+    ]
+)
+```
+
+### Available Categories
+
+- `HARM_CATEGORY_HATE_SPEECH`
+- `HARM_CATEGORY_DANGEROUS_CONTENT`
+- `HARM_CATEGORY_HARASSMENT`
+- `HARM_CATEGORY_SEXUALLY_EXPLICIT`
+
+### Thresholds
+
+- `BLOCK_NONE`: No blocking
+- `BLOCK_LOW_AND_ABOVE`: Block low probability and above
+- `BLOCK_MEDIUM_AND_ABOVE`: Block medium and above (default)
+- `BLOCK_ONLY_HIGH`: Block only high probability
+
+## Common Use Cases
+
+### 1. Marketing Assets
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='''Professional product photography:
+    - Sleek smartphone on minimalist white surface
+    - Dramatic side lighting creating subtle shadows
+    - Shallow depth of field, crisp focus
+    - Clean, modern aesthetic
+    - 4K quality
+    ''',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='4:3'
+    )
+)
+```
+
+### 2. Concept Art
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='''Fantasy concept art:
+    - Ancient floating islands connected by chains
+    - Waterfalls cascading into clouds below
+    - Magical crystals glowing on the islands
+    - Epic scale, dramatic lighting
+    - Detailed digital painting style
+    ''',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='16:9'
+    )
+)
+```
+
+### 3. Social Media Graphics
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='''Instagram post design:
+    - Pastel gradient background (pink to blue)
+    - Motivational quote layout
+    - Modern minimalist style
+    - Clean typography
+    - Mobile-friendly composition
+    ''',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='1:1'
+    )
+)
+```
+
+### 4. Illustration
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='''Children's book illustration:
+    - Friendly cartoon dragon reading a book
+    - Bright, cheerful colors
+    - Soft, rounded shapes
+    - Whimsical forest background
+    - Warm, inviting atmosphere
+    ''',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='4:3'
+    )
+)
+```
+
+### 5. UI/UX Mockups
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash-image',
+    contents='''Modern mobile app interface:
+    - Clean dashboard design
+    - Card-based layout
+    - Soft shadows and gradients
+    - Contemporary color scheme (blue and white)
+    - Professional fintech aesthetic
+    ''',
+    config=types.GenerateContentConfig(
+        response_modalities=['image'],
+        aspect_ratio='9:16'
+    )
+)
+```
+
+## Best Practices
+
+### Prompt Quality
+
+1. **Be specific**: More detail = better results
+2. **Order matters**: Most important elements first
+3. **Use examples**: Reference known styles or artists
+4. **Avoid contradictions**: Don't ask for opposing styles
+5. **Test and iterate**: Refine prompts based on results
+
+### File Management
+
+```python
+# Save with descriptive names
+timestamp = int(time.time())
+filename = f'generated_{timestamp}_{aspect_ratio}.png'
+
+with open(filename, 'wb') as f:
+    f.write(image_data)
+```
+
+### Cost Optimization
+
+**Token costs**:
+- 1 image: 1,290 tokens = $0.00129 (Flash Image at $1/1M)
+- 10 images: 12,900 tokens = $0.0129
+- 100 images: 129,000 tokens = $0.129
+
+**Strategies**:
+- Generate fewer iterations
+- Use text modality first to validate concept
+- Batch similar requests
+- Cache prompts for consistent style
+
+## Error Handling
+
+### Safety Filter Blocking
+
+```python
+try:
+    response = client.models.generate_content(
+        model='gemini-2.5-flash-image',
+        contents=prompt
+    )
+except Exception as e:
+    # Check block reason
+    if hasattr(e, 'prompt_feedback'):
+        print(f"Blocked: {e.prompt_feedback.block_reason}")
+        # Modify prompt and retry
+```
+
+### Token Limit Exceeded
+
+```python
+# Keep prompts concise
+if len(prompt) > 1000:
+    # Truncate or simplify
+    prompt = prompt[:1000]
+```
+
+## Limitations
+
+- Maximum 3 input images for composition
+- Text rendering limited (25 chars max)
+- No video or animation generation
+- Regional restrictions (child images in EEA, CH, UK)
+- Optimal language support: English, Spanish (Mexico), Japanese, Mandarin, Hindi
+- No real-time generation
+- Cannot perfectly replicate specific people or copyrighted characters
+
+## Troubleshooting
+
+### aspect_ratio Parameter Error
+
+**Error**: `Extra inputs are not permitted [type=extra_forbidden, input_value='1:1', input_type=str]`
+
+**Cause**: The `aspect_ratio` parameter must be nested inside an `image_config` object, not passed directly to `GenerateContentConfig`.
+
+**Incorrect Usage**:
+```python
+# ❌ This will fail
+config = types.GenerateContentConfig(
+    response_modalities=['image'],
+    aspect_ratio='16:9'  # Wrong - not a direct parameter
+)
+```
+
+**Correct Usage**:
+```python
+# ✅ Correct implementation
+config = types.GenerateContentConfig(
+    response_modalities=['Image'],  # Note: Capital 'I'
+    image_config=types.ImageConfig(
+        aspect_ratio='16:9'
+    )
+)
+```
+
+### Response Modality Case Sensitivity
+
+The `response_modalities` parameter expects capital case values:
+- ✅ Correct: `['Image']`, `['Text']`, `['Image', 'Text']`
+- ❌ Wrong: `['image']`, `['text']`
--- a/skills/ai-multimodal/references/video-analysis.md
+++ b/skills/ai-multimodal/references/video-analysis.md
@@ -0,0 +1,502 @@
+# Video Analysis Reference
+
+Comprehensive guide for video understanding, temporal analysis, and YouTube processing using Gemini API.
+
+## Core Capabilities
+
+- **Video Summarization**: Create concise summaries
+- **Question Answering**: Answer specific questions about content
+- **Transcription**: Audio transcription with visual descriptions
+- **Timestamp References**: Query specific moments (MM:SS format)
+- **Video Clipping**: Process specific segments
+- **Scene Detection**: Identify scene changes and transitions
+- **Multiple Videos**: Compare up to 10 videos (2.5+)
+- **YouTube Support**: Analyze YouTube videos directly
+- **Custom Frame Rate**: Adjust FPS sampling
+
+## Supported Formats
+
+- MP4, MPEG, MOV, AVI, FLV, MPG, WebM, WMV, 3GPP
+
+## Model Selection
+
+### Gemini 2.5 Series
+- **gemini-2.5-pro**: Best quality, 1M-2M context
+- **gemini-2.5-flash**: Balanced, 1M-2M context
+- **gemini-2.5-flash-preview-09-2025**: Preview features, 1M context
+
+### Gemini 2.0 Series
+- **gemini-2.0-flash**: Fast processing
+- **gemini-2.0-flash-lite**: Lightweight option
+
+### Context Windows
+- **2M token models**: ~2 hours (default) or ~6 hours (low-res)
+- **1M token models**: ~1 hour (default) or ~3 hours (low-res)
+
+## Basic Video Analysis
+
+### Local Video
+
+```python
+from google import genai
+import os
+
+client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
+
+# Upload video (File API for >20MB)
+myfile = client.files.upload(file='video.mp4')
+
+# Wait for processing
+import time
+while myfile.state.name == 'PROCESSING':
+    time.sleep(1)
+    myfile = client.files.get(name=myfile.name)
+
+if myfile.state.name == 'FAILED':
+    raise ValueError('Video processing failed')
+
+# Analyze
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Summarize this video in 3 key points', myfile]
+)
+print(response.text)
+```
+
+### YouTube Video
+
+```python
+from google.genai import types
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Summarize the main topics discussed',
+        types.Part.from_uri(
+            uri='https://www.youtube.com/watch?v=VIDEO_ID',
+            mime_type='video/mp4'
+        )
+    ]
+)
+```
+
+### Inline Video (<20MB)
+
+```python
+with open('short-clip.mp4', 'rb') as f:
+    video_bytes = f.read()
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'What happens in this video?',
+        types.Part.from_bytes(data=video_bytes, mime_type='video/mp4')
+    ]
+)
+```
+
+## Advanced Features
+
+### Video Clipping
+
+```python
+# Analyze specific time range
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Summarize this segment',
+        types.Part.from_video_metadata(
+            file_uri=myfile.uri,
+            start_offset='40s',
+            end_offset='80s'
+        )
+    ]
+)
+```
+
+### Custom Frame Rate
+
+```python
+# Lower FPS for static content (saves tokens)
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Analyze this presentation',
+        types.Part.from_video_metadata(
+            file_uri=myfile.uri,
+            fps=0.5  # Sample every 2 seconds
+        )
+    ]
+)
+
+# Higher FPS for fast-moving content
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Analyze rapid movements in this sports video',
+        types.Part.from_video_metadata(
+            file_uri=myfile.uri,
+            fps=5  # Sample 5 times per second
+        )
+    ]
+)
+```
+
+### Multiple Videos (2.5+)
+
+```python
+video1 = client.files.upload(file='demo1.mp4')
+video2 = client.files.upload(file='demo2.mp4')
+
+# Wait for processing
+for video in [video1, video2]:
+    while video.state.name == 'PROCESSING':
+        time.sleep(1)
+        video = client.files.get(name=video.name)
+
+response = client.models.generate_content(
+    model='gemini-2.5-pro',
+    contents=[
+        'Compare these two product demos. Which explains features better?',
+        video1,
+        video2
+    ]
+)
+```
+
+## Temporal Understanding
+
+### Timestamp-Based Questions
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'What happens at 01:15 and how does it relate to 02:30?',
+        myfile
+    ]
+)
+```
+
+### Timeline Creation
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Create a timeline with timestamps:
+        - Key events
+        - Scene changes
+        - Important moments
+        Format: MM:SS - Description
+        ''',
+        myfile
+    ]
+)
+```
+
+### Scene Detection
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Identify all scene changes with timestamps and describe each scene',
+        myfile
+    ]
+)
+```
+
+## Transcription
+
+### Basic Transcription
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Transcribe the audio from this video',
+        myfile
+    ]
+)
+```
+
+### With Visual Descriptions
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Transcribe with visual context:
+        - Audio transcription
+        - Visual descriptions of important moments
+        - Timestamps for salient events
+        ''',
+        myfile
+    ]
+)
+```
+
+### Speaker Identification
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Transcribe with speaker labels and timestamps',
+        myfile
+    ]
+)
+```
+
+## Common Use Cases
+
+### 1. Video Summarization
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Summarize this video:
+        1. Main topic and purpose
+        2. Key points with timestamps
+        3. Conclusion or call-to-action
+        ''',
+        myfile
+    ]
+)
+```
+
+### 2. Educational Content
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Create educational materials:
+        1. List key concepts taught
+        2. Create 5 quiz questions with answers
+        3. Provide timestamp for each concept
+        ''',
+        myfile
+    ]
+)
+```
+
+### 3. Action Detection
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'List all actions performed in this tutorial with timestamps',
+        myfile
+    ]
+)
+```
+
+### 4. Content Moderation
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Review video content:
+        1. Identify any problematic content
+        2. Note timestamps of concerns
+        3. Provide content rating recommendation
+        ''',
+        myfile
+    ]
+)
+```
+
+### 5. Interview Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Analyze interview:
+        1. Questions asked (timestamps)
+        2. Key responses
+        3. Candidate body language and demeanor
+        4. Overall assessment
+        ''',
+        myfile
+    ]
+)
+```
+
+### 6. Sports Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Analyze sports video:
+        1. Key plays with timestamps
+        2. Player movements and positioning
+        3. Game strategy observations
+        ''',
+        types.Part.from_video_metadata(
+            file_uri=myfile.uri,
+            fps=5  # Higher FPS for fast action
+        )
+    ]
+)
+```
+
+## YouTube Specific Features
+
+### Public Video Requirements
+
+- Video must be public (not private or unlisted)
+- No age-restricted content
+- Valid video ID required
+
+### Usage Example
+
+```python
+# YouTube URL
+youtube_uri = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Create chapter markers with timestamps',
+        types.Part.from_uri(uri=youtube_uri, mime_type='video/mp4')
+    ]
+)
+```
+
+### Rate Limits
+
+- **Free tier**: 8 hours of YouTube video per day
+- **Paid tier**: No length-based limits
+- Public videos only
+
+## Token Calculation
+
+Video tokens depend on resolution and FPS:
+
+**Default resolution** (~300 tokens/second):
+- 1 minute = 18,000 tokens
+- 10 minutes = 180,000 tokens
+- 1 hour = 1,080,000 tokens
+
+**Low resolution** (~100 tokens/second):
+- 1 minute = 6,000 tokens
+- 10 minutes = 60,000 tokens
+- 1 hour = 360,000 tokens
+
+**Context windows**:
+- 2M tokens ≈ 2 hours (default) or 6 hours (low-res)
+- 1M tokens ≈ 1 hour (default) or 3 hours (low-res)
+
+## Best Practices
+
+### File Management
+
+1. Use File API for videos >20MB (most videos)
+2. Wait for ACTIVE state before analysis
+3. Files auto-delete after 48 hours
+4. Clean up manually:
+   ```python
+   client.files.delete(name=myfile.name)
+   ```
+
+### Optimization Strategies
+
+**Reduce token usage**:
+- Process specific segments using start/end offsets
+- Use lower FPS for static content
+- Use low-resolution mode for long videos
+- Split very long videos into chunks
+
+**Improve accuracy**:
+- Provide context in prompts
+- Use higher FPS for fast-moving content
+- Use Pro model for complex analysis
+- Be specific about what to extract
+
+### Prompt Engineering
+
+**Effective prompts**:
+- "Summarize key points with timestamps in MM:SS format"
+- "Identify all scene changes and describe each scene"
+- "Extract action items mentioned with timestamps"
+- "Compare these two videos on: X, Y, Z criteria"
+
+**Structured output**:
+```python
+from pydantic import BaseModel
+from typing import List
+
+class VideoEvent(BaseModel):
+    timestamp: str  # MM:SS format
+    description: str
+    category: str
+
+class VideoAnalysis(BaseModel):
+    summary: str
+    events: List[VideoEvent]
+    duration: str
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Analyze this video', myfile],
+    config=genai.types.GenerateContentConfig(
+        response_mime_type='application/json',
+        response_schema=VideoAnalysis
+    )
+)
+```
+
+### Error Handling
+
+```python
+import time
+
+def upload_and_process_video(file_path, max_wait=300):
+    """Upload video and wait for processing"""
+    myfile = client.files.upload(file=file_path)
+
+    elapsed = 0
+    while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
+        time.sleep(5)
+        myfile = client.files.get(name=myfile.name)
+        elapsed += 5
+
+    if myfile.state.name == 'FAILED':
+        raise ValueError(f'Video processing failed: {myfile.state.name}')
+
+    if myfile.state.name == 'PROCESSING':
+        raise TimeoutError(f'Processing timeout after {max_wait}s')
+
+    return myfile
+```
+
+## Cost Optimization
+
+**Token costs** (Gemini 2.5 Flash at $1/1M):
+- 1 minute video (default): 18,000 tokens = $0.018
+- 10 minute video: 180,000 tokens = $0.18
+- 1 hour video: 1,080,000 tokens = $1.08
+
+**Strategies**:
+- Use video clipping for specific segments
+- Lower FPS for static content
+- Use low-resolution mode for long videos
+- Batch related queries on same video
+- Use context caching for repeated queries
+
+## Limitations
+
+- Maximum 6 hours (low-res) or 2 hours (default)
+- YouTube videos must be public
+- No live streaming analysis
+- Files expire after 48 hours
+- Processing time varies by video length
+- No real-time processing
+- Limited to 10 videos per request (2.5+)
--- a/skills/ai-multimodal/references/vision-understanding.md
+++ b/skills/ai-multimodal/references/vision-understanding.md
@@ -0,0 +1,483 @@
+# Vision Understanding Reference
+
+Comprehensive guide for image analysis, object detection, and visual understanding using Gemini API.
+
+## Core Capabilities
+
+- **Captioning**: Generate descriptive text for images
+- **Classification**: Categorize and identify content
+- **Visual Q&A**: Answer questions about images
+- **Object Detection**: Locate objects with bounding boxes (2.0+)
+- **Segmentation**: Create pixel-level masks (2.5+)
+- **Multi-image**: Compare up to 3,600 images
+- **OCR**: Extract text from images
+- **Document Understanding**: Process PDFs with vision
+
+## Supported Formats
+
+- **Images**: PNG, JPEG, WEBP, HEIC, HEIF
+- **Documents**: PDF (up to 1,000 pages)
+- **Size Limits**:
+  - Inline: 20MB max total request
+  - File API: 2GB per file
+  - Max images: 3,600 per request
+
+## Model Selection
+
+### Gemini 2.5 Series
+- **gemini-2.5-pro**: Best quality, segmentation + detection
+- **gemini-2.5-flash**: Fast, efficient, all features
+- **gemini-2.5-flash-lite**: Lightweight, all features
+
+### Gemini 2.0 Series
+- **gemini-2.0-flash**: Object detection support
+- **gemini-2.0-flash-lite**: Lightweight detection
+
+### Feature Requirements
+- **Segmentation**: Requires 2.5+ models
+- **Object Detection**: Requires 2.0+ models
+- **Multi-image**: All models (up to 3,600 images)
+
+## Basic Image Analysis
+
+### Image Captioning
+
+```python
+from google import genai
+import os
+
+client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
+
+# Local file
+with open('image.jpg', 'rb') as f:
+    img_bytes = f.read()
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Describe this image in detail',
+        genai.types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
+    ]
+)
+print(response.text)
+```
+
+### Image Classification
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Classify this image. Provide category and confidence level.',
+        img_part
+    ]
+)
+```
+
+### Visual Question Answering
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'How many people are in this image and what are they doing?',
+        img_part
+    ]
+)
+```
+
+## Advanced Features
+
+### Object Detection (2.0+)
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.0-flash',
+    contents=[
+        'Detect all objects in this image and provide bounding boxes',
+        img_part
+    ]
+)
+
+# Returns bounding box coordinates: [ymin, xmin, ymax, xmax]
+# Normalized to [0, 1000] range
+```
+
+### Segmentation (2.5+)
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Create a segmentation mask for all people in this image',
+        img_part
+    ]
+)
+
+# Returns pixel-level masks for requested objects
+```
+
+### Multi-Image Comparison
+
+```python
+import PIL.Image
+
+img1 = PIL.Image.open('photo1.jpg')
+img2 = PIL.Image.open('photo2.jpg')
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Compare these two images. What are the differences?',
+        img1,
+        img2
+    ]
+)
+```
+
+### OCR and Text Extraction
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Extract all visible text from this image',
+        img_part
+    ]
+)
+```
+
+## Input Methods
+
+### Inline Data (<20MB)
+
+```python
+from google.genai import types
+
+# From file
+with open('image.jpg', 'rb') as f:
+    img_bytes = f.read()
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Analyze this image',
+        types.Part.from_bytes(data=img_bytes, mime_type='image/jpeg')
+    ]
+)
+```
+
+### PIL Image
+
+```python
+import PIL.Image
+
+img = PIL.Image.open('photo.jpg')
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['What is in this image?', img]
+)
+```
+
+### File API (>20MB or Reuse)
+
+```python
+# Upload once
+myfile = client.files.upload(file='large-image.jpg')
+
+# Use multiple times
+response1 = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Describe this image', myfile]
+)
+
+response2 = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['What colors dominate this image?', myfile]
+)
+```
+
+### URL (Public Images)
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Analyze this image',
+        types.Part.from_uri(
+            uri='https://example.com/image.jpg',
+            mime_type='image/jpeg'
+        )
+    ]
+)
+```
+
+## Token Calculation
+
+Images consume tokens based on size:
+
+**Small images** (≤384px both dimensions): 258 tokens
+
+**Large images**: Tiled into 768×768 chunks, 258 tokens each
+
+**Formula**:
+```
+crop_unit = floor(min(width, height) / 1.5)
+tiles = (width / crop_unit) × (height / crop_unit)
+total_tokens = tiles × 258
+```
+
+**Examples**:
+- 256×256: 258 tokens (small)
+- 512×512: 258 tokens (small)
+- 960×540: 6 tiles = 1,548 tokens
+- 1920×1080: 6 tiles = 1,548 tokens
+- 3840×2160 (4K): 24 tiles = 6,192 tokens
+
+## Structured Output
+
+### JSON Schema Output
+
+```python
+from pydantic import BaseModel
+from typing import List
+
+class ObjectDetection(BaseModel):
+    object_name: str
+    confidence: float
+    bounding_box: List[int]  # [ymin, xmin, ymax, xmax]
+
+class ImageAnalysis(BaseModel):
+    description: str
+    objects: List[ObjectDetection]
+    scene_type: str
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Analyze this image', img_part],
+    config=genai.types.GenerateContentConfig(
+        response_mime_type='application/json',
+        response_schema=ImageAnalysis
+    )
+)
+
+result = ImageAnalysis.model_validate_json(response.text)
+```
+
+## Multi-Image Analysis
+
+### Batch Processing
+
+```python
+images = [
+    PIL.Image.open(f'image{i}.jpg')
+    for i in range(10)
+]
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=['Analyze these images and find common themes'] + images
+)
+```
+
+### Image Comparison
+
+```python
+before = PIL.Image.open('before.jpg')
+after = PIL.Image.open('after.jpg')
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Compare before and after. List all visible changes.',
+        before,
+        after
+    ]
+)
+```
+
+### Visual Search
+
+```python
+reference = PIL.Image.open('target.jpg')
+candidates = [PIL.Image.open(f'option{i}.jpg') for i in range(5)]
+
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Find which candidate images contain objects similar to the reference',
+        reference
+    ] + candidates
+)
+```
+
+## Best Practices
+
+### Image Quality
+
+1. **Resolution**: Use clear, non-blurry images
+2. **Rotation**: Verify correct orientation
+3. **Lighting**: Ensure good contrast and lighting
+4. **Size optimization**: Balance quality vs token cost
+5. **Format**: JPEG for photos, PNG for graphics
+
+### Prompt Engineering
+
+**Specific instructions**:
+- "Identify all vehicles with their colors and positions"
+- "Count people wearing blue shirts"
+- "Extract text from the sign in the top-left corner"
+
+**Output format**:
+- "Return results as JSON with fields: category, count, description"
+- "Format as markdown table"
+- "List findings as numbered items"
+
+**Few-shot examples**:
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Example: For an image of a cat on a sofa, respond: "Object: cat, Location: sofa"',
+        'Now analyze this image:',
+        img_part
+    ]
+)
+```
+
+### File Management
+
+1. Use File API for images >20MB
+2. Use File API for repeated queries (saves tokens)
+3. Files auto-delete after 48 hours
+4. Clean up manually:
+   ```python
+   client.files.delete(name=myfile.name)
+   ```
+
+### Cost Optimization
+
+**Token-efficient strategies**:
+- Resize large images before upload
+- Use File API for repeated queries
+- Batch multiple images when related
+- Use appropriate model (Flash vs Pro)
+
+**Token costs** (Gemini 2.5 Flash at $1/1M):
+- Small image (258 tokens): $0.000258
+- HD image (1,548 tokens): $0.001548
+- 4K image (6,192 tokens): $0.006192
+
+## Common Use Cases
+
+### 1. Product Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Analyze this product image:
+        1. Identify the product
+        2. List visible features
+        3. Assess condition
+        4. Estimate value range
+        ''',
+        img_part
+    ]
+)
+```
+
+### 2. Screenshot Analysis
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Extract all text and UI elements from this screenshot',
+        img_part
+    ]
+)
+```
+
+### 3. Medical Imaging (Informational Only)
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-pro',
+    contents=[
+        'Describe visible features in this medical image. Note: This is for informational purposes only.',
+        img_part
+    ]
+)
+```
+
+### 4. Chart/Graph Reading
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        'Extract data from this chart and format as JSON',
+        img_part
+    ]
+)
+```
+
+### 5. Scene Understanding
+
+```python
+response = client.models.generate_content(
+    model='gemini-2.5-flash',
+    contents=[
+        '''Analyze this scene:
+        1. Location type
+        2. Time of day
+        3. Weather conditions
+        4. Activities happening
+        5. Mood/atmosphere
+        ''',
+        img_part
+    ]
+)
+```
+
+## Error Handling
+
+```python
+import time
+
+def analyze_image_with_retry(image_path, prompt, max_retries=3):
+    """Analyze image with exponential backoff retry"""
+    for attempt in range(max_retries):
+        try:
+            with open(image_path, 'rb') as f:
+                img_bytes = f.read()
+
+            response = client.models.generate_content(
+                model='gemini-2.5-flash',
+                contents=[
+                    prompt,
+                    genai.types.Part.from_bytes(
+                        data=img_bytes,
+                        mime_type='image/jpeg'
+                    )
+                ]
+            )
+            return response.text
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            wait_time = 2 ** attempt
+            print(f"Retry {attempt + 1} after {wait_time}s: {e}")
+            time.sleep(wait_time)
+```
+
+## Limitations
+
+- Maximum 3,600 images per request
+- OCR accuracy varies with text quality
+- Object detection requires 2.0+ models
+- Segmentation requires 2.5+ models
+- No video frame extraction (use video API)
+- Regional restrictions on child images (EEA, CH, UK)
--- a/skills/ai-multimodal/scripts/document_converter.py
+++ b/skills/ai-multimodal/scripts/document_converter.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""
+Convert documents to Markdown using Gemini API.
+
+Supports all document types:
+- PDF documents (native vision processing)
+- Images (JPEG, PNG, WEBP, HEIC)
+- Office documents (DOCX, XLSX, PPTX)
+- HTML, TXT, and other text formats
+
+Features:
+- Converts to clean markdown format
+- Preserves structure, tables, and formatting
+- Extracts text from images and scanned documents
+- Batch conversion support
+- Saves to docs/assets/document-extraction.md by default
+"""
+
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+
+try:
+    from google import genai
+    from google.genai import types
+except ImportError:
+    print("Error: google-genai package not installed")
+    print("Install with: pip install google-genai")
+    sys.exit(1)
+
+try:
+    from dotenv import load_dotenv
+except ImportError:
+    load_dotenv = None
+
+
+def find_api_key() -> Optional[str]:
+    """Find Gemini API key using correct priority order.
+
+    Priority order (highest to lowest):
+    1. process.env (runtime environment variables)
+    2. .claude/skills/ai-multimodal/.env (skill-specific config)
+    3. .claude/skills/.env (shared skills config)
+    4. .claude/.env (Claude global config)
+    """
+    # Priority 1: Already in process.env (highest)
+    api_key = os.getenv('GEMINI_API_KEY')
+    if api_key:
+        return api_key
+
+    # Load .env files if dotenv available
+    if load_dotenv:
+        # Determine base paths
+        script_dir = Path(__file__).parent
+        skill_dir = script_dir.parent  # .claude/skills/ai-multimodal
+        skills_dir = skill_dir.parent   # .claude/skills
+        claude_dir = skills_dir.parent  # .claude
+
+        # Priority 2: Skill-specific .env
+        env_file = skill_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 3: Shared skills .env
+        env_file = skills_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 4: Claude global .env
+        env_file = claude_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+    return None
+
+
+def find_project_root() -> Path:
+    """Find project root directory."""
+    script_dir = Path(__file__).parent
+
+    # Look for .git or .claude directory
+    for parent in [script_dir] + list(script_dir.parents):
+        if (parent / '.git').exists() or (parent / '.claude').exists():
+            return parent
+
+    return script_dir
+
+
+def get_mime_type(file_path: str) -> str:
+    """Determine MIME type from file extension."""
+    ext = Path(file_path).suffix.lower()
+
+    mime_types = {
+        # Documents
+        '.pdf': 'application/pdf',
+        '.txt': 'text/plain',
+        '.html': 'text/html',
+        '.htm': 'text/html',
+        '.md': 'text/markdown',
+        '.csv': 'text/csv',
+        # Images
+        '.jpg': 'image/jpeg',
+        '.jpeg': 'image/jpeg',
+        '.png': 'image/png',
+        '.webp': 'image/webp',
+        '.heic': 'image/heic',
+        '.heif': 'image/heif',
+        # Office (need to be uploaded as binary)
+        '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+        '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+    }
+
+    return mime_types.get(ext, 'application/octet-stream')
+
+
+def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
+    """Upload file to Gemini File API."""
+    if verbose:
+        print(f"Uploading {file_path}...")
+
+    myfile = client.files.upload(file=file_path)
+
+    # Wait for processing if needed
+    max_wait = 300  # 5 minutes
+    elapsed = 0
+    while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
+        time.sleep(2)
+        myfile = client.files.get(name=myfile.name)
+        elapsed += 2
+        if verbose and elapsed % 10 == 0:
+            print(f"  Processing... {elapsed}s")
+
+    if myfile.state.name == 'FAILED':
+        raise ValueError(f"File processing failed: {file_path}")
+
+    if myfile.state.name == 'PROCESSING':
+        raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
+
+    if verbose:
+        print(f"  Uploaded: {myfile.name}")
+
+    return myfile
+
+
+def convert_to_markdown(
+    client: genai.Client,
+    file_path: str,
+    model: str = 'gemini-2.5-flash',
+    custom_prompt: Optional[str] = None,
+    verbose: bool = False,
+    max_retries: int = 3
+) -> Dict[str, Any]:
+    """Convert a document to markdown using Gemini."""
+
+    for attempt in range(max_retries):
+        try:
+            file_path_obj = Path(file_path)
+            file_size = file_path_obj.stat().st_size
+            use_file_api = file_size > 20 * 1024 * 1024  # >20MB
+
+            # Default prompt for markdown conversion
+            if custom_prompt:
+                prompt = custom_prompt
+            else:
+                prompt = """Convert this document to clean, well-formatted Markdown.
+
+Requirements:
+- Preserve all content, structure, and formatting
+- Convert tables to markdown table format
+- Maintain heading hierarchy (# ## ### etc)
+- Preserve lists, code blocks, and quotes
+- Extract text from images if present
+- Keep formatting consistent and readable
+
+Output only the markdown content without any preamble or explanation."""
+
+            # Upload or inline the file
+            if use_file_api:
+                myfile = upload_file(client, str(file_path), verbose)
+                content = [prompt, myfile]
+            else:
+                with open(file_path, 'rb') as f:
+                    file_bytes = f.read()
+
+                mime_type = get_mime_type(str(file_path))
+                content = [
+                    prompt,
+                    types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
+                ]
+
+            # Generate markdown
+            response = client.models.generate_content(
+                model=model,
+                contents=content
+            )
+
+            markdown_content = response.text if hasattr(response, 'text') else ''
+
+            return {
+                'file': str(file_path),
+                'status': 'success',
+                'markdown': markdown_content
+            }
+
+        except Exception as e:
+            if attempt == max_retries - 1:
+                return {
+                    'file': str(file_path),
+                    'status': 'error',
+                    'error': str(e),
+                    'markdown': None
+                }
+
+            wait_time = 2 ** attempt
+            if verbose:
+                print(f"  Retry {attempt + 1} after {wait_time}s: {e}")
+            time.sleep(wait_time)
+
+
+def batch_convert(
+    files: List[str],
+    output_file: Optional[str] = None,
+    auto_name: bool = False,
+    model: str = 'gemini-2.5-flash',
+    custom_prompt: Optional[str] = None,
+    verbose: bool = False
+) -> List[Dict[str, Any]]:
+    """Batch convert multiple files to markdown."""
+
+    api_key = find_api_key()
+    if not api_key:
+        print("Error: GEMINI_API_KEY not found")
+        print("Set via: export GEMINI_API_KEY='your-key'")
+        print("Or create .env file with: GEMINI_API_KEY=your-key")
+        sys.exit(1)
+
+    client = genai.Client(api_key=api_key)
+    results = []
+
+    # Determine output path
+    if not output_file:
+        project_root = find_project_root()
+        output_dir = project_root / 'docs' / 'assets'
+
+        if auto_name and len(files) == 1:
+            # Auto-generate meaningful filename from input
+            input_path = Path(files[0])
+            base_name = input_path.stem
+            output_file = str(output_dir / f"{base_name}-extraction.md")
+        else:
+            output_file = str(output_dir / 'document-extraction.md')
+
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Process each file
+    for i, file_path in enumerate(files, 1):
+        if verbose:
+            print(f"\n[{i}/{len(files)}] Converting: {file_path}")
+
+        result = convert_to_markdown(
+            client=client,
+            file_path=file_path,
+            model=model,
+            custom_prompt=custom_prompt,
+            verbose=verbose
+        )
+
+        results.append(result)
+
+        if verbose:
+            status = result.get('status', 'unknown')
+            print(f"  Status: {status}")
+
+    # Save combined markdown
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write("# Document Extraction Results\n\n")
+        f.write(f"Converted {len(files)} document(s) to markdown.\n\n")
+        f.write("---\n\n")
+
+        for result in results:
+            f.write(f"## {Path(result['file']).name}\n\n")
+
+            if result['status'] == 'success' and result.get('markdown'):
+                f.write(result['markdown'])
+                f.write("\n\n")
+            elif result['status'] == 'success':
+                f.write("**Note**: Conversion succeeded but no content was returned.\n\n")
+            else:
+                f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n")
+
+            f.write("---\n\n")
+
+    if verbose or True:  # Always show output location
+        print(f"\n{'='*50}")
+        print(f"Converted: {len(results)} file(s)")
+        print(f"Success: {sum(1 for r in results if r['status'] == 'success')}")
+        print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}")
+        print(f"Output saved to: {output_path}")
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert documents to Markdown using Gemini API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Convert single PDF to markdown (default name)
+  %(prog)s --input document.pdf
+
+  # Auto-generate meaningful filename
+  %(prog)s --input testpdf.pdf --auto-name
+  # Output: docs/assets/testpdf-extraction.md
+
+  # Convert multiple files
+  %(prog)s --input doc1.pdf doc2.docx image.png
+
+  # Specify custom output location
+  %(prog)s --input document.pdf --output ./output.md
+
+  # Use custom prompt
+  %(prog)s --input document.pdf --prompt "Extract only the tables as markdown"
+
+  # Batch convert directory
+  %(prog)s --input ./documents/*.pdf --verbose
+
+Supported formats:
+  - PDF documents (up to 1,000 pages)
+  - Images (JPEG, PNG, WEBP, HEIC)
+  - Office documents (DOCX, XLSX, PPTX)
+  - Text formats (TXT, HTML, Markdown, CSV)
+
+Default output: <project-root>/docs/assets/document-extraction.md
+        """
+    )
+
+    parser.add_argument('--input', '-i', nargs='+', required=True,
+                       help='Input file(s) to convert')
+    parser.add_argument('--output', '-o',
+                       help='Output markdown file (default: docs/assets/document-extraction.md)')
+    parser.add_argument('--auto-name', '-a', action='store_true',
+                       help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)')
+    parser.add_argument('--model', default='gemini-2.5-flash',
+                       help='Gemini model to use (default: gemini-2.5-flash)')
+    parser.add_argument('--prompt', '-p',
+                       help='Custom prompt for conversion')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                       help='Verbose output')
+
+    args = parser.parse_args()
+
+    # Validate input files
+    files = []
+    for file_pattern in args.input:
+        file_path = Path(file_pattern)
+        if file_path.exists() and file_path.is_file():
+            files.append(str(file_path))
+        else:
+            # Try glob pattern
+            import glob
+            matched = glob.glob(file_pattern)
+            files.extend([f for f in matched if Path(f).is_file()])
+
+    if not files:
+        print("Error: No valid input files found")
+        sys.exit(1)
+
+    # Convert files
+    batch_convert(
+        files=files,
+        output_file=args.output,
+        auto_name=args.auto_name,
+        model=args.model,
+        custom_prompt=args.prompt,
+        verbose=args.verbose
+    )
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/ai-multimodal/scripts/gemini_batch_process.py
+++ b/skills/ai-multimodal/scripts/gemini_batch_process.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""
+Batch process multiple media files using Gemini API.
+
+Supports all Gemini modalities:
+- Audio: Transcription, analysis, summarization
+- Image: Captioning, detection, OCR, analysis
+- Video: Summarization, Q&A, scene detection
+- Document: PDF extraction, structured output
+- Generation: Image creation from text prompts
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import csv
+import shutil
+
+try:
+    from google import genai
+    from google.genai import types
+except ImportError:
+    print("Error: google-genai package not installed")
+    print("Install with: pip install google-genai")
+    sys.exit(1)
+
+try:
+    from dotenv import load_dotenv
+except ImportError:
+    load_dotenv = None
+
+
+def find_api_key() -> Optional[str]:
+    """Find Gemini API key using correct priority order.
+
+    Priority order (highest to lowest):
+    1. process.env (runtime environment variables)
+    2. .claude/skills/ai-multimodal/.env (skill-specific config)
+    3. .claude/skills/.env (shared skills config)
+    4. .claude/.env (Claude global config)
+    """
+    # Priority 1: Already in process.env (highest)
+    api_key = os.getenv('GEMINI_API_KEY')
+    if api_key:
+        return api_key
+
+    # Load .env files if dotenv available
+    if load_dotenv:
+        # Determine base paths
+        script_dir = Path(__file__).parent
+        skill_dir = script_dir.parent  # .claude/skills/ai-multimodal
+        skills_dir = skill_dir.parent   # .claude/skills
+        claude_dir = skills_dir.parent  # .claude
+
+        # Priority 2: Skill-specific .env
+        env_file = skill_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 3: Shared skills .env
+        env_file = skills_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 4: Claude global .env
+        env_file = claude_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+    return None
+
+
+def get_mime_type(file_path: str) -> str:
+    """Determine MIME type from file extension."""
+    ext = Path(file_path).suffix.lower()
+
+    mime_types = {
+        # Audio
+        '.mp3': 'audio/mp3',
+        '.wav': 'audio/wav',
+        '.aac': 'audio/aac',
+        '.flac': 'audio/flac',
+        '.ogg': 'audio/ogg',
+        '.aiff': 'audio/aiff',
+        # Image
+        '.jpg': 'image/jpeg',
+        '.jpeg': 'image/jpeg',
+        '.png': 'image/png',
+        '.webp': 'image/webp',
+        '.heic': 'image/heic',
+        '.heif': 'image/heif',
+        # Video
+        '.mp4': 'video/mp4',
+        '.mpeg': 'video/mpeg',
+        '.mov': 'video/quicktime',
+        '.avi': 'video/x-msvideo',
+        '.flv': 'video/x-flv',
+        '.mpg': 'video/mpeg',
+        '.webm': 'video/webm',
+        '.wmv': 'video/x-ms-wmv',
+        '.3gpp': 'video/3gpp',
+        # Document
+        '.pdf': 'application/pdf',
+        '.txt': 'text/plain',
+        '.html': 'text/html',
+        '.md': 'text/markdown',
+    }
+
+    return mime_types.get(ext, 'application/octet-stream')
+
+
+def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
+    """Upload file to Gemini File API."""
+    if verbose:
+        print(f"Uploading {file_path}...")
+
+    myfile = client.files.upload(file=file_path)
+
+    # Wait for processing (video/audio files need processing)
+    mime_type = get_mime_type(file_path)
+    if mime_type.startswith('video/') or mime_type.startswith('audio/'):
+        max_wait = 300  # 5 minutes
+        elapsed = 0
+        while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
+            time.sleep(2)
+            myfile = client.files.get(name=myfile.name)
+            elapsed += 2
+            if verbose and elapsed % 10 == 0:
+                print(f"  Processing... {elapsed}s")
+
+        if myfile.state.name == 'FAILED':
+            raise ValueError(f"File processing failed: {file_path}")
+
+        if myfile.state.name == 'PROCESSING':
+            raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
+
+    if verbose:
+        print(f"  Uploaded: {myfile.name}")
+
+    return myfile
+
+
+def process_file(
+    client: genai.Client,
+    file_path: Optional[str],
+    prompt: str,
+    model: str,
+    task: str,
+    format_output: str,
+    aspect_ratio: Optional[str] = None,
+    verbose: bool = False,
+    max_retries: int = 3
+) -> Dict[str, Any]:
+    """Process a single file with retry logic."""
+
+    for attempt in range(max_retries):
+        try:
+            # For generation tasks without input files
+            if task == 'generate' and not file_path:
+                content = [prompt]
+            else:
+                # Process input file
+                file_path = Path(file_path)
+                # Determine if we need File API
+                file_size = file_path.stat().st_size
+                use_file_api = file_size > 20 * 1024 * 1024  # >20MB
+
+                if use_file_api:
+                    # Upload to File API
+                    myfile = upload_file(client, str(file_path), verbose)
+                    content = [prompt, myfile]
+                else:
+                    # Inline data
+                    with open(file_path, 'rb') as f:
+                        file_bytes = f.read()
+
+                    mime_type = get_mime_type(str(file_path))
+                    content = [
+                        prompt,
+                        types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
+                    ]
+
+            # Configure request
+            config_args = {}
+            if task == 'generate':
+                config_args['response_modalities'] = ['Image']  # Capital I per API spec
+                if aspect_ratio:
+                    # Nest aspect_ratio in image_config per API spec
+                    config_args['image_config'] = types.ImageConfig(
+                        aspect_ratio=aspect_ratio
+                    )
+
+            if format_output == 'json':
+                config_args['response_mime_type'] = 'application/json'
+
+            config = types.GenerateContentConfig(**config_args) if config_args else None
+
+            # Generate content
+            response = client.models.generate_content(
+                model=model,
+                contents=content,
+                config=config
+            )
+
+            # Extract response
+            result = {
+                'file': str(file_path) if file_path else 'generated',
+                'status': 'success',
+                'response': response.text if hasattr(response, 'text') else None
+            }
+
+            # Handle image output
+            if task == 'generate' and hasattr(response, 'candidates'):
+                for i, part in enumerate(response.candidates[0].content.parts):
+                    if part.inline_data:
+                        # Determine output directory - use project root docs/assets
+                        if file_path:
+                            output_dir = Path(file_path).parent
+                            base_name = Path(file_path).stem
+                        else:
+                            # Find project root (look for .git or .claude directory)
+                            script_dir = Path(__file__).parent
+                            project_root = script_dir
+                            for parent in [script_dir] + list(script_dir.parents):
+                                if (parent / '.git').exists() or (parent / '.claude').exists():
+                                    project_root = parent
+                                    break
+
+                            output_dir = project_root / 'docs' / 'assets'
+                            output_dir.mkdir(parents=True, exist_ok=True)
+                            base_name = "generated"
+
+                        output_file = output_dir / f"{base_name}_generated_{i}.png"
+                        with open(output_file, 'wb') as f:
+                            f.write(part.inline_data.data)
+                        result['generated_image'] = str(output_file)
+                        if verbose:
+                            print(f"  Saved image to: {output_file}")
+
+            return result
+
+        except Exception as e:
+            if attempt == max_retries - 1:
+                return {
+                    'file': str(file_path) if file_path else 'generated',
+                    'status': 'error',
+                    'error': str(e)
+                }
+
+            wait_time = 2 ** attempt
+            if verbose:
+                print(f"  Retry {attempt + 1} after {wait_time}s: {e}")
+            time.sleep(wait_time)
+
+
+def batch_process(
+    files: List[str],
+    prompt: str,
+    model: str,
+    task: str,
+    format_output: str,
+    aspect_ratio: Optional[str] = None,
+    output_file: Optional[str] = None,
+    verbose: bool = False,
+    dry_run: bool = False
+) -> List[Dict[str, Any]]:
+    """Batch process multiple files."""
+    api_key = find_api_key()
+    if not api_key:
+        print("Error: GEMINI_API_KEY not found")
+        print("Set via: export GEMINI_API_KEY='your-key'")
+        print("Or create .env file with: GEMINI_API_KEY=your-key")
+        sys.exit(1)
+
+    if dry_run:
+        print("DRY RUN MODE - No API calls will be made")
+        print(f"Files to process: {len(files)}")
+        print(f"Model: {model}")
+        print(f"Task: {task}")
+        print(f"Prompt: {prompt}")
+        return []
+
+    client = genai.Client(api_key=api_key)
+    results = []
+
+    # For generation tasks without input files, process once
+    if task == 'generate' and not files:
+        if verbose:
+            print(f"\nGenerating image from prompt...")
+
+        result = process_file(
+            client=client,
+            file_path=None,
+            prompt=prompt,
+            model=model,
+            task=task,
+            format_output=format_output,
+            aspect_ratio=aspect_ratio,
+            verbose=verbose
+        )
+
+        results.append(result)
+
+        if verbose:
+            status = result.get('status', 'unknown')
+            print(f"  Status: {status}")
+    else:
+        # Process input files
+        for i, file_path in enumerate(files, 1):
+            if verbose:
+                print(f"\n[{i}/{len(files)}] Processing: {file_path}")
+
+            result = process_file(
+                client=client,
+                file_path=file_path,
+                prompt=prompt,
+                model=model,
+                task=task,
+                format_output=format_output,
+                aspect_ratio=aspect_ratio,
+                verbose=verbose
+            )
+
+            results.append(result)
+
+            if verbose:
+                status = result.get('status', 'unknown')
+                print(f"  Status: {status}")
+
+    # Save results
+    if output_file:
+        save_results(results, output_file, format_output)
+
+    return results
+
+
+def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str):
+    """Save results to file."""
+    output_path = Path(output_file)
+
+    # Special handling for image generation - if output has image extension, copy the generated image
+    image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'}
+    if output_path.suffix.lower() in image_extensions and len(results) == 1:
+        generated_image = results[0].get('generated_image')
+        if generated_image:
+            # Copy the generated image to the specified output location
+            shutil.copy2(generated_image, output_path)
+            return
+        else:
+            # Don't write text reports to image files - save error as .txt instead
+            output_path = output_path.with_suffix('.error.txt')
+            print(f"Warning: Generation failed, saving error report to: {output_path}")
+
+    if format_output == 'json':
+        with open(output_path, 'w') as f:
+            json.dump(results, f, indent=2)
+    elif format_output == 'csv':
+        with open(output_path, 'w', newline='') as f:
+            fieldnames = ['file', 'status', 'response', 'error']
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for result in results:
+                writer.writerow({
+                    'file': result.get('file', ''),
+                    'status': result.get('status', ''),
+                    'response': result.get('response', ''),
+                    'error': result.get('error', '')
+                })
+    else:  # markdown
+        with open(output_path, 'w') as f:
+            f.write("# Batch Processing Results\n\n")
+            for i, result in enumerate(results, 1):
+                f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n")
+                f.write(f"**Status**: {result.get('status', 'unknown')}\n\n")
+                if result.get('response'):
+                    f.write(f"**Response**:\n\n{result['response']}\n\n")
+                if result.get('error'):
+                    f.write(f"**Error**: {result['error']}\n\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Batch process media files with Gemini API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Transcribe multiple audio files
+  %(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash
+
+  # Analyze images
+  %(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\
+    --model gemini-2.5-flash
+
+  # Process PDFs to JSON
+  %(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\
+    --format json --output results.json
+
+  # Generate images
+  %(prog)s --task generate --prompt "A mountain landscape" \\
+    --model gemini-2.5-flash-image --aspect-ratio 16:9
+        """
+    )
+
+    parser.add_argument('--files', nargs='*', help='Input files to process')
+    parser.add_argument('--task', required=True,
+                       choices=['transcribe', 'analyze', 'extract', 'generate'],
+                       help='Task to perform')
+    parser.add_argument('--prompt', help='Prompt for analysis/generation')
+    parser.add_argument('--model', default='gemini-2.5-flash',
+                       help='Gemini model to use (default: gemini-2.5-flash)')
+    parser.add_argument('--format', dest='format_output', default='text',
+                       choices=['text', 'json', 'csv', 'markdown'],
+                       help='Output format (default: text)')
+    parser.add_argument('--aspect-ratio', choices=['1:1', '16:9', '9:16', '4:3', '3:4'],
+                       help='Aspect ratio for image generation')
+    parser.add_argument('--output', help='Output file for results')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                       help='Verbose output')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='Show what would be done without making API calls')
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if args.task != 'generate' and not args.files:
+        parser.error("--files required for non-generation tasks")
+
+    if args.task == 'generate' and not args.prompt:
+        parser.error("--prompt required for generation task")
+
+    if args.task != 'generate' and not args.prompt:
+        # Set default prompts
+        if args.task == 'transcribe':
+            args.prompt = 'Generate a transcript with timestamps'
+        elif args.task == 'analyze':
+            args.prompt = 'Analyze this content'
+        elif args.task == 'extract':
+            args.prompt = 'Extract key information'
+
+    # Process files
+    files = args.files or []
+    results = batch_process(
+        files=files,
+        prompt=args.prompt,
+        model=args.model,
+        task=args.task,
+        format_output=args.format_output,
+        aspect_ratio=args.aspect_ratio,
+        output_file=args.output,
+        verbose=args.verbose,
+        dry_run=args.dry_run
+    )
+
+    # Print summary
+    if not args.dry_run and results:
+        success = sum(1 for r in results if r.get('status') == 'success')
+        failed = len(results) - success
+        print(f"\n{'='*50}")
+        print(f"Processed: {len(results)} files")
+        print(f"Success: {success}")
+        print(f"Failed: {failed}")
+        if args.output:
+            print(f"Results saved to: {args.output}")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/ai-multimodal/scripts/media_optimizer.py
+++ b/skills/ai-multimodal/scripts/media_optimizer.py
@@ -0,0 +1,506 @@
+#!/usr/bin/env python3
+"""
+Optimize media files for Gemini API processing.
+
+Features:
+- Compress videos/audio for size limits
+- Resize images appropriately
+- Split long videos into chunks
+- Format conversion
+- Quality vs size optimization
+- Validation before upload
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+
+try:
+    from dotenv import load_dotenv
+except ImportError:
+    load_dotenv = None
+
+
+def load_env_files():
+    """Load .env files in correct priority order.
+
+    Priority order (highest to lowest):
+    1. process.env (runtime environment variables)
+    2. .claude/skills/ai-multimodal/.env (skill-specific config)
+    3. .claude/skills/.env (shared skills config)
+    4. .claude/.env (Claude global config)
+    """
+    if not load_dotenv:
+        return
+
+    # Determine base paths
+    script_dir = Path(__file__).parent
+    skill_dir = script_dir.parent  # .claude/skills/ai-multimodal
+    skills_dir = skill_dir.parent   # .claude/skills
+    claude_dir = skills_dir.parent  # .claude
+
+    # Priority 2: Skill-specific .env
+    env_file = skill_dir / '.env'
+    if env_file.exists():
+        load_dotenv(env_file)
+
+    # Priority 3: Shared skills .env
+    env_file = skills_dir / '.env'
+    if env_file.exists():
+        load_dotenv(env_file)
+
+    # Priority 4: Claude global .env
+    env_file = claude_dir / '.env'
+    if env_file.exists():
+        load_dotenv(env_file)
+
+
+# Load environment variables at module level
+load_env_files()
+
+
+def check_ffmpeg() -> bool:
+    """Check if ffmpeg is installed."""
+    try:
+        subprocess.run(['ffmpeg', '-version'],
+                      stdout=subprocess.DEVNULL,
+                      stderr=subprocess.DEVNULL,
+                      check=True)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError, Exception):
+        return False
+
+
+def get_media_info(file_path: str) -> Dict[str, Any]:
+    """Get media file information using ffprobe."""
+    if not check_ffmpeg():
+        return {}
+
+    try:
+        cmd = [
+            'ffprobe',
+            '-v', 'quiet',
+            '-print_format', 'json',
+            '-show_format',
+            '-show_streams',
+            file_path
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        data = json.loads(result.stdout)
+
+        info = {
+            'size': int(data['format'].get('size', 0)),
+            'duration': float(data['format'].get('duration', 0)),
+            'bit_rate': int(data['format'].get('bit_rate', 0)),
+        }
+
+        # Get video/audio specific info
+        for stream in data.get('streams', []):
+            if stream['codec_type'] == 'video':
+                info['width'] = stream.get('width', 0)
+                info['height'] = stream.get('height', 0)
+                info['fps'] = eval(stream.get('r_frame_rate', '0/1'))
+            elif stream['codec_type'] == 'audio':
+                info['sample_rate'] = int(stream.get('sample_rate', 0))
+                info['channels'] = stream.get('channels', 0)
+
+        return info
+
+    except (subprocess.CalledProcessError, json.JSONDecodeError, Exception):
+        return {}
+
+
+def optimize_video(
+    input_path: str,
+    output_path: str,
+    target_size_mb: Optional[int] = None,
+    max_duration: Optional[int] = None,
+    quality: int = 23,
+    resolution: Optional[str] = None,
+    verbose: bool = False
+) -> bool:
+    """Optimize video file for Gemini API."""
+    if not check_ffmpeg():
+        print("Error: ffmpeg not installed")
+        print("Install: apt-get install ffmpeg (Linux) or brew install ffmpeg (Mac)")
+        return False
+
+    info = get_media_info(input_path)
+    if not info:
+        print(f"Error: Could not read media info from {input_path}")
+        return False
+
+    if verbose:
+        print(f"Input: {Path(input_path).name}")
+        print(f"  Size: {info['size'] / (1024*1024):.2f} MB")
+        print(f"  Duration: {info['duration']:.2f}s")
+        if 'width' in info:
+            print(f"  Resolution: {info['width']}x{info['height']}")
+        print(f"  Bit rate: {info['bit_rate'] / 1000:.0f} kbps")
+
+    # Build ffmpeg command
+    cmd = ['ffmpeg', '-i', input_path, '-y']
+
+    # Video codec
+    cmd.extend(['-c:v', 'libx264', '-crf', str(quality)])
+
+    # Resolution
+    if resolution:
+        cmd.extend(['-vf', f'scale={resolution}'])
+    elif 'width' in info and info['width'] > 1920:
+        cmd.extend(['-vf', 'scale=1920:-2'])  # Max 1080p
+
+    # Audio codec
+    cmd.extend(['-c:a', 'aac', '-b:a', '128k', '-ac', '2'])
+
+    # Duration limit
+    if max_duration and info['duration'] > max_duration:
+        cmd.extend(['-t', str(max_duration)])
+
+    # Target size (rough estimate using bitrate)
+    if target_size_mb:
+        target_bits = target_size_mb * 8 * 1024 * 1024
+        duration = min(info['duration'], max_duration) if max_duration else info['duration']
+        target_bitrate = int(target_bits / duration)
+        # Reserve some for audio (128kbps)
+        video_bitrate = max(target_bitrate - 128000, 500000)
+        cmd.extend(['-b:v', str(video_bitrate)])
+
+    cmd.append(output_path)
+
+    if verbose:
+        print(f"\nOptimizing...")
+        print(f"  Command: {' '.join(cmd)}")
+
+    try:
+        subprocess.run(cmd, check=True, capture_output=not verbose)
+
+        # Check output
+        output_info = get_media_info(output_path)
+        if output_info and verbose:
+            print(f"\nOutput: {Path(output_path).name}")
+            print(f"  Size: {output_info['size'] / (1024*1024):.2f} MB")
+            print(f"  Duration: {output_info['duration']:.2f}s")
+            if 'width' in output_info:
+                print(f"  Resolution: {output_info['width']}x{output_info['height']}")
+            compression = (1 - output_info['size'] / info['size']) * 100
+            print(f"  Compression: {compression:.1f}%")
+
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error optimizing video: {e}")
+        return False
+
+
+def optimize_audio(
+    input_path: str,
+    output_path: str,
+    target_size_mb: Optional[int] = None,
+    bitrate: str = '64k',
+    sample_rate: int = 16000,
+    verbose: bool = False
+) -> bool:
+    """Optimize audio file for Gemini API."""
+    if not check_ffmpeg():
+        print("Error: ffmpeg not installed")
+        return False
+
+    info = get_media_info(input_path)
+    if not info:
+        print(f"Error: Could not read media info from {input_path}")
+        return False
+
+    if verbose:
+        print(f"Input: {Path(input_path).name}")
+        print(f"  Size: {info['size'] / (1024*1024):.2f} MB")
+        print(f"  Duration: {info['duration']:.2f}s")
+
+    # Build command
+    cmd = [
+        'ffmpeg', '-i', input_path, '-y',
+        '-c:a', 'aac',
+        '-b:a', bitrate,
+        '-ar', str(sample_rate),
+        '-ac', '1',  # Mono (Gemini uses mono anyway)
+        output_path
+    ]
+
+    if verbose:
+        print(f"\nOptimizing...")
+
+    try:
+        subprocess.run(cmd, check=True, capture_output=not verbose)
+
+        output_info = get_media_info(output_path)
+        if output_info and verbose:
+            print(f"\nOutput: {Path(output_path).name}")
+            print(f"  Size: {output_info['size'] / (1024*1024):.2f} MB")
+            compression = (1 - output_info['size'] / info['size']) * 100
+            print(f"  Compression: {compression:.1f}%")
+
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error optimizing audio: {e}")
+        return False
+
+
+def optimize_image(
+    input_path: str,
+    output_path: str,
+    max_width: int = 1920,
+    quality: int = 85,
+    verbose: bool = False
+) -> bool:
+    """Optimize image file for Gemini API."""
+    try:
+        from PIL import Image
+    except ImportError:
+        print("Error: Pillow not installed")
+        print("Install with: pip install pillow")
+        return False
+
+    try:
+        img = Image.open(input_path)
+
+        if verbose:
+            print(f"Input: {Path(input_path).name}")
+            print(f"  Size: {Path(input_path).stat().st_size / 1024:.2f} KB")
+            print(f"  Resolution: {img.width}x{img.height}")
+
+        # Resize if needed
+        if img.width > max_width:
+            ratio = max_width / img.width
+            new_height = int(img.height * ratio)
+            img = img.resize((max_width, new_height), Image.Resampling.LANCZOS)
+            if verbose:
+                print(f"  Resized to: {img.width}x{img.height}")
+
+        # Convert RGBA to RGB if saving as JPEG
+        if output_path.lower().endswith('.jpg') or output_path.lower().endswith('.jpeg'):
+            if img.mode == 'RGBA':
+                rgb_img = Image.new('RGB', img.size, (255, 255, 255))
+                rgb_img.paste(img, mask=img.split()[3])
+                img = rgb_img
+
+        # Save
+        img.save(output_path, quality=quality, optimize=True)
+
+        if verbose:
+            print(f"\nOutput: {Path(output_path).name}")
+            print(f"  Size: {Path(output_path).stat().st_size / 1024:.2f} KB")
+            compression = (1 - Path(output_path).stat().st_size / Path(input_path).stat().st_size) * 100
+            print(f"  Compression: {compression:.1f}%")
+
+        return True
+
+    except Exception as e:
+        print(f"Error optimizing image: {e}")
+        return False
+
+
+def split_video(
+    input_path: str,
+    output_dir: str,
+    chunk_duration: int = 3600,
+    verbose: bool = False
+) -> List[str]:
+    """Split long video into chunks."""
+    if not check_ffmpeg():
+        print("Error: ffmpeg not installed")
+        return []
+
+    info = get_media_info(input_path)
+    if not info:
+        return []
+
+    total_duration = info['duration']
+    num_chunks = int(total_duration / chunk_duration) + 1
+
+    if num_chunks == 1:
+        if verbose:
+            print("Video is short enough, no splitting needed")
+        return [input_path]
+
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    output_files = []
+
+    for i in range(num_chunks):
+        start_time = i * chunk_duration
+        output_file = Path(output_dir) / f"{Path(input_path).stem}_chunk_{i+1}.mp4"
+
+        cmd = [
+            'ffmpeg', '-i', input_path, '-y',
+            '-ss', str(start_time),
+            '-t', str(chunk_duration),
+            '-c', 'copy',
+            str(output_file)
+        ]
+
+        if verbose:
+            print(f"Creating chunk {i+1}/{num_chunks}...")
+
+        try:
+            subprocess.run(cmd, check=True, capture_output=not verbose)
+            output_files.append(str(output_file))
+        except subprocess.CalledProcessError as e:
+            print(f"Error creating chunk {i+1}: {e}")
+
+    return output_files
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Optimize media files for Gemini API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Optimize video to 100MB
+  %(prog)s --input video.mp4 --output optimized.mp4 --target-size 100
+
+  # Optimize audio
+  %(prog)s --input audio.mp3 --output optimized.m4a --bitrate 64k
+
+  # Resize image
+  %(prog)s --input image.jpg --output resized.jpg --max-width 1920
+
+  # Split long video
+  %(prog)s --input long-video.mp4 --split --chunk-duration 3600 --output-dir ./chunks
+
+  # Batch optimize directory
+  %(prog)s --input-dir ./videos --output-dir ./optimized --quality 85
+        """
+    )
+
+    parser.add_argument('--input', help='Input file')
+    parser.add_argument('--output', help='Output file')
+    parser.add_argument('--input-dir', help='Input directory for batch processing')
+    parser.add_argument('--output-dir', help='Output directory for batch processing')
+    parser.add_argument('--target-size', type=int, help='Target size in MB')
+    parser.add_argument('--quality', type=int, default=85,
+                       help='Quality (video: 0-51 CRF, image: 1-100) (default: 85)')
+    parser.add_argument('--max-width', type=int, default=1920,
+                       help='Max image width (default: 1920)')
+    parser.add_argument('--bitrate', default='64k',
+                       help='Audio bitrate (default: 64k)')
+    parser.add_argument('--resolution', help='Video resolution (e.g., 1920x1080)')
+    parser.add_argument('--split', action='store_true', help='Split long video into chunks')
+    parser.add_argument('--chunk-duration', type=int, default=3600,
+                       help='Chunk duration in seconds (default: 3600 = 1 hour)')
+    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if not args.input and not args.input_dir:
+        parser.error("Either --input or --input-dir required")
+
+    # Single file processing
+    if args.input:
+        input_path = Path(args.input)
+        if not input_path.exists():
+            print(f"Error: Input file not found: {input_path}")
+            sys.exit(1)
+
+        if args.split:
+            output_dir = args.output_dir or './chunks'
+            chunks = split_video(str(input_path), output_dir, args.chunk_duration, args.verbose)
+            print(f"\nCreated {len(chunks)} chunks in {output_dir}")
+            sys.exit(0)
+
+        if not args.output:
+            parser.error("--output required for single file processing")
+
+        output_path = Path(args.output)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Determine file type
+        ext = input_path.suffix.lower()
+
+        if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
+            success = optimize_video(
+                str(input_path),
+                str(output_path),
+                target_size_mb=args.target_size,
+                quality=args.quality,
+                resolution=args.resolution,
+                verbose=args.verbose
+            )
+        elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
+            success = optimize_audio(
+                str(input_path),
+                str(output_path),
+                target_size_mb=args.target_size,
+                bitrate=args.bitrate,
+                verbose=args.verbose
+            )
+        elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
+            success = optimize_image(
+                str(input_path),
+                str(output_path),
+                max_width=args.max_width,
+                quality=args.quality,
+                verbose=args.verbose
+            )
+        else:
+            print(f"Error: Unsupported file type: {ext}")
+            sys.exit(1)
+
+        sys.exit(0 if success else 1)
+
+    # Batch processing
+    if args.input_dir:
+        if not args.output_dir:
+            parser.error("--output-dir required for batch processing")
+
+        input_dir = Path(args.input_dir)
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Find all media files
+        patterns = ['*.mp4', '*.mov', '*.avi', '*.mkv', '*.webm',
+                   '*.mp3', '*.wav', '*.m4a', '*.flac',
+                   '*.jpg', '*.jpeg', '*.png', '*.webp']
+
+        files = []
+        for pattern in patterns:
+            files.extend(input_dir.glob(pattern))
+
+        if not files:
+            print(f"No media files found in {input_dir}")
+            sys.exit(1)
+
+        print(f"Found {len(files)} files to process")
+
+        success_count = 0
+        for input_file in files:
+            output_file = output_dir / input_file.name
+
+            ext = input_file.suffix.lower()
+            success = False
+
+            if ext in ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv']:
+                success = optimize_video(str(input_file), str(output_file),
+                                        quality=args.quality, verbose=args.verbose)
+            elif ext in ['.mp3', '.wav', '.m4a', '.flac', '.aac']:
+                success = optimize_audio(str(input_file), str(output_file),
+                                        bitrate=args.bitrate, verbose=args.verbose)
+            elif ext in ['.jpg', '.jpeg', '.png', '.webp']:
+                success = optimize_image(str(input_file), str(output_file),
+                                        max_width=args.max_width, quality=args.quality,
+                                        verbose=args.verbose)
+
+            if success:
+                success_count += 1
+
+        print(f"\nProcessed: {success_count}/{len(files)} files")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/ai-multimodal/scripts/requirements.txt
+++ b/skills/ai-multimodal/scripts/requirements.txt
@@ -0,0 +1,26 @@
+# AI Multimodal Skill Dependencies
+# Python 3.10+ required
+
+# Google Gemini API
+google-genai>=0.1.0
+
+# PDF processing
+pypdf>=4.0.0
+
+# Document conversion
+python-docx>=1.0.0
+docx2pdf>=0.1.8  # Windows only, optional on Linux/macOS
+
+# Markdown processing
+markdown>=3.5.0
+
+# Image processing
+Pillow>=10.0.0
+
+# Environment variable management
+python-dotenv>=1.0.0
+
+# Testing dependencies (dev)
+pytest>=8.0.0
+pytest-cov>=4.1.0
+pytest-mock>=3.12.0
--- a/skills/ai-multimodal/scripts/tests/requirements.txt
+++ b/skills/ai-multimodal/scripts/tests/requirements.txt
@@ -0,0 +1,20 @@
+# Core dependencies
+google-genai>=0.2.0
+python-dotenv>=1.0.0
+
+# Image processing
+pillow>=10.0.0
+
+# PDF processing
+pypdf>=3.0.0
+
+# Document conversion
+markdown>=3.5
+
+# Testing
+pytest>=7.4.0
+pytest-cov>=4.1.0
+pytest-mock>=3.12.0
+
+# Optional dependencies for full functionality
+# ffmpeg-python>=0.2.0  # For media optimization (requires ffmpeg installed)
--- a/skills/ai-multimodal/scripts/tests/test_document_converter.py
+++ b/skills/ai-multimodal/scripts/tests/test_document_converter.py
@@ -0,0 +1,299 @@
+"""
+Tests for document_converter.py
+"""
+
+import pytest
+import sys
+from pathlib import Path
+from unittest.mock import Mock, patch, MagicMock
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import document_converter as dc
+
+
+class TestEnvLoading:
+    """Test environment variable loading."""
+
+    @patch('document_converter.load_dotenv')
+    @patch('pathlib.Path.exists')
+    def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
+        """Test successful .env file loading."""
+        mock_exists.return_value = True
+        dc.load_env_files()
+        # Should be called for skill, skills, and claude dirs
+        assert mock_load_dotenv.call_count >= 1
+
+    @patch('document_converter.load_dotenv', None)
+    def test_load_env_files_no_dotenv(self):
+        """Test when dotenv is not available."""
+        # Should not raise an error
+        dc.load_env_files()
+
+
+class TestDependencyCheck:
+    """Test dependency checking."""
+
+    @patch('builtins.__import__')
+    def test_check_all_dependencies_available(self, mock_import):
+        """Test when all dependencies are available."""
+        mock_import.return_value = Mock()
+
+        deps = dc.check_dependencies()
+
+        assert 'pypdf' in deps
+        assert 'markdown' in deps
+        assert 'pillow' in deps
+
+    @patch('builtins.__import__')
+    def test_check_dependencies_missing(self, mock_import):
+        """Test when dependencies are missing."""
+        def import_side_effect(name, *args, **kwargs):
+            if name == 'pypdf':
+                raise ImportError()
+            return Mock()
+
+        mock_import.side_effect = import_side_effect
+
+        # The function uses try/except, so we test the actual function
+        with patch('document_converter.sys.modules', {}):
+            # This is tricky to test due to import handling
+            pass
+
+
+class TestPDFPageExtraction:
+    """Test PDF page extraction."""
+
+    @patch('pypdf.PdfReader')
+    @patch('pypdf.PdfWriter')
+    @patch('builtins.open', create=True)
+    def test_extract_single_page(self, mock_open, mock_writer_class, mock_reader_class):
+        """Test extracting a single page."""
+        # Mock reader
+        mock_reader = Mock()
+        mock_page = Mock()
+        mock_reader.pages = [Mock(), mock_page, Mock()]
+        mock_reader_class.return_value = mock_reader
+
+        # Mock writer
+        mock_writer = Mock()
+        mock_writer.pages = [mock_page]
+        mock_writer_class.return_value = mock_writer
+
+        result = dc.extract_pdf_pages(
+            'input.pdf',
+            'output.pdf',
+            page_range='2',
+            verbose=False
+        )
+
+        assert result is True
+        mock_writer.add_page.assert_called_once_with(mock_page)
+
+    @patch('pypdf.PdfReader')
+    @patch('pypdf.PdfWriter')
+    @patch('builtins.open', create=True)
+    def test_extract_page_range(self, mock_open, mock_writer_class, mock_reader_class):
+        """Test extracting a range of pages."""
+        mock_reader = Mock()
+        mock_reader.pages = [Mock() for _ in range(10)]
+        mock_reader_class.return_value = mock_reader
+
+        mock_writer = Mock()
+        mock_writer.pages = []
+        mock_writer_class.return_value = mock_writer
+
+        result = dc.extract_pdf_pages(
+            'input.pdf',
+            'output.pdf',
+            page_range='2-5',
+            verbose=False
+        )
+
+        assert result is True
+        assert mock_writer.add_page.call_count == 4  # Pages 2-5 (4 pages)
+
+    def test_extract_pages_no_pypdf(self):
+        """Test page extraction without pypdf."""
+        with patch.dict('sys.modules', {'pypdf': None}):
+            result = dc.extract_pdf_pages('input.pdf', 'output.pdf', '1-10')
+            assert result is False
+
+
+class TestPDFOptimization:
+    """Test PDF optimization."""
+
+    @patch('pypdf.PdfReader')
+    @patch('pypdf.PdfWriter')
+    @patch('builtins.open', create=True)
+    @patch('pathlib.Path.stat')
+    def test_optimize_pdf_success(self, mock_stat, mock_open, mock_writer_class, mock_reader_class):
+        """Test successful PDF optimization."""
+        # Mock reader
+        mock_reader = Mock()
+        mock_page = Mock()
+        mock_reader.pages = [mock_page, mock_page]
+        mock_reader_class.return_value = mock_reader
+
+        # Mock writer
+        mock_writer = Mock()
+        mock_writer.pages = [mock_page, mock_page]
+        mock_writer_class.return_value = mock_writer
+
+        # Mock file sizes
+        mock_stat.return_value.st_size = 1024 * 1024
+
+        result = dc.optimize_pdf('input.pdf', 'output.pdf', verbose=False)
+
+        assert result is True
+        mock_page.compress_content_streams.assert_called()
+
+    def test_optimize_pdf_no_pypdf(self):
+        """Test PDF optimization without pypdf."""
+        with patch.dict('sys.modules', {'pypdf': None}):
+            result = dc.optimize_pdf('input.pdf', 'output.pdf')
+            assert result is False
+
+
+class TestImageExtraction:
+    """Test image extraction from PDFs."""
+
+    @patch('pypdf.PdfReader')
+    @patch('PIL.Image')
+    @patch('pathlib.Path.mkdir')
+    @patch('builtins.open', create=True)
+    def test_extract_images_success(self, mock_open, mock_mkdir, mock_image, mock_reader_class):
+        """Test successful image extraction."""
+        # Mock PDF reader
+        mock_reader = Mock()
+        mock_page = MagicMock()
+
+        # Mock XObject with image
+        mock_obj = MagicMock()
+        mock_obj.__getitem__.side_effect = lambda k: {
+            '/Subtype': '/Image',
+            '/Width': 100,
+            '/Height': 100,
+            '/Filter': '/DCTDecode'
+        }[k]
+        mock_obj.get_data.return_value = b'image_data'
+
+        mock_xobjects = MagicMock()
+        mock_xobjects.__iter__.return_value = ['img1']
+        mock_xobjects.__getitem__.return_value = mock_obj
+
+        mock_resources = MagicMock()
+        mock_resources.get_object.return_value = mock_xobjects
+        mock_page.__getitem__.side_effect = lambda k: {
+            '/Resources': {'/XObject': mock_resources}
+        }[k]
+
+        mock_reader.pages = [mock_page]
+        mock_reader_class.return_value = mock_reader
+
+        result = dc.extract_images_from_pdf('input.pdf', './output', verbose=False)
+
+        assert len(result) > 0
+
+    def test_extract_images_no_dependencies(self):
+        """Test image extraction without required dependencies."""
+        with patch.dict('sys.modules', {'pypdf': None}):
+            result = dc.extract_images_from_pdf('input.pdf', './output')
+            assert result == []
+
+
+class TestMarkdownConversion:
+    """Test Markdown to PDF conversion."""
+
+    @patch('markdown.markdown')
+    @patch('builtins.open', create=True)
+    @patch('subprocess.run')
+    @patch('pathlib.Path.unlink')
+    def test_convert_markdown_success(self, mock_unlink, mock_run, mock_open, mock_markdown):
+        """Test successful Markdown to PDF conversion."""
+        mock_markdown.return_value = '<h1>Test</h1>'
+
+        # Mock file reading and writing
+        mock_file = MagicMock()
+        mock_file.__enter__.return_value.read.return_value = '# Test'
+        mock_open.return_value = mock_file
+
+        result = dc.convert_markdown_to_pdf('input.md', 'output.pdf', verbose=False)
+
+        assert result is True
+        mock_run.assert_called_once()
+
+    @patch('markdown.markdown')
+    @patch('builtins.open', create=True)
+    @patch('subprocess.run')
+    def test_convert_markdown_no_wkhtmltopdf(self, mock_run, mock_open, mock_markdown):
+        """Test Markdown conversion without wkhtmltopdf."""
+        mock_markdown.return_value = '<h1>Test</h1>'
+
+        mock_file = MagicMock()
+        mock_file.__enter__.return_value.read.return_value = '# Test'
+        mock_open.return_value = mock_file
+
+        mock_run.side_effect = FileNotFoundError()
+
+        result = dc.convert_markdown_to_pdf('input.md', 'output.pdf', verbose=False)
+
+        assert result is False
+
+    def test_convert_markdown_no_markdown_lib(self):
+        """Test Markdown conversion without markdown library."""
+        with patch.dict('sys.modules', {'markdown': None}):
+            result = dc.convert_markdown_to_pdf('input.md', 'output.pdf')
+            assert result is False
+
+
+class TestHTMLConversion:
+    """Test HTML to PDF conversion."""
+
+    @patch('subprocess.run')
+    def test_convert_html_success(self, mock_run):
+        """Test successful HTML to PDF conversion."""
+        result = dc.convert_html_to_pdf('input.html', 'output.pdf', verbose=False)
+
+        assert result is True
+        mock_run.assert_called_once()
+
+    @patch('subprocess.run')
+    def test_convert_html_no_wkhtmltopdf(self, mock_run):
+        """Test HTML conversion without wkhtmltopdf."""
+        mock_run.side_effect = FileNotFoundError()
+
+        result = dc.convert_html_to_pdf('input.html', 'output.pdf', verbose=False)
+
+        assert result is False
+
+
+class TestIntegration:
+    """Integration tests."""
+
+    @patch('pathlib.Path.exists')
+    def test_file_not_found(self, mock_exists):
+        """Test handling of non-existent input file."""
+        mock_exists.return_value = False
+
+        # This would normally be tested via main() but we test the concept
+        assert not Path('nonexistent.pdf').exists()
+
+    @patch('document_converter.check_dependencies')
+    def test_check_dependencies_integration(self, mock_check):
+        """Test dependency checking integration."""
+        mock_check.return_value = {
+            'pypdf': True,
+            'markdown': True,
+            'pillow': True
+        }
+
+        deps = dc.check_dependencies()
+
+        assert deps['pypdf'] is True
+        assert deps['markdown'] is True
+        assert deps['pillow'] is True
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v', '--cov=document_converter', '--cov-report=term-missing'])
--- a/skills/ai-multimodal/scripts/tests/test_gemini_batch_process.py
+++ b/skills/ai-multimodal/scripts/tests/test_gemini_batch_process.py
@@ -0,0 +1,362 @@
+"""
+Tests for gemini_batch_process.py
+"""
+
+import pytest
+import sys
+from pathlib import Path
+from unittest.mock import Mock, patch, MagicMock
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import gemini_batch_process as gbp
+
+
+class TestAPIKeyFinder:
+    """Test API key detection."""
+
+    def test_find_api_key_from_env(self, monkeypatch):
+        """Test finding API key from environment variable."""
+        monkeypatch.setenv('GEMINI_API_KEY', 'test_key_123')
+        assert gbp.find_api_key() == 'test_key_123'
+
+    @patch('gemini_batch_process.load_dotenv')
+    def test_find_api_key_not_found(self, mock_load_dotenv, monkeypatch):
+        """Test when API key is not found."""
+        monkeypatch.delenv('GEMINI_API_KEY', raising=False)
+        # Mock load_dotenv to not actually load any files
+        mock_load_dotenv.return_value = None
+        assert gbp.find_api_key() is None
+
+
+class TestMimeTypeDetection:
+    """Test MIME type detection."""
+
+    def test_audio_mime_types(self):
+        """Test audio file MIME types."""
+        assert gbp.get_mime_type('test.mp3') == 'audio/mp3'
+        assert gbp.get_mime_type('test.wav') == 'audio/wav'
+        assert gbp.get_mime_type('test.aac') == 'audio/aac'
+        assert gbp.get_mime_type('test.flac') == 'audio/flac'
+
+    def test_image_mime_types(self):
+        """Test image file MIME types."""
+        assert gbp.get_mime_type('test.jpg') == 'image/jpeg'
+        assert gbp.get_mime_type('test.jpeg') == 'image/jpeg'
+        assert gbp.get_mime_type('test.png') == 'image/png'
+        assert gbp.get_mime_type('test.webp') == 'image/webp'
+
+    def test_video_mime_types(self):
+        """Test video file MIME types."""
+        assert gbp.get_mime_type('test.mp4') == 'video/mp4'
+        assert gbp.get_mime_type('test.mov') == 'video/quicktime'
+        assert gbp.get_mime_type('test.avi') == 'video/x-msvideo'
+
+    def test_document_mime_types(self):
+        """Test document file MIME types."""
+        assert gbp.get_mime_type('test.pdf') == 'application/pdf'
+        assert gbp.get_mime_type('test.txt') == 'text/plain'
+
+    def test_unknown_mime_type(self):
+        """Test unknown file extension."""
+        assert gbp.get_mime_type('test.xyz') == 'application/octet-stream'
+
+    def test_case_insensitive(self):
+        """Test case-insensitive extension matching."""
+        assert gbp.get_mime_type('TEST.MP3') == 'audio/mp3'
+        assert gbp.get_mime_type('Test.JPG') == 'image/jpeg'
+
+
+class TestFileUpload:
+    """Test file upload functionality."""
+
+    @patch('gemini_batch_process.genai.Client')
+    def test_upload_file_success(self, mock_client_class):
+        """Test successful file upload."""
+        # Mock client and file
+        mock_client = Mock()
+        mock_file = Mock()
+        mock_file.state.name = 'ACTIVE'
+        mock_file.name = 'test_file'
+        mock_client.files.upload.return_value = mock_file
+
+        result = gbp.upload_file(mock_client, 'test.jpg', verbose=False)
+
+        assert result == mock_file
+        mock_client.files.upload.assert_called_once_with(file='test.jpg')
+
+    @patch('gemini_batch_process.genai.Client')
+    @patch('gemini_batch_process.time.sleep')
+    def test_upload_video_with_processing(self, mock_sleep, mock_client_class):
+        """Test video upload with processing wait."""
+        mock_client = Mock()
+
+        # First call: PROCESSING, second call: ACTIVE
+        mock_file_processing = Mock()
+        mock_file_processing.state.name = 'PROCESSING'
+        mock_file_processing.name = 'test_video'
+
+        mock_file_active = Mock()
+        mock_file_active.state.name = 'ACTIVE'
+        mock_file_active.name = 'test_video'
+
+        mock_client.files.upload.return_value = mock_file_processing
+        mock_client.files.get.return_value = mock_file_active
+
+        result = gbp.upload_file(mock_client, 'test.mp4', verbose=False)
+
+        assert result.state.name == 'ACTIVE'
+
+    @patch('gemini_batch_process.genai.Client')
+    def test_upload_file_failed(self, mock_client_class):
+        """Test failed file upload."""
+        mock_client = Mock()
+        mock_file = Mock()
+        mock_file.state.name = 'FAILED'
+        mock_client.files.upload.return_value = mock_file
+        mock_client.files.get.return_value = mock_file
+
+        with pytest.raises(ValueError, match="File processing failed"):
+            gbp.upload_file(mock_client, 'test.mp4', verbose=False)
+
+
+class TestProcessFile:
+    """Test file processing functionality."""
+
+    @patch('gemini_batch_process.genai.Client')
+    @patch('builtins.open', create=True)
+    @patch('pathlib.Path.stat')
+    def test_process_small_file_inline(self, mock_stat, mock_open, mock_client_class):
+        """Test processing small file with inline data."""
+        # Mock small file
+        mock_stat.return_value.st_size = 10 * 1024 * 1024  # 10MB
+
+        # Mock file content
+        mock_open.return_value.__enter__.return_value.read.return_value = b'test_data'
+
+        # Mock client and response
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.text = 'Test response'
+        mock_client.models.generate_content.return_value = mock_response
+
+        result = gbp.process_file(
+            client=mock_client,
+            file_path='test.jpg',
+            prompt='Describe this image',
+            model='gemini-2.5-flash',
+            task='analyze',
+            format_output='text',
+            verbose=False
+        )
+
+        assert result['status'] == 'success'
+        assert result['response'] == 'Test response'
+
+    @patch('gemini_batch_process.upload_file')
+    @patch('gemini_batch_process.genai.Client')
+    @patch('pathlib.Path.stat')
+    def test_process_large_file_api(self, mock_stat, mock_client_class, mock_upload):
+        """Test processing large file with File API."""
+        # Mock large file
+        mock_stat.return_value.st_size = 50 * 1024 * 1024  # 50MB
+
+        # Mock upload and response
+        mock_file = Mock()
+        mock_upload.return_value = mock_file
+
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.text = 'Test response'
+        mock_client.models.generate_content.return_value = mock_response
+
+        result = gbp.process_file(
+            client=mock_client,
+            file_path='test.mp4',
+            prompt='Summarize this video',
+            model='gemini-2.5-flash',
+            task='analyze',
+            format_output='text',
+            verbose=False
+        )
+
+        assert result['status'] == 'success'
+        mock_upload.assert_called_once()
+
+    @patch('gemini_batch_process.genai.Client')
+    @patch('builtins.open', create=True)
+    @patch('pathlib.Path.stat')
+    def test_process_file_error_handling(self, mock_stat, mock_open, mock_client_class):
+        """Test error handling in file processing."""
+        mock_stat.return_value.st_size = 1024
+
+        # Mock file read
+        mock_file = MagicMock()
+        mock_file.__enter__.return_value.read.return_value = b'test_data'
+        mock_open.return_value = mock_file
+
+        mock_client = Mock()
+        mock_client.models.generate_content.side_effect = Exception("API Error")
+
+        result = gbp.process_file(
+            client=mock_client,
+            file_path='test.jpg',
+            prompt='Test',
+            model='gemini-2.5-flash',
+            task='analyze',
+            format_output='text',
+            verbose=False,
+            max_retries=1
+        )
+
+        assert result['status'] == 'error'
+        assert 'API Error' in result['error']
+
+    @patch('gemini_batch_process.genai.Client')
+    @patch('builtins.open', create=True)
+    @patch('pathlib.Path.stat')
+    def test_image_generation_with_aspect_ratio(self, mock_stat, mock_open, mock_client_class):
+        """Test image generation with aspect ratio config."""
+        mock_stat.return_value.st_size = 1024
+
+        # Mock file read
+        mock_file = MagicMock()
+        mock_file.__enter__.return_value.read.return_value = b'test'
+        mock_open.return_value = mock_file
+
+        mock_client = Mock()
+        mock_response = Mock()
+        mock_response.candidates = [Mock()]
+        mock_response.candidates[0].content.parts = [
+            Mock(inline_data=Mock(data=b'fake_image_data'))
+        ]
+        mock_client.models.generate_content.return_value = mock_response
+
+        result = gbp.process_file(
+            client=mock_client,
+            file_path='test.txt',
+            prompt='Generate mountain landscape',
+            model='gemini-2.5-flash-image',
+            task='generate',
+            format_output='text',
+            aspect_ratio='16:9',
+            verbose=False
+        )
+
+        # Verify config was called with correct structure
+        call_args = mock_client.models.generate_content.call_args
+        config = call_args.kwargs.get('config')
+        assert config is not None
+        assert result['status'] == 'success'
+        assert 'generated_image' in result
+
+
+class TestBatchProcessing:
+    """Test batch processing functionality."""
+
+    @patch('gemini_batch_process.find_api_key')
+    @patch('gemini_batch_process.process_file')
+    @patch('gemini_batch_process.genai.Client')
+    def test_batch_process_success(self, mock_client_class, mock_process, mock_find_key):
+        """Test successful batch processing."""
+        mock_find_key.return_value = 'test_key'
+        mock_process.return_value = {'status': 'success', 'response': 'Test'}
+
+        results = gbp.batch_process(
+            files=['test1.jpg', 'test2.jpg'],
+            prompt='Analyze',
+            model='gemini-2.5-flash',
+            task='analyze',
+            format_output='text',
+            verbose=False,
+            dry_run=False
+        )
+
+        assert len(results) == 2
+        assert all(r['status'] == 'success' for r in results)
+
+    @patch('gemini_batch_process.find_api_key')
+    def test_batch_process_no_api_key(self, mock_find_key):
+        """Test batch processing without API key."""
+        mock_find_key.return_value = None
+
+        with pytest.raises(SystemExit):
+            gbp.batch_process(
+                files=['test.jpg'],
+                prompt='Test',
+                model='gemini-2.5-flash',
+                task='analyze',
+                format_output='text',
+                verbose=False,
+                dry_run=False
+            )
+
+    @patch('gemini_batch_process.find_api_key')
+    def test_batch_process_dry_run(self, mock_find_key):
+        """Test dry run mode."""
+        # API key not needed for dry run, but we mock it to avoid sys.exit
+        mock_find_key.return_value = 'test_key'
+
+        results = gbp.batch_process(
+            files=['test1.jpg', 'test2.jpg'],
+            prompt='Test',
+            model='gemini-2.5-flash',
+            task='analyze',
+            format_output='text',
+            verbose=False,
+            dry_run=True
+        )
+
+        assert results == []
+
+
+class TestResultsSaving:
+    """Test results saving functionality."""
+
+    @patch('builtins.open', create=True)
+    @patch('json.dump')
+    def test_save_results_json(self, mock_json_dump, mock_open):
+        """Test saving results as JSON."""
+        results = [
+            {'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
+            {'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
+        ]
+
+        gbp.save_results(results, 'output.json', 'json')
+
+        mock_json_dump.assert_called_once()
+
+    @patch('builtins.open', create=True)
+    @patch('csv.DictWriter')
+    def test_save_results_csv(self, mock_csv_writer, mock_open):
+        """Test saving results as CSV."""
+        results = [
+            {'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
+            {'file': 'test2.jpg', 'status': 'success', 'response': 'Test2'}
+        ]
+
+        gbp.save_results(results, 'output.csv', 'csv')
+
+        # Verify CSV writer was used
+        mock_csv_writer.assert_called_once()
+
+    @patch('builtins.open', create=True)
+    def test_save_results_markdown(self, mock_open):
+        """Test saving results as Markdown."""
+        mock_file = MagicMock()
+        mock_open.return_value.__enter__.return_value = mock_file
+
+        results = [
+            {'file': 'test1.jpg', 'status': 'success', 'response': 'Test1'},
+            {'file': 'test2.jpg', 'status': 'error', 'error': 'Failed'}
+        ]
+
+        gbp.save_results(results, 'output.md', 'markdown')
+
+        # Verify write was called
+        assert mock_file.write.call_count > 0
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v', '--cov=gemini_batch_process', '--cov-report=term-missing'])
--- a/skills/ai-multimodal/scripts/tests/test_media_optimizer.py
+++ b/skills/ai-multimodal/scripts/tests/test_media_optimizer.py
@@ -0,0 +1,373 @@
+"""
+Tests for media_optimizer.py
+"""
+
+import pytest
+import sys
+from pathlib import Path
+from unittest.mock import Mock, patch, MagicMock
+import json
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import media_optimizer as mo
+
+
+class TestEnvLoading:
+    """Test environment variable loading."""
+
+    @patch('media_optimizer.load_dotenv')
+    @patch('pathlib.Path.exists')
+    def test_load_env_files_success(self, mock_exists, mock_load_dotenv):
+        """Test successful .env file loading."""
+        mock_exists.return_value = True
+        mo.load_env_files()
+        # Should be called for skill, skills, and claude dirs
+        assert mock_load_dotenv.call_count >= 1
+
+    @patch('media_optimizer.load_dotenv', None)
+    def test_load_env_files_no_dotenv(self):
+        """Test when dotenv is not available."""
+        # Should not raise an error
+        mo.load_env_files()
+
+
+class TestFFmpegCheck:
+    """Test ffmpeg availability checking."""
+
+    @patch('subprocess.run')
+    def test_ffmpeg_installed(self, mock_run):
+        """Test when ffmpeg is installed."""
+        mock_run.return_value = Mock()
+        assert mo.check_ffmpeg() is True
+
+    @patch('subprocess.run')
+    def test_ffmpeg_not_installed(self, mock_run):
+        """Test when ffmpeg is not installed."""
+        mock_run.side_effect = FileNotFoundError()
+        assert mo.check_ffmpeg() is False
+
+    @patch('subprocess.run')
+    def test_ffmpeg_error(self, mock_run):
+        """Test ffmpeg command error."""
+        mock_run.side_effect = Exception("Error")
+        assert mo.check_ffmpeg() is False
+
+
+class TestMediaInfo:
+    """Test media information extraction."""
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('subprocess.run')
+    def test_get_video_info(self, mock_run, mock_check):
+        """Test extracting video information."""
+        mock_check.return_value = True
+
+        mock_result = Mock()
+        mock_result.stdout = json.dumps({
+            'format': {
+                'size': '10485760',
+                'duration': '120.5',
+                'bit_rate': '691200'
+            },
+            'streams': [
+                {
+                    'codec_type': 'video',
+                    'width': 1920,
+                    'height': 1080,
+                    'r_frame_rate': '30/1'
+                },
+                {
+                    'codec_type': 'audio',
+                    'sample_rate': '48000',
+                    'channels': 2
+                }
+            ]
+        })
+        mock_run.return_value = mock_result
+
+        info = mo.get_media_info('test.mp4')
+
+        assert info['size'] == 10485760
+        assert info['duration'] == 120.5
+        assert info['width'] == 1920
+        assert info['height'] == 1080
+        assert info['sample_rate'] == 48000
+
+    @patch('media_optimizer.check_ffmpeg')
+    def test_get_media_info_no_ffmpeg(self, mock_check):
+        """Test when ffmpeg is not available."""
+        mock_check.return_value = False
+        info = mo.get_media_info('test.mp4')
+        assert info == {}
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('subprocess.run')
+    def test_get_media_info_error(self, mock_run, mock_check):
+        """Test error handling in media info extraction."""
+        mock_check.return_value = True
+        mock_run.side_effect = Exception("Error")
+
+        info = mo.get_media_info('test.mp4')
+        assert info == {}
+
+
+class TestVideoOptimization:
+    """Test video optimization functionality."""
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    @patch('subprocess.run')
+    def test_optimize_video_success(self, mock_run, mock_info, mock_check):
+        """Test successful video optimization."""
+        mock_check.return_value = True
+        mock_info.side_effect = [
+            # Input info
+            {
+                'size': 50 * 1024 * 1024,
+                'duration': 120.0,
+                'bit_rate': 3500000,
+                'width': 1920,
+                'height': 1080
+            },
+            # Output info
+            {
+                'size': 25 * 1024 * 1024,
+                'duration': 120.0,
+                'width': 1920,
+                'height': 1080
+            }
+        ]
+
+        result = mo.optimize_video(
+            'input.mp4',
+            'output.mp4',
+            quality=23,
+            verbose=False
+        )
+
+        assert result is True
+        mock_run.assert_called_once()
+
+    @patch('media_optimizer.check_ffmpeg')
+    def test_optimize_video_no_ffmpeg(self, mock_check):
+        """Test video optimization without ffmpeg."""
+        mock_check.return_value = False
+
+        result = mo.optimize_video('input.mp4', 'output.mp4')
+        assert result is False
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    def test_optimize_video_no_info(self, mock_info, mock_check):
+        """Test video optimization when info cannot be read."""
+        mock_check.return_value = True
+        mock_info.return_value = {}
+
+        result = mo.optimize_video('input.mp4', 'output.mp4')
+        assert result is False
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    @patch('subprocess.run')
+    def test_optimize_video_with_target_size(self, mock_run, mock_info, mock_check):
+        """Test video optimization with target size."""
+        mock_check.return_value = True
+        mock_info.side_effect = [
+            {'size': 100 * 1024 * 1024, 'duration': 60.0, 'bit_rate': 3500000},
+            {'size': 50 * 1024 * 1024, 'duration': 60.0}
+        ]
+
+        result = mo.optimize_video(
+            'input.mp4',
+            'output.mp4',
+            target_size_mb=50,
+            verbose=False
+        )
+
+        assert result is True
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    @patch('subprocess.run')
+    def test_optimize_video_with_resolution(self, mock_run, mock_info, mock_check):
+        """Test video optimization with custom resolution."""
+        mock_check.return_value = True
+        mock_info.side_effect = [
+            {'size': 50 * 1024 * 1024, 'duration': 120.0, 'bit_rate': 3500000},
+            {'size': 25 * 1024 * 1024, 'duration': 120.0}
+        ]
+
+        result = mo.optimize_video(
+            'input.mp4',
+            'output.mp4',
+            resolution='1280x720',
+            verbose=False
+        )
+
+        assert result is True
+
+
+class TestAudioOptimization:
+    """Test audio optimization functionality."""
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    @patch('subprocess.run')
+    def test_optimize_audio_success(self, mock_run, mock_info, mock_check):
+        """Test successful audio optimization."""
+        mock_check.return_value = True
+        mock_info.side_effect = [
+            {'size': 10 * 1024 * 1024, 'duration': 300.0},
+            {'size': 5 * 1024 * 1024, 'duration': 300.0}
+        ]
+
+        result = mo.optimize_audio(
+            'input.mp3',
+            'output.m4a',
+            bitrate='64k',
+            verbose=False
+        )
+
+        assert result is True
+        mock_run.assert_called_once()
+
+    @patch('media_optimizer.check_ffmpeg')
+    def test_optimize_audio_no_ffmpeg(self, mock_check):
+        """Test audio optimization without ffmpeg."""
+        mock_check.return_value = False
+
+        result = mo.optimize_audio('input.mp3', 'output.m4a')
+        assert result is False
+
+
+class TestImageOptimization:
+    """Test image optimization functionality."""
+
+    @patch('PIL.Image.open')
+    @patch('pathlib.Path.stat')
+    def test_optimize_image_success(self, mock_stat, mock_image_open):
+        """Test successful image optimization."""
+        # Mock image
+        mock_resized = Mock()
+        mock_resized.mode = 'RGB'
+
+        mock_img = Mock()
+        mock_img.width = 3840
+        mock_img.height = 2160
+        mock_img.mode = 'RGB'
+        mock_img.resize.return_value = mock_resized
+        mock_image_open.return_value = mock_img
+
+        # Mock file sizes
+        mock_stat.return_value.st_size = 5 * 1024 * 1024
+
+        result = mo.optimize_image(
+            'input.jpg',
+            'output.jpg',
+            max_width=1920,
+            quality=85,
+            verbose=False
+        )
+
+        assert result is True
+        # Since image is resized, save is called on the resized image
+        mock_resized.save.assert_called_once()
+
+    @patch('PIL.Image.open')
+    @patch('pathlib.Path.stat')
+    def test_optimize_image_resize(self, mock_stat, mock_image_open):
+        """Test image resizing during optimization."""
+        mock_img = Mock()
+        mock_img.width = 3840
+        mock_img.height = 2160
+        mock_img.mode = 'RGB'
+        mock_resized = Mock()
+        mock_img.resize.return_value = mock_resized
+        mock_image_open.return_value = mock_img
+
+        mock_stat.return_value.st_size = 5 * 1024 * 1024
+
+        mo.optimize_image('input.jpg', 'output.jpg', max_width=1920, verbose=False)
+
+        mock_img.resize.assert_called_once()
+
+    @patch('PIL.Image.open')
+    @patch('pathlib.Path.stat')
+    def test_optimize_image_rgba_to_jpg(self, mock_stat, mock_image_open):
+        """Test converting RGBA to RGB for JPEG."""
+        mock_img = Mock()
+        mock_img.width = 1920
+        mock_img.height = 1080
+        mock_img.mode = 'RGBA'
+        mock_img.split.return_value = [Mock(), Mock(), Mock(), Mock()]
+        mock_image_open.return_value = mock_img
+
+        mock_stat.return_value.st_size = 1024 * 1024
+
+        with patch('PIL.Image.new') as mock_new:
+            mock_rgb = Mock()
+            mock_new.return_value = mock_rgb
+
+            mo.optimize_image('input.png', 'output.jpg', verbose=False)
+
+            mock_new.assert_called_once()
+
+    def test_optimize_image_no_pillow(self):
+        """Test image optimization without Pillow."""
+        with patch.dict('sys.modules', {'PIL': None}):
+            result = mo.optimize_image('input.jpg', 'output.jpg')
+            # Will fail to import but function handles it
+            assert result is False
+
+
+class TestVideoSplitting:
+    """Test video splitting functionality."""
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    @patch('subprocess.run')
+    @patch('pathlib.Path.mkdir')
+    def test_split_video_success(self, mock_mkdir, mock_run, mock_info, mock_check):
+        """Test successful video splitting."""
+        mock_check.return_value = True
+        mock_info.return_value = {'duration': 7200.0}  # 2 hours
+
+        result = mo.split_video(
+            'input.mp4',
+            './chunks',
+            chunk_duration=3600,  # 1 hour chunks
+            verbose=False
+        )
+
+        # Duration 7200s / 3600s = 2, +1 for safety = 3 chunks
+        assert len(result) == 3
+        assert mock_run.call_count == 3
+
+    @patch('media_optimizer.check_ffmpeg')
+    @patch('media_optimizer.get_media_info')
+    def test_split_video_short_duration(self, mock_info, mock_check):
+        """Test splitting video shorter than chunk duration."""
+        mock_check.return_value = True
+        mock_info.return_value = {'duration': 1800.0}  # 30 minutes
+
+        result = mo.split_video(
+            'input.mp4',
+            './chunks',
+            chunk_duration=3600,  # 1 hour
+            verbose=False
+        )
+
+        assert result == ['input.mp4']
+
+    @patch('media_optimizer.check_ffmpeg')
+    def test_split_video_no_ffmpeg(self, mock_check):
+        """Test video splitting without ffmpeg."""
+        mock_check.return_value = False
+
+        result = mo.split_video('input.mp4', './chunks')
+        assert result == []
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v', '--cov=media_optimizer', '--cov-report=term-missing'])