Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:48:52 +08:00
commit 6ec3196ecc
434 changed files with 125248 additions and 0 deletions

View File

@@ -0,0 +1,480 @@
#!/usr/bin/env python3
"""
Batch process multiple media files using Gemini API.
Supports all Gemini modalities:
- Audio: Transcription, analysis, summarization
- Image: Captioning, detection, OCR, analysis
- Video: Summarization, Q&A, scene detection
- Document: PDF extraction, structured output
- Generation: Image creation from text prompts
"""
import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
import csv
import shutil
try:
from google import genai
from google.genai import types
except ImportError:
print("Error: google-genai package not installed")
print("Install with: pip install google-genai")
sys.exit(1)
try:
from dotenv import load_dotenv
except ImportError:
load_dotenv = None
def find_api_key() -> Optional[str]:
"""Find Gemini API key using correct priority order.
Priority order (highest to lowest):
1. process.env (runtime environment variables)
2. .claude/skills/ai-multimodal/.env (skill-specific config)
3. .claude/skills/.env (shared skills config)
4. .claude/.env (Claude global config)
"""
# Priority 1: Already in process.env (highest)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Load .env files if dotenv available
if load_dotenv:
# Determine base paths
script_dir = Path(__file__).parent
skill_dir = script_dir.parent # .claude/skills/ai-multimodal
skills_dir = skill_dir.parent # .claude/skills
claude_dir = skills_dir.parent # .claude
# Priority 2: Skill-specific .env
env_file = skill_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Priority 3: Shared skills .env
env_file = skills_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
# Priority 4: Claude global .env
env_file = claude_dir / '.env'
if env_file.exists():
load_dotenv(env_file)
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
return api_key
return None
def get_mime_type(file_path: str) -> str:
"""Determine MIME type from file extension."""
ext = Path(file_path).suffix.lower()
mime_types = {
# Audio
'.mp3': 'audio/mp3',
'.wav': 'audio/wav',
'.aac': 'audio/aac',
'.flac': 'audio/flac',
'.ogg': 'audio/ogg',
'.aiff': 'audio/aiff',
# Image
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.webp': 'image/webp',
'.heic': 'image/heic',
'.heif': 'image/heif',
# Video
'.mp4': 'video/mp4',
'.mpeg': 'video/mpeg',
'.mov': 'video/quicktime',
'.avi': 'video/x-msvideo',
'.flv': 'video/x-flv',
'.mpg': 'video/mpeg',
'.webm': 'video/webm',
'.wmv': 'video/x-ms-wmv',
'.3gpp': 'video/3gpp',
# Document
'.pdf': 'application/pdf',
'.txt': 'text/plain',
'.html': 'text/html',
'.md': 'text/markdown',
}
return mime_types.get(ext, 'application/octet-stream')
def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
"""Upload file to Gemini File API."""
if verbose:
print(f"Uploading {file_path}...")
myfile = client.files.upload(file=file_path)
# Wait for processing (video/audio files need processing)
mime_type = get_mime_type(file_path)
if mime_type.startswith('video/') or mime_type.startswith('audio/'):
max_wait = 300 # 5 minutes
elapsed = 0
while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
time.sleep(2)
myfile = client.files.get(name=myfile.name)
elapsed += 2
if verbose and elapsed % 10 == 0:
print(f" Processing... {elapsed}s")
if myfile.state.name == 'FAILED':
raise ValueError(f"File processing failed: {file_path}")
if myfile.state.name == 'PROCESSING':
raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
if verbose:
print(f" Uploaded: {myfile.name}")
return myfile
def process_file(
client: genai.Client,
file_path: Optional[str],
prompt: str,
model: str,
task: str,
format_output: str,
aspect_ratio: Optional[str] = None,
verbose: bool = False,
max_retries: int = 3
) -> Dict[str, Any]:
"""Process a single file with retry logic."""
for attempt in range(max_retries):
try:
# For generation tasks without input files
if task == 'generate' and not file_path:
content = [prompt]
else:
# Process input file
file_path = Path(file_path)
# Determine if we need File API
file_size = file_path.stat().st_size
use_file_api = file_size > 20 * 1024 * 1024 # >20MB
if use_file_api:
# Upload to File API
myfile = upload_file(client, str(file_path), verbose)
content = [prompt, myfile]
else:
# Inline data
with open(file_path, 'rb') as f:
file_bytes = f.read()
mime_type = get_mime_type(str(file_path))
content = [
prompt,
types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
]
# Configure request
config_args = {}
if task == 'generate':
config_args['response_modalities'] = ['Image'] # Capital I per API spec
if aspect_ratio:
# Nest aspect_ratio in image_config per API spec
config_args['image_config'] = types.ImageConfig(
aspect_ratio=aspect_ratio
)
if format_output == 'json':
config_args['response_mime_type'] = 'application/json'
config = types.GenerateContentConfig(**config_args) if config_args else None
# Generate content
response = client.models.generate_content(
model=model,
contents=content,
config=config
)
# Extract response
result = {
'file': str(file_path) if file_path else 'generated',
'status': 'success',
'response': response.text if hasattr(response, 'text') else None
}
# Handle image output
if task == 'generate' and hasattr(response, 'candidates'):
for i, part in enumerate(response.candidates[0].content.parts):
if part.inline_data:
# Determine output directory - use project root docs/assets
if file_path:
output_dir = Path(file_path).parent
base_name = Path(file_path).stem
else:
# Find project root (look for .git or .claude directory)
script_dir = Path(__file__).parent
project_root = script_dir
for parent in [script_dir] + list(script_dir.parents):
if (parent / '.git').exists() or (parent / '.claude').exists():
project_root = parent
break
output_dir = project_root / 'docs' / 'assets'
output_dir.mkdir(parents=True, exist_ok=True)
base_name = "generated"
output_file = output_dir / f"{base_name}_generated_{i}.png"
with open(output_file, 'wb') as f:
f.write(part.inline_data.data)
result['generated_image'] = str(output_file)
if verbose:
print(f" Saved image to: {output_file}")
return result
except Exception as e:
if attempt == max_retries - 1:
return {
'file': str(file_path) if file_path else 'generated',
'status': 'error',
'error': str(e)
}
wait_time = 2 ** attempt
if verbose:
print(f" Retry {attempt + 1} after {wait_time}s: {e}")
time.sleep(wait_time)
def batch_process(
files: List[str],
prompt: str,
model: str,
task: str,
format_output: str,
aspect_ratio: Optional[str] = None,
output_file: Optional[str] = None,
verbose: bool = False,
dry_run: bool = False
) -> List[Dict[str, Any]]:
"""Batch process multiple files."""
api_key = find_api_key()
if not api_key:
print("Error: GEMINI_API_KEY not found")
print("Set via: export GEMINI_API_KEY='your-key'")
print("Or create .env file with: GEMINI_API_KEY=your-key")
sys.exit(1)
if dry_run:
print("DRY RUN MODE - No API calls will be made")
print(f"Files to process: {len(files)}")
print(f"Model: {model}")
print(f"Task: {task}")
print(f"Prompt: {prompt}")
return []
client = genai.Client(api_key=api_key)
results = []
# For generation tasks without input files, process once
if task == 'generate' and not files:
if verbose:
print(f"\nGenerating image from prompt...")
result = process_file(
client=client,
file_path=None,
prompt=prompt,
model=model,
task=task,
format_output=format_output,
aspect_ratio=aspect_ratio,
verbose=verbose
)
results.append(result)
if verbose:
status = result.get('status', 'unknown')
print(f" Status: {status}")
else:
# Process input files
for i, file_path in enumerate(files, 1):
if verbose:
print(f"\n[{i}/{len(files)}] Processing: {file_path}")
result = process_file(
client=client,
file_path=file_path,
prompt=prompt,
model=model,
task=task,
format_output=format_output,
aspect_ratio=aspect_ratio,
verbose=verbose
)
results.append(result)
if verbose:
status = result.get('status', 'unknown')
print(f" Status: {status}")
# Save results
if output_file:
save_results(results, output_file, format_output)
return results
def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str):
"""Save results to file."""
output_path = Path(output_file)
# Special handling for image generation - if output has image extension, copy the generated image
image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'}
if output_path.suffix.lower() in image_extensions and len(results) == 1:
generated_image = results[0].get('generated_image')
if generated_image:
# Copy the generated image to the specified output location
shutil.copy2(generated_image, output_path)
return
else:
# Don't write text reports to image files - save error as .txt instead
output_path = output_path.with_suffix('.error.txt')
print(f"Warning: Generation failed, saving error report to: {output_path}")
if format_output == 'json':
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
elif format_output == 'csv':
with open(output_path, 'w', newline='') as f:
fieldnames = ['file', 'status', 'response', 'error']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for result in results:
writer.writerow({
'file': result.get('file', ''),
'status': result.get('status', ''),
'response': result.get('response', ''),
'error': result.get('error', '')
})
else: # markdown
with open(output_path, 'w') as f:
f.write("# Batch Processing Results\n\n")
for i, result in enumerate(results, 1):
f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n")
f.write(f"**Status**: {result.get('status', 'unknown')}\n\n")
if result.get('response'):
f.write(f"**Response**:\n\n{result['response']}\n\n")
if result.get('error'):
f.write(f"**Error**: {result['error']}\n\n")
def main():
parser = argparse.ArgumentParser(
description='Batch process media files with Gemini API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Transcribe multiple audio files
%(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash
# Analyze images
%(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\
--model gemini-2.5-flash
# Process PDFs to JSON
%(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\
--format json --output results.json
# Generate images
%(prog)s --task generate --prompt "A mountain landscape" \\
--model gemini-2.5-flash-image --aspect-ratio 16:9
"""
)
parser.add_argument('--files', nargs='*', help='Input files to process')
parser.add_argument('--task', required=True,
choices=['transcribe', 'analyze', 'extract', 'generate'],
help='Task to perform')
parser.add_argument('--prompt', help='Prompt for analysis/generation')
parser.add_argument('--model', default='gemini-2.5-flash',
help='Gemini model to use (default: gemini-2.5-flash)')
parser.add_argument('--format', dest='format_output', default='text',
choices=['text', 'json', 'csv', 'markdown'],
help='Output format (default: text)')
parser.add_argument('--aspect-ratio', choices=['1:1', '16:9', '9:16', '4:3', '3:4'],
help='Aspect ratio for image generation')
parser.add_argument('--output', help='Output file for results')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be done without making API calls')
args = parser.parse_args()
# Validate arguments
if args.task != 'generate' and not args.files:
parser.error("--files required for non-generation tasks")
if args.task == 'generate' and not args.prompt:
parser.error("--prompt required for generation task")
if args.task != 'generate' and not args.prompt:
# Set default prompts
if args.task == 'transcribe':
args.prompt = 'Generate a transcript with timestamps'
elif args.task == 'analyze':
args.prompt = 'Analyze this content'
elif args.task == 'extract':
args.prompt = 'Extract key information'
# Process files
files = args.files or []
results = batch_process(
files=files,
prompt=args.prompt,
model=args.model,
task=args.task,
format_output=args.format_output,
aspect_ratio=args.aspect_ratio,
output_file=args.output,
verbose=args.verbose,
dry_run=args.dry_run
)
# Print summary
if not args.dry_run and results:
success = sum(1 for r in results if r.get('status') == 'success')
failed = len(results) - success
print(f"\n{'='*50}")
print(f"Processed: {len(results)} files")
print(f"Success: {success}")
print(f"Failed: {failed}")
if args.output:
print(f"Results saved to: {args.output}")
if __name__ == '__main__':
main()