Initial commit

2025-11-30 08:48:52 +08:00
commit 6ec3196ecc
434 changed files with 125248 additions and 0 deletions
--- a/skills/ai-multimodal/scripts/gemini_batch_process.py
+++ b/skills/ai-multimodal/scripts/gemini_batch_process.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""
+Batch process multiple media files using Gemini API.
+
+Supports all Gemini modalities:
+- Audio: Transcription, analysis, summarization
+- Image: Captioning, detection, OCR, analysis
+- Video: Summarization, Q&A, scene detection
+- Document: PDF extraction, structured output
+- Generation: Image creation from text prompts
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import csv
+import shutil
+
+try:
+    from google import genai
+    from google.genai import types
+except ImportError:
+    print("Error: google-genai package not installed")
+    print("Install with: pip install google-genai")
+    sys.exit(1)
+
+try:
+    from dotenv import load_dotenv
+except ImportError:
+    load_dotenv = None
+
+
+def find_api_key() -> Optional[str]:
+    """Find Gemini API key using correct priority order.
+
+    Priority order (highest to lowest):
+    1. process.env (runtime environment variables)
+    2. .claude/skills/ai-multimodal/.env (skill-specific config)
+    3. .claude/skills/.env (shared skills config)
+    4. .claude/.env (Claude global config)
+    """
+    # Priority 1: Already in process.env (highest)
+    api_key = os.getenv('GEMINI_API_KEY')
+    if api_key:
+        return api_key
+
+    # Load .env files if dotenv available
+    if load_dotenv:
+        # Determine base paths
+        script_dir = Path(__file__).parent
+        skill_dir = script_dir.parent  # .claude/skills/ai-multimodal
+        skills_dir = skill_dir.parent   # .claude/skills
+        claude_dir = skills_dir.parent  # .claude
+
+        # Priority 2: Skill-specific .env
+        env_file = skill_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 3: Shared skills .env
+        env_file = skills_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+        # Priority 4: Claude global .env
+        env_file = claude_dir / '.env'
+        if env_file.exists():
+            load_dotenv(env_file)
+            api_key = os.getenv('GEMINI_API_KEY')
+            if api_key:
+                return api_key
+
+    return None
+
+
+def get_mime_type(file_path: str) -> str:
+    """Determine MIME type from file extension."""
+    ext = Path(file_path).suffix.lower()
+
+    mime_types = {
+        # Audio
+        '.mp3': 'audio/mp3',
+        '.wav': 'audio/wav',
+        '.aac': 'audio/aac',
+        '.flac': 'audio/flac',
+        '.ogg': 'audio/ogg',
+        '.aiff': 'audio/aiff',
+        # Image
+        '.jpg': 'image/jpeg',
+        '.jpeg': 'image/jpeg',
+        '.png': 'image/png',
+        '.webp': 'image/webp',
+        '.heic': 'image/heic',
+        '.heif': 'image/heif',
+        # Video
+        '.mp4': 'video/mp4',
+        '.mpeg': 'video/mpeg',
+        '.mov': 'video/quicktime',
+        '.avi': 'video/x-msvideo',
+        '.flv': 'video/x-flv',
+        '.mpg': 'video/mpeg',
+        '.webm': 'video/webm',
+        '.wmv': 'video/x-ms-wmv',
+        '.3gpp': 'video/3gpp',
+        # Document
+        '.pdf': 'application/pdf',
+        '.txt': 'text/plain',
+        '.html': 'text/html',
+        '.md': 'text/markdown',
+    }
+
+    return mime_types.get(ext, 'application/octet-stream')
+
+
+def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
+    """Upload file to Gemini File API."""
+    if verbose:
+        print(f"Uploading {file_path}...")
+
+    myfile = client.files.upload(file=file_path)
+
+    # Wait for processing (video/audio files need processing)
+    mime_type = get_mime_type(file_path)
+    if mime_type.startswith('video/') or mime_type.startswith('audio/'):
+        max_wait = 300  # 5 minutes
+        elapsed = 0
+        while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
+            time.sleep(2)
+            myfile = client.files.get(name=myfile.name)
+            elapsed += 2
+            if verbose and elapsed % 10 == 0:
+                print(f"  Processing... {elapsed}s")
+
+        if myfile.state.name == 'FAILED':
+            raise ValueError(f"File processing failed: {file_path}")
+
+        if myfile.state.name == 'PROCESSING':
+            raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")
+
+    if verbose:
+        print(f"  Uploaded: {myfile.name}")
+
+    return myfile
+
+
+def process_file(
+    client: genai.Client,
+    file_path: Optional[str],
+    prompt: str,
+    model: str,
+    task: str,
+    format_output: str,
+    aspect_ratio: Optional[str] = None,
+    verbose: bool = False,
+    max_retries: int = 3
+) -> Dict[str, Any]:
+    """Process a single file with retry logic."""
+
+    for attempt in range(max_retries):
+        try:
+            # For generation tasks without input files
+            if task == 'generate' and not file_path:
+                content = [prompt]
+            else:
+                # Process input file
+                file_path = Path(file_path)
+                # Determine if we need File API
+                file_size = file_path.stat().st_size
+                use_file_api = file_size > 20 * 1024 * 1024  # >20MB
+
+                if use_file_api:
+                    # Upload to File API
+                    myfile = upload_file(client, str(file_path), verbose)
+                    content = [prompt, myfile]
+                else:
+                    # Inline data
+                    with open(file_path, 'rb') as f:
+                        file_bytes = f.read()
+
+                    mime_type = get_mime_type(str(file_path))
+                    content = [
+                        prompt,
+                        types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
+                    ]
+
+            # Configure request
+            config_args = {}
+            if task == 'generate':
+                config_args['response_modalities'] = ['Image']  # Capital I per API spec
+                if aspect_ratio:
+                    # Nest aspect_ratio in image_config per API spec
+                    config_args['image_config'] = types.ImageConfig(
+                        aspect_ratio=aspect_ratio
+                    )
+
+            if format_output == 'json':
+                config_args['response_mime_type'] = 'application/json'
+
+            config = types.GenerateContentConfig(**config_args) if config_args else None
+
+            # Generate content
+            response = client.models.generate_content(
+                model=model,
+                contents=content,
+                config=config
+            )
+
+            # Extract response
+            result = {
+                'file': str(file_path) if file_path else 'generated',
+                'status': 'success',
+                'response': response.text if hasattr(response, 'text') else None
+            }
+
+            # Handle image output
+            if task == 'generate' and hasattr(response, 'candidates'):
+                for i, part in enumerate(response.candidates[0].content.parts):
+                    if part.inline_data:
+                        # Determine output directory - use project root docs/assets
+                        if file_path:
+                            output_dir = Path(file_path).parent
+                            base_name = Path(file_path).stem
+                        else:
+                            # Find project root (look for .git or .claude directory)
+                            script_dir = Path(__file__).parent
+                            project_root = script_dir
+                            for parent in [script_dir] + list(script_dir.parents):
+                                if (parent / '.git').exists() or (parent / '.claude').exists():
+                                    project_root = parent
+                                    break
+
+                            output_dir = project_root / 'docs' / 'assets'
+                            output_dir.mkdir(parents=True, exist_ok=True)
+                            base_name = "generated"
+
+                        output_file = output_dir / f"{base_name}_generated_{i}.png"
+                        with open(output_file, 'wb') as f:
+                            f.write(part.inline_data.data)
+                        result['generated_image'] = str(output_file)
+                        if verbose:
+                            print(f"  Saved image to: {output_file}")
+
+            return result
+
+        except Exception as e:
+            if attempt == max_retries - 1:
+                return {
+                    'file': str(file_path) if file_path else 'generated',
+                    'status': 'error',
+                    'error': str(e)
+                }
+
+            wait_time = 2 ** attempt
+            if verbose:
+                print(f"  Retry {attempt + 1} after {wait_time}s: {e}")
+            time.sleep(wait_time)
+
+
+def batch_process(
+    files: List[str],
+    prompt: str,
+    model: str,
+    task: str,
+    format_output: str,
+    aspect_ratio: Optional[str] = None,
+    output_file: Optional[str] = None,
+    verbose: bool = False,
+    dry_run: bool = False
+) -> List[Dict[str, Any]]:
+    """Batch process multiple files."""
+    api_key = find_api_key()
+    if not api_key:
+        print("Error: GEMINI_API_KEY not found")
+        print("Set via: export GEMINI_API_KEY='your-key'")
+        print("Or create .env file with: GEMINI_API_KEY=your-key")
+        sys.exit(1)
+
+    if dry_run:
+        print("DRY RUN MODE - No API calls will be made")
+        print(f"Files to process: {len(files)}")
+        print(f"Model: {model}")
+        print(f"Task: {task}")
+        print(f"Prompt: {prompt}")
+        return []
+
+    client = genai.Client(api_key=api_key)
+    results = []
+
+    # For generation tasks without input files, process once
+    if task == 'generate' and not files:
+        if verbose:
+            print(f"\nGenerating image from prompt...")
+
+        result = process_file(
+            client=client,
+            file_path=None,
+            prompt=prompt,
+            model=model,
+            task=task,
+            format_output=format_output,
+            aspect_ratio=aspect_ratio,
+            verbose=verbose
+        )
+
+        results.append(result)
+
+        if verbose:
+            status = result.get('status', 'unknown')
+            print(f"  Status: {status}")
+    else:
+        # Process input files
+        for i, file_path in enumerate(files, 1):
+            if verbose:
+                print(f"\n[{i}/{len(files)}] Processing: {file_path}")
+
+            result = process_file(
+                client=client,
+                file_path=file_path,
+                prompt=prompt,
+                model=model,
+                task=task,
+                format_output=format_output,
+                aspect_ratio=aspect_ratio,
+                verbose=verbose
+            )
+
+            results.append(result)
+
+            if verbose:
+                status = result.get('status', 'unknown')
+                print(f"  Status: {status}")
+
+    # Save results
+    if output_file:
+        save_results(results, output_file, format_output)
+
+    return results
+
+
+def save_results(results: List[Dict[str, Any]], output_file: str, format_output: str):
+    """Save results to file."""
+    output_path = Path(output_file)
+
+    # Special handling for image generation - if output has image extension, copy the generated image
+    image_extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp'}
+    if output_path.suffix.lower() in image_extensions and len(results) == 1:
+        generated_image = results[0].get('generated_image')
+        if generated_image:
+            # Copy the generated image to the specified output location
+            shutil.copy2(generated_image, output_path)
+            return
+        else:
+            # Don't write text reports to image files - save error as .txt instead
+            output_path = output_path.with_suffix('.error.txt')
+            print(f"Warning: Generation failed, saving error report to: {output_path}")
+
+    if format_output == 'json':
+        with open(output_path, 'w') as f:
+            json.dump(results, f, indent=2)
+    elif format_output == 'csv':
+        with open(output_path, 'w', newline='') as f:
+            fieldnames = ['file', 'status', 'response', 'error']
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for result in results:
+                writer.writerow({
+                    'file': result.get('file', ''),
+                    'status': result.get('status', ''),
+                    'response': result.get('response', ''),
+                    'error': result.get('error', '')
+                })
+    else:  # markdown
+        with open(output_path, 'w') as f:
+            f.write("# Batch Processing Results\n\n")
+            for i, result in enumerate(results, 1):
+                f.write(f"## {i}. {result.get('file', 'Unknown')}\n\n")
+                f.write(f"**Status**: {result.get('status', 'unknown')}\n\n")
+                if result.get('response'):
+                    f.write(f"**Response**:\n\n{result['response']}\n\n")
+                if result.get('error'):
+                    f.write(f"**Error**: {result['error']}\n\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Batch process media files with Gemini API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Transcribe multiple audio files
+  %(prog)s --files *.mp3 --task transcribe --model gemini-2.5-flash
+
+  # Analyze images
+  %(prog)s --files *.jpg --task analyze --prompt "Describe this image" \\
+    --model gemini-2.5-flash
+
+  # Process PDFs to JSON
+  %(prog)s --files *.pdf --task extract --prompt "Extract data as JSON" \\
+    --format json --output results.json
+
+  # Generate images
+  %(prog)s --task generate --prompt "A mountain landscape" \\
+    --model gemini-2.5-flash-image --aspect-ratio 16:9
+        """
+    )
+
+    parser.add_argument('--files', nargs='*', help='Input files to process')
+    parser.add_argument('--task', required=True,
+                       choices=['transcribe', 'analyze', 'extract', 'generate'],
+                       help='Task to perform')
+    parser.add_argument('--prompt', help='Prompt for analysis/generation')
+    parser.add_argument('--model', default='gemini-2.5-flash',
+                       help='Gemini model to use (default: gemini-2.5-flash)')
+    parser.add_argument('--format', dest='format_output', default='text',
+                       choices=['text', 'json', 'csv', 'markdown'],
+                       help='Output format (default: text)')
+    parser.add_argument('--aspect-ratio', choices=['1:1', '16:9', '9:16', '4:3', '3:4'],
+                       help='Aspect ratio for image generation')
+    parser.add_argument('--output', help='Output file for results')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                       help='Verbose output')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='Show what would be done without making API calls')
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if args.task != 'generate' and not args.files:
+        parser.error("--files required for non-generation tasks")
+
+    if args.task == 'generate' and not args.prompt:
+        parser.error("--prompt required for generation task")
+
+    if args.task != 'generate' and not args.prompt:
+        # Set default prompts
+        if args.task == 'transcribe':
+            args.prompt = 'Generate a transcript with timestamps'
+        elif args.task == 'analyze':
+            args.prompt = 'Analyze this content'
+        elif args.task == 'extract':
+            args.prompt = 'Extract key information'
+
+    # Process files
+    files = args.files or []
+    results = batch_process(
+        files=files,
+        prompt=args.prompt,
+        model=args.model,
+        task=args.task,
+        format_output=args.format_output,
+        aspect_ratio=args.aspect_ratio,
+        output_file=args.output,
+        verbose=args.verbose,
+        dry_run=args.dry_run
+    )
+
+    # Print summary
+    if not args.dry_run and results:
+        success = sum(1 for r in results if r.get('status') == 'success')
+        failed = len(results) - success
+        print(f"\n{'='*50}")
+        print(f"Processed: {len(results)} files")
+        print(f"Success: {success}")
+        print(f"Failed: {failed}")
+        if args.output:
+            print(f"Results saved to: {args.output}")
+
+
+if __name__ == '__main__':
+    main()