gh-rafaelcalleja-claude-mar…/skills/ai-multimodal/scripts/document_converter.py

#!/usr/bin/env python3
"""
Convert documents to Markdown using Gemini API.

Supports all document types:
- PDF documents (native vision processing)
- Images (JPEG, PNG, WEBP, HEIC)
- Office documents (DOCX, XLSX, PPTX)
- HTML, TXT, and other text formats

Features:
- Converts to clean markdown format
- Preserves structure, tables, and formatting
- Extracts text from images and scanned documents
- Batch conversion support
- Saves to docs/assets/document-extraction.md by default
"""

import argparse
import os
import sys
import time
from pathlib import Path
from typing import Optional, List, Dict, Any

try:
    from google import genai
    from google.genai import types
except ImportError:
    print("Error: google-genai package not installed")
    print("Install with: pip install google-genai")
    sys.exit(1)

try:
    from dotenv import load_dotenv
except ImportError:
    load_dotenv = None


def find_api_key() -> Optional[str]:
    """Find Gemini API key using correct priority order.

    Priority order (highest to lowest):
    1. process.env (runtime environment variables)
    2. .claude/skills/ai-multimodal/.env (skill-specific config)
    3. .claude/skills/.env (shared skills config)
    4. .claude/.env (Claude global config)
    """
    # Priority 1: Already in process.env (highest)
    api_key = os.getenv('GEMINI_API_KEY')
    if api_key:
        return api_key

    # Load .env files if dotenv available
    if load_dotenv:
        # Determine base paths
        script_dir = Path(__file__).parent
        skill_dir = script_dir.parent  # .claude/skills/ai-multimodal
        skills_dir = skill_dir.parent   # .claude/skills
        claude_dir = skills_dir.parent  # .claude

        # Priority 2: Skill-specific .env
        env_file = skill_dir / '.env'
        if env_file.exists():
            load_dotenv(env_file)
            api_key = os.getenv('GEMINI_API_KEY')
            if api_key:
                return api_key

        # Priority 3: Shared skills .env
        env_file = skills_dir / '.env'
        if env_file.exists():
            load_dotenv(env_file)
            api_key = os.getenv('GEMINI_API_KEY')
            if api_key:
                return api_key

        # Priority 4: Claude global .env
        env_file = claude_dir / '.env'
        if env_file.exists():
            load_dotenv(env_file)
            api_key = os.getenv('GEMINI_API_KEY')
            if api_key:
                return api_key

    return None


def find_project_root() -> Path:
    """Find project root directory."""
    script_dir = Path(__file__).parent

    # Look for .git or .claude directory
    for parent in [script_dir] + list(script_dir.parents):
        if (parent / '.git').exists() or (parent / '.claude').exists():
            return parent

    return script_dir


def get_mime_type(file_path: str) -> str:
    """Determine MIME type from file extension."""
    ext = Path(file_path).suffix.lower()

    mime_types = {
        # Documents
        '.pdf': 'application/pdf',
        '.txt': 'text/plain',
        '.html': 'text/html',
        '.htm': 'text/html',
        '.md': 'text/markdown',
        '.csv': 'text/csv',
        # Images
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.webp': 'image/webp',
        '.heic': 'image/heic',
        '.heif': 'image/heif',
        # Office (need to be uploaded as binary)
        '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
    }

    return mime_types.get(ext, 'application/octet-stream')


def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any:
    """Upload file to Gemini File API."""
    if verbose:
        print(f"Uploading {file_path}...")

    myfile = client.files.upload(file=file_path)

    # Wait for processing if needed
    max_wait = 300  # 5 minutes
    elapsed = 0
    while myfile.state.name == 'PROCESSING' and elapsed < max_wait:
        time.sleep(2)
        myfile = client.files.get(name=myfile.name)
        elapsed += 2
        if verbose and elapsed % 10 == 0:
            print(f"  Processing... {elapsed}s")

    if myfile.state.name == 'FAILED':
        raise ValueError(f"File processing failed: {file_path}")

    if myfile.state.name == 'PROCESSING':
        raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}")

    if verbose:
        print(f"  Uploaded: {myfile.name}")

    return myfile


def convert_to_markdown(
    client: genai.Client,
    file_path: str,
    model: str = 'gemini-2.5-flash',
    custom_prompt: Optional[str] = None,
    verbose: bool = False,
    max_retries: int = 3
) -> Dict[str, Any]:
    """Convert a document to markdown using Gemini."""

    for attempt in range(max_retries):
        try:
            file_path_obj = Path(file_path)
            file_size = file_path_obj.stat().st_size
            use_file_api = file_size > 20 * 1024 * 1024  # >20MB

            # Default prompt for markdown conversion
            if custom_prompt:
                prompt = custom_prompt
            else:
                prompt = """Convert this document to clean, well-formatted Markdown.

Requirements:
- Preserve all content, structure, and formatting
- Convert tables to markdown table format
- Maintain heading hierarchy (# ## ### etc)
- Preserve lists, code blocks, and quotes
- Extract text from images if present
- Keep formatting consistent and readable

Output only the markdown content without any preamble or explanation."""

            # Upload or inline the file
            if use_file_api:
                myfile = upload_file(client, str(file_path), verbose)
                content = [prompt, myfile]
            else:
                with open(file_path, 'rb') as f:
                    file_bytes = f.read()

                mime_type = get_mime_type(str(file_path))
                content = [
                    prompt,
                    types.Part.from_bytes(data=file_bytes, mime_type=mime_type)
                ]

            # Generate markdown
            response = client.models.generate_content(
                model=model,
                contents=content
            )

            markdown_content = response.text if hasattr(response, 'text') else ''

            return {
                'file': str(file_path),
                'status': 'success',
                'markdown': markdown_content
            }

        except Exception as e:
            if attempt == max_retries - 1:
                return {
                    'file': str(file_path),
                    'status': 'error',
                    'error': str(e),
                    'markdown': None
                }

            wait_time = 2 ** attempt
            if verbose:
                print(f"  Retry {attempt + 1} after {wait_time}s: {e}")
            time.sleep(wait_time)


def batch_convert(
    files: List[str],
    output_file: Optional[str] = None,
    auto_name: bool = False,
    model: str = 'gemini-2.5-flash',
    custom_prompt: Optional[str] = None,
    verbose: bool = False
) -> List[Dict[str, Any]]:
    """Batch convert multiple files to markdown."""

    api_key = find_api_key()
    if not api_key:
        print("Error: GEMINI_API_KEY not found")
        print("Set via: export GEMINI_API_KEY='your-key'")
        print("Or create .env file with: GEMINI_API_KEY=your-key")
        sys.exit(1)

    client = genai.Client(api_key=api_key)
    results = []

    # Determine output path
    if not output_file:
        project_root = find_project_root()
        output_dir = project_root / 'docs' / 'assets'

        if auto_name and len(files) == 1:
            # Auto-generate meaningful filename from input
            input_path = Path(files[0])
            base_name = input_path.stem
            output_file = str(output_dir / f"{base_name}-extraction.md")
        else:
            output_file = str(output_dir / 'document-extraction.md')

    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Process each file
    for i, file_path in enumerate(files, 1):
        if verbose:
            print(f"\n[{i}/{len(files)}] Converting: {file_path}")

        result = convert_to_markdown(
            client=client,
            file_path=file_path,
            model=model,
            custom_prompt=custom_prompt,
            verbose=verbose
        )

        results.append(result)

        if verbose:
            status = result.get('status', 'unknown')
            print(f"  Status: {status}")

    # Save combined markdown
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("# Document Extraction Results\n\n")
        f.write(f"Converted {len(files)} document(s) to markdown.\n\n")
        f.write("---\n\n")

        for result in results:
            f.write(f"## {Path(result['file']).name}\n\n")

            if result['status'] == 'success' and result.get('markdown'):
                f.write(result['markdown'])
                f.write("\n\n")
            elif result['status'] == 'success':
                f.write("**Note**: Conversion succeeded but no content was returned.\n\n")
            else:
                f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n")

            f.write("---\n\n")

    if verbose or True:  # Always show output location
        print(f"\n{'='*50}")
        print(f"Converted: {len(results)} file(s)")
        print(f"Success: {sum(1 for r in results if r['status'] == 'success')}")
        print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}")
        print(f"Output saved to: {output_path}")

    return results


def main():
    parser = argparse.ArgumentParser(
        description='Convert documents to Markdown using Gemini API',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Convert single PDF to markdown (default name)
  %(prog)s --input document.pdf

  # Auto-generate meaningful filename
  %(prog)s --input testpdf.pdf --auto-name
  # Output: docs/assets/testpdf-extraction.md

  # Convert multiple files
  %(prog)s --input doc1.pdf doc2.docx image.png

  # Specify custom output location
  %(prog)s --input document.pdf --output ./output.md

  # Use custom prompt
  %(prog)s --input document.pdf --prompt "Extract only the tables as markdown"

  # Batch convert directory
  %(prog)s --input ./documents/*.pdf --verbose

Supported formats:
  - PDF documents (up to 1,000 pages)
  - Images (JPEG, PNG, WEBP, HEIC)
  - Office documents (DOCX, XLSX, PPTX)
  - Text formats (TXT, HTML, Markdown, CSV)

Default output: <project-root>/docs/assets/document-extraction.md
        """
    )

    parser.add_argument('--input', '-i', nargs='+', required=True,
                       help='Input file(s) to convert')
    parser.add_argument('--output', '-o',
                       help='Output markdown file (default: docs/assets/document-extraction.md)')
    parser.add_argument('--auto-name', '-a', action='store_true',
                       help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)')
    parser.add_argument('--model', default='gemini-2.5-flash',
                       help='Gemini model to use (default: gemini-2.5-flash)')
    parser.add_argument('--prompt', '-p',
                       help='Custom prompt for conversion')
    parser.add_argument('--verbose', '-v', action='store_true',
                       help='Verbose output')

    args = parser.parse_args()

    # Validate input files
    files = []
    for file_pattern in args.input:
        file_path = Path(file_pattern)
        if file_path.exists() and file_path.is_file():
            files.append(str(file_path))
        else:
            # Try glob pattern
            import glob
            matched = glob.glob(file_pattern)
            files.extend([f for f in matched if Path(f).is_file()])

    if not files:
        print("Error: No valid input files found")
        sys.exit(1)

    # Convert files
    batch_convert(
        files=files,
        output_file=args.output,
        auto_name=args.auto_name,
        model=args.model,
        custom_prompt=args.prompt,
        verbose=args.verbose
    )


if __name__ == '__main__':
    main()