#!/usr/bin/env python3 """ Convert documents to Markdown using Gemini API. Supports all document types: - PDF documents (native vision processing) - Images (JPEG, PNG, WEBP, HEIC) - Office documents (DOCX, XLSX, PPTX) - HTML, TXT, and other text formats Features: - Converts to clean markdown format - Preserves structure, tables, and formatting - Extracts text from images and scanned documents - Batch conversion support - Saves to docs/assets/document-extraction.md by default """ import argparse import os import sys import time from pathlib import Path from typing import Optional, List, Dict, Any try: from google import genai from google.genai import types except ImportError: print("Error: google-genai package not installed") print("Install with: pip install google-genai") sys.exit(1) try: from dotenv import load_dotenv except ImportError: load_dotenv = None def find_api_key() -> Optional[str]: """Find Gemini API key using correct priority order. Priority order (highest to lowest): 1. process.env (runtime environment variables) 2. .claude/skills/ai-multimodal/.env (skill-specific config) 3. .claude/skills/.env (shared skills config) 4. .claude/.env (Claude global config) """ # Priority 1: Already in process.env (highest) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key # Load .env files if dotenv available if load_dotenv: # Determine base paths script_dir = Path(__file__).parent skill_dir = script_dir.parent # .claude/skills/ai-multimodal skills_dir = skill_dir.parent # .claude/skills claude_dir = skills_dir.parent # .claude # Priority 2: Skill-specific .env env_file = skill_dir / '.env' if env_file.exists(): load_dotenv(env_file) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key # Priority 3: Shared skills .env env_file = skills_dir / '.env' if env_file.exists(): load_dotenv(env_file) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key # Priority 4: Claude global .env env_file = claude_dir / '.env' if env_file.exists(): load_dotenv(env_file) api_key = os.getenv('GEMINI_API_KEY') if api_key: return api_key return None def find_project_root() -> Path: """Find project root directory.""" script_dir = Path(__file__).parent # Look for .git or .claude directory for parent in [script_dir] + list(script_dir.parents): if (parent / '.git').exists() or (parent / '.claude').exists(): return parent return script_dir def get_mime_type(file_path: str) -> str: """Determine MIME type from file extension.""" ext = Path(file_path).suffix.lower() mime_types = { # Documents '.pdf': 'application/pdf', '.txt': 'text/plain', '.html': 'text/html', '.htm': 'text/html', '.md': 'text/markdown', '.csv': 'text/csv', # Images '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.webp': 'image/webp', '.heic': 'image/heic', '.heif': 'image/heif', # Office (need to be uploaded as binary) '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', } return mime_types.get(ext, 'application/octet-stream') def upload_file(client: genai.Client, file_path: str, verbose: bool = False) -> Any: """Upload file to Gemini File API.""" if verbose: print(f"Uploading {file_path}...") myfile = client.files.upload(file=file_path) # Wait for processing if needed max_wait = 300 # 5 minutes elapsed = 0 while myfile.state.name == 'PROCESSING' and elapsed < max_wait: time.sleep(2) myfile = client.files.get(name=myfile.name) elapsed += 2 if verbose and elapsed % 10 == 0: print(f" Processing... {elapsed}s") if myfile.state.name == 'FAILED': raise ValueError(f"File processing failed: {file_path}") if myfile.state.name == 'PROCESSING': raise TimeoutError(f"Processing timeout after {max_wait}s: {file_path}") if verbose: print(f" Uploaded: {myfile.name}") return myfile def convert_to_markdown( client: genai.Client, file_path: str, model: str = 'gemini-2.5-flash', custom_prompt: Optional[str] = None, verbose: bool = False, max_retries: int = 3 ) -> Dict[str, Any]: """Convert a document to markdown using Gemini.""" for attempt in range(max_retries): try: file_path_obj = Path(file_path) file_size = file_path_obj.stat().st_size use_file_api = file_size > 20 * 1024 * 1024 # >20MB # Default prompt for markdown conversion if custom_prompt: prompt = custom_prompt else: prompt = """Convert this document to clean, well-formatted Markdown. Requirements: - Preserve all content, structure, and formatting - Convert tables to markdown table format - Maintain heading hierarchy (# ## ### etc) - Preserve lists, code blocks, and quotes - Extract text from images if present - Keep formatting consistent and readable Output only the markdown content without any preamble or explanation.""" # Upload or inline the file if use_file_api: myfile = upload_file(client, str(file_path), verbose) content = [prompt, myfile] else: with open(file_path, 'rb') as f: file_bytes = f.read() mime_type = get_mime_type(str(file_path)) content = [ prompt, types.Part.from_bytes(data=file_bytes, mime_type=mime_type) ] # Generate markdown response = client.models.generate_content( model=model, contents=content ) markdown_content = response.text if hasattr(response, 'text') else '' return { 'file': str(file_path), 'status': 'success', 'markdown': markdown_content } except Exception as e: if attempt == max_retries - 1: return { 'file': str(file_path), 'status': 'error', 'error': str(e), 'markdown': None } wait_time = 2 ** attempt if verbose: print(f" Retry {attempt + 1} after {wait_time}s: {e}") time.sleep(wait_time) def batch_convert( files: List[str], output_file: Optional[str] = None, auto_name: bool = False, model: str = 'gemini-2.5-flash', custom_prompt: Optional[str] = None, verbose: bool = False ) -> List[Dict[str, Any]]: """Batch convert multiple files to markdown.""" api_key = find_api_key() if not api_key: print("Error: GEMINI_API_KEY not found") print("Set via: export GEMINI_API_KEY='your-key'") print("Or create .env file with: GEMINI_API_KEY=your-key") sys.exit(1) client = genai.Client(api_key=api_key) results = [] # Determine output path if not output_file: project_root = find_project_root() output_dir = project_root / 'docs' / 'assets' if auto_name and len(files) == 1: # Auto-generate meaningful filename from input input_path = Path(files[0]) base_name = input_path.stem output_file = str(output_dir / f"{base_name}-extraction.md") else: output_file = str(output_dir / 'document-extraction.md') output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) # Process each file for i, file_path in enumerate(files, 1): if verbose: print(f"\n[{i}/{len(files)}] Converting: {file_path}") result = convert_to_markdown( client=client, file_path=file_path, model=model, custom_prompt=custom_prompt, verbose=verbose ) results.append(result) if verbose: status = result.get('status', 'unknown') print(f" Status: {status}") # Save combined markdown with open(output_path, 'w', encoding='utf-8') as f: f.write("# Document Extraction Results\n\n") f.write(f"Converted {len(files)} document(s) to markdown.\n\n") f.write("---\n\n") for result in results: f.write(f"## {Path(result['file']).name}\n\n") if result['status'] == 'success' and result.get('markdown'): f.write(result['markdown']) f.write("\n\n") elif result['status'] == 'success': f.write("**Note**: Conversion succeeded but no content was returned.\n\n") else: f.write(f"**Error**: {result.get('error', 'Unknown error')}\n\n") f.write("---\n\n") if verbose or True: # Always show output location print(f"\n{'='*50}") print(f"Converted: {len(results)} file(s)") print(f"Success: {sum(1 for r in results if r['status'] == 'success')}") print(f"Failed: {sum(1 for r in results if r['status'] == 'error')}") print(f"Output saved to: {output_path}") return results def main(): parser = argparse.ArgumentParser( description='Convert documents to Markdown using Gemini API', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Convert single PDF to markdown (default name) %(prog)s --input document.pdf # Auto-generate meaningful filename %(prog)s --input testpdf.pdf --auto-name # Output: docs/assets/testpdf-extraction.md # Convert multiple files %(prog)s --input doc1.pdf doc2.docx image.png # Specify custom output location %(prog)s --input document.pdf --output ./output.md # Use custom prompt %(prog)s --input document.pdf --prompt "Extract only the tables as markdown" # Batch convert directory %(prog)s --input ./documents/*.pdf --verbose Supported formats: - PDF documents (up to 1,000 pages) - Images (JPEG, PNG, WEBP, HEIC) - Office documents (DOCX, XLSX, PPTX) - Text formats (TXT, HTML, Markdown, CSV) Default output: /docs/assets/document-extraction.md """ ) parser.add_argument('--input', '-i', nargs='+', required=True, help='Input file(s) to convert') parser.add_argument('--output', '-o', help='Output markdown file (default: docs/assets/document-extraction.md)') parser.add_argument('--auto-name', '-a', action='store_true', help='Auto-generate meaningful output filename from input (e.g., document.pdf -> document-extraction.md)') parser.add_argument('--model', default='gemini-2.5-flash', help='Gemini model to use (default: gemini-2.5-flash)') parser.add_argument('--prompt', '-p', help='Custom prompt for conversion') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') args = parser.parse_args() # Validate input files files = [] for file_pattern in args.input: file_path = Path(file_pattern) if file_path.exists() and file_path.is_file(): files.append(str(file_path)) else: # Try glob pattern import glob matched = glob.glob(file_pattern) files.extend([f for f in matched if Path(f).is_file()]) if not files: print("Error: No valid input files found") sys.exit(1) # Convert files batch_convert( files=files, output_file=args.output, auto_name=args.auto_name, model=args.model, custom_prompt=args.prompt, verbose=args.verbose ) if __name__ == '__main__': main()