#!/usr/bin/env python3 """ Convert documents to Markdown with AI-enhanced image descriptions. This script demonstrates how to use MarkItDown with OpenRouter to generate detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.) """ import argparse import os import sys from pathlib import Path from markitdown import MarkItDown from openai import OpenAI # Predefined prompts for different use cases PROMPTS = { 'scientific': """ Analyze this scientific image or diagram. Provide: 1. Type of visualization (graph, chart, microscopy, diagram, etc.) 2. Key data points, trends, or patterns 3. Axes labels, legends, and scales 4. Notable features or findings 5. Scientific context and significance Be precise, technical, and detailed. """.strip(), 'presentation': """ Describe this presentation slide image. Include: 1. Main visual elements and their arrangement 2. Key points or messages conveyed 3. Data or information presented 4. Visual hierarchy and emphasis Keep the description clear and informative. """.strip(), 'general': """ Describe this image in detail. Include: 1. Main subjects and objects 2. Visual composition and layout 3. Text content (if any) 4. Notable details 5. Overall context and purpose Be comprehensive and accurate. """.strip(), 'data_viz': """ Analyze this data visualization. Provide: 1. Type of chart/graph (bar, line, scatter, pie, etc.) 2. Variables and axes 3. Data ranges and scales 4. Key patterns, trends, or outliers 5. Statistical insights Focus on quantitative accuracy. """.strip(), 'medical': """ Describe this medical image. Include: 1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.) 2. Anatomical structures visible 3. Notable findings or abnormalities 4. Image quality and contrast 5. Clinical relevance Be professional and precise. """.strip() } def convert_with_ai( input_file: Path, output_file: Path, api_key: str, model: str = "anthropic/claude-sonnet-4.5", prompt_type: str = "general", custom_prompt: str = None ) -> bool: """ Convert a file to Markdown with AI image descriptions. Args: input_file: Path to input file output_file: Path to output Markdown file api_key: OpenRouter API key model: Model name (default: anthropic/claude-sonnet-4.5) prompt_type: Type of prompt to use custom_prompt: Custom prompt (overrides prompt_type) Returns: True if successful, False otherwise """ try: # Initialize OpenRouter client (OpenAI-compatible) client = OpenAI( api_key=api_key, base_url="https://openrouter.ai/api/v1" ) # Select prompt if custom_prompt: prompt = custom_prompt else: prompt = PROMPTS.get(prompt_type, PROMPTS['general']) print(f"Using model: {model}") print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}") print(f"Converting: {input_file}") # Create MarkItDown with AI support md = MarkItDown( llm_client=client, llm_model=model, llm_prompt=prompt ) # Convert file result = md.convert(str(input_file)) # Create output with metadata content = f"# {result.title or input_file.stem}\n\n" content += f"**Source**: {input_file.name}\n" content += f"**Format**: {input_file.suffix}\n" content += f"**AI Model**: {model}\n" content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n" content += "---\n\n" content += result.text_content # Write output output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text(content, encoding='utf-8') print(f"✓ Successfully converted to: {output_file}") return True except Exception as e: print(f"✗ Error: {str(e)}", file=sys.stderr) return False def main(): parser = argparse.ArgumentParser( description="Convert documents to Markdown with AI-enhanced image descriptions", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=f""" Available prompt types: scientific - For scientific diagrams, graphs, and charts presentation - For presentation slides general - General-purpose image description data_viz - For data visualizations and charts medical - For medical imaging Examples: # Convert a scientific paper python convert_with_ai.py paper.pdf output.md --prompt-type scientific # Convert a presentation with custom model python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation # Use custom prompt with Claude Sonnet 4.5 python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram" # Set API key via environment variable export OPENROUTER_API_KEY="sk-or-v1-..." python convert_with_ai.py image.jpg image.md Environment Variables: OPENROUTER_API_KEY OpenRouter API key (required if not passed via --api-key) Popular Models (use with --model): anthropic/claude-sonnet-4.5 - Claude Sonnet 4.5 (recommended, vision support) anthropic/claude-3.5-sonnet - Claude 3.5 Sonnet (vision support) openai/gpt-4o - GPT-4 Omni (vision support) openai/gpt-4-vision - GPT-4 Vision google/gemini-pro-vision - Gemini Pro Vision """ ) parser.add_argument('input', type=Path, help='Input file') parser.add_argument('output', type=Path, help='Output Markdown file') parser.add_argument( '--api-key', '-k', help='OpenRouter API key (or set OPENROUTER_API_KEY env var)' ) parser.add_argument( '--model', '-m', default='anthropic/claude-sonnet-4.5', help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)' ) parser.add_argument( '--prompt-type', '-t', choices=list(PROMPTS.keys()), default='general', help='Type of prompt to use (default: general)' ) parser.add_argument( '--custom-prompt', '-p', help='Custom prompt (overrides --prompt-type)' ) parser.add_argument( '--list-prompts', '-l', action='store_true', help='List available prompt types and exit' ) args = parser.parse_args() # List prompts and exit if args.list_prompts: print("Available prompt types:\n") for name, prompt in PROMPTS.items(): print(f"[{name}]") print(prompt) print("\n" + "="*60 + "\n") sys.exit(0) # Get API key api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY') if not api_key: print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key") print("Get your API key at: https://openrouter.ai/keys") sys.exit(1) # Validate input file if not args.input.exists(): print(f"Error: Input file '{args.input}' does not exist") sys.exit(1) # Convert file success = convert_with_ai( input_file=args.input, output_file=args.output, api_key=api_key, model=args.model, prompt_type=args.prompt_type, custom_prompt=args.custom_prompt ) sys.exit(0 if success else 1) if __name__ == '__main__': main()