Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/markitdown/scripts/convert_with_ai.py
+++ b/skills/markitdown/scripts/convert_with_ai.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""
+Convert documents to Markdown with AI-enhanced image descriptions.
+
+This script demonstrates how to use MarkItDown with OpenRouter to generate
+detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.)
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from markitdown import MarkItDown
+from openai import OpenAI
+
+
+# Predefined prompts for different use cases
+PROMPTS = {
+    'scientific': """
+Analyze this scientific image or diagram. Provide:
+1. Type of visualization (graph, chart, microscopy, diagram, etc.)
+2. Key data points, trends, or patterns
+3. Axes labels, legends, and scales
+4. Notable features or findings
+5. Scientific context and significance
+Be precise, technical, and detailed.
+    """.strip(),
+    
+    'presentation': """
+Describe this presentation slide image. Include:
+1. Main visual elements and their arrangement
+2. Key points or messages conveyed
+3. Data or information presented
+4. Visual hierarchy and emphasis
+Keep the description clear and informative.
+    """.strip(),
+    
+    'general': """
+Describe this image in detail. Include:
+1. Main subjects and objects
+2. Visual composition and layout
+3. Text content (if any)
+4. Notable details
+5. Overall context and purpose
+Be comprehensive and accurate.
+    """.strip(),
+    
+    'data_viz': """
+Analyze this data visualization. Provide:
+1. Type of chart/graph (bar, line, scatter, pie, etc.)
+2. Variables and axes
+3. Data ranges and scales
+4. Key patterns, trends, or outliers
+5. Statistical insights
+Focus on quantitative accuracy.
+    """.strip(),
+    
+    'medical': """
+Describe this medical image. Include:
+1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.)
+2. Anatomical structures visible
+3. Notable findings or abnormalities
+4. Image quality and contrast
+5. Clinical relevance
+Be professional and precise.
+    """.strip()
+}
+
+
+def convert_with_ai(
+    input_file: Path,
+    output_file: Path,
+    api_key: str,
+    model: str = "anthropic/claude-sonnet-4.5",
+    prompt_type: str = "general",
+    custom_prompt: str = None
+) -> bool:
+    """
+    Convert a file to Markdown with AI image descriptions.
+    
+    Args:
+        input_file: Path to input file
+        output_file: Path to output Markdown file
+        api_key: OpenRouter API key
+        model: Model name (default: anthropic/claude-sonnet-4.5)
+        prompt_type: Type of prompt to use
+        custom_prompt: Custom prompt (overrides prompt_type)
+        
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        # Initialize OpenRouter client (OpenAI-compatible)
+        client = OpenAI(
+            api_key=api_key,
+            base_url="https://openrouter.ai/api/v1"
+        )
+        
+        # Select prompt
+        if custom_prompt:
+            prompt = custom_prompt
+        else:
+            prompt = PROMPTS.get(prompt_type, PROMPTS['general'])
+        
+        print(f"Using model: {model}")
+        print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}")
+        print(f"Converting: {input_file}")
+        
+        # Create MarkItDown with AI support
+        md = MarkItDown(
+            llm_client=client,
+            llm_model=model,
+            llm_prompt=prompt
+        )
+        
+        # Convert file
+        result = md.convert(str(input_file))
+        
+        # Create output with metadata
+        content = f"# {result.title or input_file.stem}\n\n"
+        content += f"**Source**: {input_file.name}\n"
+        content += f"**Format**: {input_file.suffix}\n"
+        content += f"**AI Model**: {model}\n"
+        content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n"
+        content += "---\n\n"
+        content += result.text_content
+        
+        # Write output
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        output_file.write_text(content, encoding='utf-8')
+        
+        print(f"✓ Successfully converted to: {output_file}")
+        return True
+        
+    except Exception as e:
+        print(f"✗ Error: {str(e)}", file=sys.stderr)
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert documents to Markdown with AI-enhanced image descriptions",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Available prompt types:
+  scientific    - For scientific diagrams, graphs, and charts
+  presentation  - For presentation slides
+  general       - General-purpose image description
+  data_viz      - For data visualizations and charts
+  medical       - For medical imaging
+
+Examples:
+  # Convert a scientific paper
+  python convert_with_ai.py paper.pdf output.md --prompt-type scientific
+  
+  # Convert a presentation with custom model
+  python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation
+  
+  # Use custom prompt with Claude Sonnet 4.5
+  python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram"
+  
+  # Set API key via environment variable
+  export OPENROUTER_API_KEY="sk-or-v1-..."
+  python convert_with_ai.py image.jpg image.md
+
+Environment Variables:
+  OPENROUTER_API_KEY    OpenRouter API key (required if not passed via --api-key)
+
+Popular Models (use with --model):
+  anthropic/claude-sonnet-4.5 - Claude Sonnet 4.5 (recommended, vision support)
+  anthropic/claude-3.5-sonnet - Claude 3.5 Sonnet (vision support)
+  openai/gpt-4o              - GPT-4 Omni (vision support)
+  openai/gpt-4-vision        - GPT-4 Vision
+  google/gemini-pro-vision   - Gemini Pro Vision
+        """
+    )
+    
+    parser.add_argument('input', type=Path, help='Input file')
+    parser.add_argument('output', type=Path, help='Output Markdown file')
+    parser.add_argument(
+        '--api-key', '-k',
+        help='OpenRouter API key (or set OPENROUTER_API_KEY env var)'
+    )
+    parser.add_argument(
+        '--model', '-m',
+        default='anthropic/claude-sonnet-4.5',
+        help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)'
+    )
+    parser.add_argument(
+        '--prompt-type', '-t',
+        choices=list(PROMPTS.keys()),
+        default='general',
+        help='Type of prompt to use (default: general)'
+    )
+    parser.add_argument(
+        '--custom-prompt', '-p',
+        help='Custom prompt (overrides --prompt-type)'
+    )
+    parser.add_argument(
+        '--list-prompts', '-l',
+        action='store_true',
+        help='List available prompt types and exit'
+    )
+    
+    args = parser.parse_args()
+    
+    # List prompts and exit
+    if args.list_prompts:
+        print("Available prompt types:\n")
+        for name, prompt in PROMPTS.items():
+            print(f"[{name}]")
+            print(prompt)
+            print("\n" + "="*60 + "\n")
+        sys.exit(0)
+    
+    # Get API key
+    api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY')
+    if not api_key:
+        print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key")
+        print("Get your API key at: https://openrouter.ai/keys")
+        sys.exit(1)
+    
+    # Validate input file
+    if not args.input.exists():
+        print(f"Error: Input file '{args.input}' does not exist")
+        sys.exit(1)
+    
+    # Convert file
+    success = convert_with_ai(
+        input_file=args.input,
+        output_file=args.output,
+        api_key=api_key,
+        model=args.model,
+        prompt_type=args.prompt_type,
+        custom_prompt=args.custom_prompt
+    )
+    
+    sys.exit(0 if success else 1)
+
+
+if __name__ == '__main__':
+    main()
+