Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/markitdown/scripts/batch_convert.py
+++ b/skills/markitdown/scripts/batch_convert.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+"""
+Batch conversion utility for MarkItDown.
+
+Converts all supported files in a directory to Markdown format.
+"""
+
+import os
+import sys
+from pathlib import Path
+from markitdown import MarkItDown
+from typing import Optional, List
+import argparse
+
+
+# Supported file extensions
+SUPPORTED_EXTENSIONS = {
+    '.pdf', '.docx', '.pptx', '.xlsx', '.xls',
+    '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
+    '.wav', '.mp3', '.flac', '.ogg', '.aiff',
+    '.html', '.htm', '.epub',
+    '.csv', '.json', '.xml',
+    '.zip'
+}
+
+
+def setup_markitdown(
+    use_llm: bool = False,
+    llm_model: str = "gpt-4o",
+    use_azure_di: bool = False,
+    azure_endpoint: Optional[str] = None,
+    azure_key: Optional[str] = None
+) -> MarkItDown:
+    """
+    Setup MarkItDown instance with optional advanced features.
+
+    Args:
+        use_llm: Enable LLM-powered image descriptions
+        llm_model: LLM model to use (default: gpt-4o)
+        use_azure_di: Enable Azure Document Intelligence
+        azure_endpoint: Azure Document Intelligence endpoint
+        azure_key: Azure Document Intelligence API key
+
+    Returns:
+        Configured MarkItDown instance
+    """
+    kwargs = {}
+
+    if use_llm:
+        try:
+            from openai import OpenAI
+            client = OpenAI()
+            kwargs['llm_client'] = client
+            kwargs['llm_model'] = llm_model
+            print(f"✓ LLM integration enabled ({llm_model})")
+        except ImportError:
+            print("✗ Warning: OpenAI not installed, LLM features disabled")
+            print("  Install with: pip install openai")
+
+    if use_azure_di:
+        if azure_endpoint and azure_key:
+            kwargs['docintel_endpoint'] = azure_endpoint
+            kwargs['docintel_key'] = azure_key
+            print("✓ Azure Document Intelligence enabled")
+        else:
+            print("✗ Warning: Azure credentials not provided, Azure DI disabled")
+
+    return MarkItDown(**kwargs)
+
+
+def convert_file(
+    md: MarkItDown,
+    input_path: Path,
+    output_dir: Path,
+    verbose: bool = False
+) -> bool:
+    """
+    Convert a single file to Markdown.
+
+    Args:
+        md: MarkItDown instance
+        input_path: Path to input file
+        output_dir: Directory for output files
+        verbose: Print detailed progress
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        if verbose:
+            print(f"  Processing: {input_path.name}")
+
+        # Convert file
+        result = md.convert(str(input_path))
+
+        # Create output filename
+        output_filename = input_path.stem + '.md'
+        output_path = output_dir / output_filename
+
+        # Write output
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(result.text_content)
+
+        if verbose:
+            print(f"  ✓ Converted: {input_path.name} → {output_filename}")
+
+        return True
+
+    except Exception as e:
+        print(f"  ✗ Error converting {input_path.name}: {e}")
+        return False
+
+
+def find_files(input_dir: Path, recursive: bool = False) -> List[Path]:
+    """
+    Find all supported files in directory.
+
+    Args:
+        input_dir: Directory to search
+        recursive: Search subdirectories
+
+    Returns:
+        List of file paths
+    """
+    files = []
+
+    if recursive:
+        for ext in SUPPORTED_EXTENSIONS:
+            files.extend(input_dir.rglob(f"*{ext}"))
+    else:
+        for ext in SUPPORTED_EXTENSIONS:
+            files.extend(input_dir.glob(f"*{ext}"))
+
+    return sorted(files)
+
+
+def batch_convert(
+    input_dir: str,
+    output_dir: str,
+    recursive: bool = False,
+    use_llm: bool = False,
+    llm_model: str = "gpt-4o",
+    use_azure_di: bool = False,
+    azure_endpoint: Optional[str] = None,
+    azure_key: Optional[str] = None,
+    verbose: bool = False
+) -> None:
+    """
+    Batch convert all supported files in a directory.
+
+    Args:
+        input_dir: Input directory containing files
+        output_dir: Output directory for Markdown files
+        recursive: Search subdirectories
+        use_llm: Enable LLM-powered descriptions
+        llm_model: LLM model to use
+        use_azure_di: Enable Azure Document Intelligence
+        azure_endpoint: Azure DI endpoint
+        azure_key: Azure DI API key
+        verbose: Print detailed progress
+    """
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+
+    # Validate input directory
+    if not input_path.exists():
+        print(f"✗ Error: Input directory '{input_dir}' does not exist")
+        sys.exit(1)
+
+    if not input_path.is_dir():
+        print(f"✗ Error: '{input_dir}' is not a directory")
+        sys.exit(1)
+
+    # Create output directory
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Setup MarkItDown
+    print("Setting up MarkItDown...")
+    md = setup_markitdown(
+        use_llm=use_llm,
+        llm_model=llm_model,
+        use_azure_di=use_azure_di,
+        azure_endpoint=azure_endpoint,
+        azure_key=azure_key
+    )
+
+    # Find files
+    print(f"\nScanning directory: {input_dir}")
+    if recursive:
+        print("  (including subdirectories)")
+
+    files = find_files(input_path, recursive)
+
+    if not files:
+        print("✗ No supported files found")
+        print(f"  Supported extensions: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
+        sys.exit(0)
+
+    print(f"✓ Found {len(files)} file(s) to convert\n")
+
+    # Convert files
+    successful = 0
+    failed = 0
+
+    for file_path in files:
+        if convert_file(md, file_path, output_path, verbose):
+            successful += 1
+        else:
+            failed += 1
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed:     {failed}")
+    print(f"  Output:     {output_dir}")
+    print(f"{'='*60}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Batch convert files to Markdown using MarkItDown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python batch_convert.py documents/ output/
+
+  # Recursive conversion
+  python batch_convert.py documents/ output/ --recursive
+
+  # With LLM-powered image descriptions
+  python batch_convert.py documents/ output/ --llm
+
+  # With Azure Document Intelligence
+  python batch_convert.py documents/ output/ --azure \\
+      --azure-endpoint https://example.cognitiveservices.azure.com/ \\
+      --azure-key YOUR-KEY
+
+  # All features enabled
+  python batch_convert.py documents/ output/ --llm --azure \\
+      --azure-endpoint $AZURE_ENDPOINT --azure-key $AZURE_KEY
+
+Supported file types:
+  Documents: PDF, DOCX, PPTX, XLSX, XLS
+  Images:    JPG, PNG, GIF, BMP, TIFF
+  Audio:     WAV, MP3, FLAC, OGG, AIFF
+  Web:       HTML, EPUB
+  Data:      CSV, JSON, XML
+  Archives:  ZIP
+        """
+    )
+
+    parser.add_argument(
+        'input_dir',
+        help='Input directory containing files to convert'
+    )
+    parser.add_argument(
+        'output_dir',
+        help='Output directory for Markdown files'
+    )
+    parser.add_argument(
+        '-r', '--recursive',
+        action='store_true',
+        help='Recursively search subdirectories'
+    )
+    parser.add_argument(
+        '--llm',
+        action='store_true',
+        help='Enable LLM-powered image descriptions (requires OpenAI API key)'
+    )
+    parser.add_argument(
+        '--llm-model',
+        default='gpt-4o',
+        help='LLM model to use (default: gpt-4o)'
+    )
+    parser.add_argument(
+        '--azure',
+        action='store_true',
+        help='Enable Azure Document Intelligence for PDFs'
+    )
+    parser.add_argument(
+        '--azure-endpoint',
+        help='Azure Document Intelligence endpoint URL'
+    )
+    parser.add_argument(
+        '--azure-key',
+        help='Azure Document Intelligence API key'
+    )
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='Print detailed progress'
+    )
+
+    args = parser.parse_args()
+
+    # Environment variable fallbacks for Azure
+    azure_endpoint = args.azure_endpoint or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
+    azure_key = args.azure_key or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
+
+    batch_convert(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        recursive=args.recursive,
+        use_llm=args.llm,
+        llm_model=args.llm_model,
+        use_azure_di=args.azure,
+        azure_endpoint=azure_endpoint,
+        azure_key=azure_key,
+        verbose=args.verbose
+    )
+
+
+if __name__ == '__main__':
+    main()