Initial commit
This commit is contained in:
317
skills/markitdown/scripts/batch_convert.py
Executable file
317
skills/markitdown/scripts/batch_convert.py
Executable file
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch conversion utility for MarkItDown.
|
||||
|
||||
Converts all supported files in a directory to Markdown format.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from markitdown import MarkItDown
|
||||
from typing import Optional, List
|
||||
import argparse
|
||||
|
||||
|
||||
# Supported file extensions
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
'.pdf', '.docx', '.pptx', '.xlsx', '.xls',
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
|
||||
'.wav', '.mp3', '.flac', '.ogg', '.aiff',
|
||||
'.html', '.htm', '.epub',
|
||||
'.csv', '.json', '.xml',
|
||||
'.zip'
|
||||
}
|
||||
|
||||
|
||||
def setup_markitdown(
|
||||
use_llm: bool = False,
|
||||
llm_model: str = "gpt-4o",
|
||||
use_azure_di: bool = False,
|
||||
azure_endpoint: Optional[str] = None,
|
||||
azure_key: Optional[str] = None
|
||||
) -> MarkItDown:
|
||||
"""
|
||||
Setup MarkItDown instance with optional advanced features.
|
||||
|
||||
Args:
|
||||
use_llm: Enable LLM-powered image descriptions
|
||||
llm_model: LLM model to use (default: gpt-4o)
|
||||
use_azure_di: Enable Azure Document Intelligence
|
||||
azure_endpoint: Azure Document Intelligence endpoint
|
||||
azure_key: Azure Document Intelligence API key
|
||||
|
||||
Returns:
|
||||
Configured MarkItDown instance
|
||||
"""
|
||||
kwargs = {}
|
||||
|
||||
if use_llm:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI()
|
||||
kwargs['llm_client'] = client
|
||||
kwargs['llm_model'] = llm_model
|
||||
print(f"✓ LLM integration enabled ({llm_model})")
|
||||
except ImportError:
|
||||
print("✗ Warning: OpenAI not installed, LLM features disabled")
|
||||
print(" Install with: pip install openai")
|
||||
|
||||
if use_azure_di:
|
||||
if azure_endpoint and azure_key:
|
||||
kwargs['docintel_endpoint'] = azure_endpoint
|
||||
kwargs['docintel_key'] = azure_key
|
||||
print("✓ Azure Document Intelligence enabled")
|
||||
else:
|
||||
print("✗ Warning: Azure credentials not provided, Azure DI disabled")
|
||||
|
||||
return MarkItDown(**kwargs)
|
||||
|
||||
|
||||
def convert_file(
|
||||
md: MarkItDown,
|
||||
input_path: Path,
|
||||
output_dir: Path,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Convert a single file to Markdown.
|
||||
|
||||
Args:
|
||||
md: MarkItDown instance
|
||||
input_path: Path to input file
|
||||
output_dir: Directory for output files
|
||||
verbose: Print detailed progress
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
if verbose:
|
||||
print(f" Processing: {input_path.name}")
|
||||
|
||||
# Convert file
|
||||
result = md.convert(str(input_path))
|
||||
|
||||
# Create output filename
|
||||
output_filename = input_path.stem + '.md'
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result.text_content)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Converted: {input_path.name} → {output_filename}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error converting {input_path.name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def find_files(input_dir: Path, recursive: bool = False) -> List[Path]:
|
||||
"""
|
||||
Find all supported files in directory.
|
||||
|
||||
Args:
|
||||
input_dir: Directory to search
|
||||
recursive: Search subdirectories
|
||||
|
||||
Returns:
|
||||
List of file paths
|
||||
"""
|
||||
files = []
|
||||
|
||||
if recursive:
|
||||
for ext in SUPPORTED_EXTENSIONS:
|
||||
files.extend(input_dir.rglob(f"*{ext}"))
|
||||
else:
|
||||
for ext in SUPPORTED_EXTENSIONS:
|
||||
files.extend(input_dir.glob(f"*{ext}"))
|
||||
|
||||
return sorted(files)
|
||||
|
||||
|
||||
def batch_convert(
|
||||
input_dir: str,
|
||||
output_dir: str,
|
||||
recursive: bool = False,
|
||||
use_llm: bool = False,
|
||||
llm_model: str = "gpt-4o",
|
||||
use_azure_di: bool = False,
|
||||
azure_endpoint: Optional[str] = None,
|
||||
azure_key: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Batch convert all supported files in a directory.
|
||||
|
||||
Args:
|
||||
input_dir: Input directory containing files
|
||||
output_dir: Output directory for Markdown files
|
||||
recursive: Search subdirectories
|
||||
use_llm: Enable LLM-powered descriptions
|
||||
llm_model: LLM model to use
|
||||
use_azure_di: Enable Azure Document Intelligence
|
||||
azure_endpoint: Azure DI endpoint
|
||||
azure_key: Azure DI API key
|
||||
verbose: Print detailed progress
|
||||
"""
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
|
||||
# Validate input directory
|
||||
if not input_path.exists():
|
||||
print(f"✗ Error: Input directory '{input_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
if not input_path.is_dir():
|
||||
print(f"✗ Error: '{input_dir}' is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Create output directory
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup MarkItDown
|
||||
print("Setting up MarkItDown...")
|
||||
md = setup_markitdown(
|
||||
use_llm=use_llm,
|
||||
llm_model=llm_model,
|
||||
use_azure_di=use_azure_di,
|
||||
azure_endpoint=azure_endpoint,
|
||||
azure_key=azure_key
|
||||
)
|
||||
|
||||
# Find files
|
||||
print(f"\nScanning directory: {input_dir}")
|
||||
if recursive:
|
||||
print(" (including subdirectories)")
|
||||
|
||||
files = find_files(input_path, recursive)
|
||||
|
||||
if not files:
|
||||
print("✗ No supported files found")
|
||||
print(f" Supported extensions: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
|
||||
sys.exit(0)
|
||||
|
||||
print(f"✓ Found {len(files)} file(s) to convert\n")
|
||||
|
||||
# Convert files
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
for file_path in files:
|
||||
if convert_file(md, file_path, output_path, verbose):
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Conversion complete!")
|
||||
print(f" Successful: {successful}")
|
||||
print(f" Failed: {failed}")
|
||||
print(f" Output: {output_dir}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch convert files to Markdown using MarkItDown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic usage
|
||||
python batch_convert.py documents/ output/
|
||||
|
||||
# Recursive conversion
|
||||
python batch_convert.py documents/ output/ --recursive
|
||||
|
||||
# With LLM-powered image descriptions
|
||||
python batch_convert.py documents/ output/ --llm
|
||||
|
||||
# With Azure Document Intelligence
|
||||
python batch_convert.py documents/ output/ --azure \\
|
||||
--azure-endpoint https://example.cognitiveservices.azure.com/ \\
|
||||
--azure-key YOUR-KEY
|
||||
|
||||
# All features enabled
|
||||
python batch_convert.py documents/ output/ --llm --azure \\
|
||||
--azure-endpoint $AZURE_ENDPOINT --azure-key $AZURE_KEY
|
||||
|
||||
Supported file types:
|
||||
Documents: PDF, DOCX, PPTX, XLSX, XLS
|
||||
Images: JPG, PNG, GIF, BMP, TIFF
|
||||
Audio: WAV, MP3, FLAC, OGG, AIFF
|
||||
Web: HTML, EPUB
|
||||
Data: CSV, JSON, XML
|
||||
Archives: ZIP
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_dir',
|
||||
help='Input directory containing files to convert'
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_dir',
|
||||
help='Output directory for Markdown files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--recursive',
|
||||
action='store_true',
|
||||
help='Recursively search subdirectories'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--llm',
|
||||
action='store_true',
|
||||
help='Enable LLM-powered image descriptions (requires OpenAI API key)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--llm-model',
|
||||
default='gpt-4o',
|
||||
help='LLM model to use (default: gpt-4o)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure',
|
||||
action='store_true',
|
||||
help='Enable Azure Document Intelligence for PDFs'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure-endpoint',
|
||||
help='Azure Document Intelligence endpoint URL'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure-key',
|
||||
help='Azure Document Intelligence API key'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Print detailed progress'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Environment variable fallbacks for Azure
|
||||
azure_endpoint = args.azure_endpoint or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
||||
azure_key = args.azure_key or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
|
||||
|
||||
batch_convert(
|
||||
input_dir=args.input_dir,
|
||||
output_dir=args.output_dir,
|
||||
recursive=args.recursive,
|
||||
use_llm=args.llm,
|
||||
llm_model=args.llm_model,
|
||||
use_azure_di=args.azure,
|
||||
azure_endpoint=azure_endpoint,
|
||||
azure_key=azure_key,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user