Files
gh-k-dense-ai-claude-scient…/skills/markitdown/assets/example_usage.md
2025-11-30 08:30:14 +08:00

10 KiB

MarkItDown Example Usage

This document provides practical examples of using MarkItDown in various scenarios.

Basic Examples

1. Simple File Conversion

from markitdown import MarkItDown

md = MarkItDown()

# Convert a PDF
result = md.convert("research_paper.pdf")
print(result.text_content)

# Convert a Word document
result = md.convert("manuscript.docx")
print(result.text_content)

# Convert a PowerPoint
result = md.convert("presentation.pptx")
print(result.text_content)

2. Save to File

from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("document.pdf")

with open("output.md", "w", encoding="utf-8") as f:
    f.write(result.text_content)

3. Convert from Stream

from markitdown import MarkItDown

md = MarkItDown()

with open("document.pdf", "rb") as f:
    result = md.convert_stream(f, file_extension=".pdf")
    print(result.text_content)

Scientific Workflows

Convert Research Papers

from markitdown import MarkItDown
from pathlib import Path

md = MarkItDown()

# Convert all papers in a directory
papers_dir = Path("research_papers/")
output_dir = Path("markdown_papers/")
output_dir.mkdir(exist_ok=True)

for paper in papers_dir.glob("*.pdf"):
    result = md.convert(str(paper))
    
    # Save with original filename
    output_file = output_dir / f"{paper.stem}.md"
    output_file.write_text(result.text_content)
    
    print(f"Converted: {paper.name}")

Extract Tables from Excel

from markitdown import MarkItDown

md = MarkItDown()

# Convert Excel to Markdown tables
result = md.convert("experimental_data.xlsx")

# The result contains Markdown-formatted tables
print(result.text_content)

# Save for further processing
with open("data_tables.md", "w") as f:
    f.write(result.text_content)

Process Presentation Slides

from markitdown import MarkItDown
from openai import OpenAI

# With AI descriptions for images
client = OpenAI()
md = MarkItDown(
    llm_client=client,
    llm_model="anthropic/claude-sonnet-4.5",
    llm_prompt="Describe this scientific slide, focusing on data and key findings"
)

result = md.convert("conference_talk.pptx")

# Save with metadata
output = f"""# Conference Talk

{result.text_content}
"""

with open("talk_notes.md", "w") as f:
    f.write(output)

AI-Enhanced Conversions

Detailed Image Descriptions

from markitdown import MarkItDown
from openai import OpenAI

# Initialize OpenRouter client
client = OpenAI(
    api_key="your-openrouter-api-key",
    base_url="https://openrouter.ai/api/v1"
)

# Scientific diagram analysis
scientific_prompt = """
Analyze this scientific figure. Describe:
- Type of visualization (graph, microscopy, diagram, etc.)
- Key data points and trends
- Axes, labels, and legends
- Scientific significance
Be technical and precise.
"""

md = MarkItDown(
    llm_client=client,
    llm_model="anthropic/claude-sonnet-4.5",  # recommended for scientific vision
    llm_prompt=scientific_prompt
)

# Convert paper with figures
result = md.convert("paper_with_figures.pdf")
print(result.text_content)

Different Prompts for Different Files

from markitdown import MarkItDown
from openai import OpenAI

# Initialize OpenRouter client
client = OpenAI(
    api_key="your-openrouter-api-key",
    base_url="https://openrouter.ai/api/v1"
)

# Scientific papers - use Claude for technical analysis
scientific_md = MarkItDown(
    llm_client=client,
    llm_model="anthropic/claude-sonnet-4.5",
    llm_prompt="Describe scientific figures with technical precision"
)

# Presentations - use GPT-4o for visual understanding
presentation_md = MarkItDown(
    llm_client=client,
    llm_model="anthropic/claude-sonnet-4.5",
    llm_prompt="Summarize slide content and key visual elements"
)

# Use appropriate instance for each file
paper_result = scientific_md.convert("research.pdf")
slides_result = presentation_md.convert("talk.pptx")

Batch Processing

Process Multiple Files

from markitdown import MarkItDown
from pathlib import Path

md = MarkItDown()

files_to_convert = [
    "paper1.pdf",
    "data.xlsx",
    "presentation.pptx",
    "notes.docx"
]

for file in files_to_convert:
    try:
        result = md.convert(file)
        output = Path(file).stem + ".md"
        
        with open(output, "w") as f:
            f.write(result.text_content)
        
        print(f"✓ {file} -> {output}")
    except Exception as e:
        print(f"✗ Error converting {file}: {e}")

Parallel Processing

from markitdown import MarkItDown
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

def convert_file(filepath):
    md = MarkItDown()
    result = md.convert(filepath)
    
    output = Path(filepath).stem + ".md"
    with open(output, "w") as f:
        f.write(result.text_content)
    
    return filepath, output

files = list(Path("documents/").glob("*.pdf"))

with ThreadPoolExecutor(max_workers=4) as executor:
    results = executor.map(convert_file, [str(f) for f in files])
    
    for input_file, output_file in results:
        print(f"Converted: {input_file} -> {output_file}")

Integration Examples

Literature Review Pipeline

from markitdown import MarkItDown
from pathlib import Path
import json

md = MarkItDown()

# Convert papers and create metadata
papers_dir = Path("literature/")
output_dir = Path("literature_markdown/")
output_dir.mkdir(exist_ok=True)

catalog = []

for paper in papers_dir.glob("*.pdf"):
    result = md.convert(str(paper))
    
    # Save Markdown
    md_file = output_dir / f"{paper.stem}.md"
    md_file.write_text(result.text_content)
    
    # Store metadata
    catalog.append({
        "title": result.title or paper.stem,
        "source": paper.name,
        "markdown": str(md_file),
        "word_count": len(result.text_content.split())
    })

# Save catalog
with open(output_dir / "catalog.json", "w") as f:
    json.dump(catalog, f, indent=2)

Data Extraction Pipeline

from markitdown import MarkItDown
import re

md = MarkItDown()

# Convert Excel data to Markdown
result = md.convert("experimental_results.xlsx")

# Extract tables (Markdown tables start with |)
tables = []
current_table = []
in_table = False

for line in result.text_content.split('\n'):
    if line.strip().startswith('|'):
        in_table = True
        current_table.append(line)
    elif in_table:
        if current_table:
            tables.append('\n'.join(current_table))
            current_table = []
        in_table = False

# Process each table
for i, table in enumerate(tables):
    print(f"Table {i+1}:")
    print(table)
    print("\n" + "="*50 + "\n")

YouTube Transcript Analysis

from markitdown import MarkItDown

md = MarkItDown()

# Get transcript
video_url = "https://www.youtube.com/watch?v=VIDEO_ID"
result = md.convert(video_url)

# Save transcript
with open("lecture_transcript.md", "w") as f:
    f.write(f"# Lecture Transcript\n\n")
    f.write(f"**Source**: {video_url}\n\n")
    f.write(result.text_content)

Error Handling

Robust Conversion

from markitdown import MarkItDown
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

md = MarkItDown()

def safe_convert(filepath):
    """Convert file with error handling."""
    try:
        result = md.convert(filepath)
        output = Path(filepath).stem + ".md"
        
        with open(output, "w") as f:
            f.write(result.text_content)
        
        logger.info(f"Successfully converted {filepath}")
        return True
    
    except FileNotFoundError:
        logger.error(f"File not found: {filepath}")
        return False
    
    except ValueError as e:
        logger.error(f"Invalid file format for {filepath}: {e}")
        return False
    
    except Exception as e:
        logger.error(f"Unexpected error converting {filepath}: {e}")
        return False

# Use it
files = ["paper.pdf", "data.xlsx", "slides.pptx"]
results = [safe_convert(f) for f in files]

print(f"Successfully converted {sum(results)}/{len(files)} files")

Advanced Use Cases

Custom Metadata Extraction

from markitdown import MarkItDown
import re
from datetime import datetime

md = MarkItDown()

def convert_with_metadata(filepath):
    result = md.convert(filepath)
    
    # Extract metadata from content
    metadata = {
        "file": filepath,
        "title": result.title,
        "converted_at": datetime.now().isoformat(),
        "word_count": len(result.text_content.split()),
        "char_count": len(result.text_content)
    }
    
    # Try to find author
    author_match = re.search(r'(?:Author|By):\s*(.+?)(?:\n|$)', result.text_content)
    if author_match:
        metadata["author"] = author_match.group(1).strip()
    
    # Create formatted output
    output = f"""---
title: {metadata['title']}
author: {metadata.get('author', 'Unknown')}
source: {metadata['file']}
converted: {metadata['converted_at']}
words: {metadata['word_count']}
---

{result.text_content}
"""
    
    return output, metadata

# Use it
content, meta = convert_with_metadata("paper.pdf")
print(meta)

Format-Specific Processing

from markitdown import MarkItDown
from pathlib import Path

md = MarkItDown()

def process_by_format(filepath):
    path = Path(filepath)
    result = md.convert(filepath)
    
    if path.suffix == '.pdf':
        # Add PDF-specific metadata
        output = f"# PDF Document: {path.stem}\n\n"
        output += result.text_content
    
    elif path.suffix == '.xlsx':
        # Add table count
        table_count = result.text_content.count('|---')
        output = f"# Excel Data: {path.stem}\n\n"
        output += f"**Tables**: {table_count}\n\n"
        output += result.text_content
    
    elif path.suffix == '.pptx':
        # Add slide count
        slide_count = result.text_content.count('## Slide')
        output = f"# Presentation: {path.stem}\n\n"
        output += f"**Slides**: {slide_count}\n\n"
        output += result.text_content
    
    else:
        output = result.text_content
    
    return output

# Use it
content = process_by_format("presentation.pptx")
print(content)