464 lines
10 KiB
Markdown
464 lines
10 KiB
Markdown
# MarkItDown Example Usage
|
|
|
|
This document provides practical examples of using MarkItDown in various scenarios.
|
|
|
|
## Basic Examples
|
|
|
|
### 1. Simple File Conversion
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
|
|
# Convert a PDF
|
|
result = md.convert("research_paper.pdf")
|
|
print(result.text_content)
|
|
|
|
# Convert a Word document
|
|
result = md.convert("manuscript.docx")
|
|
print(result.text_content)
|
|
|
|
# Convert a PowerPoint
|
|
result = md.convert("presentation.pptx")
|
|
print(result.text_content)
|
|
```
|
|
|
|
### 2. Save to File
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
result = md.convert("document.pdf")
|
|
|
|
with open("output.md", "w", encoding="utf-8") as f:
|
|
f.write(result.text_content)
|
|
```
|
|
|
|
### 3. Convert from Stream
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
|
|
with open("document.pdf", "rb") as f:
|
|
result = md.convert_stream(f, file_extension=".pdf")
|
|
print(result.text_content)
|
|
```
|
|
|
|
## Scientific Workflows
|
|
|
|
### Convert Research Papers
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
|
|
md = MarkItDown()
|
|
|
|
# Convert all papers in a directory
|
|
papers_dir = Path("research_papers/")
|
|
output_dir = Path("markdown_papers/")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
for paper in papers_dir.glob("*.pdf"):
|
|
result = md.convert(str(paper))
|
|
|
|
# Save with original filename
|
|
output_file = output_dir / f"{paper.stem}.md"
|
|
output_file.write_text(result.text_content)
|
|
|
|
print(f"Converted: {paper.name}")
|
|
```
|
|
|
|
### Extract Tables from Excel
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
|
|
# Convert Excel to Markdown tables
|
|
result = md.convert("experimental_data.xlsx")
|
|
|
|
# The result contains Markdown-formatted tables
|
|
print(result.text_content)
|
|
|
|
# Save for further processing
|
|
with open("data_tables.md", "w") as f:
|
|
f.write(result.text_content)
|
|
```
|
|
|
|
### Process Presentation Slides
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from openai import OpenAI
|
|
|
|
# With AI descriptions for images
|
|
client = OpenAI()
|
|
md = MarkItDown(
|
|
llm_client=client,
|
|
llm_model="anthropic/claude-sonnet-4.5",
|
|
llm_prompt="Describe this scientific slide, focusing on data and key findings"
|
|
)
|
|
|
|
result = md.convert("conference_talk.pptx")
|
|
|
|
# Save with metadata
|
|
output = f"""# Conference Talk
|
|
|
|
{result.text_content}
|
|
"""
|
|
|
|
with open("talk_notes.md", "w") as f:
|
|
f.write(output)
|
|
```
|
|
|
|
## AI-Enhanced Conversions
|
|
|
|
### Detailed Image Descriptions
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from openai import OpenAI
|
|
|
|
# Initialize OpenRouter client
|
|
client = OpenAI(
|
|
api_key="your-openrouter-api-key",
|
|
base_url="https://openrouter.ai/api/v1"
|
|
)
|
|
|
|
# Scientific diagram analysis
|
|
scientific_prompt = """
|
|
Analyze this scientific figure. Describe:
|
|
- Type of visualization (graph, microscopy, diagram, etc.)
|
|
- Key data points and trends
|
|
- Axes, labels, and legends
|
|
- Scientific significance
|
|
Be technical and precise.
|
|
"""
|
|
|
|
md = MarkItDown(
|
|
llm_client=client,
|
|
llm_model="anthropic/claude-sonnet-4.5", # recommended for scientific vision
|
|
llm_prompt=scientific_prompt
|
|
)
|
|
|
|
# Convert paper with figures
|
|
result = md.convert("paper_with_figures.pdf")
|
|
print(result.text_content)
|
|
```
|
|
|
|
### Different Prompts for Different Files
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from openai import OpenAI
|
|
|
|
# Initialize OpenRouter client
|
|
client = OpenAI(
|
|
api_key="your-openrouter-api-key",
|
|
base_url="https://openrouter.ai/api/v1"
|
|
)
|
|
|
|
# Scientific papers - use Claude for technical analysis
|
|
scientific_md = MarkItDown(
|
|
llm_client=client,
|
|
llm_model="anthropic/claude-sonnet-4.5",
|
|
llm_prompt="Describe scientific figures with technical precision"
|
|
)
|
|
|
|
# Presentations - use GPT-4o for visual understanding
|
|
presentation_md = MarkItDown(
|
|
llm_client=client,
|
|
llm_model="anthropic/claude-sonnet-4.5",
|
|
llm_prompt="Summarize slide content and key visual elements"
|
|
)
|
|
|
|
# Use appropriate instance for each file
|
|
paper_result = scientific_md.convert("research.pdf")
|
|
slides_result = presentation_md.convert("talk.pptx")
|
|
```
|
|
|
|
## Batch Processing
|
|
|
|
### Process Multiple Files
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
|
|
md = MarkItDown()
|
|
|
|
files_to_convert = [
|
|
"paper1.pdf",
|
|
"data.xlsx",
|
|
"presentation.pptx",
|
|
"notes.docx"
|
|
]
|
|
|
|
for file in files_to_convert:
|
|
try:
|
|
result = md.convert(file)
|
|
output = Path(file).stem + ".md"
|
|
|
|
with open(output, "w") as f:
|
|
f.write(result.text_content)
|
|
|
|
print(f"✓ {file} -> {output}")
|
|
except Exception as e:
|
|
print(f"✗ Error converting {file}: {e}")
|
|
```
|
|
|
|
### Parallel Processing
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
def convert_file(filepath):
|
|
md = MarkItDown()
|
|
result = md.convert(filepath)
|
|
|
|
output = Path(filepath).stem + ".md"
|
|
with open(output, "w") as f:
|
|
f.write(result.text_content)
|
|
|
|
return filepath, output
|
|
|
|
files = list(Path("documents/").glob("*.pdf"))
|
|
|
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
results = executor.map(convert_file, [str(f) for f in files])
|
|
|
|
for input_file, output_file in results:
|
|
print(f"Converted: {input_file} -> {output_file}")
|
|
```
|
|
|
|
## Integration Examples
|
|
|
|
### Literature Review Pipeline
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
import json
|
|
|
|
md = MarkItDown()
|
|
|
|
# Convert papers and create metadata
|
|
papers_dir = Path("literature/")
|
|
output_dir = Path("literature_markdown/")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
catalog = []
|
|
|
|
for paper in papers_dir.glob("*.pdf"):
|
|
result = md.convert(str(paper))
|
|
|
|
# Save Markdown
|
|
md_file = output_dir / f"{paper.stem}.md"
|
|
md_file.write_text(result.text_content)
|
|
|
|
# Store metadata
|
|
catalog.append({
|
|
"title": result.title or paper.stem,
|
|
"source": paper.name,
|
|
"markdown": str(md_file),
|
|
"word_count": len(result.text_content.split())
|
|
})
|
|
|
|
# Save catalog
|
|
with open(output_dir / "catalog.json", "w") as f:
|
|
json.dump(catalog, f, indent=2)
|
|
```
|
|
|
|
### Data Extraction Pipeline
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
import re
|
|
|
|
md = MarkItDown()
|
|
|
|
# Convert Excel data to Markdown
|
|
result = md.convert("experimental_results.xlsx")
|
|
|
|
# Extract tables (Markdown tables start with |)
|
|
tables = []
|
|
current_table = []
|
|
in_table = False
|
|
|
|
for line in result.text_content.split('\n'):
|
|
if line.strip().startswith('|'):
|
|
in_table = True
|
|
current_table.append(line)
|
|
elif in_table:
|
|
if current_table:
|
|
tables.append('\n'.join(current_table))
|
|
current_table = []
|
|
in_table = False
|
|
|
|
# Process each table
|
|
for i, table in enumerate(tables):
|
|
print(f"Table {i+1}:")
|
|
print(table)
|
|
print("\n" + "="*50 + "\n")
|
|
```
|
|
|
|
### YouTube Transcript Analysis
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
|
|
# Get transcript
|
|
video_url = "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
result = md.convert(video_url)
|
|
|
|
# Save transcript
|
|
with open("lecture_transcript.md", "w") as f:
|
|
f.write(f"# Lecture Transcript\n\n")
|
|
f.write(f"**Source**: {video_url}\n\n")
|
|
f.write(result.text_content)
|
|
```
|
|
|
|
## Error Handling
|
|
|
|
### Robust Conversion
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
md = MarkItDown()
|
|
|
|
def safe_convert(filepath):
|
|
"""Convert file with error handling."""
|
|
try:
|
|
result = md.convert(filepath)
|
|
output = Path(filepath).stem + ".md"
|
|
|
|
with open(output, "w") as f:
|
|
f.write(result.text_content)
|
|
|
|
logger.info(f"Successfully converted {filepath}")
|
|
return True
|
|
|
|
except FileNotFoundError:
|
|
logger.error(f"File not found: {filepath}")
|
|
return False
|
|
|
|
except ValueError as e:
|
|
logger.error(f"Invalid file format for {filepath}: {e}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error converting {filepath}: {e}")
|
|
return False
|
|
|
|
# Use it
|
|
files = ["paper.pdf", "data.xlsx", "slides.pptx"]
|
|
results = [safe_convert(f) for f in files]
|
|
|
|
print(f"Successfully converted {sum(results)}/{len(files)} files")
|
|
```
|
|
|
|
## Advanced Use Cases
|
|
|
|
### Custom Metadata Extraction
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
import re
|
|
from datetime import datetime
|
|
|
|
md = MarkItDown()
|
|
|
|
def convert_with_metadata(filepath):
|
|
result = md.convert(filepath)
|
|
|
|
# Extract metadata from content
|
|
metadata = {
|
|
"file": filepath,
|
|
"title": result.title,
|
|
"converted_at": datetime.now().isoformat(),
|
|
"word_count": len(result.text_content.split()),
|
|
"char_count": len(result.text_content)
|
|
}
|
|
|
|
# Try to find author
|
|
author_match = re.search(r'(?:Author|By):\s*(.+?)(?:\n|$)', result.text_content)
|
|
if author_match:
|
|
metadata["author"] = author_match.group(1).strip()
|
|
|
|
# Create formatted output
|
|
output = f"""---
|
|
title: {metadata['title']}
|
|
author: {metadata.get('author', 'Unknown')}
|
|
source: {metadata['file']}
|
|
converted: {metadata['converted_at']}
|
|
words: {metadata['word_count']}
|
|
---
|
|
|
|
{result.text_content}
|
|
"""
|
|
|
|
return output, metadata
|
|
|
|
# Use it
|
|
content, meta = convert_with_metadata("paper.pdf")
|
|
print(meta)
|
|
```
|
|
|
|
### Format-Specific Processing
|
|
|
|
```python
|
|
from markitdown import MarkItDown
|
|
from pathlib import Path
|
|
|
|
md = MarkItDown()
|
|
|
|
def process_by_format(filepath):
|
|
path = Path(filepath)
|
|
result = md.convert(filepath)
|
|
|
|
if path.suffix == '.pdf':
|
|
# Add PDF-specific metadata
|
|
output = f"# PDF Document: {path.stem}\n\n"
|
|
output += result.text_content
|
|
|
|
elif path.suffix == '.xlsx':
|
|
# Add table count
|
|
table_count = result.text_content.count('|---')
|
|
output = f"# Excel Data: {path.stem}\n\n"
|
|
output += f"**Tables**: {table_count}\n\n"
|
|
output += result.text_content
|
|
|
|
elif path.suffix == '.pptx':
|
|
# Add slide count
|
|
slide_count = result.text_content.count('## Slide')
|
|
output = f"# Presentation: {path.stem}\n\n"
|
|
output += f"**Slides**: {slide_count}\n\n"
|
|
output += result.text_content
|
|
|
|
else:
|
|
output = result.text_content
|
|
|
|
return output
|
|
|
|
# Use it
|
|
content = process_by_format("presentation.pptx")
|
|
print(content)
|
|
```
|
|
|