Files
gh-k-dense-ai-claude-scient…/skills/markitdown/scripts/convert_with_ai.py
2025-11-30 08:30:14 +08:00

244 lines
7.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Convert documents to Markdown with AI-enhanced image descriptions.
This script demonstrates how to use MarkItDown with OpenRouter to generate
detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.)
"""
import argparse
import os
import sys
from pathlib import Path
from markitdown import MarkItDown
from openai import OpenAI
# Predefined prompts for different use cases
PROMPTS = {
'scientific': """
Analyze this scientific image or diagram. Provide:
1. Type of visualization (graph, chart, microscopy, diagram, etc.)
2. Key data points, trends, or patterns
3. Axes labels, legends, and scales
4. Notable features or findings
5. Scientific context and significance
Be precise, technical, and detailed.
""".strip(),
'presentation': """
Describe this presentation slide image. Include:
1. Main visual elements and their arrangement
2. Key points or messages conveyed
3. Data or information presented
4. Visual hierarchy and emphasis
Keep the description clear and informative.
""".strip(),
'general': """
Describe this image in detail. Include:
1. Main subjects and objects
2. Visual composition and layout
3. Text content (if any)
4. Notable details
5. Overall context and purpose
Be comprehensive and accurate.
""".strip(),
'data_viz': """
Analyze this data visualization. Provide:
1. Type of chart/graph (bar, line, scatter, pie, etc.)
2. Variables and axes
3. Data ranges and scales
4. Key patterns, trends, or outliers
5. Statistical insights
Focus on quantitative accuracy.
""".strip(),
'medical': """
Describe this medical image. Include:
1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.)
2. Anatomical structures visible
3. Notable findings or abnormalities
4. Image quality and contrast
5. Clinical relevance
Be professional and precise.
""".strip()
}
def convert_with_ai(
input_file: Path,
output_file: Path,
api_key: str,
model: str = "anthropic/claude-sonnet-4.5",
prompt_type: str = "general",
custom_prompt: str = None
) -> bool:
"""
Convert a file to Markdown with AI image descriptions.
Args:
input_file: Path to input file
output_file: Path to output Markdown file
api_key: OpenRouter API key
model: Model name (default: anthropic/claude-sonnet-4.5)
prompt_type: Type of prompt to use
custom_prompt: Custom prompt (overrides prompt_type)
Returns:
True if successful, False otherwise
"""
try:
# Initialize OpenRouter client (OpenAI-compatible)
client = OpenAI(
api_key=api_key,
base_url="https://openrouter.ai/api/v1"
)
# Select prompt
if custom_prompt:
prompt = custom_prompt
else:
prompt = PROMPTS.get(prompt_type, PROMPTS['general'])
print(f"Using model: {model}")
print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}")
print(f"Converting: {input_file}")
# Create MarkItDown with AI support
md = MarkItDown(
llm_client=client,
llm_model=model,
llm_prompt=prompt
)
# Convert file
result = md.convert(str(input_file))
# Create output with metadata
content = f"# {result.title or input_file.stem}\n\n"
content += f"**Source**: {input_file.name}\n"
content += f"**Format**: {input_file.suffix}\n"
content += f"**AI Model**: {model}\n"
content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n"
content += "---\n\n"
content += result.text_content
# Write output
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content, encoding='utf-8')
print(f"✓ Successfully converted to: {output_file}")
return True
except Exception as e:
print(f"✗ Error: {str(e)}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(
description="Convert documents to Markdown with AI-enhanced image descriptions",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
Available prompt types:
scientific - For scientific diagrams, graphs, and charts
presentation - For presentation slides
general - General-purpose image description
data_viz - For data visualizations and charts
medical - For medical imaging
Examples:
# Convert a scientific paper
python convert_with_ai.py paper.pdf output.md --prompt-type scientific
# Convert a presentation with custom model
python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation
# Use custom prompt with Claude Sonnet 4.5
python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram"
# Set API key via environment variable
export OPENROUTER_API_KEY="sk-or-v1-..."
python convert_with_ai.py image.jpg image.md
Environment Variables:
OPENROUTER_API_KEY OpenRouter API key (required if not passed via --api-key)
Popular Models (use with --model):
anthropic/claude-sonnet-4.5 - Claude Sonnet 4.5 (recommended, vision support)
anthropic/claude-3.5-sonnet - Claude 3.5 Sonnet (vision support)
openai/gpt-4o - GPT-4 Omni (vision support)
openai/gpt-4-vision - GPT-4 Vision
google/gemini-pro-vision - Gemini Pro Vision
"""
)
parser.add_argument('input', type=Path, help='Input file')
parser.add_argument('output', type=Path, help='Output Markdown file')
parser.add_argument(
'--api-key', '-k',
help='OpenRouter API key (or set OPENROUTER_API_KEY env var)'
)
parser.add_argument(
'--model', '-m',
default='anthropic/claude-sonnet-4.5',
help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)'
)
parser.add_argument(
'--prompt-type', '-t',
choices=list(PROMPTS.keys()),
default='general',
help='Type of prompt to use (default: general)'
)
parser.add_argument(
'--custom-prompt', '-p',
help='Custom prompt (overrides --prompt-type)'
)
parser.add_argument(
'--list-prompts', '-l',
action='store_true',
help='List available prompt types and exit'
)
args = parser.parse_args()
# List prompts and exit
if args.list_prompts:
print("Available prompt types:\n")
for name, prompt in PROMPTS.items():
print(f"[{name}]")
print(prompt)
print("\n" + "="*60 + "\n")
sys.exit(0)
# Get API key
api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY')
if not api_key:
print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key")
print("Get your API key at: https://openrouter.ai/keys")
sys.exit(1)
# Validate input file
if not args.input.exists():
print(f"Error: Input file '{args.input}' does not exist")
sys.exit(1)
# Convert file
success = convert_with_ai(
input_file=args.input,
output_file=args.output,
api_key=api_key,
model=args.model,
prompt_type=args.prompt_type,
custom_prompt=args.custom_prompt
)
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()