Initial commit
This commit is contained in:
311
skills/markdown-optimizer/scripts/optimize_markdown.py
Executable file
311
skills/markdown-optimizer/scripts/optimize_markdown.py
Executable file
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Markdown Optimizer for LLM Consumption
|
||||
|
||||
Optimizes markdown files by:
|
||||
- Adding YAML front-matter with metadata
|
||||
- Creating TOC in front-matter
|
||||
- Normalizing heading hierarchy
|
||||
- Removing redundant content and noise
|
||||
- Converting verbose prose to structured formats
|
||||
- Identifying diagram opportunities
|
||||
- Calculating token estimates
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class MarkdownOptimizer:
|
||||
def __init__(self, content: str, source_path: str = ""):
|
||||
self.original_content = content
|
||||
self.source_path = source_path
|
||||
self.lines = content.split('\n')
|
||||
self.headings = []
|
||||
self.metadata = {}
|
||||
|
||||
def extract_headings(self) -> List[Dict]:
|
||||
"""Extract all headings with their levels and content."""
|
||||
headings = []
|
||||
for i, line in enumerate(self.lines):
|
||||
match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
text = match.group(2).strip()
|
||||
headings.append({
|
||||
'level': level,
|
||||
'text': text,
|
||||
'line': i
|
||||
})
|
||||
return headings
|
||||
|
||||
def normalize_heading_hierarchy(self) -> str:
|
||||
"""Ensure logical heading progression (no skipped levels)."""
|
||||
content = self.original_content
|
||||
headings = self.extract_headings()
|
||||
|
||||
if not headings:
|
||||
return content
|
||||
|
||||
# Start from H1
|
||||
expected_level = 1
|
||||
adjustments = {}
|
||||
|
||||
for heading in headings:
|
||||
current_level = heading['level']
|
||||
|
||||
# If we skip levels, normalize
|
||||
if current_level > expected_level + 1:
|
||||
adjustments[heading['line']] = expected_level + 1
|
||||
expected_level = expected_level + 1
|
||||
else:
|
||||
adjustments[heading['line']] = current_level
|
||||
expected_level = current_level
|
||||
|
||||
# Apply adjustments
|
||||
lines = content.split('\n')
|
||||
for line_num, new_level in adjustments.items():
|
||||
old_line = lines[line_num]
|
||||
match = re.match(r'^(#{1,6})\s+(.+)$', old_line)
|
||||
if match:
|
||||
lines[line_num] = '#' * new_level + ' ' + match.group(2)
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def generate_toc(self) -> List[Dict]:
|
||||
"""Generate table of contents from headings."""
|
||||
headings = self.extract_headings()
|
||||
toc = []
|
||||
|
||||
for heading in headings:
|
||||
# Create anchor-style reference
|
||||
anchor = heading['text'].lower()
|
||||
anchor = re.sub(r'[^\w\s-]', '', anchor)
|
||||
anchor = re.sub(r'[-\s]+', '-', anchor)
|
||||
|
||||
toc.append({
|
||||
'level': heading['level'],
|
||||
'text': heading['text'],
|
||||
'anchor': anchor
|
||||
})
|
||||
|
||||
return toc
|
||||
|
||||
def extract_key_concepts(self) -> List[str]:
|
||||
"""Extract key concepts/topics from the document."""
|
||||
# Remove markdown syntax and extract meaningful words
|
||||
text = re.sub(r'[#*`_\[\]()]', '', self.original_content)
|
||||
words = re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text) # CamelCase
|
||||
words += re.findall(r'\b[A-Z]{2,}\b', text) # ACRONYMS
|
||||
|
||||
# Count and return top concepts
|
||||
word_counts = Counter(words)
|
||||
return [word for word, _ in word_counts.most_common(10)]
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
"""Rough token estimation (1 token ≈ 4 characters)."""
|
||||
return len(text) // 4
|
||||
|
||||
def identify_diagram_opportunities(self) -> List[Dict]:
|
||||
"""Identify sections that could benefit from Mermaid diagrams."""
|
||||
opportunities = []
|
||||
content_lower = self.original_content.lower()
|
||||
|
||||
# Process flow indicators
|
||||
process_indicators = [
|
||||
'step 1', 'step 2', 'first', 'then', 'next', 'finally',
|
||||
'process:', 'workflow:', 'procedure:'
|
||||
]
|
||||
|
||||
# Relationship indicators
|
||||
relationship_indicators = [
|
||||
'depends on', 'related to', 'connects to', 'inherits from',
|
||||
'composed of', 'hierarchy', 'relationship between'
|
||||
]
|
||||
|
||||
# Architecture/structure indicators
|
||||
architecture_indicators = [
|
||||
'architecture', 'component', 'system design', 'structure',
|
||||
'module', 'layer', 'interface'
|
||||
]
|
||||
|
||||
headings = self.extract_headings()
|
||||
for heading in headings:
|
||||
section_start = heading['line']
|
||||
# Find next heading or end of document
|
||||
next_heading_line = None
|
||||
for next_h in headings:
|
||||
if next_h['line'] > section_start:
|
||||
next_heading_line = next_h['line']
|
||||
break
|
||||
|
||||
section_end = next_heading_line if next_heading_line else len(self.lines)
|
||||
section_text = '\n'.join(self.lines[section_start:section_end]).lower()
|
||||
|
||||
diagram_type = None
|
||||
if any(ind in section_text for ind in process_indicators):
|
||||
diagram_type = 'flowchart'
|
||||
elif any(ind in section_text for ind in relationship_indicators):
|
||||
diagram_type = 'graph'
|
||||
elif any(ind in section_text for ind in architecture_indicators):
|
||||
diagram_type = 'architecture'
|
||||
|
||||
if diagram_type:
|
||||
opportunities.append({
|
||||
'heading': heading['text'],
|
||||
'type': diagram_type,
|
||||
'line': section_start
|
||||
})
|
||||
|
||||
return opportunities
|
||||
|
||||
def remove_noise(self, content: str) -> str:
|
||||
"""Remove common noise patterns in markdown."""
|
||||
lines = content.split('\n')
|
||||
cleaned = []
|
||||
|
||||
# Patterns to remove
|
||||
noise_patterns = [
|
||||
r'^\s*---+\s*$', # Horizontal rules (unless in front-matter)
|
||||
r'^\s*\*\*\*+\s*$', # Alternative horizontal rules
|
||||
]
|
||||
|
||||
in_frontmatter = False
|
||||
frontmatter_count = 0
|
||||
|
||||
for line in lines:
|
||||
# Track front-matter boundaries
|
||||
if line.strip() == '---':
|
||||
frontmatter_count += 1
|
||||
if frontmatter_count <= 2:
|
||||
in_frontmatter = not in_frontmatter
|
||||
cleaned.append(line)
|
||||
continue
|
||||
|
||||
# Skip noise patterns (but not in front-matter)
|
||||
if not in_frontmatter:
|
||||
is_noise = any(re.match(pattern, line) for pattern in noise_patterns)
|
||||
if is_noise:
|
||||
continue
|
||||
|
||||
# Remove excessive empty lines
|
||||
if not line.strip():
|
||||
if cleaned and not cleaned[-1].strip():
|
||||
continue # Skip consecutive empty lines
|
||||
|
||||
cleaned.append(line)
|
||||
|
||||
return '\n'.join(cleaned)
|
||||
|
||||
def generate_frontmatter(self) -> str:
|
||||
"""Generate YAML front-matter with metadata."""
|
||||
headings = self.extract_headings()
|
||||
toc = self.generate_toc()
|
||||
concepts = self.extract_key_concepts()
|
||||
diagrams = self.identify_diagram_opportunities()
|
||||
|
||||
# Extract title (first H1 or filename)
|
||||
title = next((h['text'] for h in headings if h['level'] == 1),
|
||||
Path(self.source_path).stem if self.source_path else "Untitled")
|
||||
|
||||
# Build front-matter
|
||||
fm_lines = ['---']
|
||||
fm_lines.append(f'title: "{title}"')
|
||||
|
||||
# Token estimate
|
||||
token_count = self.estimate_tokens(self.original_content)
|
||||
fm_lines.append(f'tokens: {token_count}')
|
||||
|
||||
# Optimized flag
|
||||
fm_lines.append('optimized_for_llm: true')
|
||||
|
||||
# Key concepts
|
||||
if concepts:
|
||||
fm_lines.append('concepts:')
|
||||
for concept in concepts[:5]: # Top 5
|
||||
fm_lines.append(f' - {concept}')
|
||||
|
||||
# TOC
|
||||
if toc:
|
||||
fm_lines.append('toc:')
|
||||
current_level = 1
|
||||
for item in toc:
|
||||
indent = ' ' * (item['level'] - 1)
|
||||
fm_lines.append(f'{indent}- {item["text"]}')
|
||||
|
||||
# Diagram suggestions
|
||||
if diagrams:
|
||||
fm_lines.append('suggested_diagrams:')
|
||||
for diag in diagrams:
|
||||
fm_lines.append(f' - section: "{diag["heading"]}"')
|
||||
fm_lines.append(f' type: {diag["type"]}')
|
||||
|
||||
fm_lines.append('---')
|
||||
|
||||
return '\n'.join(fm_lines)
|
||||
|
||||
def optimize(self) -> str:
|
||||
"""Run full optimization pipeline."""
|
||||
# 1. Normalize heading hierarchy
|
||||
content = self.normalize_heading_hierarchy()
|
||||
|
||||
# 2. Remove noise
|
||||
content = self.remove_noise(content)
|
||||
|
||||
# 3. Remove existing front-matter if present
|
||||
if content.startswith('---'):
|
||||
parts = content.split('---', 2)
|
||||
if len(parts) >= 3:
|
||||
content = parts[2].lstrip('\n')
|
||||
|
||||
# 4. Generate new front-matter
|
||||
self.original_content = content # Update for metadata generation
|
||||
self.lines = content.split('\n')
|
||||
frontmatter = self.generate_frontmatter()
|
||||
|
||||
# 5. Combine
|
||||
optimized = frontmatter + '\n\n' + content
|
||||
|
||||
return optimized
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: optimize_markdown.py <input_file> [output_file]")
|
||||
print("\nOptimizes markdown files for LLM consumption.")
|
||||
print("If output_file is not specified, prints to stdout.")
|
||||
sys.exit(1)
|
||||
|
||||
input_path = sys.argv[1]
|
||||
output_path = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
# Read input
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Optimize
|
||||
optimizer = MarkdownOptimizer(content, input_path)
|
||||
optimized = optimizer.optimize()
|
||||
|
||||
# Output
|
||||
if output_path:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(optimized)
|
||||
print(f"✅ Optimized markdown written to: {output_path}")
|
||||
|
||||
# Print stats
|
||||
original_tokens = optimizer.estimate_tokens(content)
|
||||
new_tokens = optimizer.estimate_tokens(optimized)
|
||||
print(f"\n📊 Statistics:")
|
||||
print(f" Original: ~{original_tokens:,} tokens")
|
||||
print(f" Optimized: ~{new_tokens:,} tokens")
|
||||
print(f" Change: {new_tokens - original_tokens:+,} tokens")
|
||||
else:
|
||||
print(optimized)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user