Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:41:42 +08:00
commit 735685a38f
7 changed files with 998 additions and 0 deletions

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
Markdown Optimizer for LLM Consumption
Optimizes markdown files by:
- Adding YAML front-matter with metadata
- Creating TOC in front-matter
- Normalizing heading hierarchy
- Removing redundant content and noise
- Converting verbose prose to structured formats
- Identifying diagram opportunities
- Calculating token estimates
"""
import re
import sys
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter
class MarkdownOptimizer:
def __init__(self, content: str, source_path: str = ""):
self.original_content = content
self.source_path = source_path
self.lines = content.split('\n')
self.headings = []
self.metadata = {}
def extract_headings(self) -> List[Dict]:
"""Extract all headings with their levels and content."""
headings = []
for i, line in enumerate(self.lines):
match = re.match(r'^(#{1,6})\s+(.+)$', line)
if match:
level = len(match.group(1))
text = match.group(2).strip()
headings.append({
'level': level,
'text': text,
'line': i
})
return headings
def normalize_heading_hierarchy(self) -> str:
"""Ensure logical heading progression (no skipped levels)."""
content = self.original_content
headings = self.extract_headings()
if not headings:
return content
# Start from H1
expected_level = 1
adjustments = {}
for heading in headings:
current_level = heading['level']
# If we skip levels, normalize
if current_level > expected_level + 1:
adjustments[heading['line']] = expected_level + 1
expected_level = expected_level + 1
else:
adjustments[heading['line']] = current_level
expected_level = current_level
# Apply adjustments
lines = content.split('\n')
for line_num, new_level in adjustments.items():
old_line = lines[line_num]
match = re.match(r'^(#{1,6})\s+(.+)$', old_line)
if match:
lines[line_num] = '#' * new_level + ' ' + match.group(2)
return '\n'.join(lines)
def generate_toc(self) -> List[Dict]:
"""Generate table of contents from headings."""
headings = self.extract_headings()
toc = []
for heading in headings:
# Create anchor-style reference
anchor = heading['text'].lower()
anchor = re.sub(r'[^\w\s-]', '', anchor)
anchor = re.sub(r'[-\s]+', '-', anchor)
toc.append({
'level': heading['level'],
'text': heading['text'],
'anchor': anchor
})
return toc
def extract_key_concepts(self) -> List[str]:
"""Extract key concepts/topics from the document."""
# Remove markdown syntax and extract meaningful words
text = re.sub(r'[#*`_\[\]()]', '', self.original_content)
words = re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text) # CamelCase
words += re.findall(r'\b[A-Z]{2,}\b', text) # ACRONYMS
# Count and return top concepts
word_counts = Counter(words)
return [word for word, _ in word_counts.most_common(10)]
def estimate_tokens(self, text: str) -> int:
"""Rough token estimation (1 token ≈ 4 characters)."""
return len(text) // 4
def identify_diagram_opportunities(self) -> List[Dict]:
"""Identify sections that could benefit from Mermaid diagrams."""
opportunities = []
content_lower = self.original_content.lower()
# Process flow indicators
process_indicators = [
'step 1', 'step 2', 'first', 'then', 'next', 'finally',
'process:', 'workflow:', 'procedure:'
]
# Relationship indicators
relationship_indicators = [
'depends on', 'related to', 'connects to', 'inherits from',
'composed of', 'hierarchy', 'relationship between'
]
# Architecture/structure indicators
architecture_indicators = [
'architecture', 'component', 'system design', 'structure',
'module', 'layer', 'interface'
]
headings = self.extract_headings()
for heading in headings:
section_start = heading['line']
# Find next heading or end of document
next_heading_line = None
for next_h in headings:
if next_h['line'] > section_start:
next_heading_line = next_h['line']
break
section_end = next_heading_line if next_heading_line else len(self.lines)
section_text = '\n'.join(self.lines[section_start:section_end]).lower()
diagram_type = None
if any(ind in section_text for ind in process_indicators):
diagram_type = 'flowchart'
elif any(ind in section_text for ind in relationship_indicators):
diagram_type = 'graph'
elif any(ind in section_text for ind in architecture_indicators):
diagram_type = 'architecture'
if diagram_type:
opportunities.append({
'heading': heading['text'],
'type': diagram_type,
'line': section_start
})
return opportunities
def remove_noise(self, content: str) -> str:
"""Remove common noise patterns in markdown."""
lines = content.split('\n')
cleaned = []
# Patterns to remove
noise_patterns = [
r'^\s*---+\s*$', # Horizontal rules (unless in front-matter)
r'^\s*\*\*\*+\s*$', # Alternative horizontal rules
]
in_frontmatter = False
frontmatter_count = 0
for line in lines:
# Track front-matter boundaries
if line.strip() == '---':
frontmatter_count += 1
if frontmatter_count <= 2:
in_frontmatter = not in_frontmatter
cleaned.append(line)
continue
# Skip noise patterns (but not in front-matter)
if not in_frontmatter:
is_noise = any(re.match(pattern, line) for pattern in noise_patterns)
if is_noise:
continue
# Remove excessive empty lines
if not line.strip():
if cleaned and not cleaned[-1].strip():
continue # Skip consecutive empty lines
cleaned.append(line)
return '\n'.join(cleaned)
def generate_frontmatter(self) -> str:
"""Generate YAML front-matter with metadata."""
headings = self.extract_headings()
toc = self.generate_toc()
concepts = self.extract_key_concepts()
diagrams = self.identify_diagram_opportunities()
# Extract title (first H1 or filename)
title = next((h['text'] for h in headings if h['level'] == 1),
Path(self.source_path).stem if self.source_path else "Untitled")
# Build front-matter
fm_lines = ['---']
fm_lines.append(f'title: "{title}"')
# Token estimate
token_count = self.estimate_tokens(self.original_content)
fm_lines.append(f'tokens: {token_count}')
# Optimized flag
fm_lines.append('optimized_for_llm: true')
# Key concepts
if concepts:
fm_lines.append('concepts:')
for concept in concepts[:5]: # Top 5
fm_lines.append(f' - {concept}')
# TOC
if toc:
fm_lines.append('toc:')
current_level = 1
for item in toc:
indent = ' ' * (item['level'] - 1)
fm_lines.append(f'{indent}- {item["text"]}')
# Diagram suggestions
if diagrams:
fm_lines.append('suggested_diagrams:')
for diag in diagrams:
fm_lines.append(f' - section: "{diag["heading"]}"')
fm_lines.append(f' type: {diag["type"]}')
fm_lines.append('---')
return '\n'.join(fm_lines)
def optimize(self) -> str:
"""Run full optimization pipeline."""
# 1. Normalize heading hierarchy
content = self.normalize_heading_hierarchy()
# 2. Remove noise
content = self.remove_noise(content)
# 3. Remove existing front-matter if present
if content.startswith('---'):
parts = content.split('---', 2)
if len(parts) >= 3:
content = parts[2].lstrip('\n')
# 4. Generate new front-matter
self.original_content = content # Update for metadata generation
self.lines = content.split('\n')
frontmatter = self.generate_frontmatter()
# 5. Combine
optimized = frontmatter + '\n\n' + content
return optimized
def main():
if len(sys.argv) < 2:
print("Usage: optimize_markdown.py <input_file> [output_file]")
print("\nOptimizes markdown files for LLM consumption.")
print("If output_file is not specified, prints to stdout.")
sys.exit(1)
input_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else None
# Read input
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Optimize
optimizer = MarkdownOptimizer(content, input_path)
optimized = optimizer.optimize()
# Output
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(optimized)
print(f"✅ Optimized markdown written to: {output_path}")
# Print stats
original_tokens = optimizer.estimate_tokens(content)
new_tokens = optimizer.estimate_tokens(optimized)
print(f"\n📊 Statistics:")
print(f" Original: ~{original_tokens:,} tokens")
print(f" Optimized: ~{new_tokens:,} tokens")
print(f" Change: {new_tokens - original_tokens:+,} tokens")
else:
print(optimized)
if __name__ == '__main__':
main()