gh-mvdmakesthings-skills-ma…/skills/markdown-optimizer/scripts/optimize_markdown.py

#!/usr/bin/env python3
"""
Markdown Optimizer for LLM Consumption

Optimizes markdown files by:
- Adding YAML front-matter with metadata
- Creating TOC in front-matter
- Normalizing heading hierarchy
- Removing redundant content and noise
- Converting verbose prose to structured formats
- Identifying diagram opportunities
- Calculating token estimates
"""

import re
import sys
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter


class MarkdownOptimizer:
    def __init__(self, content: str, source_path: str = ""):
        self.original_content = content
        self.source_path = source_path
        self.lines = content.split('\n')
        self.headings = []
        self.metadata = {}

    def extract_headings(self) -> List[Dict]:
        """Extract all headings with their levels and content."""
        headings = []
        for i, line in enumerate(self.lines):
            match = re.match(r'^(#{1,6})\s+(.+)$', line)
            if match:
                level = len(match.group(1))
                text = match.group(2).strip()
                headings.append({
                    'level': level,
                    'text': text,
                    'line': i
                })
        return headings

    def normalize_heading_hierarchy(self) -> str:
        """Ensure logical heading progression (no skipped levels)."""
        content = self.original_content
        headings = self.extract_headings()

        if not headings:
            return content

        # Start from H1
        expected_level = 1
        adjustments = {}

        for heading in headings:
            current_level = heading['level']

            # If we skip levels, normalize
            if current_level > expected_level + 1:
                adjustments[heading['line']] = expected_level + 1
                expected_level = expected_level + 1
            else:
                adjustments[heading['line']] = current_level
                expected_level = current_level

        # Apply adjustments
        lines = content.split('\n')
        for line_num, new_level in adjustments.items():
            old_line = lines[line_num]
            match = re.match(r'^(#{1,6})\s+(.+)$', old_line)
            if match:
                lines[line_num] = '#' * new_level + ' ' + match.group(2)

        return '\n'.join(lines)

    def generate_toc(self) -> List[Dict]:
        """Generate table of contents from headings."""
        headings = self.extract_headings()
        toc = []

        for heading in headings:
            # Create anchor-style reference
            anchor = heading['text'].lower()
            anchor = re.sub(r'[^\w\s-]', '', anchor)
            anchor = re.sub(r'[-\s]+', '-', anchor)

            toc.append({
                'level': heading['level'],
                'text': heading['text'],
                'anchor': anchor
            })

        return toc

    def extract_key_concepts(self) -> List[str]:
        """Extract key concepts/topics from the document."""
        # Remove markdown syntax and extract meaningful words
        text = re.sub(r'[#*`_\[\]()]', '', self.original_content)
        words = re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text)  # CamelCase
        words += re.findall(r'\b[A-Z]{2,}\b', text)  # ACRONYMS

        # Count and return top concepts
        word_counts = Counter(words)
        return [word for word, _ in word_counts.most_common(10)]

    def estimate_tokens(self, text: str) -> int:
        """Rough token estimation (1 token ≈ 4 characters)."""
        return len(text) // 4

    def identify_diagram_opportunities(self) -> List[Dict]:
        """Identify sections that could benefit from Mermaid diagrams."""
        opportunities = []
        content_lower = self.original_content.lower()

        # Process flow indicators
        process_indicators = [
            'step 1', 'step 2', 'first', 'then', 'next', 'finally',
            'process:', 'workflow:', 'procedure:'
        ]

        # Relationship indicators
        relationship_indicators = [
            'depends on', 'related to', 'connects to', 'inherits from',
            'composed of', 'hierarchy', 'relationship between'
        ]

        # Architecture/structure indicators
        architecture_indicators = [
            'architecture', 'component', 'system design', 'structure',
            'module', 'layer', 'interface'
        ]

        headings = self.extract_headings()
        for heading in headings:
            section_start = heading['line']
            # Find next heading or end of document
            next_heading_line = None
            for next_h in headings:
                if next_h['line'] > section_start:
                    next_heading_line = next_h['line']
                    break

            section_end = next_heading_line if next_heading_line else len(self.lines)
            section_text = '\n'.join(self.lines[section_start:section_end]).lower()

            diagram_type = None
            if any(ind in section_text for ind in process_indicators):
                diagram_type = 'flowchart'
            elif any(ind in section_text for ind in relationship_indicators):
                diagram_type = 'graph'
            elif any(ind in section_text for ind in architecture_indicators):
                diagram_type = 'architecture'

            if diagram_type:
                opportunities.append({
                    'heading': heading['text'],
                    'type': diagram_type,
                    'line': section_start
                })

        return opportunities

    def remove_noise(self, content: str) -> str:
        """Remove common noise patterns in markdown."""
        lines = content.split('\n')
        cleaned = []

        # Patterns to remove
        noise_patterns = [
            r'^\s*---+\s*$',  # Horizontal rules (unless in front-matter)
            r'^\s*\*\*\*+\s*$',  # Alternative horizontal rules
        ]

        in_frontmatter = False
        frontmatter_count = 0

        for line in lines:
            # Track front-matter boundaries
            if line.strip() == '---':
                frontmatter_count += 1
                if frontmatter_count <= 2:
                    in_frontmatter = not in_frontmatter
                    cleaned.append(line)
                    continue

            # Skip noise patterns (but not in front-matter)
            if not in_frontmatter:
                is_noise = any(re.match(pattern, line) for pattern in noise_patterns)
                if is_noise:
                    continue

            # Remove excessive empty lines
            if not line.strip():
                if cleaned and not cleaned[-1].strip():
                    continue  # Skip consecutive empty lines

            cleaned.append(line)

        return '\n'.join(cleaned)

    def generate_frontmatter(self) -> str:
        """Generate YAML front-matter with metadata."""
        headings = self.extract_headings()
        toc = self.generate_toc()
        concepts = self.extract_key_concepts()
        diagrams = self.identify_diagram_opportunities()

        # Extract title (first H1 or filename)
        title = next((h['text'] for h in headings if h['level'] == 1),
                     Path(self.source_path).stem if self.source_path else "Untitled")

        # Build front-matter
        fm_lines = ['---']
        fm_lines.append(f'title: "{title}"')

        # Token estimate
        token_count = self.estimate_tokens(self.original_content)
        fm_lines.append(f'tokens: {token_count}')

        # Optimized flag
        fm_lines.append('optimized_for_llm: true')

        # Key concepts
        if concepts:
            fm_lines.append('concepts:')
            for concept in concepts[:5]:  # Top 5
                fm_lines.append(f'  - {concept}')

        # TOC
        if toc:
            fm_lines.append('toc:')
            current_level = 1
            for item in toc:
                indent = '  ' * (item['level'] - 1)
                fm_lines.append(f'{indent}- {item["text"]}')

        # Diagram suggestions
        if diagrams:
            fm_lines.append('suggested_diagrams:')
            for diag in diagrams:
                fm_lines.append(f'  - section: "{diag["heading"]}"')
                fm_lines.append(f'    type: {diag["type"]}')

        fm_lines.append('---')

        return '\n'.join(fm_lines)

    def optimize(self) -> str:
        """Run full optimization pipeline."""
        # 1. Normalize heading hierarchy
        content = self.normalize_heading_hierarchy()

        # 2. Remove noise
        content = self.remove_noise(content)

        # 3. Remove existing front-matter if present
        if content.startswith('---'):
            parts = content.split('---', 2)
            if len(parts) >= 3:
                content = parts[2].lstrip('\n')

        # 4. Generate new front-matter
        self.original_content = content  # Update for metadata generation
        self.lines = content.split('\n')
        frontmatter = self.generate_frontmatter()

        # 5. Combine
        optimized = frontmatter + '\n\n' + content

        return optimized


def main():
    if len(sys.argv) < 2:
        print("Usage: optimize_markdown.py <input_file> [output_file]")
        print("\nOptimizes markdown files for LLM consumption.")
        print("If output_file is not specified, prints to stdout.")
        sys.exit(1)

    input_path = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else None

    # Read input
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Optimize
    optimizer = MarkdownOptimizer(content, input_path)
    optimized = optimizer.optimize()

    # Output
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(optimized)
        print(f"✅ Optimized markdown written to: {output_path}")

        # Print stats
        original_tokens = optimizer.estimate_tokens(content)
        new_tokens = optimizer.estimate_tokens(optimized)
        print(f"\n📊 Statistics:")
        print(f"   Original: ~{original_tokens:,} tokens")
        print(f"   Optimized: ~{new_tokens:,} tokens")
        print(f"   Change: {new_tokens - original_tokens:+,} tokens")
    else:
        print(optimized)


if __name__ == '__main__':
    main()