Files
gh-alonw0-llm-docs-optimizer/skills/llm-docs-optimizer/scripts/analyze_docs.py
2025-11-29 17:52:13 +08:00

344 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Documentation Analyzer for C7Score Optimization
Analyzes README and documentation files to identify:
- Snippets that are import-only or installation-only
- Potential formatting issues
- Metadata content (licensing, citations, directory structures)
- Duplicate or near-duplicate code blocks
- Missing question-answering examples
"""
import re
import sys
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter
class CodeSnippet:
def __init__(self, language: str, code: str, context: str, line_num: int):
self.language = language
self.code = code.strip()
self.context = context # Text before the code block
self.line_num = line_num
self.issues = []
def __repr__(self):
return f"CodeSnippet(lang={self.language}, lines={len(self.code.splitlines())}, line={self.line_num})"
def extract_code_snippets(content: str) -> List[CodeSnippet]:
"""Extract all code blocks from markdown content."""
snippets = []
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i]
# Match code block start
if line.strip().startswith('```'):
# Extract language
language = line.strip()[3:].strip() or 'unknown'
start_line = i
# Get context (previous non-empty lines up to 5)
context_lines = []
for j in range(max(0, i-5), i):
if lines[j].strip():
context_lines.append(lines[j].strip())
context = ' '.join(context_lines[-3:]) # Last 3 lines of context
# Collect code until end marker
i += 1
code_lines = []
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
code = '\n'.join(code_lines)
snippets.append(CodeSnippet(language, code, context, start_line + 1))
i += 1
return snippets
def analyze_snippet(snippet: CodeSnippet) -> List[str]:
"""Analyze a single code snippet for c7score issues."""
issues = []
code = snippet.code.strip()
lines = [l.strip() for l in code.split('\n') if l.strip()]
# Check 1: Import-only snippets
if lines:
import_patterns = [
r'^import\s+',
r'^from\s+\S+\s+import\s+',
r'^require\s*\(',
r'^const\s+\S+\s*=\s*require',
r'^using\s+',
]
import_count = sum(1 for line in lines if any(re.match(p, line) for p in import_patterns))
if import_count == len(lines) and len(lines) <= 5:
issues.append("⚠️ Import-only snippet (Metric 5: Initialization)")
# Check 2: Installation-only snippets
install_patterns = [
r'pip install',
r'npm install',
r'yarn add',
r'cargo install',
r'go get',
r'gem install',
]
if len(lines) <= 2 and any(any(pattern in line for pattern in install_patterns) for line in lines):
issues.append("⚠️ Installation-only snippet (Metric 5: Initialization)")
# Check 3: Snippet length
if len(lines) < 3:
issues.append("⚠️ Very short snippet (<3 lines) (Metric 3: Formatting)")
elif len(lines) > 100:
issues.append("⚠️ Very long snippet (>100 lines) (Metric 3: Formatting)")
# Check 4: Language tag issues
problematic_languages = [
'configuration', 'config', 'cli arguments', 'arguments',
'none', 'console', 'output', 'text', 'plaintext'
]
if snippet.language.lower() in problematic_languages:
issues.append(f"⚠️ Problematic language tag: '{snippet.language}' (Metric 3: Formatting)")
# Check 5: Looks like a list
if len(lines) > 3:
list_markers = sum(1 for line in lines if re.match(r'^\s*[-*\d.]+\s', line))
if list_markers / len(lines) > 0.5:
issues.append("⚠️ Appears to be a list, not code (Metric 3: Formatting)")
# Check 6: Directory structure
if any(all(char in line for char in ['', '', '', '']) for line in lines):
issues.append("⚠️ Directory structure detected (Metric 4: Project Metadata)")
# Check 7: License or citation markers
license_markers = ['license', 'copyright', 'mit', 'apache', 'gpl', 'bsd']
citation_markers = ['@article', '@book', 'bibtex', 'doi:', 'citation']
code_lower = code.lower()
if any(marker in code_lower for marker in license_markers) and len(code) > 100:
issues.append("⚠️ License content detected (Metric 4: Project Metadata)")
if any(marker in code_lower for marker in citation_markers):
issues.append("⚠️ Citation content detected (Metric 4: Project Metadata)")
return issues
def find_duplicates(snippets: List[CodeSnippet]) -> List[Tuple[int, int]]:
"""Find duplicate or near-duplicate snippets."""
duplicates = []
for i, snippet1 in enumerate(snippets):
for j, snippet2 in enumerate(snippets[i+1:], start=i+1):
# Normalize for comparison
code1 = re.sub(r'\s+', ' ', snippet1.code.lower()).strip()
code2 = re.sub(r'\s+', ' ', snippet2.code.lower()).strip()
# Exact duplicate
if code1 == code2:
duplicates.append((i, j))
# Near duplicate (>80% similar)
elif len(code1) > 20 and len(code2) > 20:
# Simple similarity check
min_len = min(len(code1), len(code2))
max_len = max(len(code1), len(code2))
if min_len / max_len > 0.8:
# Check if one contains most of the other
if code1 in code2 or code2 in code1:
duplicates.append((i, j))
return duplicates
def generate_question_suggestions(content: str) -> List[str]:
"""Suggest questions that should be answered in the documentation."""
# Extract apparent project name
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
project_name = title_match.group(1) if title_match else "this library"
questions = [
f"How do I install {project_name}?",
f"How do I get started with {project_name}?",
f"How do I initialize/configure {project_name}?",
f"How do I authenticate with {project_name}?",
f"What are the main features and how do I use them?",
f"How do I handle errors in {project_name}?",
f"How do I perform [common operation]?",
f"What are common configuration options?",
f"How do I integrate {project_name} with [common tools]?",
f"How do I test code using {project_name}?",
]
return questions
def analyze_documentation(file_path: str) -> Dict:
"""Analyze documentation file for c7score optimization opportunities."""
path = Path(file_path)
if not path.exists():
return {"error": f"File not found: {file_path}"}
content = path.read_text(encoding='utf-8')
snippets = extract_code_snippets(content)
# Analyze each snippet
snippet_issues = []
for snippet in snippets:
issues = analyze_snippet(snippet)
if issues:
snippet_issues.append({
'snippet': snippet,
'issues': issues
})
# Find duplicates
duplicates = find_duplicates(snippets)
# Calculate statistics
total_snippets = len(snippets)
snippets_with_issues = len(snippet_issues)
# Language distribution
language_dist = Counter(s.language for s in snippets)
# Issue type counts
issue_types = Counter()
for item in snippet_issues:
for issue in item['issues']:
# Extract metric number
if "Metric 3" in issue:
issue_types["Formatting (M3)"] += 1
elif "Metric 4" in issue:
issue_types["Metadata (M4)"] += 1
elif "Metric 5" in issue:
issue_types["Initialization (M5)"] += 1
return {
'file': file_path,
'total_snippets': total_snippets,
'snippets_with_issues': snippets_with_issues,
'issue_breakdown': dict(issue_types),
'duplicates': len(duplicates),
'language_distribution': dict(language_dist),
'detailed_issues': snippet_issues,
'duplicate_pairs': duplicates,
'question_suggestions': generate_question_suggestions(content),
}
def print_report(analysis: Dict):
"""Print a formatted analysis report."""
if 'error' in analysis:
print(f"{analysis['error']}")
return
print(f"\n{'='*70}")
print(f"C7Score Documentation Analysis: {analysis['file']}")
print(f"{'='*70}\n")
print(f"📊 Summary Statistics")
print(f"{''*70}")
print(f"Total code snippets: {analysis['total_snippets']}")
print(f"Snippets with issues: {analysis['snippets_with_issues']}")
print(f"Duplicate snippets: {analysis['duplicates']}")
if analysis['total_snippets'] > 0:
issue_rate = (analysis['snippets_with_issues'] / analysis['total_snippets']) * 100
print(f"Issue rate: {issue_rate:.1f}%")
print(f"\n📝 Language Distribution")
print(f"{''*70}")
for lang, count in sorted(analysis['language_distribution'].items(), key=lambda x: x[1], reverse=True):
print(f" {lang}: {count}")
if analysis['issue_breakdown']:
print(f"\n⚠️ Issue Breakdown by Metric")
print(f"{''*70}")
for issue_type, count in sorted(analysis['issue_breakdown'].items(), key=lambda x: x[1], reverse=True):
print(f" {issue_type}: {count}")
if analysis['detailed_issues']:
print(f"\n🔍 Detailed Issues (Showing first 10)")
print(f"{''*70}")
for i, item in enumerate(analysis['detailed_issues'][:10], 1):
snippet = item['snippet']
print(f"\n{i}. Line {snippet.line_num} [{snippet.language}] ({len(snippet.code.splitlines())} lines)")
for issue in item['issues']:
print(f" {issue}")
# Show first 2 lines of code
code_preview = '\n'.join(snippet.code.split('\n')[:2])
print(f" Preview: {code_preview[:80]}...")
if analysis['duplicate_pairs']:
print(f"\n🔄 Duplicate Snippets")
print(f"{''*70}")
for i, (idx1, idx2) in enumerate(analysis['duplicate_pairs'][:5], 1):
print(f"{i}. Snippets at lines {snippets[idx1].line_num} and {snippets[idx2].line_num} are duplicates")
print(f"\n💡 Suggested Questions to Answer")
print(f"{''*70}")
for i, question in enumerate(analysis['question_suggestions'], 1):
print(f"{i}. {question}")
print(f"\n✅ Recommendations")
print(f"{''*70}")
recommendations = []
if analysis['issue_breakdown'].get('Initialization (M5)', 0) > 0:
recommendations.append(
"• Combine import-only and installation-only snippets with actual usage examples"
)
if analysis['issue_breakdown'].get('Formatting (M3)', 0) > 0:
recommendations.append(
"• Fix formatting issues: use proper language tags, avoid very short/long snippets"
)
if analysis['issue_breakdown'].get('Metadata (M4)', 0) > 0:
recommendations.append(
"• Remove or relocate metadata content (licensing, citations, directory trees)"
)
if analysis['duplicates'] > 0:
recommendations.append(
f"• Remove or consolidate {analysis['duplicates']} duplicate snippets (reduces LLM score)"
)
if analysis['total_snippets'] < 10:
recommendations.append(
"• Add more comprehensive code examples answering common developer questions"
)
if not recommendations:
recommendations.append("• Documentation looks good! Consider running actual c7score for detailed metrics")
for rec in recommendations:
print(rec)
print(f"\n{'='*70}\n")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python analyze_docs.py <path-to-readme-or-doc.md>")
sys.exit(1)
file_path = sys.argv[1]
analysis = analyze_documentation(file_path)
print_report(analysis)