Initial commit

2025-11-29 17:52:13 +08:00
commit 4b20ee9596
10 changed files with 3079 additions and 0 deletions
--- a/skills/llm-docs-optimizer/scripts/analyze_docs.py
+++ b/skills/llm-docs-optimizer/scripts/analyze_docs.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+"""
+Documentation Analyzer for C7Score Optimization
+
+Analyzes README and documentation files to identify:
+- Snippets that are import-only or installation-only
+- Potential formatting issues
+- Metadata content (licensing, citations, directory structures)
+- Duplicate or near-duplicate code blocks
+- Missing question-answering examples
+"""
+
+import re
+import sys
+from pathlib import Path
+from typing import List, Dict, Tuple
+from collections import Counter
+
+
+class CodeSnippet:
+    def __init__(self, language: str, code: str, context: str, line_num: int):
+        self.language = language
+        self.code = code.strip()
+        self.context = context  # Text before the code block
+        self.line_num = line_num
+        self.issues = []
+
+    def __repr__(self):
+        return f"CodeSnippet(lang={self.language}, lines={len(self.code.splitlines())}, line={self.line_num})"
+
+
+def extract_code_snippets(content: str) -> List[CodeSnippet]:
+    """Extract all code blocks from markdown content."""
+    snippets = []
+    lines = content.split('\n')
+    i = 0
+    
+    while i < len(lines):
+        line = lines[i]
+        # Match code block start
+        if line.strip().startswith('```'):
+            # Extract language
+            language = line.strip()[3:].strip() or 'unknown'
+            start_line = i
+            
+            # Get context (previous non-empty lines up to 5)
+            context_lines = []
+            for j in range(max(0, i-5), i):
+                if lines[j].strip():
+                    context_lines.append(lines[j].strip())
+            context = ' '.join(context_lines[-3:])  # Last 3 lines of context
+            
+            # Collect code until end marker
+            i += 1
+            code_lines = []
+            while i < len(lines) and not lines[i].strip().startswith('```'):
+                code_lines.append(lines[i])
+                i += 1
+            
+            code = '\n'.join(code_lines)
+            snippets.append(CodeSnippet(language, code, context, start_line + 1))
+        
+        i += 1
+    
+    return snippets
+
+
+def analyze_snippet(snippet: CodeSnippet) -> List[str]:
+    """Analyze a single code snippet for c7score issues."""
+    issues = []
+    code = snippet.code.strip()
+    lines = [l.strip() for l in code.split('\n') if l.strip()]
+    
+    # Check 1: Import-only snippets
+    if lines:
+        import_patterns = [
+            r'^import\s+',
+            r'^from\s+\S+\s+import\s+',
+            r'^require\s*\(',
+            r'^const\s+\S+\s*=\s*require',
+            r'^using\s+',
+        ]
+        
+        import_count = sum(1 for line in lines if any(re.match(p, line) for p in import_patterns))
+        if import_count == len(lines) and len(lines) <= 5:
+            issues.append("⚠️  Import-only snippet (Metric 5: Initialization)")
+    
+    # Check 2: Installation-only snippets
+    install_patterns = [
+        r'pip install',
+        r'npm install',
+        r'yarn add',
+        r'cargo install',
+        r'go get',
+        r'gem install',
+    ]
+    
+    if len(lines) <= 2 and any(any(pattern in line for pattern in install_patterns) for line in lines):
+        issues.append("⚠️  Installation-only snippet (Metric 5: Initialization)")
+    
+    # Check 3: Snippet length
+    if len(lines) < 3:
+        issues.append("⚠️  Very short snippet (<3 lines) (Metric 3: Formatting)")
+    elif len(lines) > 100:
+        issues.append("⚠️  Very long snippet (>100 lines) (Metric 3: Formatting)")
+    
+    # Check 4: Language tag issues
+    problematic_languages = [
+        'configuration', 'config', 'cli arguments', 'arguments',
+        'none', 'console', 'output', 'text', 'plaintext'
+    ]
+    
+    if snippet.language.lower() in problematic_languages:
+        issues.append(f"⚠️  Problematic language tag: '{snippet.language}' (Metric 3: Formatting)")
+    
+    # Check 5: Looks like a list
+    if len(lines) > 3:
+        list_markers = sum(1 for line in lines if re.match(r'^\s*[-*\d.]+\s', line))
+        if list_markers / len(lines) > 0.5:
+            issues.append("⚠️  Appears to be a list, not code (Metric 3: Formatting)")
+    
+    # Check 6: Directory structure
+    if any(all(char in line for char in ['├', '│', '─', '└']) for line in lines):
+        issues.append("⚠️  Directory structure detected (Metric 4: Project Metadata)")
+    
+    # Check 7: License or citation markers
+    license_markers = ['license', 'copyright', 'mit', 'apache', 'gpl', 'bsd']
+    citation_markers = ['@article', '@book', 'bibtex', 'doi:', 'citation']
+    
+    code_lower = code.lower()
+    if any(marker in code_lower for marker in license_markers) and len(code) > 100:
+        issues.append("⚠️  License content detected (Metric 4: Project Metadata)")
+    
+    if any(marker in code_lower for marker in citation_markers):
+        issues.append("⚠️  Citation content detected (Metric 4: Project Metadata)")
+    
+    return issues
+
+
+def find_duplicates(snippets: List[CodeSnippet]) -> List[Tuple[int, int]]:
+    """Find duplicate or near-duplicate snippets."""
+    duplicates = []
+    
+    for i, snippet1 in enumerate(snippets):
+        for j, snippet2 in enumerate(snippets[i+1:], start=i+1):
+            # Normalize for comparison
+            code1 = re.sub(r'\s+', ' ', snippet1.code.lower()).strip()
+            code2 = re.sub(r'\s+', ' ', snippet2.code.lower()).strip()
+            
+            # Exact duplicate
+            if code1 == code2:
+                duplicates.append((i, j))
+            # Near duplicate (>80% similar)
+            elif len(code1) > 20 and len(code2) > 20:
+                # Simple similarity check
+                min_len = min(len(code1), len(code2))
+                max_len = max(len(code1), len(code2))
+                if min_len / max_len > 0.8:
+                    # Check if one contains most of the other
+                    if code1 in code2 or code2 in code1:
+                        duplicates.append((i, j))
+    
+    return duplicates
+
+
+def generate_question_suggestions(content: str) -> List[str]:
+    """Suggest questions that should be answered in the documentation."""
+    # Extract apparent project name
+    title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
+    project_name = title_match.group(1) if title_match else "this library"
+    
+    questions = [
+        f"How do I install {project_name}?",
+        f"How do I get started with {project_name}?",
+        f"How do I initialize/configure {project_name}?",
+        f"How do I authenticate with {project_name}?",
+        f"What are the main features and how do I use them?",
+        f"How do I handle errors in {project_name}?",
+        f"How do I perform [common operation]?",
+        f"What are common configuration options?",
+        f"How do I integrate {project_name} with [common tools]?",
+        f"How do I test code using {project_name}?",
+    ]
+    
+    return questions
+
+
+def analyze_documentation(file_path: str) -> Dict:
+    """Analyze documentation file for c7score optimization opportunities."""
+    path = Path(file_path)
+    
+    if not path.exists():
+        return {"error": f"File not found: {file_path}"}
+    
+    content = path.read_text(encoding='utf-8')
+    snippets = extract_code_snippets(content)
+    
+    # Analyze each snippet
+    snippet_issues = []
+    for snippet in snippets:
+        issues = analyze_snippet(snippet)
+        if issues:
+            snippet_issues.append({
+                'snippet': snippet,
+                'issues': issues
+            })
+    
+    # Find duplicates
+    duplicates = find_duplicates(snippets)
+    
+    # Calculate statistics
+    total_snippets = len(snippets)
+    snippets_with_issues = len(snippet_issues)
+    
+    # Language distribution
+    language_dist = Counter(s.language for s in snippets)
+    
+    # Issue type counts
+    issue_types = Counter()
+    for item in snippet_issues:
+        for issue in item['issues']:
+            # Extract metric number
+            if "Metric 3" in issue:
+                issue_types["Formatting (M3)"] += 1
+            elif "Metric 4" in issue:
+                issue_types["Metadata (M4)"] += 1
+            elif "Metric 5" in issue:
+                issue_types["Initialization (M5)"] += 1
+    
+    return {
+        'file': file_path,
+        'total_snippets': total_snippets,
+        'snippets_with_issues': snippets_with_issues,
+        'issue_breakdown': dict(issue_types),
+        'duplicates': len(duplicates),
+        'language_distribution': dict(language_dist),
+        'detailed_issues': snippet_issues,
+        'duplicate_pairs': duplicates,
+        'question_suggestions': generate_question_suggestions(content),
+    }
+
+
+def print_report(analysis: Dict):
+    """Print a formatted analysis report."""
+    if 'error' in analysis:
+        print(f"❌ {analysis['error']}")
+        return
+    
+    print(f"\n{'='*70}")
+    print(f"C7Score Documentation Analysis: {analysis['file']}")
+    print(f"{'='*70}\n")
+    
+    print(f"📊 Summary Statistics")
+    print(f"{'─'*70}")
+    print(f"Total code snippets: {analysis['total_snippets']}")
+    print(f"Snippets with issues: {analysis['snippets_with_issues']}")
+    print(f"Duplicate snippets: {analysis['duplicates']}")
+    
+    if analysis['total_snippets'] > 0:
+        issue_rate = (analysis['snippets_with_issues'] / analysis['total_snippets']) * 100
+        print(f"Issue rate: {issue_rate:.1f}%")
+    
+    print(f"\n📝 Language Distribution")
+    print(f"{'─'*70}")
+    for lang, count in sorted(analysis['language_distribution'].items(), key=lambda x: x[1], reverse=True):
+        print(f"  {lang}: {count}")
+    
+    if analysis['issue_breakdown']:
+        print(f"\n⚠️  Issue Breakdown by Metric")
+        print(f"{'─'*70}")
+        for issue_type, count in sorted(analysis['issue_breakdown'].items(), key=lambda x: x[1], reverse=True):
+            print(f"  {issue_type}: {count}")
+    
+    if analysis['detailed_issues']:
+        print(f"\n🔍 Detailed Issues (Showing first 10)")
+        print(f"{'─'*70}")
+        for i, item in enumerate(analysis['detailed_issues'][:10], 1):
+            snippet = item['snippet']
+            print(f"\n{i}. Line {snippet.line_num} [{snippet.language}] ({len(snippet.code.splitlines())} lines)")
+            for issue in item['issues']:
+                print(f"   {issue}")
+            # Show first 2 lines of code
+            code_preview = '\n'.join(snippet.code.split('\n')[:2])
+            print(f"   Preview: {code_preview[:80]}...")
+    
+    if analysis['duplicate_pairs']:
+        print(f"\n🔄 Duplicate Snippets")
+        print(f"{'─'*70}")
+        for i, (idx1, idx2) in enumerate(analysis['duplicate_pairs'][:5], 1):
+            print(f"{i}. Snippets at lines {snippets[idx1].line_num} and {snippets[idx2].line_num} are duplicates")
+    
+    print(f"\n💡 Suggested Questions to Answer")
+    print(f"{'─'*70}")
+    for i, question in enumerate(analysis['question_suggestions'], 1):
+        print(f"{i}. {question}")
+    
+    print(f"\n✅ Recommendations")
+    print(f"{'─'*70}")
+    
+    recommendations = []
+    
+    if analysis['issue_breakdown'].get('Initialization (M5)', 0) > 0:
+        recommendations.append(
+            "• Combine import-only and installation-only snippets with actual usage examples"
+        )
+    
+    if analysis['issue_breakdown'].get('Formatting (M3)', 0) > 0:
+        recommendations.append(
+            "• Fix formatting issues: use proper language tags, avoid very short/long snippets"
+        )
+    
+    if analysis['issue_breakdown'].get('Metadata (M4)', 0) > 0:
+        recommendations.append(
+            "• Remove or relocate metadata content (licensing, citations, directory trees)"
+        )
+    
+    if analysis['duplicates'] > 0:
+        recommendations.append(
+            f"• Remove or consolidate {analysis['duplicates']} duplicate snippets (reduces LLM score)"
+        )
+    
+    if analysis['total_snippets'] < 10:
+        recommendations.append(
+            "• Add more comprehensive code examples answering common developer questions"
+        )
+    
+    if not recommendations:
+        recommendations.append("• Documentation looks good! Consider running actual c7score for detailed metrics")
+    
+    for rec in recommendations:
+        print(rec)
+    
+    print(f"\n{'='*70}\n")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python analyze_docs.py <path-to-readme-or-doc.md>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    analysis = analyze_documentation(file_path)
+    print_report(analysis)