#!/usr/bin/env python3 """ Documentation Analyzer for C7Score Optimization Analyzes README and documentation files to identify: - Snippets that are import-only or installation-only - Potential formatting issues - Metadata content (licensing, citations, directory structures) - Duplicate or near-duplicate code blocks - Missing question-answering examples """ import re import sys from pathlib import Path from typing import List, Dict, Tuple from collections import Counter class CodeSnippet: def __init__(self, language: str, code: str, context: str, line_num: int): self.language = language self.code = code.strip() self.context = context # Text before the code block self.line_num = line_num self.issues = [] def __repr__(self): return f"CodeSnippet(lang={self.language}, lines={len(self.code.splitlines())}, line={self.line_num})" def extract_code_snippets(content: str) -> List[CodeSnippet]: """Extract all code blocks from markdown content.""" snippets = [] lines = content.split('\n') i = 0 while i < len(lines): line = lines[i] # Match code block start if line.strip().startswith('```'): # Extract language language = line.strip()[3:].strip() or 'unknown' start_line = i # Get context (previous non-empty lines up to 5) context_lines = [] for j in range(max(0, i-5), i): if lines[j].strip(): context_lines.append(lines[j].strip()) context = ' '.join(context_lines[-3:]) # Last 3 lines of context # Collect code until end marker i += 1 code_lines = [] while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 code = '\n'.join(code_lines) snippets.append(CodeSnippet(language, code, context, start_line + 1)) i += 1 return snippets def analyze_snippet(snippet: CodeSnippet) -> List[str]: """Analyze a single code snippet for c7score issues.""" issues = [] code = snippet.code.strip() lines = [l.strip() for l in code.split('\n') if l.strip()] # Check 1: Import-only snippets if lines: import_patterns = [ r'^import\s+', r'^from\s+\S+\s+import\s+', r'^require\s*\(', r'^const\s+\S+\s*=\s*require', r'^using\s+', ] import_count = sum(1 for line in lines if any(re.match(p, line) for p in import_patterns)) if import_count == len(lines) and len(lines) <= 5: issues.append("⚠️ Import-only snippet (Metric 5: Initialization)") # Check 2: Installation-only snippets install_patterns = [ r'pip install', r'npm install', r'yarn add', r'cargo install', r'go get', r'gem install', ] if len(lines) <= 2 and any(any(pattern in line for pattern in install_patterns) for line in lines): issues.append("⚠️ Installation-only snippet (Metric 5: Initialization)") # Check 3: Snippet length if len(lines) < 3: issues.append("⚠️ Very short snippet (<3 lines) (Metric 3: Formatting)") elif len(lines) > 100: issues.append("⚠️ Very long snippet (>100 lines) (Metric 3: Formatting)") # Check 4: Language tag issues problematic_languages = [ 'configuration', 'config', 'cli arguments', 'arguments', 'none', 'console', 'output', 'text', 'plaintext' ] if snippet.language.lower() in problematic_languages: issues.append(f"⚠️ Problematic language tag: '{snippet.language}' (Metric 3: Formatting)") # Check 5: Looks like a list if len(lines) > 3: list_markers = sum(1 for line in lines if re.match(r'^\s*[-*\d.]+\s', line)) if list_markers / len(lines) > 0.5: issues.append("⚠️ Appears to be a list, not code (Metric 3: Formatting)") # Check 6: Directory structure if any(all(char in line for char in ['├', '│', '─', '└']) for line in lines): issues.append("⚠️ Directory structure detected (Metric 4: Project Metadata)") # Check 7: License or citation markers license_markers = ['license', 'copyright', 'mit', 'apache', 'gpl', 'bsd'] citation_markers = ['@article', '@book', 'bibtex', 'doi:', 'citation'] code_lower = code.lower() if any(marker in code_lower for marker in license_markers) and len(code) > 100: issues.append("⚠️ License content detected (Metric 4: Project Metadata)") if any(marker in code_lower for marker in citation_markers): issues.append("⚠️ Citation content detected (Metric 4: Project Metadata)") return issues def find_duplicates(snippets: List[CodeSnippet]) -> List[Tuple[int, int]]: """Find duplicate or near-duplicate snippets.""" duplicates = [] for i, snippet1 in enumerate(snippets): for j, snippet2 in enumerate(snippets[i+1:], start=i+1): # Normalize for comparison code1 = re.sub(r'\s+', ' ', snippet1.code.lower()).strip() code2 = re.sub(r'\s+', ' ', snippet2.code.lower()).strip() # Exact duplicate if code1 == code2: duplicates.append((i, j)) # Near duplicate (>80% similar) elif len(code1) > 20 and len(code2) > 20: # Simple similarity check min_len = min(len(code1), len(code2)) max_len = max(len(code1), len(code2)) if min_len / max_len > 0.8: # Check if one contains most of the other if code1 in code2 or code2 in code1: duplicates.append((i, j)) return duplicates def generate_question_suggestions(content: str) -> List[str]: """Suggest questions that should be answered in the documentation.""" # Extract apparent project name title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) project_name = title_match.group(1) if title_match else "this library" questions = [ f"How do I install {project_name}?", f"How do I get started with {project_name}?", f"How do I initialize/configure {project_name}?", f"How do I authenticate with {project_name}?", f"What are the main features and how do I use them?", f"How do I handle errors in {project_name}?", f"How do I perform [common operation]?", f"What are common configuration options?", f"How do I integrate {project_name} with [common tools]?", f"How do I test code using {project_name}?", ] return questions def analyze_documentation(file_path: str) -> Dict: """Analyze documentation file for c7score optimization opportunities.""" path = Path(file_path) if not path.exists(): return {"error": f"File not found: {file_path}"} content = path.read_text(encoding='utf-8') snippets = extract_code_snippets(content) # Analyze each snippet snippet_issues = [] for snippet in snippets: issues = analyze_snippet(snippet) if issues: snippet_issues.append({ 'snippet': snippet, 'issues': issues }) # Find duplicates duplicates = find_duplicates(snippets) # Calculate statistics total_snippets = len(snippets) snippets_with_issues = len(snippet_issues) # Language distribution language_dist = Counter(s.language for s in snippets) # Issue type counts issue_types = Counter() for item in snippet_issues: for issue in item['issues']: # Extract metric number if "Metric 3" in issue: issue_types["Formatting (M3)"] += 1 elif "Metric 4" in issue: issue_types["Metadata (M4)"] += 1 elif "Metric 5" in issue: issue_types["Initialization (M5)"] += 1 return { 'file': file_path, 'total_snippets': total_snippets, 'snippets_with_issues': snippets_with_issues, 'issue_breakdown': dict(issue_types), 'duplicates': len(duplicates), 'language_distribution': dict(language_dist), 'detailed_issues': snippet_issues, 'duplicate_pairs': duplicates, 'question_suggestions': generate_question_suggestions(content), } def print_report(analysis: Dict): """Print a formatted analysis report.""" if 'error' in analysis: print(f"❌ {analysis['error']}") return print(f"\n{'='*70}") print(f"C7Score Documentation Analysis: {analysis['file']}") print(f"{'='*70}\n") print(f"📊 Summary Statistics") print(f"{'─'*70}") print(f"Total code snippets: {analysis['total_snippets']}") print(f"Snippets with issues: {analysis['snippets_with_issues']}") print(f"Duplicate snippets: {analysis['duplicates']}") if analysis['total_snippets'] > 0: issue_rate = (analysis['snippets_with_issues'] / analysis['total_snippets']) * 100 print(f"Issue rate: {issue_rate:.1f}%") print(f"\n📝 Language Distribution") print(f"{'─'*70}") for lang, count in sorted(analysis['language_distribution'].items(), key=lambda x: x[1], reverse=True): print(f" {lang}: {count}") if analysis['issue_breakdown']: print(f"\n⚠️ Issue Breakdown by Metric") print(f"{'─'*70}") for issue_type, count in sorted(analysis['issue_breakdown'].items(), key=lambda x: x[1], reverse=True): print(f" {issue_type}: {count}") if analysis['detailed_issues']: print(f"\n🔍 Detailed Issues (Showing first 10)") print(f"{'─'*70}") for i, item in enumerate(analysis['detailed_issues'][:10], 1): snippet = item['snippet'] print(f"\n{i}. Line {snippet.line_num} [{snippet.language}] ({len(snippet.code.splitlines())} lines)") for issue in item['issues']: print(f" {issue}") # Show first 2 lines of code code_preview = '\n'.join(snippet.code.split('\n')[:2]) print(f" Preview: {code_preview[:80]}...") if analysis['duplicate_pairs']: print(f"\n🔄 Duplicate Snippets") print(f"{'─'*70}") for i, (idx1, idx2) in enumerate(analysis['duplicate_pairs'][:5], 1): print(f"{i}. Snippets at lines {snippets[idx1].line_num} and {snippets[idx2].line_num} are duplicates") print(f"\n💡 Suggested Questions to Answer") print(f"{'─'*70}") for i, question in enumerate(analysis['question_suggestions'], 1): print(f"{i}. {question}") print(f"\n✅ Recommendations") print(f"{'─'*70}") recommendations = [] if analysis['issue_breakdown'].get('Initialization (M5)', 0) > 0: recommendations.append( "• Combine import-only and installation-only snippets with actual usage examples" ) if analysis['issue_breakdown'].get('Formatting (M3)', 0) > 0: recommendations.append( "• Fix formatting issues: use proper language tags, avoid very short/long snippets" ) if analysis['issue_breakdown'].get('Metadata (M4)', 0) > 0: recommendations.append( "• Remove or relocate metadata content (licensing, citations, directory trees)" ) if analysis['duplicates'] > 0: recommendations.append( f"• Remove or consolidate {analysis['duplicates']} duplicate snippets (reduces LLM score)" ) if analysis['total_snippets'] < 10: recommendations.append( "• Add more comprehensive code examples answering common developer questions" ) if not recommendations: recommendations.append("• Documentation looks good! Consider running actual c7score for detailed metrics") for rec in recommendations: print(rec) print(f"\n{'='*70}\n") if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python analyze_docs.py ") sys.exit(1) file_path = sys.argv[1] analysis = analyze_documentation(file_path) print_report(analysis)