#!/usr/bin/env python3 """ Concept Extraction Script for Ontological Documentation This script analyzes codebases to extract domain concepts, entities, and relationships for building ontological documentation. It supports multiple programming languages and can identify inheritance hierarchies, composition patterns, and semantic relationships. """ import ast import re import json import sys from pathlib import Path from typing import Dict, List, Set, Tuple, Any from collections import defaultdict class ConceptExtractor: """Extracts ontological concepts from source code.""" def __init__(self): self.concepts = defaultdict(dict) self.relationships = defaultdict(list) self.taxonomies = defaultdict(list) def extract_from_python(self, file_path: Path) -> Dict[str, Any]: """Extract concepts from Python source code.""" try: with open(file_path, 'r', encoding='utf-8') as f: tree = ast.parse(f.read()) visitor = ClassVisitor() visitor.visit(tree) return { 'classes': visitor.classes, 'inheritance': visitor.inheritance, 'imports': visitor.imports, 'functions': visitor.functions } except Exception as e: return {'error': str(e)} def extract_from_javascript(self, file_path: Path) -> Dict[str, Any]: """Extract concepts from JavaScript/TypeScript source code.""" concepts = { 'classes': [], 'interfaces': [], 'functions': [], 'imports': [] } try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Extract class declarations class_pattern = r'(?:class|interface)\s+(\w+)(?:\s+extends\s+(\w+))?' for match in re.finditer(class_pattern, content): class_name = match.group(1) parent_class = match.group(2) concepts['classes'].append({ 'name': class_name, 'parent': parent_class, 'type': 'class' if 'class' in match.group(0) else 'interface' }) # Extract function declarations func_pattern = r'(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:\([^)]*\)\s*)?=>|(\w+)\s*:\s*\([^)]*\)\s*=>)' for match in re.finditer(func_pattern, content): func_name = match.group(1) or match.group(2) or match.group(3) if func_name: concepts['functions'].append({'name': func_name}) # Extract imports import_pattern = r'import\s+.*?from\s+["\']([^"\']+)["\']' for match in re.finditer(import_pattern, content): concepts['imports'].append({'source': match.group(1)}) except Exception as e: concepts['error'] = str(e) return concepts def build_ontology(self, extracted_data: List[Dict[str, Any]]) -> Dict[str, Any]: """Build ontological structure from extracted data.""" ontology = { 'concepts': {}, 'relationships': { 'is_a': [], # inheritance 'part_of': [], # composition 'depends_on': [], # dependencies 'associates_with': [] # loose associations }, 'taxonomies': {} } all_classes = [] all_functions = [] all_imports = [] # Collect all entities for data in extracted_data: if 'classes' in data: all_classes.extend(data['classes']) if 'functions' in data: all_functions.extend(data['functions']) if 'imports' in data: all_imports.extend(data['imports']) # Build concepts for cls in all_classes: if isinstance(cls, dict): concept_name = cls.get('name', str(cls)) ontology['concepts'][concept_name] = { 'type': 'class', 'properties': cls } # Build inheritance relationships parent = cls.get('parent') if parent: ontology['relationships']['is_a'].append({ 'subject': concept_name, 'object': parent }) for func in all_functions: if isinstance(func, dict): func_name = func.get('name', str(func)) ontology['concepts'][func_name] = { 'type': 'function', 'properties': func } return ontology def generate_mermaid_diagram(self, ontology: Dict[str, Any]) -> str: """Generate Mermaid diagram from ontology.""" lines = ["graph TD"] # Add concepts as nodes for concept_name, concept_data in ontology['concepts'].items(): concept_type = concept_data.get('type', 'concept') if concept_type == 'class': lines.append(f" {concept_name}[({concept_name})]") else: lines.append(f" {concept_name}[{concept_name}]") # Add relationships for rel_type, relationships in ontology['relationships'].items(): for rel in relationships: subject = rel['subject'] obj = rel['object'] if rel_type == 'is_a': lines.append(f" {subject} --|> {obj}") elif rel_type == 'part_of': lines.append(f" {subject} --* {obj}") elif rel_type == 'depends_on': lines.append(f" {subject} -.-> {obj}") else: lines.append(f" {subject} --- {obj}") return "\n".join(lines) class ClassVisitor(ast.NodeVisitor): """AST visitor for Python class analysis.""" def __init__(self): self.classes = [] self.inheritance = [] self.imports = [] self.functions = [] def visit_ClassDef(self, node): class_info = { 'name': node.name, 'bases': [base.id for base in node.bases if hasattr(base, 'id')], 'methods': [], 'line_number': node.lineno } for item in node.body: if isinstance(item, ast.FunctionDef): class_info['methods'].append({ 'name': item.name, 'args': [arg.arg for arg in item.args.args], 'line_number': item.lineno }) self.classes.append(class_info) # Track inheritance for base in node.bases: if hasattr(base, 'id'): self.inheritance.append({ 'child': node.name, 'parent': base.id }) self.generic_visit(node) def visit_Import(self, node): for alias in node.names: self.imports.append({ 'module': alias.name, 'alias': alias.asname }) self.generic_visit(node) def visit_FunctionDef(self, node): func_info = { 'name': node.name, 'args': [arg.arg for arg in node.args.args], 'line_number': node.lineno } self.functions.append(func_info) self.generic_visit(node) def main(): """Main function to run concept extraction.""" if len(sys.argv) < 2: print("Usage: python extract_concepts.py ") sys.exit(1) path = Path(sys.argv[1]) extractor = ConceptExtractor() extracted_data = [] if path.is_file(): if path.suffix == '.py': data = extractor.extract_from_python(path) extracted_data.append(data) elif path.suffix in ['.js', '.ts', '.jsx', '.tsx']: data = extractor.extract_from_javascript(path) extracted_data.append(data) elif path.is_dir(): for file_path in path.rglob('*'): if file_path.is_file(): if file_path.suffix == '.py': data = extractor.extract_from_python(file_path) extracted_data.append(data) elif file_path.suffix in ['.js', '.ts', '.jsx', '.tsx']: data = extractor.extract_from_javascript(file_path) extracted_data.append(data) ontology = extractor.build_ontology(extracted_data) # Output as JSON print(json.dumps(ontology, indent=2)) # Also generate Mermaid diagram diagram = extractor.generate_mermaid_diagram(ontology) print("\n--- Mermaid Diagram ---") print(diagram) if __name__ == "__main__": main()