#!/usr/bin/env python3 """ Smart Context Generator for Oracle Enhances context generation by analyzing: - Current git status (files changed, branch name) - File patterns and paths in knowledge tags - Time-decay for older knowledge - Relevance scoring based on current work Usage: python smart_context.py [--format text|json] [--max-length 5000] This can be used standalone or integrated into generate_context.py Examples: python smart_context.py python smart_context.py --format json --max-length 10000 """ import os import sys import json import subprocess from datetime import datetime, timedelta from pathlib import Path from typing import List, Dict, Optional, Any, Tuple import re def find_oracle_root() -> Optional[Path]: """Find the .oracle directory.""" current = Path.cwd() while current != current.parent: oracle_path = current / '.oracle' if oracle_path.exists(): return oracle_path current = current.parent return None def get_git_status() -> Dict[str, Any]: """Get current git status information. Returns: Dictionary with git status information """ git_info = { 'branch': None, 'modified_files': [], 'staged_files': [], 'untracked_files': [], 'is_repo': False } try: # Check if we're in a git repo subprocess.run( ['git', 'rev-parse', '--git-dir'], check=True, capture_output=True, text=True, timeout=5 ) git_info['is_repo'] = True # Get current branch result = subprocess.run( ['git', 'branch', '--show-current'], capture_output=True, text=True, check=False, timeout=5 ) if result.returncode == 0: git_info['branch'] = result.stdout.strip() # Get modified files result = subprocess.run( ['git', 'diff', '--name-only'], capture_output=True, text=True, check=False, timeout=5 ) if result.returncode == 0: git_info['modified_files'] = [f.strip() for f in result.stdout.split('\n') if f.strip()] # Get staged files result = subprocess.run( ['git', 'diff', '--staged', '--name-only'], capture_output=True, text=True, check=False, timeout=5 ) if result.returncode == 0: git_info['staged_files'] = [f.strip() for f in result.stdout.split('\n') if f.strip()] # Get untracked files result = subprocess.run( ['git', 'ls-files', '--others', '--exclude-standard'], capture_output=True, text=True, check=False, timeout=5 ) if result.returncode == 0: git_info['untracked_files'] = [f.strip() for f in result.stdout.split('\n') if f.strip()] except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): # Not a git repo, git not available, or git command timed out pass return git_info def extract_file_patterns(files: List[str]) -> List[str]: """Extract patterns from file paths for matching knowledge. Args: files: List of file paths Returns: List of patterns (file types, directory names, etc.) """ patterns = set() for file_path in files: path = Path(file_path) # Add file extension if path.suffix: patterns.add(path.suffix[1:]) # Remove the dot # Add directory components for part in path.parts[:-1]: # Exclude filename if part and part != '.': patterns.add(part) # Add filename without extension stem = path.stem if stem: patterns.add(stem) return list(patterns) def load_all_knowledge(oracle_path: Path) -> List[Dict[str, Any]]: """Load all knowledge from Oracle. Args: oracle_path: Path to .oracle directory Returns: List of knowledge entries """ knowledge_dir = oracle_path / 'knowledge' all_knowledge: List[Dict[str, Any]] = [] categories = ['patterns', 'preferences', 'gotchas', 'solutions', 'corrections'] for category in categories: file_path = knowledge_dir / f'{category}.json' if file_path.exists(): try: with open(file_path, 'r', encoding='utf-8') as f: entries = json.load(f) for entry in entries: if isinstance(entry, dict): entry['_category'] = category all_knowledge.append(entry) except json.JSONDecodeError as e: # Log parsing errors for debugging print(f"Warning: Failed to parse {file_path}: {e}", file=sys.stderr) continue except (FileNotFoundError, OSError, IOError) as e: # Log file access errors print(f"Warning: Cannot read {file_path}: {e}", file=sys.stderr) continue return all_knowledge def calculate_time_decay_score(created_date: str, days_half_life: int = 30) -> float: """Calculate time decay score for knowledge based on age. Args: created_date: ISO format date string days_half_life: Number of days for score to decay to 0.5 (must be positive) Returns: Score between 0 and 1 (1 = created today, decays over time) Raises: ValueError: If days_half_life is not positive """ if days_half_life <= 0: raise ValueError(f"days_half_life must be positive, got {days_half_life}") try: created = datetime.fromisoformat(created_date) # Use UTC time if available, otherwise use local time now = datetime.now(created.tzinfo) if created.tzinfo else datetime.now() # Use total_seconds for precise calculation (includes hours/minutes) age_seconds = (now - created).total_seconds() age_days = age_seconds / (24 * 3600) # Convert to days with decimals # Exponential decay: score = 0.5 ^ (days_old / half_life) score = 0.5 ** (age_days / days_half_life) return max(0.0, min(1.0, score)) except (ValueError, TypeError): # If date parsing fails, return neutral score return 0.5 def calculate_relevance_score( entry: Dict[str, Any], file_patterns: List[str], branch: Optional[str] = None ) -> float: """Calculate relevance score for a knowledge entry. Args: entry: Knowledge entry dictionary file_patterns: List of file patterns from current work branch: Current git branch name Returns: Relevance score (0.0 to 1.0) """ score = 0.0 # Base score from priority priority_scores = { 'critical': 1.0, 'high': 0.8, 'medium': 0.5, 'low': 0.2 } priority = entry.get('priority', 'medium') score += priority_scores.get(priority, 0.5) * 0.3 # 30% weight to priority # Score from tag matches - FIXED: protect against empty file_patterns tags = entry.get('tags', []) if tags and file_patterns: # Check how many patterns match tags (using word boundary matching) matches = sum(1 for pattern in file_patterns if any(re.search(r'\b' + re.escape(pattern.lower()) + r'\b', tag.lower()) for tag in tags)) tag_score = matches / len(file_patterns) # Safe: len(file_patterns) > 0 score += min(1.0, tag_score) * 0.4 # 40% weight to tag matching # Score from content/title keyword matching - FIXED: protect against empty file_patterns if file_patterns: content = f"{entry.get('title', '')} {entry.get('content', '')} {entry.get('context', '')}".lower() # Use word boundary matching to avoid false positives keyword_matches = sum(1 for pattern in file_patterns if re.search(r'\b' + re.escape(pattern.lower()) + r'\b', content)) keyword_score = keyword_matches / len(file_patterns) # Safe: len(file_patterns) > 0 score += min(1.0, keyword_score) * 0.2 # 20% weight to keyword matching # Score from time decay created = entry.get('created', '') time_score = calculate_time_decay_score(created) score += time_score * 0.1 # 10% weight to recency return min(1.0, score) def score_and_rank_knowledge( knowledge: List[Dict[str, Any]], git_info: Dict[str, Any] ) -> List[Tuple[Dict[str, Any], float]]: """Score and rank knowledge entries by relevance. Args: knowledge: List of knowledge entries git_info: Git status information Returns: List of tuples (entry, score) sorted by score descending """ # Extract file patterns from all changed files all_files = ( git_info['modified_files'] + git_info['staged_files'] + git_info['untracked_files'] ) file_patterns = extract_file_patterns(all_files) # Score each entry scored_entries = [] for entry in knowledge: score = calculate_relevance_score(entry, file_patterns, git_info.get('branch')) scored_entries.append((entry, score)) # Sort by score descending scored_entries.sort(key=lambda x: x[1], reverse=True) return scored_entries def generate_smart_context( oracle_path: Path, max_length: int = 5000, min_score: float = 0.3 ) -> str: """Generate smart context based on current git status. Args: oracle_path: Path to .oracle directory max_length: Maximum context length (must be > 0) min_score: Minimum relevance score to include (0.0-1.0) Returns: Formatted context string Raises: ValueError: If parameters are invalid """ # Validate parameters if not 0.0 <= min_score <= 1.0: raise ValueError(f"min_score must be in [0.0, 1.0], got {min_score}") if max_length <= 0: raise ValueError(f"max_length must be positive, got {max_length}") # Get git status git_info = get_git_status() # Load all knowledge knowledge = load_all_knowledge(oracle_path) if not knowledge: return "Oracle: No knowledge base found." # Score and rank knowledge scored_knowledge = score_and_rank_knowledge(knowledge, git_info) # Filter by minimum score relevant_knowledge = [(entry, score) for entry, score in scored_knowledge if score >= min_score] # Build context lines = [] lines.append("# Oracle Smart Context") lines.append("") # Add git status if available if git_info['is_repo']: lines.append("## Current Work Context") if git_info['branch']: lines.append(f"Branch: `{git_info['branch']}`") total_files = len(git_info['modified_files']) + len(git_info['staged_files']) if total_files > 0: lines.append(f"Files being worked on: {total_files}") lines.append("") # Add relevant knowledge if relevant_knowledge: lines.append("## Relevant Knowledge") lines.append("") # Group by category by_category: Dict[str, List[Tuple[Dict[str, Any], float]]] = {} for entry, score in relevant_knowledge[:20]: # Top 20 category = entry['_category'] if category not in by_category: by_category[category] = [] by_category[category].append((entry, score)) category_labels = { 'patterns': 'Patterns', 'preferences': 'Preferences', 'gotchas': 'Gotchas (Watch Out!)', 'solutions': 'Solutions', 'corrections': 'Corrections' } for category, items in by_category.items(): label = category_labels.get(category, category.capitalize()) lines.append(f"### {label}") lines.append("") for entry, score in items[:10]: # Top 10 per category priority = entry.get('priority', 'medium') title = entry.get('title', 'Untitled') content = entry.get('content', '') # Format based on priority and score if priority == 'critical' or score >= 0.8: lines.append(f"- **[{score:.1f}] {title}**") else: lines.append(f"- [{score:.1f}] {title}") # Add content if it's brief if content and len(content) < 200: lines.append(f" {content}") # Add tags if they matched tags = entry.get('tags', []) if tags: lines.append(f" *Tags: {', '.join(tags[:5])}*") lines.append("") else: lines.append("No highly relevant knowledge found for current work.") lines.append("") lines.append("Showing high-priority items:") lines.append("") # Fall back to high-priority items high_priority = [e for e in knowledge if e.get('priority') in ['critical', 'high']] for entry in high_priority[:10]: title = entry.get('title', 'Untitled') lines.append(f"- {title}") lines.append("") # Combine and truncate if needed full_context = "\n".join(lines) if len(full_context) > max_length: truncated = full_context[:max_length] # Find last newline to avoid breaking mid-line last_newline = truncated.rfind('\n') if last_newline != -1: truncated = truncated[:last_newline] truncated += f"\n\n*[Context truncated to {max_length} chars]*" return truncated return full_context def main(): import argparse parser = argparse.ArgumentParser( description='Generate smart context from Oracle knowledge', formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument( '--format', choices=['text', 'json'], default='text', help='Output format (text or json)' ) parser.add_argument( '--max-length', type=int, default=5000, help='Maximum context length' ) parser.add_argument( '--min-score', type=float, default=0.3, help='Minimum relevance score (0.0-1.0)' ) args = parser.parse_args() # Find Oracle oracle_path = find_oracle_root() if not oracle_path: if args.format == 'json': print(json.dumps({'error': 'Oracle not initialized'})) else: print("[ERROR] .oracle directory not found.") sys.exit(1) # Generate context try: context = generate_smart_context(oracle_path, args.max_length, args.min_score) if args.format == 'json': output = { 'context': context, 'git_status': get_git_status() } print(json.dumps(output, indent=2)) else: print(context) except Exception as e: if args.format == 'json': print(json.dumps({'error': str(e)})) else: print(f"[ERROR] {e}") sys.exit(1) if __name__ == '__main__': main()