Initial commit

2025-11-30 08:51:34 +08:00
commit acde81dcfe
59 changed files with 22282 additions and 0 deletions
--- a/skills/context-memory/utils/memory_intelligence.py
+++ b/skills/context-memory/utils/memory_intelligence.py
@@ -0,0 +1,574 @@
+#!/usr/bin/env python3
+"""
+PRISM Context Memory - Intelligence Layer
+
+Implements memory decay, self-evaluation, and learning over time.
+Based on research in persistent memory systems with confidence scoring.
+
+Key Concepts:
+- Memory Decay: Confidence scores decay following Ebbinghaus curve unless reinforced
+- Self-Evaluation: Track retrieval success and relevance
+- Upsert Logic: Update existing knowledge rather than duplicate
+- Confidence Scoring: Increases with successful usage, decays over time
+"""
+
+import os
+import sys
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+import math
+import re
+
+sys.path.insert(0, str(Path(__file__).parent))
+
+try:
+    import frontmatter
+except ImportError:
+    print("[ERROR] python-frontmatter not installed")
+    sys.exit(1)
+
+# Lazy import to avoid circular dependency
+# storage_obsidian imports from this file, so we can't import it at module level
+_storage_obsidian = None
+
+def _get_storage():
+    """Lazy load storage_obsidian to avoid circular import."""
+    global _storage_obsidian
+    if _storage_obsidian is None:
+        from storage_obsidian import get_vault_path, get_folder_paths, ensure_folder
+        _storage_obsidian = {
+            'get_vault_path': get_vault_path,
+            'get_folder_paths': get_folder_paths,
+            'ensure_folder': ensure_folder
+        }
+    return _storage_obsidian
+
+
+# ============================================================================
+# MEMORY DECAY & CONFIDENCE SCORING
+# ============================================================================
+
+def calculate_decay(
+    confidence: float,
+    last_accessed: datetime,
+    half_life_days: int = 30
+) -> float:
+    """
+    Calculate memory decay using exponential decay model (Ebbinghaus curve).
+
+    Confidence decays unless memory is reinforced through successful retrieval.
+
+    Args:
+        confidence: Current confidence score (0-1)
+        last_accessed: When memory was last accessed
+        half_life_days: Days for confidence to decay to 50%
+
+    Returns:
+        Decayed confidence score
+    """
+    days_since_access = (datetime.now() - last_accessed).days
+
+    if days_since_access == 0:
+        return confidence
+
+    # Exponential decay: C(t) = C₀ * (0.5)^(t/h)
+    # where h is half-life
+    decay_factor = math.pow(0.5, days_since_access / half_life_days)
+    decayed_confidence = confidence * decay_factor
+
+    # Don't decay below minimum threshold
+    return max(decayed_confidence, 0.1)
+
+
+def reinforce_confidence(
+    current_confidence: float,
+    retrieval_success: bool,
+    learning_rate: float = 0.1
+) -> float:
+    """
+    Reinforce or weaken confidence based on retrieval outcome.
+
+    Successful retrievals increase confidence; failures decrease it.
+
+    Args:
+        current_confidence: Current score (0-1)
+        retrieval_success: Whether retrieval was successful/relevant
+        learning_rate: How quickly confidence adjusts (0-1)
+
+    Returns:
+        Updated confidence score
+    """
+    if retrieval_success:
+        # Increase confidence, with diminishing returns as it approaches 1
+        delta = learning_rate * (1 - current_confidence)
+        return min(current_confidence + delta, 1.0)
+    else:
+        # Decrease confidence
+        delta = learning_rate * current_confidence
+        return max(current_confidence - delta, 0.1)
+
+
+def calculate_relevance_score(
+    access_count: int,
+    last_accessed: datetime,
+    confidence: float,
+    recency_weight: float = 0.3,
+    frequency_weight: float = 0.3,
+    confidence_weight: float = 0.4
+) -> float:
+    """
+    Calculate overall relevance score combining multiple factors.
+
+    Args:
+        access_count: Number of times accessed
+        last_accessed: Most recent access time
+        confidence: Current confidence score
+        recency_weight: Weight for recency (default 0.3)
+        frequency_weight: Weight for frequency (default 0.3)
+        confidence_weight: Weight for confidence (default 0.4)
+
+    Returns:
+        Relevance score (0-1)
+    """
+    # Recency score (exponential decay)
+    days_since = (datetime.now() - last_accessed).days
+    recency = math.exp(-days_since / 30)  # 30-day half-life
+
+    # Frequency score (logarithmic scaling)
+    frequency = math.log(1 + access_count) / math.log(101)  # Scale to 0-1
+
+    # Weighted combination
+    relevance = (
+        recency * recency_weight +
+        frequency * frequency_weight +
+        confidence * confidence_weight
+    )
+
+    return min(relevance, 1.0)
+
+
+# ============================================================================
+# INTELLIGENT TAGGING
+# ============================================================================
+
+def extract_tags_from_content(content: str, existing_tags: List[str] = None) -> List[str]:
+    """
+    Extract intelligent tags from content.
+
+    Generates:
+    - Concept tags (from domain terms)
+    - Entity tags (specific technologies)
+    - Action tags (verbs describing operations)
+
+    Args:
+        content: Note content
+        existing_tags: Tags already assigned
+
+    Returns:
+        List of extracted tags
+    """
+    existing_tags = existing_tags or []
+    extracted = set(existing_tags)
+
+    content_lower = content.lower()
+
+    # Common concept tags
+    concept_map = {
+        'authentication': ['auth', 'login', 'oauth', 'jwt', 'token'],
+        'database': ['sql', 'query', 'schema', 'migration', 'postgresql', 'mongodb'],
+        'testing': ['test', 'spec', 'assert', 'mock', 'fixture'],
+        'api': ['endpoint', 'route', 'request', 'response', 'rest'],
+        'security': ['encrypt', 'hash', 'secure', 'vulnerable', 'xss', 'csrf'],
+        'performance': ['optimize', 'cache', 'latency', 'throughput'],
+        'architecture': ['pattern', 'design', 'structure', 'component'],
+    }
+
+    for concept, keywords in concept_map.items():
+        if any(kw in content_lower for kw in keywords):
+            extracted.add(concept)
+
+    # Technology entity tags
+    tech_patterns = [
+        r'\b(react|vue|angular|svelte)\b',
+        r'\b(python|javascript|typescript|java|go|rust)\b',
+        r'\b(postgres|mysql|mongodb|redis|elasticsearch)\b',
+        r'\b(docker|kubernetes|aws|azure|gcp)\b',
+        r'\b(jwt|oauth|saml|ldap)\b',
+    ]
+
+    for pattern in tech_patterns:
+        matches = re.findall(pattern, content_lower, re.IGNORECASE)
+        extracted.update(matches)
+
+    return sorted(list(extracted))
+
+
+def generate_tag_hierarchy(tags: List[str]) -> Dict[str, List[str]]:
+    """
+    Organize tags into hierarchical structure.
+
+    Returns:
+        Dict mapping parent categories to child tags
+    """
+    hierarchy = {
+        'technology': [],
+        'concept': [],
+        'domain': [],
+        'pattern': []
+    }
+
+    # Categorize tags
+    tech_keywords = ['python', 'javascript', 'typescript', 'react', 'postgres', 'docker']
+    concept_keywords = ['authentication', 'testing', 'security', 'performance']
+    pattern_keywords = ['repository', 'service', 'factory', 'singleton']
+
+    for tag in tags:
+        tag_lower = tag.lower()
+        if any(tech in tag_lower for tech in tech_keywords):
+            hierarchy['technology'].append(tag)
+        elif any(concept in tag_lower for concept in concept_keywords):
+            hierarchy['concept'].append(tag)
+        elif any(pattern in tag_lower for pattern in pattern_keywords):
+            hierarchy['pattern'].append(tag)
+        else:
+            hierarchy['domain'].append(tag)
+
+    # Remove empty categories
+    return {k: v for k, v in hierarchy.items() if v}
+
+
+# ============================================================================
+# UPSERT LOGIC - UPDATE EXISTING KNOWLEDGE
+# ============================================================================
+
+def find_similar_notes(
+    title: str,
+    content: str,
+    note_type: str,
+    threshold: float = 0.7
+) -> List[Tuple[Path, float]]:
+    """
+    Find existing notes that might be duplicates or updates.
+
+    Uses title similarity and content overlap to identify candidates.
+
+    Args:
+        title: Note title
+        content: Note content
+        note_type: Type of note (file-analysis, pattern, decision)
+        threshold: Similarity threshold (0-1)
+
+    Returns:
+        List of (path, similarity_score) tuples
+    """
+    storage = _get_storage()
+    folders = storage['get_folder_paths']()
+    vault = storage['get_vault_path']()
+
+    # Map note type to folder
+    folder_map = {
+        'file-analysis': folders['files'],
+        'pattern': folders['patterns'],
+        'decision': folders['decisions'],
+        'interaction': folders['interactions']
+    }
+
+    search_folder = folder_map.get(note_type)
+    if not search_folder or not search_folder.exists():
+        return []
+
+    candidates = []
+    title_lower = title.lower()
+    content_words = set(re.findall(r'\w+', content.lower()))
+
+    for note_file in search_folder.rglob("*.md"):
+        try:
+            # Check title similarity
+            note_title = note_file.stem.lower()
+            title_similarity = compute_string_similarity(title_lower, note_title)
+
+            if title_similarity < 0.5:
+                continue
+
+            # Check content overlap
+            post = frontmatter.load(note_file)
+            note_content_words = set(re.findall(r'\w+', post.content.lower()))
+
+            # Jaccard similarity
+            intersection = len(content_words & note_content_words)
+            union = len(content_words | note_content_words)
+            content_similarity = intersection / union if union > 0 else 0
+
+            # Combined score (weighted average)
+            overall_similarity = (title_similarity * 0.6 + content_similarity * 0.4)
+
+            if overall_similarity >= threshold:
+                candidates.append((note_file, overall_similarity))
+
+        except Exception:
+            continue
+
+    # Sort by similarity (highest first)
+    candidates.sort(key=lambda x: x[1], reverse=True)
+    return candidates
+
+
+def compute_string_similarity(s1: str, s2: str) -> float:
+    """
+    Compute similarity between two strings using Levenshtein-based approach.
+
+    Returns:
+        Similarity score (0-1)
+    """
+    # Simple word overlap method
+    words1 = set(s1.split())
+    words2 = set(s2.split())
+
+    if not words1 or not words2:
+        return 0.0
+
+    intersection = len(words1 & words2)
+    union = len(words1 | words2)
+
+    return intersection / union if union > 0 else 0.0
+
+
+def should_update_existing(
+    existing_path: Path,
+    new_content: str,
+    similarity_score: float
+) -> bool:
+    """
+    Decide whether to update existing note or create new one.
+
+    Args:
+        existing_path: Path to existing note
+        new_content: New content to potentially add
+        similarity_score: How similar notes are (0-1)
+
+    Returns:
+        True if should update, False if should create new
+    """
+    # High similarity -> update existing
+    if similarity_score >= 0.85:
+        return True
+
+    # Medium similarity -> check if new content adds value
+    if similarity_score >= 0.7:
+        post = frontmatter.load(existing_path)
+        existing_length = len(post.content)
+        new_length = len(new_content)
+
+        # If new content is substantially different/longer, keep separate
+        if new_length > existing_length * 1.5:
+            return False
+
+        return True
+
+    # Low similarity -> create new
+    return False
+
+
+def merge_note_content(
+    existing_content: str,
+    new_content: str,
+    merge_strategy: str = "append"
+) -> str:
+    """
+    Intelligently merge new content into existing note.
+
+    Args:
+        existing_content: Current note content
+        new_content: New information to add
+        merge_strategy: How to merge ("append", "replace", "sections")
+
+    Returns:
+        Merged content
+    """
+    if merge_strategy == "replace":
+        return new_content
+
+    elif merge_strategy == "append":
+        # Add new content at end with separator
+        return f"{existing_content}\n\n## Updated Information\n\n{new_content}"
+
+    elif merge_strategy == "sections":
+        # Merge by sections (smarter merging)
+        # For now, append with date
+        timestamp = datetime.now().strftime("%Y-%m-%d")
+        return f"{existing_content}\n\n## Update - {timestamp}\n\n{new_content}"
+
+    return existing_content
+
+
+# ============================================================================
+# SELF-EVALUATION & MAINTENANCE
+# ============================================================================
+
+def evaluate_memory_health(vault_path: Path = None) -> Dict:
+    """
+    Evaluate overall memory system health.
+
+    Checks:
+    - Low-confidence memories
+    - Stale memories (not accessed recently)
+    - Duplicate candidates
+    - Tag consistency
+
+    Returns:
+        Health report dictionary
+    """
+    if vault_path is None:
+        storage = _get_storage()
+        vault_path = storage['get_vault_path']()
+
+    report = {
+        'total_notes': 0,
+        'low_confidence': [],
+        'stale_memories': [],
+        'duplicate_candidates': [],
+        'tag_issues': [],
+        'avg_confidence': 0.0,
+        'avg_relevance': 0.0
+    }
+
+    confidences = []
+    relevances = []
+
+    for note_file in vault_path.rglob("*.md"):
+        try:
+            post = frontmatter.load(note_file)
+
+            if post.get('type') not in ['file-analysis', 'pattern', 'decision']:
+                continue
+
+            report['total_notes'] += 1
+
+            # Check confidence
+            confidence = post.get('confidence_score', 0.5)
+            confidences.append(confidence)
+
+            if confidence < 0.3:
+                report['low_confidence'].append(str(note_file.relative_to(vault_path)))
+
+            # Check staleness
+            last_accessed_str = post.get('last_accessed')
+            if last_accessed_str:
+                last_accessed = datetime.fromisoformat(last_accessed_str)
+                days_stale = (datetime.now() - last_accessed).days
+
+                if days_stale > 90:
+                    report['stale_memories'].append({
+                        'path': str(note_file.relative_to(vault_path)),
+                        'days_stale': days_stale
+                    })
+
+            # Check tags
+            tags = post.get('tags', [])
+            if not tags:
+                report['tag_issues'].append(str(note_file.relative_to(vault_path)))
+
+        except Exception:
+            continue
+
+    if confidences:
+        report['avg_confidence'] = sum(confidences) / len(confidences)
+
+    return report
+
+
+def consolidate_duplicates(
+    duplicate_candidates: List[Tuple[Path, Path, float]],
+    auto_merge_threshold: float = 0.95
+) -> List[Dict]:
+    """
+    Consolidate duplicate or near-duplicate memories.
+
+    Args:
+        duplicate_candidates: List of (path1, path2, similarity) tuples
+        auto_merge_threshold: Automatically merge if similarity above this
+
+    Returns:
+        List of consolidation actions taken
+    """
+    actions = []
+
+    for path1, path2, similarity in duplicate_candidates:
+        if similarity >= auto_merge_threshold:
+            # Auto-merge high-similarity duplicates
+            try:
+                post1 = frontmatter.load(path1)
+                post2 = frontmatter.load(path2)
+
+                # Keep the one with higher confidence
+                conf1 = post1.get('confidence_score', 0.5)
+                conf2 = post2.get('confidence_score', 0.5)
+
+                if conf1 >= conf2:
+                    primary, secondary = path1, path2
+                else:
+                    primary, secondary = path2, path1
+
+                # Merge content
+                post_primary = frontmatter.load(primary)
+                post_secondary = frontmatter.load(secondary)
+
+                merged_content = merge_note_content(
+                    post_primary.content,
+                    post_secondary.content,
+                    "sections"
+                )
+
+                # Update primary
+                post_primary.content = merged_content
+
+                with open(primary, 'w', encoding='utf-8') as f:
+                    f.write(frontmatter.dumps(post_primary))
+
+                # Archive secondary
+                secondary.unlink()
+
+                actions.append({
+                    'action': 'merged',
+                    'primary': str(primary),
+                    'secondary': str(secondary),
+                    'similarity': similarity
+                })
+
+            except Exception as e:
+                actions.append({
+                    'action': 'error',
+                    'files': [str(path1), str(path2)],
+                    'error': str(e)
+                })
+
+    return actions
+
+
+if __name__ == "__main__":
+    print("Memory Intelligence System")
+    print("=" * 60)
+
+    # Test decay calculation
+    confidence = 0.8
+    last_access = datetime.now() - timedelta(days=45)
+    decayed = calculate_decay(confidence, last_access)
+    print(f"\nDecay Test:")
+    print(f"  Initial confidence: {confidence}")
+    print(f"  Days since access: 45")
+    print(f"  Decayed confidence: {decayed:.3f}")
+
+    # Test reinforcement
+    reinforced = reinforce_confidence(decayed, True)
+    print(f"\nReinforcement Test:")
+    print(f"  After successful retrieval: {reinforced:.3f}")
+
+    # Test tag extraction
+    sample = "Implement JWT authentication using jsonwebtoken library for secure API access"
+    tags = extract_tags_from_content(sample)
+    print(f"\nTag Extraction Test:")
+    print(f"  Content: {sample}")
+    print(f"  Tags: {tags}")
+
+    print("\n[OK] Memory intelligence layer operational")