575 lines
17 KiB
Python
575 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PRISM Context Memory - Intelligence Layer
|
|
|
|
Implements memory decay, self-evaluation, and learning over time.
|
|
Based on research in persistent memory systems with confidence scoring.
|
|
|
|
Key Concepts:
|
|
- Memory Decay: Confidence scores decay following Ebbinghaus curve unless reinforced
|
|
- Self-Evaluation: Track retrieval success and relevance
|
|
- Upsert Logic: Update existing knowledge rather than duplicate
|
|
- Confidence Scoring: Increases with successful usage, decays over time
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, Tuple
|
|
import math
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
try:
|
|
import frontmatter
|
|
except ImportError:
|
|
print("[ERROR] python-frontmatter not installed")
|
|
sys.exit(1)
|
|
|
|
# Lazy import to avoid circular dependency
|
|
# storage_obsidian imports from this file, so we can't import it at module level
|
|
_storage_obsidian = None
|
|
|
|
def _get_storage():
|
|
"""Lazy load storage_obsidian to avoid circular import."""
|
|
global _storage_obsidian
|
|
if _storage_obsidian is None:
|
|
from storage_obsidian import get_vault_path, get_folder_paths, ensure_folder
|
|
_storage_obsidian = {
|
|
'get_vault_path': get_vault_path,
|
|
'get_folder_paths': get_folder_paths,
|
|
'ensure_folder': ensure_folder
|
|
}
|
|
return _storage_obsidian
|
|
|
|
|
|
# ============================================================================
|
|
# MEMORY DECAY & CONFIDENCE SCORING
|
|
# ============================================================================
|
|
|
|
def calculate_decay(
|
|
confidence: float,
|
|
last_accessed: datetime,
|
|
half_life_days: int = 30
|
|
) -> float:
|
|
"""
|
|
Calculate memory decay using exponential decay model (Ebbinghaus curve).
|
|
|
|
Confidence decays unless memory is reinforced through successful retrieval.
|
|
|
|
Args:
|
|
confidence: Current confidence score (0-1)
|
|
last_accessed: When memory was last accessed
|
|
half_life_days: Days for confidence to decay to 50%
|
|
|
|
Returns:
|
|
Decayed confidence score
|
|
"""
|
|
days_since_access = (datetime.now() - last_accessed).days
|
|
|
|
if days_since_access == 0:
|
|
return confidence
|
|
|
|
# Exponential decay: C(t) = C₀ * (0.5)^(t/h)
|
|
# where h is half-life
|
|
decay_factor = math.pow(0.5, days_since_access / half_life_days)
|
|
decayed_confidence = confidence * decay_factor
|
|
|
|
# Don't decay below minimum threshold
|
|
return max(decayed_confidence, 0.1)
|
|
|
|
|
|
def reinforce_confidence(
|
|
current_confidence: float,
|
|
retrieval_success: bool,
|
|
learning_rate: float = 0.1
|
|
) -> float:
|
|
"""
|
|
Reinforce or weaken confidence based on retrieval outcome.
|
|
|
|
Successful retrievals increase confidence; failures decrease it.
|
|
|
|
Args:
|
|
current_confidence: Current score (0-1)
|
|
retrieval_success: Whether retrieval was successful/relevant
|
|
learning_rate: How quickly confidence adjusts (0-1)
|
|
|
|
Returns:
|
|
Updated confidence score
|
|
"""
|
|
if retrieval_success:
|
|
# Increase confidence, with diminishing returns as it approaches 1
|
|
delta = learning_rate * (1 - current_confidence)
|
|
return min(current_confidence + delta, 1.0)
|
|
else:
|
|
# Decrease confidence
|
|
delta = learning_rate * current_confidence
|
|
return max(current_confidence - delta, 0.1)
|
|
|
|
|
|
def calculate_relevance_score(
|
|
access_count: int,
|
|
last_accessed: datetime,
|
|
confidence: float,
|
|
recency_weight: float = 0.3,
|
|
frequency_weight: float = 0.3,
|
|
confidence_weight: float = 0.4
|
|
) -> float:
|
|
"""
|
|
Calculate overall relevance score combining multiple factors.
|
|
|
|
Args:
|
|
access_count: Number of times accessed
|
|
last_accessed: Most recent access time
|
|
confidence: Current confidence score
|
|
recency_weight: Weight for recency (default 0.3)
|
|
frequency_weight: Weight for frequency (default 0.3)
|
|
confidence_weight: Weight for confidence (default 0.4)
|
|
|
|
Returns:
|
|
Relevance score (0-1)
|
|
"""
|
|
# Recency score (exponential decay)
|
|
days_since = (datetime.now() - last_accessed).days
|
|
recency = math.exp(-days_since / 30) # 30-day half-life
|
|
|
|
# Frequency score (logarithmic scaling)
|
|
frequency = math.log(1 + access_count) / math.log(101) # Scale to 0-1
|
|
|
|
# Weighted combination
|
|
relevance = (
|
|
recency * recency_weight +
|
|
frequency * frequency_weight +
|
|
confidence * confidence_weight
|
|
)
|
|
|
|
return min(relevance, 1.0)
|
|
|
|
|
|
# ============================================================================
|
|
# INTELLIGENT TAGGING
|
|
# ============================================================================
|
|
|
|
def extract_tags_from_content(content: str, existing_tags: List[str] = None) -> List[str]:
|
|
"""
|
|
Extract intelligent tags from content.
|
|
|
|
Generates:
|
|
- Concept tags (from domain terms)
|
|
- Entity tags (specific technologies)
|
|
- Action tags (verbs describing operations)
|
|
|
|
Args:
|
|
content: Note content
|
|
existing_tags: Tags already assigned
|
|
|
|
Returns:
|
|
List of extracted tags
|
|
"""
|
|
existing_tags = existing_tags or []
|
|
extracted = set(existing_tags)
|
|
|
|
content_lower = content.lower()
|
|
|
|
# Common concept tags
|
|
concept_map = {
|
|
'authentication': ['auth', 'login', 'oauth', 'jwt', 'token'],
|
|
'database': ['sql', 'query', 'schema', 'migration', 'postgresql', 'mongodb'],
|
|
'testing': ['test', 'spec', 'assert', 'mock', 'fixture'],
|
|
'api': ['endpoint', 'route', 'request', 'response', 'rest'],
|
|
'security': ['encrypt', 'hash', 'secure', 'vulnerable', 'xss', 'csrf'],
|
|
'performance': ['optimize', 'cache', 'latency', 'throughput'],
|
|
'architecture': ['pattern', 'design', 'structure', 'component'],
|
|
}
|
|
|
|
for concept, keywords in concept_map.items():
|
|
if any(kw in content_lower for kw in keywords):
|
|
extracted.add(concept)
|
|
|
|
# Technology entity tags
|
|
tech_patterns = [
|
|
r'\b(react|vue|angular|svelte)\b',
|
|
r'\b(python|javascript|typescript|java|go|rust)\b',
|
|
r'\b(postgres|mysql|mongodb|redis|elasticsearch)\b',
|
|
r'\b(docker|kubernetes|aws|azure|gcp)\b',
|
|
r'\b(jwt|oauth|saml|ldap)\b',
|
|
]
|
|
|
|
for pattern in tech_patterns:
|
|
matches = re.findall(pattern, content_lower, re.IGNORECASE)
|
|
extracted.update(matches)
|
|
|
|
return sorted(list(extracted))
|
|
|
|
|
|
def generate_tag_hierarchy(tags: List[str]) -> Dict[str, List[str]]:
|
|
"""
|
|
Organize tags into hierarchical structure.
|
|
|
|
Returns:
|
|
Dict mapping parent categories to child tags
|
|
"""
|
|
hierarchy = {
|
|
'technology': [],
|
|
'concept': [],
|
|
'domain': [],
|
|
'pattern': []
|
|
}
|
|
|
|
# Categorize tags
|
|
tech_keywords = ['python', 'javascript', 'typescript', 'react', 'postgres', 'docker']
|
|
concept_keywords = ['authentication', 'testing', 'security', 'performance']
|
|
pattern_keywords = ['repository', 'service', 'factory', 'singleton']
|
|
|
|
for tag in tags:
|
|
tag_lower = tag.lower()
|
|
if any(tech in tag_lower for tech in tech_keywords):
|
|
hierarchy['technology'].append(tag)
|
|
elif any(concept in tag_lower for concept in concept_keywords):
|
|
hierarchy['concept'].append(tag)
|
|
elif any(pattern in tag_lower for pattern in pattern_keywords):
|
|
hierarchy['pattern'].append(tag)
|
|
else:
|
|
hierarchy['domain'].append(tag)
|
|
|
|
# Remove empty categories
|
|
return {k: v for k, v in hierarchy.items() if v}
|
|
|
|
|
|
# ============================================================================
|
|
# UPSERT LOGIC - UPDATE EXISTING KNOWLEDGE
|
|
# ============================================================================
|
|
|
|
def find_similar_notes(
|
|
title: str,
|
|
content: str,
|
|
note_type: str,
|
|
threshold: float = 0.7
|
|
) -> List[Tuple[Path, float]]:
|
|
"""
|
|
Find existing notes that might be duplicates or updates.
|
|
|
|
Uses title similarity and content overlap to identify candidates.
|
|
|
|
Args:
|
|
title: Note title
|
|
content: Note content
|
|
note_type: Type of note (file-analysis, pattern, decision)
|
|
threshold: Similarity threshold (0-1)
|
|
|
|
Returns:
|
|
List of (path, similarity_score) tuples
|
|
"""
|
|
storage = _get_storage()
|
|
folders = storage['get_folder_paths']()
|
|
vault = storage['get_vault_path']()
|
|
|
|
# Map note type to folder
|
|
folder_map = {
|
|
'file-analysis': folders['files'],
|
|
'pattern': folders['patterns'],
|
|
'decision': folders['decisions'],
|
|
'interaction': folders['interactions']
|
|
}
|
|
|
|
search_folder = folder_map.get(note_type)
|
|
if not search_folder or not search_folder.exists():
|
|
return []
|
|
|
|
candidates = []
|
|
title_lower = title.lower()
|
|
content_words = set(re.findall(r'\w+', content.lower()))
|
|
|
|
for note_file in search_folder.rglob("*.md"):
|
|
try:
|
|
# Check title similarity
|
|
note_title = note_file.stem.lower()
|
|
title_similarity = compute_string_similarity(title_lower, note_title)
|
|
|
|
if title_similarity < 0.5:
|
|
continue
|
|
|
|
# Check content overlap
|
|
post = frontmatter.load(note_file)
|
|
note_content_words = set(re.findall(r'\w+', post.content.lower()))
|
|
|
|
# Jaccard similarity
|
|
intersection = len(content_words & note_content_words)
|
|
union = len(content_words | note_content_words)
|
|
content_similarity = intersection / union if union > 0 else 0
|
|
|
|
# Combined score (weighted average)
|
|
overall_similarity = (title_similarity * 0.6 + content_similarity * 0.4)
|
|
|
|
if overall_similarity >= threshold:
|
|
candidates.append((note_file, overall_similarity))
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
# Sort by similarity (highest first)
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
return candidates
|
|
|
|
|
|
def compute_string_similarity(s1: str, s2: str) -> float:
|
|
"""
|
|
Compute similarity between two strings using Levenshtein-based approach.
|
|
|
|
Returns:
|
|
Similarity score (0-1)
|
|
"""
|
|
# Simple word overlap method
|
|
words1 = set(s1.split())
|
|
words2 = set(s2.split())
|
|
|
|
if not words1 or not words2:
|
|
return 0.0
|
|
|
|
intersection = len(words1 & words2)
|
|
union = len(words1 | words2)
|
|
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
|
|
def should_update_existing(
|
|
existing_path: Path,
|
|
new_content: str,
|
|
similarity_score: float
|
|
) -> bool:
|
|
"""
|
|
Decide whether to update existing note or create new one.
|
|
|
|
Args:
|
|
existing_path: Path to existing note
|
|
new_content: New content to potentially add
|
|
similarity_score: How similar notes are (0-1)
|
|
|
|
Returns:
|
|
True if should update, False if should create new
|
|
"""
|
|
# High similarity -> update existing
|
|
if similarity_score >= 0.85:
|
|
return True
|
|
|
|
# Medium similarity -> check if new content adds value
|
|
if similarity_score >= 0.7:
|
|
post = frontmatter.load(existing_path)
|
|
existing_length = len(post.content)
|
|
new_length = len(new_content)
|
|
|
|
# If new content is substantially different/longer, keep separate
|
|
if new_length > existing_length * 1.5:
|
|
return False
|
|
|
|
return True
|
|
|
|
# Low similarity -> create new
|
|
return False
|
|
|
|
|
|
def merge_note_content(
|
|
existing_content: str,
|
|
new_content: str,
|
|
merge_strategy: str = "append"
|
|
) -> str:
|
|
"""
|
|
Intelligently merge new content into existing note.
|
|
|
|
Args:
|
|
existing_content: Current note content
|
|
new_content: New information to add
|
|
merge_strategy: How to merge ("append", "replace", "sections")
|
|
|
|
Returns:
|
|
Merged content
|
|
"""
|
|
if merge_strategy == "replace":
|
|
return new_content
|
|
|
|
elif merge_strategy == "append":
|
|
# Add new content at end with separator
|
|
return f"{existing_content}\n\n## Updated Information\n\n{new_content}"
|
|
|
|
elif merge_strategy == "sections":
|
|
# Merge by sections (smarter merging)
|
|
# For now, append with date
|
|
timestamp = datetime.now().strftime("%Y-%m-%d")
|
|
return f"{existing_content}\n\n## Update - {timestamp}\n\n{new_content}"
|
|
|
|
return existing_content
|
|
|
|
|
|
# ============================================================================
|
|
# SELF-EVALUATION & MAINTENANCE
|
|
# ============================================================================
|
|
|
|
def evaluate_memory_health(vault_path: Path = None) -> Dict:
|
|
"""
|
|
Evaluate overall memory system health.
|
|
|
|
Checks:
|
|
- Low-confidence memories
|
|
- Stale memories (not accessed recently)
|
|
- Duplicate candidates
|
|
- Tag consistency
|
|
|
|
Returns:
|
|
Health report dictionary
|
|
"""
|
|
if vault_path is None:
|
|
storage = _get_storage()
|
|
vault_path = storage['get_vault_path']()
|
|
|
|
report = {
|
|
'total_notes': 0,
|
|
'low_confidence': [],
|
|
'stale_memories': [],
|
|
'duplicate_candidates': [],
|
|
'tag_issues': [],
|
|
'avg_confidence': 0.0,
|
|
'avg_relevance': 0.0
|
|
}
|
|
|
|
confidences = []
|
|
relevances = []
|
|
|
|
for note_file in vault_path.rglob("*.md"):
|
|
try:
|
|
post = frontmatter.load(note_file)
|
|
|
|
if post.get('type') not in ['file-analysis', 'pattern', 'decision']:
|
|
continue
|
|
|
|
report['total_notes'] += 1
|
|
|
|
# Check confidence
|
|
confidence = post.get('confidence_score', 0.5)
|
|
confidences.append(confidence)
|
|
|
|
if confidence < 0.3:
|
|
report['low_confidence'].append(str(note_file.relative_to(vault_path)))
|
|
|
|
# Check staleness
|
|
last_accessed_str = post.get('last_accessed')
|
|
if last_accessed_str:
|
|
last_accessed = datetime.fromisoformat(last_accessed_str)
|
|
days_stale = (datetime.now() - last_accessed).days
|
|
|
|
if days_stale > 90:
|
|
report['stale_memories'].append({
|
|
'path': str(note_file.relative_to(vault_path)),
|
|
'days_stale': days_stale
|
|
})
|
|
|
|
# Check tags
|
|
tags = post.get('tags', [])
|
|
if not tags:
|
|
report['tag_issues'].append(str(note_file.relative_to(vault_path)))
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
if confidences:
|
|
report['avg_confidence'] = sum(confidences) / len(confidences)
|
|
|
|
return report
|
|
|
|
|
|
def consolidate_duplicates(
|
|
duplicate_candidates: List[Tuple[Path, Path, float]],
|
|
auto_merge_threshold: float = 0.95
|
|
) -> List[Dict]:
|
|
"""
|
|
Consolidate duplicate or near-duplicate memories.
|
|
|
|
Args:
|
|
duplicate_candidates: List of (path1, path2, similarity) tuples
|
|
auto_merge_threshold: Automatically merge if similarity above this
|
|
|
|
Returns:
|
|
List of consolidation actions taken
|
|
"""
|
|
actions = []
|
|
|
|
for path1, path2, similarity in duplicate_candidates:
|
|
if similarity >= auto_merge_threshold:
|
|
# Auto-merge high-similarity duplicates
|
|
try:
|
|
post1 = frontmatter.load(path1)
|
|
post2 = frontmatter.load(path2)
|
|
|
|
# Keep the one with higher confidence
|
|
conf1 = post1.get('confidence_score', 0.5)
|
|
conf2 = post2.get('confidence_score', 0.5)
|
|
|
|
if conf1 >= conf2:
|
|
primary, secondary = path1, path2
|
|
else:
|
|
primary, secondary = path2, path1
|
|
|
|
# Merge content
|
|
post_primary = frontmatter.load(primary)
|
|
post_secondary = frontmatter.load(secondary)
|
|
|
|
merged_content = merge_note_content(
|
|
post_primary.content,
|
|
post_secondary.content,
|
|
"sections"
|
|
)
|
|
|
|
# Update primary
|
|
post_primary.content = merged_content
|
|
|
|
with open(primary, 'w', encoding='utf-8') as f:
|
|
f.write(frontmatter.dumps(post_primary))
|
|
|
|
# Archive secondary
|
|
secondary.unlink()
|
|
|
|
actions.append({
|
|
'action': 'merged',
|
|
'primary': str(primary),
|
|
'secondary': str(secondary),
|
|
'similarity': similarity
|
|
})
|
|
|
|
except Exception as e:
|
|
actions.append({
|
|
'action': 'error',
|
|
'files': [str(path1), str(path2)],
|
|
'error': str(e)
|
|
})
|
|
|
|
return actions
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Memory Intelligence System")
|
|
print("=" * 60)
|
|
|
|
# Test decay calculation
|
|
confidence = 0.8
|
|
last_access = datetime.now() - timedelta(days=45)
|
|
decayed = calculate_decay(confidence, last_access)
|
|
print(f"\nDecay Test:")
|
|
print(f" Initial confidence: {confidence}")
|
|
print(f" Days since access: 45")
|
|
print(f" Decayed confidence: {decayed:.3f}")
|
|
|
|
# Test reinforcement
|
|
reinforced = reinforce_confidence(decayed, True)
|
|
print(f"\nReinforcement Test:")
|
|
print(f" After successful retrieval: {reinforced:.3f}")
|
|
|
|
# Test tag extraction
|
|
sample = "Implement JWT authentication using jsonwebtoken library for secure API access"
|
|
tags = extract_tags_from_content(sample)
|
|
print(f"\nTag Extraction Test:")
|
|
print(f" Content: {sample}")
|
|
print(f" Tags: {tags}")
|
|
|
|
print("\n[OK] Memory intelligence layer operational")
|