Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:51:34 +08:00
commit acde81dcfe
59 changed files with 22282 additions and 0 deletions

View File

@@ -0,0 +1,574 @@
#!/usr/bin/env python3
"""
PRISM Context Memory - Intelligence Layer
Implements memory decay, self-evaluation, and learning over time.
Based on research in persistent memory systems with confidence scoring.
Key Concepts:
- Memory Decay: Confidence scores decay following Ebbinghaus curve unless reinforced
- Self-Evaluation: Track retrieval success and relevance
- Upsert Logic: Update existing knowledge rather than duplicate
- Confidence Scoring: Increases with successful usage, decays over time
"""
import os
import sys
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import math
import re
sys.path.insert(0, str(Path(__file__).parent))
try:
import frontmatter
except ImportError:
print("[ERROR] python-frontmatter not installed")
sys.exit(1)
# Lazy import to avoid circular dependency
# storage_obsidian imports from this file, so we can't import it at module level
_storage_obsidian = None
def _get_storage():
"""Lazy load storage_obsidian to avoid circular import."""
global _storage_obsidian
if _storage_obsidian is None:
from storage_obsidian import get_vault_path, get_folder_paths, ensure_folder
_storage_obsidian = {
'get_vault_path': get_vault_path,
'get_folder_paths': get_folder_paths,
'ensure_folder': ensure_folder
}
return _storage_obsidian
# ============================================================================
# MEMORY DECAY & CONFIDENCE SCORING
# ============================================================================
def calculate_decay(
confidence: float,
last_accessed: datetime,
half_life_days: int = 30
) -> float:
"""
Calculate memory decay using exponential decay model (Ebbinghaus curve).
Confidence decays unless memory is reinforced through successful retrieval.
Args:
confidence: Current confidence score (0-1)
last_accessed: When memory was last accessed
half_life_days: Days for confidence to decay to 50%
Returns:
Decayed confidence score
"""
days_since_access = (datetime.now() - last_accessed).days
if days_since_access == 0:
return confidence
# Exponential decay: C(t) = C₀ * (0.5)^(t/h)
# where h is half-life
decay_factor = math.pow(0.5, days_since_access / half_life_days)
decayed_confidence = confidence * decay_factor
# Don't decay below minimum threshold
return max(decayed_confidence, 0.1)
def reinforce_confidence(
current_confidence: float,
retrieval_success: bool,
learning_rate: float = 0.1
) -> float:
"""
Reinforce or weaken confidence based on retrieval outcome.
Successful retrievals increase confidence; failures decrease it.
Args:
current_confidence: Current score (0-1)
retrieval_success: Whether retrieval was successful/relevant
learning_rate: How quickly confidence adjusts (0-1)
Returns:
Updated confidence score
"""
if retrieval_success:
# Increase confidence, with diminishing returns as it approaches 1
delta = learning_rate * (1 - current_confidence)
return min(current_confidence + delta, 1.0)
else:
# Decrease confidence
delta = learning_rate * current_confidence
return max(current_confidence - delta, 0.1)
def calculate_relevance_score(
access_count: int,
last_accessed: datetime,
confidence: float,
recency_weight: float = 0.3,
frequency_weight: float = 0.3,
confidence_weight: float = 0.4
) -> float:
"""
Calculate overall relevance score combining multiple factors.
Args:
access_count: Number of times accessed
last_accessed: Most recent access time
confidence: Current confidence score
recency_weight: Weight for recency (default 0.3)
frequency_weight: Weight for frequency (default 0.3)
confidence_weight: Weight for confidence (default 0.4)
Returns:
Relevance score (0-1)
"""
# Recency score (exponential decay)
days_since = (datetime.now() - last_accessed).days
recency = math.exp(-days_since / 30) # 30-day half-life
# Frequency score (logarithmic scaling)
frequency = math.log(1 + access_count) / math.log(101) # Scale to 0-1
# Weighted combination
relevance = (
recency * recency_weight +
frequency * frequency_weight +
confidence * confidence_weight
)
return min(relevance, 1.0)
# ============================================================================
# INTELLIGENT TAGGING
# ============================================================================
def extract_tags_from_content(content: str, existing_tags: List[str] = None) -> List[str]:
"""
Extract intelligent tags from content.
Generates:
- Concept tags (from domain terms)
- Entity tags (specific technologies)
- Action tags (verbs describing operations)
Args:
content: Note content
existing_tags: Tags already assigned
Returns:
List of extracted tags
"""
existing_tags = existing_tags or []
extracted = set(existing_tags)
content_lower = content.lower()
# Common concept tags
concept_map = {
'authentication': ['auth', 'login', 'oauth', 'jwt', 'token'],
'database': ['sql', 'query', 'schema', 'migration', 'postgresql', 'mongodb'],
'testing': ['test', 'spec', 'assert', 'mock', 'fixture'],
'api': ['endpoint', 'route', 'request', 'response', 'rest'],
'security': ['encrypt', 'hash', 'secure', 'vulnerable', 'xss', 'csrf'],
'performance': ['optimize', 'cache', 'latency', 'throughput'],
'architecture': ['pattern', 'design', 'structure', 'component'],
}
for concept, keywords in concept_map.items():
if any(kw in content_lower for kw in keywords):
extracted.add(concept)
# Technology entity tags
tech_patterns = [
r'\b(react|vue|angular|svelte)\b',
r'\b(python|javascript|typescript|java|go|rust)\b',
r'\b(postgres|mysql|mongodb|redis|elasticsearch)\b',
r'\b(docker|kubernetes|aws|azure|gcp)\b',
r'\b(jwt|oauth|saml|ldap)\b',
]
for pattern in tech_patterns:
matches = re.findall(pattern, content_lower, re.IGNORECASE)
extracted.update(matches)
return sorted(list(extracted))
def generate_tag_hierarchy(tags: List[str]) -> Dict[str, List[str]]:
"""
Organize tags into hierarchical structure.
Returns:
Dict mapping parent categories to child tags
"""
hierarchy = {
'technology': [],
'concept': [],
'domain': [],
'pattern': []
}
# Categorize tags
tech_keywords = ['python', 'javascript', 'typescript', 'react', 'postgres', 'docker']
concept_keywords = ['authentication', 'testing', 'security', 'performance']
pattern_keywords = ['repository', 'service', 'factory', 'singleton']
for tag in tags:
tag_lower = tag.lower()
if any(tech in tag_lower for tech in tech_keywords):
hierarchy['technology'].append(tag)
elif any(concept in tag_lower for concept in concept_keywords):
hierarchy['concept'].append(tag)
elif any(pattern in tag_lower for pattern in pattern_keywords):
hierarchy['pattern'].append(tag)
else:
hierarchy['domain'].append(tag)
# Remove empty categories
return {k: v for k, v in hierarchy.items() if v}
# ============================================================================
# UPSERT LOGIC - UPDATE EXISTING KNOWLEDGE
# ============================================================================
def find_similar_notes(
title: str,
content: str,
note_type: str,
threshold: float = 0.7
) -> List[Tuple[Path, float]]:
"""
Find existing notes that might be duplicates or updates.
Uses title similarity and content overlap to identify candidates.
Args:
title: Note title
content: Note content
note_type: Type of note (file-analysis, pattern, decision)
threshold: Similarity threshold (0-1)
Returns:
List of (path, similarity_score) tuples
"""
storage = _get_storage()
folders = storage['get_folder_paths']()
vault = storage['get_vault_path']()
# Map note type to folder
folder_map = {
'file-analysis': folders['files'],
'pattern': folders['patterns'],
'decision': folders['decisions'],
'interaction': folders['interactions']
}
search_folder = folder_map.get(note_type)
if not search_folder or not search_folder.exists():
return []
candidates = []
title_lower = title.lower()
content_words = set(re.findall(r'\w+', content.lower()))
for note_file in search_folder.rglob("*.md"):
try:
# Check title similarity
note_title = note_file.stem.lower()
title_similarity = compute_string_similarity(title_lower, note_title)
if title_similarity < 0.5:
continue
# Check content overlap
post = frontmatter.load(note_file)
note_content_words = set(re.findall(r'\w+', post.content.lower()))
# Jaccard similarity
intersection = len(content_words & note_content_words)
union = len(content_words | note_content_words)
content_similarity = intersection / union if union > 0 else 0
# Combined score (weighted average)
overall_similarity = (title_similarity * 0.6 + content_similarity * 0.4)
if overall_similarity >= threshold:
candidates.append((note_file, overall_similarity))
except Exception:
continue
# Sort by similarity (highest first)
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates
def compute_string_similarity(s1: str, s2: str) -> float:
"""
Compute similarity between two strings using Levenshtein-based approach.
Returns:
Similarity score (0-1)
"""
# Simple word overlap method
words1 = set(s1.split())
words2 = set(s2.split())
if not words1 or not words2:
return 0.0
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union if union > 0 else 0.0
def should_update_existing(
existing_path: Path,
new_content: str,
similarity_score: float
) -> bool:
"""
Decide whether to update existing note or create new one.
Args:
existing_path: Path to existing note
new_content: New content to potentially add
similarity_score: How similar notes are (0-1)
Returns:
True if should update, False if should create new
"""
# High similarity -> update existing
if similarity_score >= 0.85:
return True
# Medium similarity -> check if new content adds value
if similarity_score >= 0.7:
post = frontmatter.load(existing_path)
existing_length = len(post.content)
new_length = len(new_content)
# If new content is substantially different/longer, keep separate
if new_length > existing_length * 1.5:
return False
return True
# Low similarity -> create new
return False
def merge_note_content(
existing_content: str,
new_content: str,
merge_strategy: str = "append"
) -> str:
"""
Intelligently merge new content into existing note.
Args:
existing_content: Current note content
new_content: New information to add
merge_strategy: How to merge ("append", "replace", "sections")
Returns:
Merged content
"""
if merge_strategy == "replace":
return new_content
elif merge_strategy == "append":
# Add new content at end with separator
return f"{existing_content}\n\n## Updated Information\n\n{new_content}"
elif merge_strategy == "sections":
# Merge by sections (smarter merging)
# For now, append with date
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{existing_content}\n\n## Update - {timestamp}\n\n{new_content}"
return existing_content
# ============================================================================
# SELF-EVALUATION & MAINTENANCE
# ============================================================================
def evaluate_memory_health(vault_path: Path = None) -> Dict:
"""
Evaluate overall memory system health.
Checks:
- Low-confidence memories
- Stale memories (not accessed recently)
- Duplicate candidates
- Tag consistency
Returns:
Health report dictionary
"""
if vault_path is None:
storage = _get_storage()
vault_path = storage['get_vault_path']()
report = {
'total_notes': 0,
'low_confidence': [],
'stale_memories': [],
'duplicate_candidates': [],
'tag_issues': [],
'avg_confidence': 0.0,
'avg_relevance': 0.0
}
confidences = []
relevances = []
for note_file in vault_path.rglob("*.md"):
try:
post = frontmatter.load(note_file)
if post.get('type') not in ['file-analysis', 'pattern', 'decision']:
continue
report['total_notes'] += 1
# Check confidence
confidence = post.get('confidence_score', 0.5)
confidences.append(confidence)
if confidence < 0.3:
report['low_confidence'].append(str(note_file.relative_to(vault_path)))
# Check staleness
last_accessed_str = post.get('last_accessed')
if last_accessed_str:
last_accessed = datetime.fromisoformat(last_accessed_str)
days_stale = (datetime.now() - last_accessed).days
if days_stale > 90:
report['stale_memories'].append({
'path': str(note_file.relative_to(vault_path)),
'days_stale': days_stale
})
# Check tags
tags = post.get('tags', [])
if not tags:
report['tag_issues'].append(str(note_file.relative_to(vault_path)))
except Exception:
continue
if confidences:
report['avg_confidence'] = sum(confidences) / len(confidences)
return report
def consolidate_duplicates(
duplicate_candidates: List[Tuple[Path, Path, float]],
auto_merge_threshold: float = 0.95
) -> List[Dict]:
"""
Consolidate duplicate or near-duplicate memories.
Args:
duplicate_candidates: List of (path1, path2, similarity) tuples
auto_merge_threshold: Automatically merge if similarity above this
Returns:
List of consolidation actions taken
"""
actions = []
for path1, path2, similarity in duplicate_candidates:
if similarity >= auto_merge_threshold:
# Auto-merge high-similarity duplicates
try:
post1 = frontmatter.load(path1)
post2 = frontmatter.load(path2)
# Keep the one with higher confidence
conf1 = post1.get('confidence_score', 0.5)
conf2 = post2.get('confidence_score', 0.5)
if conf1 >= conf2:
primary, secondary = path1, path2
else:
primary, secondary = path2, path1
# Merge content
post_primary = frontmatter.load(primary)
post_secondary = frontmatter.load(secondary)
merged_content = merge_note_content(
post_primary.content,
post_secondary.content,
"sections"
)
# Update primary
post_primary.content = merged_content
with open(primary, 'w', encoding='utf-8') as f:
f.write(frontmatter.dumps(post_primary))
# Archive secondary
secondary.unlink()
actions.append({
'action': 'merged',
'primary': str(primary),
'secondary': str(secondary),
'similarity': similarity
})
except Exception as e:
actions.append({
'action': 'error',
'files': [str(path1), str(path2)],
'error': str(e)
})
return actions
if __name__ == "__main__":
print("Memory Intelligence System")
print("=" * 60)
# Test decay calculation
confidence = 0.8
last_access = datetime.now() - timedelta(days=45)
decayed = calculate_decay(confidence, last_access)
print(f"\nDecay Test:")
print(f" Initial confidence: {confidence}")
print(f" Days since access: 45")
print(f" Decayed confidence: {decayed:.3f}")
# Test reinforcement
reinforced = reinforce_confidence(decayed, True)
print(f"\nReinforcement Test:")
print(f" After successful retrieval: {reinforced:.3f}")
# Test tag extraction
sample = "Implement JWT authentication using jsonwebtoken library for secure API access"
tags = extract_tags_from_content(sample)
print(f"\nTag Extraction Test:")
print(f" Content: {sample}")
print(f" Tags: {tags}")
print("\n[OK] Memory intelligence layer operational")