From 6a89ebc4a7f046edee364b7588e6abc1de64a599 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:46:56 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + plugin.lock.json | 48 +++ skills/evaluator/SKILL.md | 375 +++++++++++++++++++++ skills/evaluator/scripts/track_event.py | 412 ++++++++++++++++++++++++ 5 files changed, 850 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/evaluator/SKILL.md create mode 100644 skills/evaluator/scripts/track_event.py diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..720b79b --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "evaluator", + "description": "Skill evaluation and telemetry framework. Collects anonymous usage data and feedback via GitHub Issues and Projects. Privacy-first, opt-in, transparent. Helps improve ClaudeShack skills based on real-world usage.", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Overlord-Z", + "email": "[email protected]" + }, + "skills": [ + "./skills/evaluator" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e78ec81 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# evaluator + +Skill evaluation and telemetry framework. Collects anonymous usage data and feedback via GitHub Issues and Projects. Privacy-first, opt-in, transparent. Helps improve ClaudeShack skills based on real-world usage. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..75f8650 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,48 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:Overlord-Z/ClaudeShack:evaluator", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "94c55623aab7fa19ae35e42ef99a0ecbfe4f7076", + "treeHash": "0469bc5d6836792b05db84ece16735b5ae9e2f4232eac94a5373976e77612916", + "generatedAt": "2025-11-28T10:12:23.865375Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "evaluator", + "description": "Skill evaluation and telemetry framework. Collects anonymous usage data and feedback via GitHub Issues and Projects. Privacy-first, opt-in, transparent. Helps improve ClaudeShack skills based on real-world usage." + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "197002bfe8849dbe0ca5e87e24a41f5a47e877de947a1a4df41e7384735bcd3d" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "5583be9c595cb5f3a128d6204a882a13503771db5f97b11ac5ef929ea753ba0f" + }, + { + "path": "skills/evaluator/SKILL.md", + "sha256": "bad017d01abd9fd91534c328aad41d5f43f1b33448238f86d95ed9cd52f24e17" + }, + { + "path": "skills/evaluator/scripts/track_event.py", + "sha256": "f20b72cccb991433d78ff2993e788e43875737bf524d4472fe11845689f5adab" + } + ], + "dirSha256": "0469bc5d6836792b05db84ece16735b5ae9e2f4232eac94a5373976e77612916" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/evaluator/SKILL.md b/skills/evaluator/SKILL.md new file mode 100644 index 0000000..67c41d3 --- /dev/null +++ b/skills/evaluator/SKILL.md @@ -0,0 +1,375 @@ +--- +name: evaluator +description: Skill evaluation and telemetry framework. Collects anonymous usage data and feedback via GitHub Issues and Projects. Privacy-first, opt-in, transparent. Helps improve ClaudeShack skills based on real-world usage. Integrates with oracle and guardian. +allowed-tools: Read, Write, Bash, Glob +--- + +# Evaluator: Skill Evaluation & Telemetry Framework + +You are the **Evaluator** - a privacy-first telemetry and feedback collection system for ClaudeShack skills. + +## Core Principles + +1. **Privacy First**: All telemetry is anonymous and opt-in +2. **Transparency**: Users know exactly what data is collected +3. **Easy Opt-Out**: Single command to disable telemetry +4. **No PII**: Never collect personally identifiable information +5. **GitHub-Native**: Uses GitHub Issues and Projects for feedback +6. **Community Benefit**: Collected data improves skills for everyone +7. **Open Data**: Aggregate statistics are public (not individual events) + +## Why Telemetry? + +Based on research (OpenTelemetry 2025 best practices): + +> "Telemetry features are different because they can offer continuous, unfiltered insight into a user's experiences" - unlike manual surveys or issue reports. + +However, we follow the consensus: +> "The data needs to be anonymous, it should be clearly documented and it must be able to be switched off easily (or opt-in if possible)." + +## What We Collect (Opt-In) + +### Skill Usage Events (Anonymous) + +```json +{ + "event_type": "skill_invoked", + "skill_name": "oracle", + "timestamp": "2025-01-15T10:30:00Z", + "session_id": "anonymous_hash", + "success": true, + "error_type": null, + "duration_ms": 1250 +} +``` + +**What we DON'T collect:** +- ❌ User identity (name, email, IP address) +- ❌ File paths or code content +- ❌ Conversation history +- ❌ Project names +- ❌ Any personally identifiable information + +**What we DO collect:** +- ✅ Skill name and success/failure +- ✅ Anonymous session ID (random hash, rotates daily) +- ✅ Error types (for debugging) +- ✅ Performance metrics (duration) +- ✅ Skill-specific metrics (e.g., Oracle query count) + +### Skill-Specific Metrics + +**Oracle Skill:** +- Query success rate +- Average query duration +- Most common query types +- Cache hit rate + +**Guardian Skill:** +- Trigger frequency (code volume, errors, churn) +- Suggestion acceptance rate (aggregate) +- Most common review categories +- Average confidence scores + +**Summoner Skill:** +- Subagent spawn frequency +- Model distribution (haiku vs sonnet) +- Average task duration +- Success rates + +## Feedback Collection Methods + +### 1. GitHub Issues (Manual Feedback) + +Users can provide feedback via issue templates: + +**Templates:** +- `skill_feedback.yml` - General skill feedback +- `skill_bug.yml` - Bug reports +- `skill_improvement.yml` - Improvement suggestions +- `skill_request.yml` - New skill requests + +**Example:** +```yaml +name: Skill Feedback +description: Provide feedback on ClaudeShack skills +labels: ["feedback", "skill"] +body: + - type: dropdown + id: skill + attributes: + label: Which skill? + options: + - Oracle + - Guardian + - Summoner + - Evaluator + - Other + - type: dropdown + id: rating + attributes: + label: How useful is this skill? + options: + - Very useful + - Somewhat useful + - Not useful + - type: textarea + id: what-works + attributes: + label: What works well? + - type: textarea + id: what-doesnt + attributes: + label: What could be improved? +``` + +### 2. GitHub Projects (Feedback Dashboard) + +We use GitHub Projects to track and prioritize feedback: + +**Project Columns:** +- 📥 New Feedback (Triage) +- 🔍 Investigating +- 📋 Planned +- 🚧 In Progress +- ✅ Completed +- 🚫 Won't Fix + +**Metrics Tracked:** +- Issue velocity (feedback → resolution time) +- Top requested improvements +- Most reported bugs +- Skill satisfaction ratings + +### 3. Anonymous Telemetry (Opt-In) + +**How It Works:** + +1. User opts in: `/evaluator enable` +2. Events are collected locally in `.evaluator/events.jsonl` +3. Periodically (daily), events are aggregated into summary stats +4. Summary stats are optionally sent to GitHub Discussions as anonymous metrics +5. Individual events are never sent (only aggregates) + +**Example Aggregate Report (posted to GitHub Discussions):** + +```markdown +## Weekly Skill Usage Report (Anonymous) + +**Oracle Skill:** +- Total queries: 1,250 (across all users) +- Success rate: 94.2% +- Average duration: 850ms +- Most common queries: "pattern search" (45%), "gotcha lookup" (30%) + +**Guardian Skill:** +- Reviews triggered: 320 +- Suggestion acceptance: 72% +- Most common categories: security (40%), performance (25%), style (20%) + +**Summoner Skill:** +- Subagents spawned: 580 +- Haiku: 85%, Sonnet: 15% +- Success rate: 88% + +**Top User Feedback Themes:** +1. "Oracle needs better search filters" (12 mentions) +2. "Guardian triggers too frequently" (8 mentions) +3. "Love the minimal context passing!" (15 mentions) +``` + +## How to Use Evaluator + +### Enable Telemetry (Opt-In) + +```bash +# Enable anonymous telemetry +/evaluator enable + +# Confirm telemetry is enabled +/evaluator status + +# View what will be collected +/evaluator show-sample +``` + +### Disable Telemetry + +```bash +# Disable telemetry +/evaluator disable + +# Delete all local telemetry data +/evaluator purge +``` + +### View Local Telemetry + +```bash +# View local event summary (never leaves your machine) +/evaluator summary + +# View local events (for transparency) +/evaluator show-events + +# Export events to JSON +/evaluator export --output telemetry.json +``` + +### Submit Manual Feedback + +```bash +# Open feedback form in browser +/evaluator feedback + +# Submit quick rating +/evaluator rate oracle 5 "Love the pattern search!" + +# Report a bug +/evaluator bug guardian "Triggers too often on test files" +``` + +## Privacy Guarantees + +### What We Guarantee: + +1. **Opt-In Only**: Telemetry is disabled by default +2. **No PII**: We never collect personal information +3. **Local First**: Events stored locally, you control when/if they're sent +4. **Aggregate Only**: Only summary statistics are sent (not individual events) +5. **Easy Deletion**: One command to delete all local data +6. **Transparent**: Source code is open, you can audit what's collected +7. **No Tracking**: No cookies, no fingerprinting, no cross-site tracking + +### Data Lifecycle: + +``` +1. Event occurs → 2. Stored locally → 3. Aggregated weekly → +4. [Optional] Send aggregate → 5. Auto-delete events >30 days old +``` + +**You control steps 4 and 5.** + +## Configuration + +`.evaluator/config.json`: + +```json +{ + "enabled": false, + "anonymous_id": "randomly-generated-daily-rotating-hash", + "send_aggregates": false, + "retention_days": 30, + "aggregation_interval_days": 7, + "collect": { + "skill_usage": true, + "performance_metrics": true, + "error_types": true, + "success_rates": true + }, + "exclude_skills": [], + "github": { + "repo": "Overlord-Z/ClaudeShack", + "discussions_category": "Telemetry", + "issue_labels": ["feedback", "telemetry"] + } +} +``` + +## For Skill Developers + +### Instrumenting Your Skill + +Add telemetry hooks to your skill: + +```python +from evaluator import track_event, track_metric + +# Track skill invocation +with track_event('my_skill_invoked'): + result = my_skill.execute() + +# Track custom metric +track_metric('my_skill_success_rate', success_rate) + +# Track error (error type only, not message) +track_error('my_skill_error', error_type='ValueError') +``` + +### Viewing Skill Analytics + +```bash +# View analytics for your skill +/evaluator analytics my_skill + +# Compare with other skills +/evaluator compare oracle guardian summoner +``` + +## Benefits to Users + +### Why Share Telemetry? + +1. **Better Skills**: Identify which features are most useful +2. **Faster Bug Fixes**: Know which bugs affect the most users +3. **Prioritized Features**: Build what users actually want +4. **Performance Improvements**: Optimize based on real usage patterns +5. **Community Growth**: Demonstrate value to attract contributors + +### What You Get Back: + +- Public aggregate metrics (see how you compare) +- Priority bug fixes for highly-used features +- Better documentation based on common questions +- Skills optimized for real-world usage patterns + +## Implementation Status + +**Current:** +- ✅ Privacy-first design +- ✅ GitHub Issues templates designed +- ✅ Configuration schema +- ✅ Opt-in/opt-out framework + +**In Progress:** +- 🚧 Event collection scripts +- 🚧 Aggregation engine +- 🚧 GitHub Projects integration +- 🚧 Analytics dashboard + +**Planned:** +- 📋 Skill instrumentation helpers +- 📋 Automated weekly reports +- 📋 Community analytics page + +## Transparency Report + +We commit to publishing quarterly transparency reports: + +**Metrics Reported:** +- Total opt-in users (approximate) +- Total events collected +- Top skills by usage +- Top feedback themes +- Privacy incidents (if any) + +**Example:** +> "Q1 2025: 45 users opted in, 12,500 events collected, 0 privacy incidents, 23 bugs fixed based on feedback" + +## Anti-Patterns (What We Won't Do) + +- ❌ Collect data without consent +- ❌ Sell or share data with third parties +- ❌ Track individual users +- ❌ Collect code or file contents +- ❌ Use data for advertising +- ❌ Make telemetry difficult to disable +- ❌ Hide what we collect + +## References + +Based on 2025 best practices: +- OpenTelemetry standards for instrumentation +- GitHub Copilot's feedback collection model +- VSCode extension telemetry guidelines +- Open source community consensus on privacy diff --git a/skills/evaluator/scripts/track_event.py b/skills/evaluator/scripts/track_event.py new file mode 100644 index 0000000..7b6001c --- /dev/null +++ b/skills/evaluator/scripts/track_event.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +""" +Evaluator Event Tracking + +Privacy-first anonymous telemetry for ClaudeShack skills. + +Usage: + # Track a skill invocation + python track_event.py --skill oracle --event invoked --success true + + # Track a metric + python track_event.py --skill guardian --metric acceptance_rate --value 0.75 + + # Track an error (type only, no message) + python track_event.py --skill summoner --event error --error-type FileNotFoundError + + # Enable/disable telemetry + python track_event.py --enable + python track_event.py --disable + + # View local events + python track_event.py --show-events + python track_event.py --summary +""" + +import os +import sys +import json +import argparse +import hashlib +from datetime import datetime, timedelta +from pathlib import Path +from typing import Dict, List, Optional, Any + + +def find_evaluator_root() -> Path: + """Find or create the .evaluator directory.""" + current = Path.cwd() + + while current != current.parent: + evaluator_path = current / '.evaluator' + if evaluator_path.exists(): + return evaluator_path + current = current.parent + + # Not found, create in current project root + evaluator_path = Path.cwd() / '.evaluator' + evaluator_path.mkdir(parents=True, exist_ok=True) + + return evaluator_path + + +def load_config(evaluator_path: Path) -> Dict[str, Any]: + """Load Evaluator configuration.""" + config_file = evaluator_path / 'config.json' + + if not config_file.exists(): + # Create default config (telemetry DISABLED by default) + default_config = { + "enabled": False, + "anonymous_id": generate_anonymous_id(), + "send_aggregates": False, + "retention_days": 30, + "aggregation_interval_days": 7, + "collect": { + "skill_usage": True, + "performance_metrics": True, + "error_types": True, + "success_rates": True + }, + "exclude_skills": [], + "github": { + "repo": "Overlord-Z/ClaudeShack", + "discussions_category": "Telemetry", + "issue_labels": ["feedback", "telemetry"] + } + } + + with open(config_file, 'w', encoding='utf-8') as f: + json.dump(default_config, f, indent=2) + + return default_config + + try: + with open(config_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, OSError, IOError): + return {"enabled": False} + + +def save_config(evaluator_path: Path, config: Dict[str, Any]) -> None: + """Save Evaluator configuration.""" + config_file = evaluator_path / 'config.json' + + try: + with open(config_file, 'w', encoding='utf-8') as f: + json.dump(config, f, indent=2) + except (OSError, IOError) as e: + print(f"Error: Failed to save config: {e}", file=sys.stderr) + sys.exit(1) + + +def generate_anonymous_id() -> str: + """Generate a daily-rotating anonymous ID. + + Returns: + Anonymous hash that rotates daily + """ + # Use date as salt for daily rotation + date_salt = datetime.now().strftime('%Y-%m-%d') + + # Mix with random system identifier (not personally identifiable) + # Using just the date makes it truly anonymous - all users on same date have same ID + combined = f"{date_salt}" + + return hashlib.sha256(combined.encode()).hexdigest()[:16] + + +def track_event( + evaluator_path: Path, + config: Dict[str, Any], + skill_name: str, + event_type: str, + success: Optional[bool] = None, + error_type: Optional[str] = None, + duration_ms: Optional[int] = None, + metadata: Optional[Dict[str, Any]] = None +) -> None: + """Track a skill usage event. + + Args: + evaluator_path: Path to .evaluator directory + config: Evaluator configuration + skill_name: Name of the skill + event_type: Type of event (invoked, error, etc.) + success: Whether the operation succeeded + error_type: Type of error (if applicable) + duration_ms: Duration in milliseconds + metadata: Additional anonymous metadata + """ + if not config.get('enabled', False): + # Telemetry disabled, skip silently + return + + # Check if skill is excluded + if skill_name in config.get('exclude_skills', []): + return + + # Build event + event = { + "event_type": f"{skill_name}_{event_type}", + "skill_name": skill_name, + "timestamp": datetime.now().isoformat(), + "session_id": config.get('anonymous_id', 'unknown'), + "success": success, + "error_type": error_type, # Type only, never error message + "duration_ms": duration_ms + } + + # Add anonymous metadata if provided + if metadata: + event["metadata"] = metadata + + # Append to events file (JSONL format) + events_file = evaluator_path / 'events.jsonl' + + try: + with open(events_file, 'a', encoding='utf-8') as f: + f.write(json.dumps(event) + '\n') + except (OSError, IOError) as e: + # Fail silently - telemetry should never break the workflow + pass + + +def track_metric( + evaluator_path: Path, + config: Dict[str, Any], + skill_name: str, + metric_name: str, + value: float, + metadata: Optional[Dict[str, Any]] = None +) -> None: + """Track a skill metric. + + Args: + evaluator_path: Path to .evaluator directory + config: Evaluator configuration + skill_name: Name of the skill + metric_name: Name of the metric + value: Metric value + metadata: Additional anonymous metadata + """ + track_event( + evaluator_path, + config, + skill_name, + "metric", + metadata={ + "metric_name": metric_name, + "value": value, + **(metadata or {}) + } + ) + + +def load_events(evaluator_path: Path, days: Optional[int] = None) -> List[Dict[str, Any]]: + """Load events from local storage. + + Args: + evaluator_path: Path to .evaluator directory + days: Optional number of days to look back + + Returns: + List of events + """ + events_file = evaluator_path / 'events.jsonl' + + if not events_file.exists(): + return [] + + events = [] + cutoff = None + + if days: + cutoff = datetime.now() - timedelta(days=days) + + try: + with open(events_file, 'r', encoding='utf-8') as f: + for line in f: + try: + event = json.loads(line.strip()) + + # Filter by date if cutoff specified + if cutoff: + event_time = datetime.fromisoformat(event['timestamp']) + if event_time < cutoff: + continue + + events.append(event) + except json.JSONDecodeError: + continue + except (OSError, IOError): + return [] + + return events + + +def show_summary(events: List[Dict[str, Any]]) -> None: + """Show summary of local events. + + Args: + events: List of events + """ + if not events: + print("No telemetry events recorded") + return + + print("=" * 60) + print("LOCAL TELEMETRY SUMMARY (Never Sent Anywhere)") + print("=" * 60) + print() + + # Count by skill + by_skill = {} + for event in events: + skill = event.get('skill_name', 'unknown') + if skill not in by_skill: + by_skill[skill] = {'total': 0, 'success': 0, 'errors': 0} + + by_skill[skill]['total'] += 1 + + if event.get('success') is True: + by_skill[skill]['success'] += 1 + elif event.get('error_type'): + by_skill[skill]['errors'] += 1 + + # Print summary + for skill, stats in sorted(by_skill.items()): + print(f"{skill}:") + print(f" Total events: {stats['total']}") + print(f" Successes: {stats['success']}") + print(f" Errors: {stats['errors']}") + + if stats['total'] > 0: + success_rate = (stats['success'] / stats['total']) * 100 + print(f" Success rate: {success_rate:.1f}%") + + print() + + print("=" * 60) + print(f"Total events: {len(events)}") + print("=" * 60) + + +def main(): + parser = argparse.ArgumentParser( + description='Privacy-first anonymous telemetry for ClaudeShack', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument('--skill', help='Skill name') + parser.add_argument('--event', help='Event type (invoked, error, etc.)') + parser.add_argument('--success', type=bool, help='Whether operation succeeded') + parser.add_argument('--error-type', help='Error type (not message)') + parser.add_argument('--duration', type=int, help='Duration in milliseconds') + + parser.add_argument('--metric', help='Metric name') + parser.add_argument('--value', type=float, help='Metric value') + + parser.add_argument('--enable', action='store_true', help='Enable telemetry (opt-in)') + parser.add_argument('--disable', action='store_true', help='Disable telemetry') + parser.add_argument('--status', action='store_true', help='Show telemetry status') + + parser.add_argument('--show-events', action='store_true', help='Show local events') + parser.add_argument('--summary', action='store_true', help='Show event summary') + parser.add_argument('--days', type=int, help='Days to look back (default: all)') + + parser.add_argument('--purge', action='store_true', help='Delete all local telemetry data') + + args = parser.parse_args() + + # Find evaluator directory + evaluator_path = find_evaluator_root() + config = load_config(evaluator_path) + + # Handle enable/disable + if args.enable: + config['enabled'] = True + config['anonymous_id'] = generate_anonymous_id() + save_config(evaluator_path, config) + print("✓ Telemetry enabled (anonymous, opt-in)") + print(f" Anonymous ID: {config['anonymous_id']}") + print(" No personally identifiable information is collected") + print(" You can disable anytime with: --disable") + sys.exit(0) + + if args.disable: + config['enabled'] = False + save_config(evaluator_path, config) + print("✓ Telemetry disabled") + print(" Run with --purge to delete all local data") + sys.exit(0) + + # Handle status + if args.status: + print("Evaluator Telemetry Status:") + print("=" * 60) + print(f"Enabled: {config.get('enabled', False)}") + print(f"Anonymous ID: {config.get('anonymous_id', 'Not set')}") + print(f"Send aggregates: {config.get('send_aggregates', False)}") + print(f"Retention: {config.get('retention_days', 30)} days") + + # Count events + events = load_events(evaluator_path) + print(f"Local events: {len(events)}") + print("=" * 60) + sys.exit(0) + + # Handle purge + if args.purge: + events_file = evaluator_path / 'events.jsonl' + if events_file.exists(): + events_file.unlink() + print("✓ All local telemetry data deleted") + else: + print("No telemetry data to delete") + sys.exit(0) + + # Handle show events + if args.show_events: + events = load_events(evaluator_path, args.days) + print(json.dumps(events, indent=2)) + sys.exit(0) + + # Handle summary + if args.summary: + events = load_events(evaluator_path, args.days) + show_summary(events) + sys.exit(0) + + # Track event + if args.skill and args.event: + track_event( + evaluator_path, + config, + args.skill, + args.event, + args.success, + args.error_type, + args.duration + ) + # Silent success (telemetry should be invisible) + sys.exit(0) + + # Track metric + if args.skill and args.metric and args.value is not None: + track_metric( + evaluator_path, + config, + args.skill, + args.metric, + args.value + ) + # Silent success + sys.exit(0) + + parser.print_help() + sys.exit(1) + + +if __name__ == '__main__': + main()