From 57790ee7119bb5b50fe8dfa99182bc6bd89033e9 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:59:38 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + plugin.lock.json | 61 ++++ skills/using-skillpack-maintenance/SKILL.md | 249 +++++++++++++ .../analyzing-pack-domain.md | 168 +++++++++ .../implementing-fixes.md | 334 +++++++++++++++++ .../reviewing-pack-structure.md | 252 +++++++++++++ .../testing-skill-quality.md | 340 ++++++++++++++++++ 8 files changed, 1419 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/using-skillpack-maintenance/SKILL.md create mode 100644 skills/using-skillpack-maintenance/analyzing-pack-domain.md create mode 100644 skills/using-skillpack-maintenance/implementing-fixes.md create mode 100644 skills/using-skillpack-maintenance/reviewing-pack-structure.md create mode 100644 skills/using-skillpack-maintenance/testing-skill-quality.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..00533d9 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "meta-skillpack-maintenance", + "description": "Systematic maintenance and enhancement of skill packs through investigative domain analysis, RED-GREEN-REFACTOR testing, and automated quality improvements", + "version": "1.0.1", + "author": { + "name": "tachyon-beep", + "url": "https://github.com/tachyon-beep" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2b7a36 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# meta-skillpack-maintenance + +Systematic maintenance and enhancement of skill packs through investigative domain analysis, RED-GREEN-REFACTOR testing, and automated quality improvements diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..5d5aa7d --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,61 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/meta-skillpack-maintenance", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "1ffe74e4e56301106f09a070ee6a9e734c2d1c07", + "treeHash": "2910222b93e5901ee7d84c06321e85bffe738a1ae8c9c8f45c14cfd72fc8db3a", + "generatedAt": "2025-11-28T10:28:32.564768Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "meta-skillpack-maintenance", + "description": "Systematic maintenance and enhancement of skill packs through investigative domain analysis, RED-GREEN-REFACTOR testing, and automated quality improvements", + "version": "1.0.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "1fa862d1c9c47cb95ff42f9ff2d8fb9f91fd6393cc69fdef3fc0434c27fff89a" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "a048f5922206151fa4c6d2dd231fa248becd7bffe8a697ea7af35078f398a1b6" + }, + { + "path": "skills/using-skillpack-maintenance/testing-skill-quality.md", + "sha256": "d7eb57a8b3535a1bcd2f258fb1deb30e1dfa94d7c9106bf7397b681fb9a515ac" + }, + { + "path": "skills/using-skillpack-maintenance/implementing-fixes.md", + "sha256": "eb4b3bf93588611f8535dd091364ee2b41fe0213f6621d4b92a9043ea5f0606a" + }, + { + "path": "skills/using-skillpack-maintenance/analyzing-pack-domain.md", + "sha256": "d9ca2f161084cce7dc53ec5b867a6a989c9422544e761a7a65be47d487157d8c" + }, + { + "path": "skills/using-skillpack-maintenance/reviewing-pack-structure.md", + "sha256": "d463c97b3df928f10669db08e22445426cb20e259561919c86725fe160d59d24" + }, + { + "path": "skills/using-skillpack-maintenance/SKILL.md", + "sha256": "eebaa2637db750749e36f1ea5188791f82dd19f2bcfb2b783fc223c1e0dececc" + } + ], + "dirSha256": "2910222b93e5901ee7d84c06321e85bffe738a1ae8c9c8f45c14cfd72fc8db3a" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/using-skillpack-maintenance/SKILL.md b/skills/using-skillpack-maintenance/SKILL.md new file mode 100644 index 0000000..d7e983e --- /dev/null +++ b/skills/using-skillpack-maintenance/SKILL.md @@ -0,0 +1,249 @@ +--- +name: using-skillpack-maintenance +description: Use when maintaining or enhancing existing skill packs in the skillpacks repository - systematic pack refresh through domain analysis, structure review, RED-GREEN-REFACTOR gauntlet testing, and automated quality improvements +--- + +# Skillpack Maintenance + +## Overview + +Systematic maintenance and enhancement of existing skill packs using investigative domain analysis, RED-GREEN-REFACTOR testing, and automated improvements. + +**Core principle:** Maintenance uses behavioral testing (gauntlet with subagents), not syntactic validation. Skills are process documentation - test if they guide agents correctly, not if they parse correctly. + +## When to Use + +Use when: +- Enhancing an existing skill pack (e.g., "refresh yzmir-deep-rl") +- Improving existing SKILL.md files +- Identifying gaps in pack coverage +- Validating skill quality through testing + +**Do NOT use for:** +- Creating new skills from scratch (use superpowers:writing-skills) +- Creating new packs from scratch (design first, then use creation workflow) + +## The Iron Law + +**NO SKILL CHANGES WITHOUT BEHAVIORAL TESTING** + +Syntactic validation (does it parse?) ≠ Behavioral testing (does it work?) + +## Common Rationalizations (from baseline testing) + +| Excuse | Reality | +|--------|---------| +| "Syntactic validation is sufficient" | Parsing ≠ effectiveness. Test with subagents. | +| "Quality benchmarking = effectiveness" | Comparing structure ≠ testing behavior. Run gauntlet. | +| "Comprehensive coverage = working skill" | Coverage ≠ guidance quality. Test if agents follow it. | +| "Following patterns = success" | Pattern-matching ≠ validation. Behavioral testing required. | +| "I'll test if issues emerge" | Issues = broken skills in production. Test BEFORE deploying. | + +**All of these mean: Run behavioral tests with subagents. No exceptions.** + +## Workflow Overview + +**Review → Discuss → [Create New Skills if Needed] → Execute** + +1. **Investigation & Scorecard** → Load `analyzing-pack-domain.md` +2. **Structure Review (Pass 1)** → Load `reviewing-pack-structure.md` +3. **Content Testing (Pass 2)** → Load `testing-skill-quality.md` +4. **Coherence Check (Pass 3)** → Validate cross-skill consistency +5. **Discussion** → Present findings, get approval +6. **[CONDITIONAL] Create New Skills** → If gaps identified, use `superpowers:writing-skills` for EACH gap (RED-GREEN-REFACTOR) +7. **Execution** → Load `implementing-fixes.md`, enhance existing skills only +8. **Commit** → Single commit with version bump + +## Stage 1: Investigation & Scorecard + +**Load briefing:** `analyzing-pack-domain.md` + +**Purpose:** Establish "what this pack should cover" from first principles. + +**Adaptive investigation (D→B→C→A):** +1. **User-guided scope (D)** - Ask user about pack intent and boundaries +2. **LLM knowledge analysis (B)** - Map domain comprehensively, flag if research needed +3. **Existing pack audit (C)** - Compare current state vs. coverage map +4. **Research if needed (A)** - Conditional: only if domain is rapidly evolving + +**Output:** Domain coverage map, gap analysis, research currency flag + +**Then: Load `reviewing-pack-structure.md` for scorecard** + +**Scorecard levels:** +- **Critical** - Pack unusable, recommend rebuild vs. enhance +- **Major** - Significant gaps or duplicates +- **Minor** - Organizational improvements +- **Pass** - Structurally sound + +**Decision gate:** Present scorecard → User decides: Proceed / Rebuild / Cancel + +## Stage 2: Comprehensive Review + +### Pass 1: Structure (from reviewing-pack-structure.md) + +**Analyze:** +- Gaps (missing skills based on coverage map) +- Duplicates (overlapping coverage - merge/specialize/remove) +- Organization (router accuracy, faction alignment, metadata sync) + +**Output:** Structural issues with priorities (critical/major/minor) + +### Pass 2: Content Quality (from testing-skill-quality.md) + +**CRITICAL:** This is behavioral testing with subagents, not syntactic validation. + +**Gauntlet design (A→C→B priority):** + +**A. Pressure scenarios** - Catch rationalizations: +- Time pressure: "This is urgent, just do it quickly" +- Simplicity temptation: "Too simple to need the skill" +- Overkill perception: "Skill is for complex cases, this is straightforward" + +**C. Adversarial edge cases** - Test robustness: +- Corner cases where skill principles conflict +- Situations where naive application fails + +**B. Real-world complexity** - Validate utility: +- Messy requirements, unclear constraints +- Multiple valid approaches + +**Testing process per skill:** +1. Design challenging scenario from gauntlet categories +2. **Run subagent WITH current skill** (behavioral test) +3. Observe: Does it follow? Where does it rationalize/fail? +4. Document failure modes +5. Result: Pass OR Fix needed (with specific issues listed) + +**Philosophy:** D as gauntlet to identify issues, B for targeted fixes. If skill passes gauntlet, no changes needed. + +**Output:** Per-skill test results (Pass / Fix needed + priorities) + +### Pass 3: Coherence + +**After structure/content analysis, validate pack-level coherence:** + +1. **Cross-skill consistency** - Terminology, examples, cross-references +2. **Router accuracy** - Does using-X router reflect current specialists? +3. **Faction alignment** - Check FACTIONS.md, flag drift, suggest rehoming if needed +4. **Metadata sync** - plugin.json description, skill count +5. **Navigation** - Can users find skills easily? + +**CRITICAL:** Update skills to reference new/enhanced skills (post-update hygiene) + +**Output:** Coherence issues, faction drift flags + +## Stage 3: Interactive Discussion + +**Present findings conversationally:** + +**Structural category:** +- **Gaps requiring superpowers:writing-skills** (new skills needed - each requires RED-GREEN-REFACTOR) +- Duplicates to remove/merge +- Organization issues + +**Content category:** +- Skills needing enhancement (from gauntlet failures) +- Severity levels (critical/major/minor) +- Specific failure modes identified + +**Coherence category:** +- Cross-reference updates needed +- Faction alignment issues +- Metadata corrections + +**Get user approval for scope of work** + +**CRITICAL DECISION POINT:** If gaps (new skills) were identified: +- User approves → **IMMEDIATELY use superpowers:writing-skills for EACH gap** +- Do NOT proceed to Stage 4 until ALL new skills are created and tested +- Each gap = separate RED-GREEN-REFACTOR cycle +- Return to Stage 4 only after ALL gaps are filled + +## Stage 4: Autonomous Execution + +**Load briefing:** `implementing-fixes.md` + +**PREREQUISITE CHECK:** +- ✓ Zero gaps identified, OR +- ✓ All gaps already filled using superpowers:writing-skills (each skill individually tested) + +**If gaps exist and you haven't used writing-skills:** STOP. Return to Stage 3. + +**Execute approved changes:** + +1. **Structural fixes** - Remove/merge duplicate skills, update router +2. **Content enhancements** - Fix gauntlet failures, add missing guidance to existing skills +3. **Coherence improvements** - Cross-references, terminology alignment, faction voice +4. **Version management** - Apply impact-based bump (patch/minor/major) +5. **Git commit** - Single commit with all changes + +**Version bump rules (impact-based):** +- **Patch (x.y.Z)** - Low-impact: typos, formatting, minor clarifications +- **Minor (x.Y.0)** - Medium-impact: enhanced guidance, new skills, better examples (DEFAULT) +- **Major (X.0.0)** - High-impact: skills removed, structural changes, philosophy shifts (RARE) + +**Commit format:** +``` +feat(meta): enhance [pack-name] - [summary] + +[Detailed list of changes by category] +- Structure: [changes] +- Content: [changes] +- Coherence: [changes] + +Version bump: [reason for patch/minor/major] +``` + +**Output:** Enhanced pack, commit created, summary report + +## Briefing Files Reference + +All briefing files are in this skill directory: + +- `analyzing-pack-domain.md` - Investigative domain analysis (D→B→C→A) +- `reviewing-pack-structure.md` - Structure review, scorecard, gap/duplicate analysis +- `testing-skill-quality.md` - Gauntlet testing methodology with subagents +- `implementing-fixes.md` - Autonomous execution, version management, git commit + +**Load appropriate briefing at each stage.** + +## Critical Distinctions + +**Behavioral vs. Syntactic Testing:** +- ❌ **Syntactic:** "Does Python code parse?" → ast.parse() +- ✅ **Behavioral:** "Does skill guide agents correctly?" → Subagent gauntlet + +**This workflow requires BEHAVIORAL testing.** + +**Maintenance vs. Creation:** +- **Maintenance** (this skill): Enhancing existing SKILL.md files +- **Creation** (superpowers:writing-skills): Writing new skills from scratch + +**Use the right tool for the task.** + +## Red Flags - STOP and Switch Tools + +If you catch yourself thinking ANY of these: +- "I'll write the new skills during execution" → NO. Use superpowers:writing-skills for EACH gap +- "implementing-fixes.md says to create skills" → NO. That section was REMOVED. Exit and use writing-skills +- "Token efficiency - I can just write good skills" → NO. Untested skills = broken skills +- "I see the pattern, I can replicate it" → NO. Pattern-matching ≠ behavioral testing +- "User wants this done quickly" → NO. Fast + untested = waste of time fixing later +- "I'm competent, testing is overkill" → NO. Competence = following the process +- "Gaps were approved, so I should fill them" → YES, but using writing-skills, not here +- Validating syntax instead of behavior → Load testing-skill-quality.md +- Skipping gauntlet testing → You're violating the Iron Law +- Making changes without user approval → Follow Review→Discuss→Execute + +**All of these mean: STOP. Exit workflow. Use superpowers:writing-skills.** + +## The Bottom Line + +**Maintaining skills requires behavioral testing, not syntactic validation.** + +Same principle as code: you test behavior, not syntax. + +Load briefings at each stage. Test with subagents. Get approval. Execute. + +No shortcuts. No rationalizations. diff --git a/skills/using-skillpack-maintenance/analyzing-pack-domain.md b/skills/using-skillpack-maintenance/analyzing-pack-domain.md new file mode 100644 index 0000000..b7f64bc --- /dev/null +++ b/skills/using-skillpack-maintenance/analyzing-pack-domain.md @@ -0,0 +1,168 @@ +# Analyzing Pack Domain + +**Purpose:** Investigative process to establish "what this pack should cover" from first principles. + +## Adaptive Investigation Workflow + +**Sequence: D → B → C → A (conditional)** + +### Phase D: User-Guided Scope + +**Ask user:** +- "What is the intended scope and purpose of [pack-name]?" +- "What boundaries should this pack respect?" +- "Who is the target audience? (beginners / practitioners / experts)" +- "What depth of coverage is expected? (overview / comprehensive / exhaustive)" + +**Document:** +- User's vision as baseline +- Explicit boundaries (what's IN scope, what's OUT of scope) +- Success criteria (what makes this pack "complete"?) + +### Phase B: LLM Knowledge-Based Analysis + +**Leverage model knowledge to map the domain:** + +1. **Generate comprehensive coverage map:** + - What are the major concepts/algorithms/techniques in this domain? + - What are the core patterns practitioners need to know? + - What are common implementation challenges? + +2. **Identify structure:** + - Foundational concepts (must understand first) + - Core techniques (bread-and-butter patterns) + - Advanced topics (expert-level material) + - Cross-cutting concerns (testing, debugging, optimization) + +3. **Flag research currency:** + - Is this domain stable or rapidly evolving? + - Stable examples: Design patterns, basic algorithms, established protocols + - Evolving examples: AI/ML, security, modern web frameworks + - If evolving → Flag for Phase A research + +**Output:** Coverage map with categorization (foundational/core/advanced) + +### Phase C: Existing Pack Audit + +**Read all current skills in the pack:** + +1. **Inventory:** + - List all SKILL.md files + - Note skill names and descriptions + - Check router skill (if exists) for specialist list + +2. **Compare against coverage map:** + - What's covered? (existing skills matching coverage areas) + - What's missing? (gaps in foundational/core/advanced areas) + - What overlaps? (multiple skills covering same concept) + - What's obsolete? (outdated approaches, deprecated patterns) + +3. **Quality check:** + - Are descriptions accurate? + - Do skills match their descriptions? + - Are there broken cross-references? + +**Output:** Gap list, duplicate list, obsolescence flags + +### Phase A: Research (Conditional) + +**ONLY if Phase B flagged domain as rapidly evolving.** + +**Research authoritative sources:** + +1. **For AI/ML domains:** + - Latest survey papers (search: "[domain] survey 2024/2025") + - Current textbooks (check publication dates) + - Official library documentation (PyTorch, TensorFlow, etc.) + - Research benchmarks (Papers with Code, etc.) + +2. **For security domains:** + - OWASP guidelines + - NIST standards + - Recent CVE patterns + - Current threat landscape + +3. **For framework domains:** + - Official documentation (latest version) + - Migration guides (breaking changes) + - Best practices (official recommendations) + +**Update coverage map:** +- Add new techniques/patterns +- Flag deprecated approaches in existing skills +- Note version-specific considerations + +**Decision criteria for Phase A:** +- **Skip research** for: Math, algorithms, design patterns, established protocols +- **Run research** for: AI/ML, security, modern frameworks, evolving standards + +## Outputs + +Generate comprehensive report: + +### 1. Domain Coverage Map + +``` +Foundational: +- [Concept 1] - [Status: Exists / Missing / Needs enhancement] +- [Concept 2] - [Status: Exists / Missing / Needs enhancement] + +Core Techniques: +- [Technique 1] - [Status: ...] +- [Technique 2] - [Status: ...] + +Advanced Topics: +- [Topic 1] - [Status: ...] +- [Topic 2] - [Status: ...] + +Cross-Cutting: +- [Concern 1] - [Status: ...] +``` + +### 2. Current State Assessment + +``` +Existing skills: [count] +- [Skill 1 name] - covers [domain area] +- [Skill 2 name] - covers [domain area] +... +``` + +### 3. Gap Analysis + +``` +Missing (High Priority): +- [Gap 1] - foundational concept not covered +- [Gap 2] - core technique missing + +Missing (Medium Priority): +- [Gap 3] - advanced topic not covered + +Duplicates: +- [Skill A] and [Skill B] overlap on [topic] + +Obsolete: +- [Skill C] uses deprecated approach [old pattern] +``` + +### 4. Research Currency Flag + +``` +Domain stability: [Stable / Evolving] +Research conducted: [Yes / No / Not needed] +Currency concerns: [None / List specific areas needing updates] +``` + +## Proceeding to Next Stage + +After completing investigation, hand off to `reviewing-pack-structure.md` for scorecard generation. + +## Common Mistakes + +| Mistake | Fix | +|---------|-----| +| Skipping user input (Phase D) | Always start with user vision - they define scope | +| Over-relying on LLM knowledge | For evolving domains, run research (Phase A) | +| Skipping gap analysis | Compare coverage map vs. existing skills systematically | +| Treating all domains as stable | Flag AI/ML/security/frameworks for research | +| Vague gap descriptions | Be specific: "Missing TaskGroup patterns" not "async needs work" | diff --git a/skills/using-skillpack-maintenance/implementing-fixes.md b/skills/using-skillpack-maintenance/implementing-fixes.md new file mode 100644 index 0000000..4508020 --- /dev/null +++ b/skills/using-skillpack-maintenance/implementing-fixes.md @@ -0,0 +1,334 @@ +# Implementing Fixes + +**Purpose:** Autonomous execution of approved changes with version management and git commit. + +## Prerequisites + +You should have completed and gotten approval for: +- Pass 1: Structure review (gaps, duplicates, organization) +- Pass 2: Content testing (gauntlet results, fix priorities) +- Pass 3: Coherence validation (cross-skill consistency, faction alignment) +- User discussion and approval of scope + +**Do NOT proceed without user approval of the scope of work.** + +## Execution Workflow + +### 1. Structural Fixes (from Pass 1) + +**CRITICAL CHECKPOINT - New Skills:** + +**STOP:** Did you identify gaps (new skills needed) in Pass 1? + +**If YES → You MUST exit this workflow NOW:** + +1. **DO NOT proceed to execution** +2. **For EACH gap identified:** + - Use `superpowers:writing-skills` skill + - RED: Test scenario WITHOUT the skill + - GREEN: Write skill addressing gaps + - REFACTOR: Close loopholes + - **Commit that ONE skill** +3. **Repeat for ALL gaps** (each skill = separate RED-GREEN-REFACTOR cycle) +4. **AFTER all new skills are tested and committed:** + - Return to meta-skillpack-maintenance + - Load this briefing again + - Continue with other structural fixes below + +**Proceeding past this checkpoint assumes:** +- ✓ Zero new skills needed, OR +- ✓ All new skills already created via superpowers:writing-skills +- ✓ You are ONLY enhancing existing skills, removing duplicates, updating router/metadata + +**If you identified ANY gaps and haven't used superpowers:writing-skills for each:** +**STOP. Exit now. You're violating the Iron Law: NO SKILL WITHOUT BEHAVIORAL TESTING.** + +--- + +**Remove duplicate skills:** + +For skills marked for removal: +1. Identify unique value in skill being removed +2. Merge unique value into kept skill (if any) +3. Delete duplicate SKILL.md and directory +4. Remove references from router (if exists) +5. Update cross-references in other skills + +**Merge overlapping skills:** + +For partial duplicates: +1. Identify all unique content from both skills +2. Create merged skill with comprehensive coverage +3. Reorganize structure if needed +4. Delete original skills +5. Update router and cross-references +6. Update skill name/description if needed + +**Update router skill:** + +If pack has using-X router: +1. Update specialist list to reflect adds/removes +2. Update descriptions to match current skills +3. Verify routing logic makes sense +4. Add cross-references as needed + +### 2. Content Enhancements (from Pass 2) + +**For each skill marked "Fix needed" in gauntlet testing:** + +**Fix rationalizations (A-type issues):** +1. Add explicit counter for each identified rationalization +2. Update "Common Rationalizations" table +3. Add to "Red Flags" list if applicable +4. Strengthen "No exceptions" language + +**Fill edge case gaps (C-type issues):** +1. Add guidance for identified corner cases +2. Document when/how to adapt core principles +3. Add examples for edge case handling +4. Cross-reference related skills if needed + +**Enhance real-world guidance (B-type issues):** +1. Add examples from realistic scenarios +2. Clarify ambiguous instructions +3. Add decision frameworks where needed +4. Update "When to Use" section if unclear + +**Add anti-patterns:** +1. Document observed failure modes from testing +2. Add ❌ WRONG / ✅ CORRECT examples +3. Update "Common Mistakes" section +4. Add warnings for subtle pitfalls + +**Improve examples:** +1. Replace weak examples with tested scenarios +2. Ensure examples are complete and runnable +3. Add comments explaining WHY, not just WHAT +4. Use realistic domain context + +### 3. Coherence Improvements (from Pass 3) + +**Cross-reference updates:** + +**CRITICAL:** This is post-update hygiene - ensure skills reference new/enhanced skills. + +For each skill in pack: +1. Identify related skills (related concepts, prerequisites, follow-ups) +2. Add cross-references where helpful: + - "See [skill-name] for [related concept]" + - "**REQUIRED BACKGROUND:** [skill-name]" + - "After mastering this, see [skill-name]" +3. Update router cross-references +4. Ensure bidirectional links (if A references B, should B reference A?) + +**Terminology alignment:** + +1. Identify terminology inconsistencies across skills +2. Choose canonical terms (most clear/standard) +3. Update all skills to use canonical terms +4. Add glossary to router if needed + +**Faction voice adjustment:** + +For skills flagged with faction drift: +1. Read FACTIONS.md for faction principles +2. Adjust language/tone to match faction +3. Realign examples with faction philosophy +4. If severe drift: Flag for potential rehoming + +**If rehoming recommended:** +- Document which faction skill should move to +- Note in commit message for manual handling +- Don't move skills automatically (requires marketplace changes) + +**Metadata synchronization:** + +Update `plugin.json`: +1. Description - ensure it matches enhanced pack content +2. Count skills if tool supports it +3. Verify category is appropriate + +### 4. Version Management (Impact-Based) + +**Assess impact of all changes:** + +**Patch bump (x.y.Z) - Low impact:** +- Typos fixed +- Formatting improvements +- Minor clarifications (< 50 words added) +- Small example corrections +- No new skills, no skills removed + +**Minor bump (x.Y.0) - Medium impact (DEFAULT):** +- Enhanced guidance (added sections, better examples) +- New skills added +- Improved existing skills significantly +- Better anti-pattern coverage +- Fixed gauntlet failures +- Updated for current best practices + +**Major bump (X.0.0) - High impact (RARE, use sparingly):** +- Skills removed entirely +- Structural reorganization +- Philosophy shifts +- Breaking changes to how skills work +- Deprecated major patterns + +**Decision logic:** +1. Any new skills added? → Minor minimum +2. Any skills removed? → Consider major +3. Only fixes/clarifications? → Patch +4. Enhanced multiple skills significantly? → Minor +5. Changed pack philosophy? → Major + +**Default for maintenance reviews: Minor bump** + +**Update version in plugin.json:** +```json +{ + "version": "[new-version]" +} +``` + +### 5. Git Commit + +**Single commit with all changes:** + +**Commit message format:** + +``` +feat(meta): enhance [pack-name] - [one-line summary] + +Structure changes: +- Added [count] new skills: [skill-1], [skill-2], ... +- Removed [count] duplicate skills: [skill-1], [skill-2], ... +- Merged [skill-a] + [skill-b] into [skill-merged] +- Updated router to reflect new structure + +Content improvements: +- Enhanced [skill-1]: [specific improvements] +- Enhanced [skill-2]: [specific improvements] +- Fixed gauntlet failures in [skill-3]: [issues addressed] +- Added anti-patterns to [skill-4] + +Coherence updates: +- Added cross-references between [count] skills +- Aligned terminology across pack +- Adjusted faction voice in [skill-name] +- Updated plugin.json metadata + +Version: [old-version] → [new-version] ([patch/minor/major]) +Rationale: [reason for version bump type] +``` + +**Commit command:** + +```bash +git add plugins/[pack-name]/ +git commit -m "$(cat <<'EOF' +feat(meta): enhance [pack-name] - [summary] + +[Full message body as above] +EOF +)" +``` + +**Do NOT push** - let user decide when to push. + +## Execution Principles + +**Autonomous within approved scope:** +- Execute all approved changes without asking again +- Follow user's approved plan exactly +- Make editorial decisions within scope +- Ask only if something unexpected blocks progress + +**Quality standards:** +- All new skills follow CSO guidelines (name/description format) +- All code examples are complete and appropriate to domain +- All cross-references are accurate +- Faction voice is maintained + +**Verification before commit:** +- Verify YAML front matter syntax in all modified skills +- Check that all cross-references point to existing skills +- Ensure router (if exists) references all current skills +- Verify plugin.json has valid JSON syntax + +## Output After Completion + +Provide comprehensive summary: + +``` +# Pack Enhancement Complete: [pack-name] + +## Version: [old] → [new] ([type]) + +## Summary Statistics + +- Skills added: [count] +- Skills removed: [count] +- Skills enhanced: [count] +- Skills tested and passed: [count] + +## Changes by Category + +### Structure +[List of structural changes] + +### Content +[List of content improvements] + +### Coherence +[List of coherence updates] + +## Git Commit + +Created commit: [commit-hash if available] +Message: [first line of commit] + +Ready to push: [Yes] +``` + +## Common Mistakes + +| Mistake | Fix | +|---------|-----| +| Proceeding without approval | Always get user approval before executing | +| Batch changes across passes | Complete one pass fully before next | +| Inconsistent faction voice | Read FACTIONS.md, maintain voice throughout | +| Broken cross-references | Verify all referenced skills exist | +| Invalid YAML | Check syntax before committing | +| Pushing automatically | Let user decide when to push | +| Vague commit messages | Be specific about what changed and why | +| Wrong version bump | Follow impact-based rules, default to minor | + +## Anti-Patterns + +**❌ Changing scope during execution:** +- Don't add extra improvements not discussed +- Don't skip approved changes because "not needed" +- Stick to approved scope exactly + +**❌ Sub-optimal quality:** +- Don't write quick/dirty skills to fill gaps +- Don't copy-paste without adapting to faction +- Don't skip cross-references to save time + +**❌ Incomplete commits:** +- Don't commit partial work +- Don't split into multiple commits +- Single commit with all changes + +**❌ No verification:** +- Don't assume syntax is correct +- Don't skip cross-reference checking +- Verify before committing + +## The Bottom Line + +**Execute approved changes autonomously with high quality standards.** + +One commit. Proper versioning. Complete summary. + +No shortcuts. No scope creep. Professional execution. diff --git a/skills/using-skillpack-maintenance/reviewing-pack-structure.md b/skills/using-skillpack-maintenance/reviewing-pack-structure.md new file mode 100644 index 0000000..73d5d82 --- /dev/null +++ b/skills/using-skillpack-maintenance/reviewing-pack-structure.md @@ -0,0 +1,252 @@ +# Reviewing Pack Structure + +**Purpose:** Pass 1 - Analyze pack organization, identify structural issues, generate fitness scorecard. + +## Inputs + +From `analyzing-pack-domain.md`: +- Domain coverage map (what should exist) +- Current skill inventory (what does exist) +- Gap analysis (missing/duplicate/obsolete) +- Research currency flag + +## Analysis Tasks + +### 1. Fitness Scorecard + +Generate scorecard with risk-driven prioritization: + +**Critical Issues** - Pack unusable or fundamentally broken: +- Missing core foundational concepts (users can't understand basics) +- Major gaps in essential coverage (50%+ of core techniques missing) +- Router completely inaccurate or missing when needed +- Multiple skills broken or contradictory + +**Decision:** Critical issues → Recommend "Rebuild from scratch" vs. "Enhance existing" + +Rebuild if: +- More skills missing than exist +- Fundamental philosophy is wrong +- Faction mismatch is severe + +Enhance if: +- Core structure is sound, just needs additions/fixes +- Most existing skills are salvageable + +**Major Issues** - Significant effectiveness reduction: +- Important gaps in core coverage (20-50% of core techniques missing) +- Multiple duplicate skills causing confusion +- Obsolete skills teaching deprecated patterns +- Faction drift across multiple skills +- Metadata significantly out of sync + +**Minor Issues** - Polish and improvements: +- Small gaps in advanced topics +- Minor organizational inconsistencies +- Router descriptions slightly outdated +- Small metadata corrections needed + +**Pass** - Structurally sound: +- Comprehensive coverage of foundational and core areas +- No major gaps or duplicates +- Router (if exists) is accurate +- Faction alignment is good +- Metadata is current + +**Output:** Scorecard with category and specific issues listed + +### 2. Gap Identification + +From coverage map, identify missing skills: + +**Prioritize by importance:** + +**High priority (foundational/core):** +- Foundational concepts users must understand +- Core techniques used frequently +- Common patterns missing from basics + +**Medium priority (advanced):** +- Advanced topics for expert users +- Specialized techniques +- Edge case handling + +**Low priority (nice-to-have):** +- Rare patterns +- Future-looking topics +- Experimental techniques + +**For each gap:** +- Draft skill name (following naming conventions) +- Write description (following CSO guidelines) +- Estimate scope (small/medium/large skill) +- Note dependencies (what skills should be read first) + +**Output:** Prioritized list of gaps with draft names/descriptions + +### 3. Duplicate Detection + +Find skills with overlapping coverage: + +**Analysis process:** +1. Read all skill descriptions +2. Identify content overlap (skills covering same concepts) +3. Read overlapping skills to assess actual content +4. Determine relationship + +**Duplicate types:** + +**Complete duplicates** - Same content, different names: +- **Action:** Remove one, preserve unique value from both + +**Partial overlap** - Significant shared content: +- **Action:** Merge into single comprehensive skill + +**Specialization** - One general, one specific: +- **Action:** Keep both, clarify relationship via cross-references +- Example: "async-patterns" (general) + "asyncio-taskgroup" (specific) + +**Complementary** - Different angles on same topic: +- **Action:** Keep both, strengthen cross-references +- Example: "testing-async-code" + "async-patterns-and-concurrency" + +**False positive** - Similar names, different content: +- **Action:** No change, maybe clarify descriptions + +**For each duplicate pair:** +- Classification (complete/partial/specialization/complementary/false) +- Recommendation (remove/merge/keep with cross-refs) +- Preserve unique value from each + +**Output:** Duplicate analysis with recommendations + +### 4. Organization Validation + +Check pack-level organization: + +**Router skill validation (if exists):** +- Does router list all current specialist skills? +- Are descriptions in router accurate? +- Does routing logic make sense? +- Are there skills NOT mentioned in router? +- Are there router entries for NON-EXISTENT skills? + +**Faction alignment:** +- Read FACTIONS.md for this pack's faction principles +- Check 3-5 representative skills for voice/philosophy +- Identify drift patterns +- Severity: Minor (style drift) / Major (wrong philosophy) + +**Metadata validation:** +- plugin.json description matches actual content? +- Skill count is accurate? +- Category is appropriate? +- Version reflects current state? + +**Navigation experience:** +- Can users find appropriate skills easily? +- Are skill names descriptive? +- Are descriptions helpful for discovery? + +**Output:** Organization issues with severity + +## Generate Complete Report + +Combine all analyses: + +``` +# Structural Review: [pack-name] + +## Scorecard: [Critical / Major / Minor / Pass] + +[If Critical] +Recommendation: [Rebuild from scratch / Enhance existing] +Rationale: [Specific reasons] + +## Issues by Priority + +### Critical Issues ([count]) +- [Issue 1] - [Description] +- [Issue 2] - [Description] + +### Major Issues ([count]) +- [Issue 1] - [Description] +- [Issue 2] - [Description] + +### Minor Issues ([count]) +- [Issue 1] - [Description] +- [Issue 2] - [Description] + +## Gap Analysis + +### High Priority Gaps ([count]) +- [Gap 1] + - Skill name: [proposed-name] + - Description: [draft description] + - Scope: [small/medium/large] + - Dependencies: [prerequisites] + +### Medium Priority Gaps ([count]) +[Same format] + +### Low Priority Gaps ([count]) +[Same format] + +## Duplicate Analysis + +- [Skill A] + [Skill B] + - Type: [complete/partial/specialization/complementary] + - Recommendation: [remove/merge/keep with cross-refs] + - Rationale: [why] + +## Organization Issues + +### Router ([issues count]) +- [Issue description] + +### Faction Alignment ([severity]) +- [Drift pattern] +- Affected skills: [list] + +### Metadata ([issues count]) +- [Issue description] + +## Recommended Actions + +**Gaps requiring superpowers:writing-skills:** +- [count] new skills needed (RED-GREEN-REFACTOR for each, outside this workflow) + +**Structure fixes (for later execution phase):** +- Remove: [count] duplicate skills +- Merge: [count] partial duplicates +- Update router: [Yes/No] +``` + +## Decision Gate + +Present scorecard and report to user: + +**If Critical:** +- Explain rebuild vs. enhance trade-offs +- Get user decision before proceeding + +**If Major/Minor/Pass:** +- Present findings +- Confirm user wants to proceed with Pass 2 (content testing) + +## Proceeding to Next Stage + +After scorecard approval: +- If proceeding → Move to `testing-skill-quality.md` (Pass 2) +- If rebuilding → Stop maintenance workflow, switch to creation workflow +- If canceling → Stop workflow + +## Common Mistakes + +| Mistake | Fix | +|---------|-----| +| Scorecard too lenient | Be honest: missing 50% of core = Critical | +| Vague gap descriptions | Draft actual skill names and descriptions | +| Keeping all duplicates | Duplicates confuse users - merge or remove | +| Ignoring faction drift | Faction identity matters - flag misalignment | +| Skipping metadata check | Inaccurate metadata breaks discovery | diff --git a/skills/using-skillpack-maintenance/testing-skill-quality.md b/skills/using-skillpack-maintenance/testing-skill-quality.md new file mode 100644 index 0000000..d5c1bf9 --- /dev/null +++ b/skills/using-skillpack-maintenance/testing-skill-quality.md @@ -0,0 +1,340 @@ +# Testing Skill Quality + +**Purpose:** Pass 2 - Run gauntlet tests on each skill using subagents to identify issues requiring fixes. + +## Core Principle + +**Behavioral testing, NOT syntactic validation.** + +Skills are process documentation. Test if they guide agents correctly, not if they parse correctly. + +## What We're Testing + +**Effectiveness questions:** +- Does the skill actually guide agents correctly? +- Do agents follow the skill under pressure? +- Does the skill handle edge cases? +- Are there gaps in guidance that leave agents stranded? + +**What we're NOT testing:** +- Syntax (markdown parsing, code syntax) - syntactic, not behavioral +- Coverage (already done in Pass 1) - structural, not behavioral +- Quality benchmarking (comparing to other skills) - comparative, not behavioral + +## Gauntlet Design + +**Priority: A → C → B** + +### A. Pressure Scenarios (Catch Rationalizations) + +**Purpose:** Test if skill holds up when agents want to skip it. + +**Pressure types:** + +**1. Time pressure:** +- "This is urgent, we need it done quickly" +- "Just get it working, we can improve it later" +- "The deadline is in an hour" + +**2. Simplicity temptation:** +- "This seems too simple to need [skill pattern]" +- "The example is straightforward, no need to overthink" +- "This is a trivial case" + +**3. Overkill perception:** +- "The skill is designed for complex cases, this is basic" +- "We don't need the full process for this small change" +- "That's way more than necessary" + +**4. Sunk cost:** +- "I already wrote most of the code" +- "We've invested time in this approach" +- "Just need to finish this last part" + +**Design approach:** +- Combine 2-3 pressures for maximum effect +- Example: Time pressure + simplicity + sunk cost +- Watch for rationalizations (verbatim documentation critical) + +### C. Adversarial Edge Cases (Test Robustness) + +**Purpose:** Test if skill provides guidance for corner cases. + +**Edge case types:** + +**1. Principle conflicts:** +- When skill's guidelines conflict with each other +- Example: "DRY vs. explicit" or "test-first vs. prototyping" +- Does skill help resolve conflict? + +**2. Naive application failures:** +- Cases where following skill literally doesn't work +- Example: TDD for exploratory research code +- Does skill explain when/how to adapt? + +**3. Missing information:** +- Scenarios requiring knowledge skill doesn't provide +- Does skill reference other resources? +- Does it leave agent completely stuck? + +**4. Tool limitations:** +- When environment doesn't support skill's approach +- Example: No test framework available +- Does skill have fallback guidance? + +**Design approach:** +- Identify skill's core principles +- Find situations where they conflict or fail +- Test if skill handles gracefully + +### B. Real-World Complexity (Validate Utility) + +**Purpose:** Test if skill guides toward best practices in realistic scenarios. + +**Complexity types:** + +**1. Messy requirements:** +- Unclear specifications +- Conflicting stakeholder needs +- Evolving requirements mid-task + +**2. Multiple valid approaches:** +- Several solutions, all reasonable +- Trade-offs between options +- Does skill help choose? + +**3. Integration constraints:** +- Existing codebase patterns +- Team conventions +- Technical debt + +**4. Incomplete information:** +- Missing context +- Unknown dependencies +- Undocumented behavior + +**Design approach:** +- Use realistic scenarios from the domain +- Include ambiguity and messiness +- Test if skill provides actionable guidance + +## Testing Process (Per Skill) + +**D - Iterative Hardening:** + +### 1. Design Challenging Scenario + +Pick from gauntlet categories (prioritize A → C → B): + +**For discipline-enforcing skills** (TDD, verification-before-completion): +- Focus on **A (pressure)** scenarios +- Combine multiple pressures +- Test rationalization resistance + +**For technique skills** (condition-based-waiting, root-cause-tracing): +- Focus on **C (edge cases)** and **B (real-world)** +- Test application correctness +- Test gap identification + +**For pattern skills** (reducing-complexity, information-hiding): +- Focus on **C (edge cases)** and **B (real-world)** +- Test recognition and application +- Test when NOT to apply + +**For reference skills** (API docs, command references): +- Focus on **B (real-world)** +- Test information retrieval +- Test application of retrieved info + +### 2. Run Subagent with Current Skill + +**Critical:** Use the Task tool to dispatch subagent. + +**Provide to subagent:** +- The scenario (task description) +- Access to the skill being tested +- Any necessary context (codebase, tools) + +**What NOT to provide:** +- Meta-testing instructions (don't tell them they're being tested) +- Expected behavior (let them apply skill naturally) +- Hints about what you're looking for + +### 3. Observe and Document + +**Watch for:** + +**Compliance:** +- Did agent follow the skill? +- Did they reference it explicitly? +- Did they apply patterns correctly? + +**Rationalizations (verbatim):** +- Exact words used to skip steps +- Justifications for shortcuts +- "Spirit vs. letter" arguments + +**Failure modes:** +- Where did skill guidance fail? +- Where was agent left without guidance? +- Where did naive application break? + +**Edge case handling:** +- Did skill provide guidance for corner cases? +- Did agent get stuck? +- Did they improvise (potentially incorrectly)? + +### 4. Assess Result + +**Pass criteria:** +- Agent followed skill correctly +- Skill provided sufficient guidance +- No significant rationalizations +- Edge cases handled appropriately + +**Fix needed criteria:** +- Agent skipped skill steps (with rationalization) +- Skill had gaps leaving agent stuck +- Edge cases not covered +- Naive application failed + +### 5. Document Issues + +**If fix needed, document specifically:** + +**Issue category:** +- Rationalization vulnerability (A) +- Edge case gap (C) +- Real-world guidance gap (B) +- Missing anti-pattern warning +- Unclear instructions +- Missing cross-reference + +**Priority:** +- **Critical** - Skill fails basic use cases, agents skip it consistently +- **Major** - Edge cases fail, significant gaps in guidance +- **Minor** - Clarity improvements, additional examples needed + +**Specific fixes needed:** +- "Add explicit counter for rationalization: [quote]" +- "Add guidance for edge case: [description]" +- "Add example for scenario: [description]" +- "Clarify instruction: [which section]" + +## Testing Multiple Skills + +**Strategy:** + +**Priority order:** +1. Router skills first (affects all specialist discovery) +2. Foundational skills (prerequisites for others) +3. Core technique skills (most frequently used) +4. Advanced skills (expert-level) + +**Batch approach:** +- Test 3-5 skills at a time +- Document results before moving to next batch +- Allows pattern recognition across skills + +**Efficiency:** +- Skills that passed in previous maintenance cycles: Spot-check only +- New skills or significantly changed: Full gauntlet +- Minor edits: Targeted testing of changed sections + +## Output Format + +Generate per-skill report: + +``` +# Quality Testing Results: [pack-name] + +## Summary + +- Total skills tested: [count] +- Passed: [count] +- Fix needed: [count] + - Critical: [count] + - Major: [count] + - Minor: [count] + +## Detailed Results + +### [Skill 1 Name] + +**Result:** [Pass / Fix needed] + +[If Fix needed] + +**Priority:** [Critical / Major / Minor] + +**Test scenario used:** [Brief description] + +**Issues identified:** + +1. **Issue:** [Description] + - **Category:** [Rationalization / Edge case / Real-world gap / etc.] + - **Evidence:** "[Verbatim quote from subagent if applicable]" + - **Fix needed:** [Specific action] + +2. **Issue:** [Description] + [Same format] + +**Test transcript:** [Link or summary of subagent behavior] + +--- + +### [Skill 2 Name] + +**Result:** Pass + +**Test scenario used:** [Brief description] + +**Notes:** Skill performed well, no issues identified. + +--- + +[Repeat for all skills] +``` + +## Common Rationalizations (Meta-Testing) + +When YOU are doing the testing, watch for these rationalizations: + +| Excuse | Reality | +|--------|---------| +| "Skill looks good, no need to test" | Looking ≠ testing. Run gauntlet. | +| "I'll just check the syntax" | Syntactic validation ≠ behavioral. Use subagents. | +| "Testing is overkill for small changes" | Small changes can break guidance. Test anyway. | +| "I'm confident this works" | Confidence ≠ validation. Test behavior. | +| "Quality benchmarking is enough" | Comparison ≠ effectiveness. Test with scenarios. | + +**If you catch yourself thinking these → STOP. Run gauntlet with subagents.** + +## Philosophy + +**D as gauntlet + B for fixes:** + +- **D (iterative hardening):** Run challenging scenarios to identify issues +- **B (targeted fixes):** Fix specific identified problems + +If skill passes gauntlet → No changes needed. + +The LLM is both author and judge of skill fitness. Trust the testing process. + +## Proceeding to Next Stage + +After testing all skills: +- Compile complete test report +- Proceed to Pass 3 (coherence validation) +- Test results will inform implementation fixes in Stage 4 + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Instead | +|--------------|---------|---------| +| Syntactic validation only | Doesn't test if skill actually works | Run behavioral tests with subagents | +| Self-assessment | You can't objectively test your own work | Dispatch subagents for testing | +| "Looks good" review | Visual inspection ≠ behavioral testing | Run gauntlet scenarios | +| Skipping pressure tests | Miss rationalization vulnerabilities | Use A-priority pressure scenarios | +| Generic test scenarios | Don't reveal real issues | Use domain-specific, realistic scenarios | +| Testing without documenting | Can't track patterns or close loops | Document verbatim rationalizations |