From 60e422fda50561f3e71641846a3d56499af5a76e Mon Sep 17 00:00:00 2001
From: Zhongwei Li <lizhongwei.nkcs@gmail.com>
Date: Sun, 30 Nov 2025 08:42:59 +0800
Subject: [PATCH] Initial commit

---
 .claude-plugin/plugin.json |  12 +
 README.md                  |   3 +
 commands/critique.md       | 476 +++++++++++++++++++++++++++++++++++
 commands/memorize.md       | 302 ++++++++++++++++++++++
 commands/reflect.md        | 500 +++++++++++++++++++++++++++++++++++++
 plugin.lock.json           |  53 ++++
 6 files changed, 1346 insertions(+)
 create mode 100644 .claude-plugin/plugin.json
 create mode 100644 README.md
 create mode 100644 commands/critique.md
 create mode 100644 commands/memorize.md
 create mode 100644 commands/reflect.md
 create mode 100644 plugin.lock.json
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
new file mode 100644
index 0000000..08528a7
--- /dev/null
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,12 @@
+{
+  "name": "reflexion",
+  "description": "Collection of commands that force LLM to reflect on previous response and output. Based on papers like Self-Refine and Reflexion. These techniques improve the output of large language models by introducing feedback and refinement loops.",
+  "version": "1.0.0",
+  "author": {
+    "name": "Vlad Goncharov",
+    "email": "vlad.goncharov@neolab.finance"
+  },
+  "commands": [
+    "./commands"
+  ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d211ae0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# reflexion
+
+Collection of commands that force LLM to reflect on previous response and output. Based on papers like Self-Refine and Reflexion. These techniques improve the output of large language models by introducing feedback and refinement loops.
diff --git a/commands/critique.md b/commands/critique.md
new file mode 100644
index 0000000..e9bfc06
--- /dev/null
+++ b/commands/critique.md
@@ -0,0 +1,476 @@
+---
+description: Comprehensive multi-perspective review using specialized judges with debate and consensus building
+argument-hint: Optional file paths, commits, or context to review (defaults to recent changes)
+---
+
+# Work Critique Command
+
+<task>
+You are a critique coordinator conducting a comprehensive multi-perspective review of completed work using the Multi-Agent Debate + LLM-as-a-Judge pattern. Your role is to orchestrate multiple specialized judges who will independently review the work, debate their findings, and reach consensus on quality, correctness, and improvement opportunities.
+</task>
+
+<context>
+This command implements a sophisticated review pattern combining:
+- **Multi-Agent Debate**: Multiple specialized judges provide independent perspectives
+- **LLM-as-a-Judge**: Structured evaluation framework for consistent assessment
+- **Chain-of-Verification (CoVe)**: Each judge validates their own critique before submission
+- **Consensus Building**: Judges debate findings to reach agreement on recommendations
+
+The review is **report-only** - findings are presented for user consideration without automatic fixes.
+</context>
+
+## Your Workflow
+
+### Phase 1: Context Gathering
+
+Before starting the review, understand what was done:
+
+1. **Identify the scope of work to review**:
+   - If arguments provided: Use them to identify specific files, commits, or conversation context
+   - If no arguments: Review the recent conversation history and file changes
+   - Ask user if scope is unclear: "What work should I review? (recent changes, specific feature, entire conversation, etc.)"
+
+2. **Capture relevant context**:
+   - Original requirements or user request
+   - Files that were modified or created
+   - Decisions made during implementation
+   - Any constraints or assumptions
+
+3. **Summarize scope for confirmation**:
+
+   ```
+   📋 Review Scope:
+   - Original request: [summary]
+   - Files changed: [list]
+   - Approach taken: [brief description]
+
+   Proceeding with multi-agent review...
+   ```
+
+### Phase 2: Independent Judge Reviews (Parallel)
+
+Use the Task tool to spawn three specialized judge agents in parallel. Each judge operates independently without seeing others' reviews.
+
+#### Judge 1: Requirements Validator
+
+**Prompt for Agent:**
+
+```
+You are a Requirements Validator conducting a thorough review of completed work.
+
+## Your Task
+
+Review the following work and assess alignment with original requirements:
+
+[CONTEXT]
+Original Requirements: {requirements}
+Work Completed: {summary of changes}
+Files Modified: {file list}
+[/CONTEXT]
+
+## Your Process (Chain-of-Verification)
+
+1. **Initial Analysis**:
+   - List all requirements from the original request
+   - Check each requirement against the implementation
+   - Identify gaps, over-delivery, or misalignments
+
+2. **Self-Verification**:
+   - Generate 3-5 verification questions about your analysis
+   - Example: "Did I check for edge cases mentioned in requirements?"
+   - Answer each question honestly
+   - Refine your analysis based on answers
+
+3. **Final Critique**:
+   Provide structured output:
+
+   ### Requirements Alignment Score: X/10
+
+   ### Requirements Coverage:
+   ✅ [Met requirement 1]
+   ✅ [Met requirement 2]
+   ⚠️ [Partially met requirement 3] - [explanation]
+   ❌ [Missed requirement 4] - [explanation]
+
+   ### Gaps Identified:
+   - [gap 1 with severity: Critical/High/Medium/Low]
+   - [gap 2 with severity]
+
+   ### Over-Delivery/Scope Creep:
+   - [item 1] - [is this good or problematic?]
+
+   ### Verification Questions & Answers:
+   Q1: [question]
+   A1: [answer that influenced your critique]
+   ...
+
+Be specific, objective, and cite examples from the code.
+```
+
+#### Judge 2: Solution Architect
+
+**Prompt for Agent:**
+
+```
+You are a Solution Architect evaluating the technical approach and design decisions.
+
+## Your Task
+
+Review the implementation approach and assess if it's optimal:
+
+[CONTEXT]
+Problem to Solve: {problem description}
+Solution Implemented: {summary of approach}
+Files Modified: {file list with brief description of changes}
+[/CONTEXT]
+
+## Your Process (Chain-of-Verification)
+
+1. **Initial Evaluation**:
+   - Analyze the chosen approach
+   - Consider alternative approaches
+   - Evaluate trade-offs and design decisions
+   - Check for architectural patterns and best practices
+
+2. **Self-Verification**:
+   - Generate 3-5 verification questions about your evaluation
+   - Example: "Am I being biased toward a particular pattern?"
+   - Example: "Did I consider the project's existing architecture?"
+   - Answer each question honestly
+   - Adjust your evaluation based on answers
+
+3. **Final Critique**:
+   Provide structured output:
+
+   ### Solution Optimality Score: X/10
+
+   ### Approach Assessment:
+   **Chosen Approach**: [brief description]
+   **Strengths**:
+   - [strength 1 with explanation]
+   - [strength 2]
+
+   **Weaknesses**:
+   - [weakness 1 with explanation]
+   - [weakness 2]
+
+   ### Alternative Approaches Considered:
+   1. **[Alternative 1]**
+      - Pros: [list]
+      - Cons: [list]
+      - Recommendation: [Better/Worse/Equivalent to current approach]
+
+   2. **[Alternative 2]**
+      - Pros: [list]
+      - Cons: [list]
+      - Recommendation: [Better/Worse/Equivalent]
+
+   ### Design Pattern Assessment:
+   - Patterns used correctly: [list]
+   - Patterns missing: [list with explanation why they'd help]
+   - Anti-patterns detected: [list with severity]
+
+   ### Scalability & Maintainability:
+   - [assessment of how solution scales]
+   - [assessment of maintainability]
+
+   ### Verification Questions & Answers:
+   Q1: [question]
+   A1: [answer that influenced your critique]
+   ...
+
+Be objective and consider the context of the project (size, team, constraints).
+```
+
+#### Judge 3: Code Quality Reviewer
+
+**Prompt for Agent:**
+
+```
+You are a Code Quality Reviewer assessing implementation quality and suggesting refactorings.
+
+## Your Task
+
+Review the code quality and identify refactoring opportunities:
+
+[CONTEXT]
+Files Changed: {file list}
+Implementation Details: {code snippets or file contents as needed}
+Project Conventions: {any known conventions from codebase}
+[/CONTEXT]
+
+## Your Process (Chain-of-Verification)
+
+1. **Initial Review**:
+   - Assess code readability and clarity
+   - Check for code smells and complexity
+   - Evaluate naming, structure, and organization
+   - Look for duplication and coupling issues
+   - Verify error handling and edge cases
+
+2. **Self-Verification**:
+   - Generate 3-5 verification questions about your review
+   - Example: "Am I applying personal preferences vs. objective quality criteria?"
+   - Example: "Did I consider the existing codebase style?"
+   - Answer each question honestly
+   - Refine your review based on answers
+
+3. **Final Critique**:
+   Provide structured output:
+
+   ### Code Quality Score: X/10
+
+   ### Quality Assessment:
+   **Strengths**:
+   - [strength 1 with specific example]
+   - [strength 2]
+
+   **Issues Found**:
+   - [issue 1] - Severity: [Critical/High/Medium/Low]
+     - Location: [file:line]
+     - Example: [code snippet]
+
+   ### Refactoring Opportunities:
+
+   1. **[Refactoring 1 Name]** - Priority: [High/Medium/Low]
+      - Current code:
+        ```
+        [code snippet]
+        ```
+      - Suggested refactoring:
+        ```
+        [improved code]
+        ```
+      - Benefits: [explanation]
+      - Effort: [Small/Medium/Large]
+
+   2. **[Refactoring 2]**
+      - [same structure]
+
+   ### Code Smells Detected:
+   - [smell 1] at [location] - [explanation and impact]
+   - [smell 2]
+
+   ### Complexity Analysis:
+   - High complexity areas: [list with locations]
+   - Suggested simplifications: [list]
+
+   ### Verification Questions & Answers:
+   Q1: [question]
+   A1: [answer that influenced your critique]
+   ...
+
+Provide specific, actionable feedback with code examples.
+```
+
+**Implementation Note**: Use the Task tool with subagent_type="general-purpose" to spawn these three agents in parallel, each with their respective prompt and context.
+
+### Phase 3: Cross-Review & Debate
+
+After receiving all three judge reports:
+
+1. **Synthesize the findings**:
+   - Identify areas of agreement
+   - Identify contradictions or disagreements
+   - Note gaps in any review
+
+2. **Conduct debate session** (if significant disagreements exist):
+   - Present conflicting viewpoints to judges
+   - Ask each judge to review the other judges' findings
+   - Example: "Requirements Validator says approach is overengineered, but Solution Architect says it's appropriate for scale. Please both review this disagreement and provide reasoning."
+   - Use Task tool to spawn follow-up agents that have context of previous reviews
+
+3. **Reach consensus**:
+   - Synthesize the debate outcomes
+   - Identify which viewpoints are better supported
+   - Document any unresolved disagreements with "reasonable people may disagree" notation
+
+### Phase 4: Generate Consensus Report
+
+Compile all findings into a comprehensive, actionable report:
+
+```markdown
+# 🔍 Work Critique Report
+
+## Executive Summary
+[2-3 sentences summarizing overall assessment]
+
+**Overall Quality Score**: X/10 (average of three judge scores)
+
+---
+
+## 📊 Judge Scores
+
+| Judge | Score | Key Finding |
+|-------|-------|-------------|
+| Requirements Validator | X/10 | [one-line summary] |
+| Solution Architect | X/10 | [one-line summary] |
+| Code Quality Reviewer | X/10 | [one-line summary] |
+
+---
+
+## ✅ Strengths
+
+[Synthesized list of what was done well, with specific examples]
+
+1. **[Strength 1]**
+   - Source: [which judge(s) noted this]
+   - Evidence: [specific example]
+
+---
+
+## ⚠️ Issues & Gaps
+
+### Critical Issues
+[Issues that need immediate attention]
+
+- **[Issue 1]**
+  - Identified by: [judge name]
+  - Location: [file:line if applicable]
+  - Impact: [explanation]
+  - Recommendation: [what to do]
+
+### High Priority
+[Important but not blocking]
+
+### Medium Priority
+[Nice to have improvements]
+
+### Low Priority
+[Minor polish items]
+
+---
+
+## 🎯 Requirements Alignment
+
+[Detailed breakdown from Requirements Validator]
+
+**Requirements Met**: X/Y
+**Coverage**: Z%
+
+[Specific requirements table with status]
+
+---
+
+## 🏗️ Solution Architecture
+
+[Key insights from Solution Architect]
+
+**Chosen Approach**: [brief description]
+
+**Alternative Approaches Considered**:
+1. [Alternative 1] - [Why chosen approach is better/worse]
+2. [Alternative 2] - [Why chosen approach is better/worse]
+
+**Recommendation**: [Stick with current / Consider alternative X because...]
+
+---
+
+## 🔨 Refactoring Recommendations
+
+[Prioritized list from Code Quality Reviewer]
+
+### High Priority Refactorings
+
+1. **[Refactoring Name]**
+   - Benefit: [explanation]
+   - Effort: [estimate]
+   - Before/After: [code examples]
+
+### Medium Priority Refactorings
+[similar structure]
+
+---
+
+## 🤝 Areas of Consensus
+
+[List where all judges agreed]
+
+- [Agreement 1]
+- [Agreement 2]
+
+---
+
+## 💬 Areas of Debate
+
+[If applicable - where judges disagreed]
+
+**Debate 1: [Topic]**
+- Requirements Validator position: [summary]
+- Solution Architect position: [summary]
+- Resolution: [consensus reached or "reasonable disagreement"]
+
+---
+
+## 📋 Action Items (Prioritized)
+
+Based on the critique, here are recommended next steps:
+
+**Must Do**:
+- [ ] [Critical action 1]
+- [ ] [Critical action 2]
+
+**Should Do**:
+- [ ] [High priority action 1]
+- [ ] [High priority action 2]
+
+**Could Do**:
+- [ ] [Medium priority action 1]
+- [ ] [Nice to have action 2]
+
+---
+
+## 🎓 Learning Opportunities
+
+[Lessons that could improve future work]
+
+- [Learning 1]
+- [Learning 2]
+
+---
+
+## 📝 Conclusion
+
+[Final assessment paragraph summarizing whether the work meets quality standards and key takeaways]
+
+**Verdict**: ✅ Ready to ship | ⚠️ Needs improvements before shipping | ❌ Requires significant rework
+
+---
+
+*Generated using Multi-Agent Debate + LLM-as-a-Judge pattern*
+*Review Date: [timestamp]*
+```
+
+## Important Guidelines
+
+1. **Be Objective**: Base assessments on evidence, not preferences
+2. **Be Specific**: Always cite file locations, line numbers, and code examples
+3. **Be Constructive**: Frame criticism as opportunities for improvement
+4. **Be Balanced**: Acknowledge both strengths and weaknesses
+5. **Be Actionable**: Provide concrete recommendations with examples
+6. **Consider Context**: Account for project constraints, team size, timelines
+7. **Avoid Bias**: Don't favor certain patterns/styles without justification
+
+## Usage Examples
+
+```bash
+# Review recent work from conversation
+/critique
+
+# Review specific files
+/critique src/feature.ts src/feature.test.ts
+
+# Review with specific focus
+/critique --focus=security
+
+# Review a git commit
+/critique HEAD~1..HEAD
+```
+
+## Notes
+
+- This is a **report-only** command - it does not make changes
+- The review may take 2-5 minutes due to multi-agent coordination
+- Scores are relative to professional development standards
+- Disagreements between judges are valuable insights, not failures
+- Use findings to inform future development decisions
diff --git a/commands/memorize.md b/commands/memorize.md
new file mode 100644
index 0000000..aac1768
--- /dev/null
+++ b/commands/memorize.md
@@ -0,0 +1,302 @@
+---
+description: Curates insights from reflections and critiques into CLAUDE.md using Agentic Context Engineering
+argument-hint: Optional source specification (last, selection, chat:<id>) or --dry-run for preview
+---
+
+# Memory Consolidation: Curate and Update CLAUDE.md
+
+<role>
+You are a memory consolidation specialist implementing Agentic Context Engineering (ACE). Your role is to capture insights from reflection and debate processes, then curate and organize these learnings into CLAUDE.md to create an evolving context playbook that improves future agent performance through structured knowledge accumulation.
+</role>
+
+<task>
+Transform reflections, critiques, verification outcomes, and execution feedback into durable, reusable guidance by updating `CLAUDE.md`. Use Agentic Context Engineering (ACE) principles to grow-and-refine a living playbook that improves over time without collapsing into vague summaries.
+</task>
+
+<context>
+This command implements the **Curation** phase of the Agentic Context Engineering framework:
+- **Generation**: Initial solutions and approaches (handled by main conversation)
+- **Reflection**: Analysis and critique of solutions (handled by /reflexion:reflect and /reflexion:critique)
+- **Curation**: Memory consolidation and context evolution (this command)
+
+Output must add precise, actionable bullets that future tasks can immediately apply.
+</context>
+
+## Memory Consolidation Workflow
+
+### Phase 1: Context Harvesting
+
+First, gather insights from recent reflection and work:
+
+1. **Identify Learning Sources**:
+   - Recent conversation history and decisions
+   - Reflection outputs from `/reflexion:reflect`
+   - Critique findings from `/reflexion:critique`
+   - Problem-solving patterns that emerged
+   - Failed approaches and why they didn't work
+
+If scope is unclear, ask: “What output(s) should I memorize? (last message, selection, specific files, critique report, etc.)”
+
+2. **Extract Key Insights (Grow)**:
+   - **Domain Knowledge**: Specific facts about the codebase, business logic, or problem domain
+   - **Solution Patterns**: Effective approaches that could be reused
+   - **Anti-Patterns**: Approaches to avoid and why
+   - **Context Clues**: Information that helps understand requirements better
+   - **Quality Gates**: Standards and criteria that led to better outcomes
+
+Extract only high‑value, generalizable insights:
+
+- Errors and Gaps
+  - Error identification → one line
+  - Root cause → one line
+  - Correct approach → imperative rule
+  - Key insight → decision rule or checklist item
+- Repeatable Success Patterns
+  - When to apply, minimal preconditions, limits, quick example
+- API/Tool Usage Rules
+  - Auth, pagination, rate limits, idempotency, error handling
+- Verification Items
+  - Concrete checks/questions to catch regressions next time
+- Pitfalls/Anti‑patterns
+  - What to avoid and why (evidence‑based)
+
+Prefer specifics over generalities. If you cannot back a claim with either code evidence, docs, or repeated observations, don’t memorize it.
+
+3. **Categorize by Impact**:
+   - **Critical**: Insights that prevent major issues or unlock significant improvements
+   - **High**: Patterns that consistently improve quality or efficiency
+   - **Medium**: Useful context that aids understanding
+   - **Low**: Minor optimizations or preferences
+
+### Phase 2: Memory Curation Process
+
+#### Step 1: Analyze Current CLAUDE.md Context
+
+```bash
+# Read current context file
+@CLAUDE.md
+```
+
+Assess what's already documented:
+
+- What domain knowledge exists?
+- Which patterns are already captured?
+- Are there conflicting or outdated entries?
+- What gaps exist that new insights could fill?
+
+#### Step 2: Curation Rules (Refine)
+
+For each insight identified in Phase 1 apply ACE’s “grow‑and‑refine” principle:
+
+- Relevance: Only include items helpful for recurring tasks in this repo/org
+- Non‑redundancy: Do not duplicate existing bullets; merge or skip if similar
+- Atomicity: One idea per bullet; short, imperative, self‑contained
+- Verifiability: Avoid speculative claims; link docs when stating external facts
+- Safety: No secrets, tokens, internal URLs, or private PII
+- Stability: Prefer strategies that remain valid over time; call out version‑specifics
+
+#### Step 3: Apply Curation Transformation
+
+**Generation → Curation Mapping**:
+
+- Raw insight: [What was learned]
+- Context category: [Where it fits in CLAUDE.md structure]
+- Actionable format: [How to phrase it for future use]
+- Validation criteria: [How to know if it's being applied correctly]
+
+**Example Transformation**:
+
+```
+Raw insight: "Using Map instead of Object for this lookup caused performance issues because the dataset was small (<100 items)"
+
+Curated memory: "For dataset lookups <100 items, prefer Object over Map for better performance. Map is optimal for 10K+ items. Use performance testing to validate choice."
+```
+
+#### Step 4: Prevent Context Collapse
+
+Ensure new memories don't dilute existing quality context:
+
+1. **Consolidation Check**:
+   - Can this insight be merged with existing knowledge?
+   - Does it contradict something already documented?
+   - Is it specific enough to be actionable?
+
+2. **Specificity Preservation**:
+   - Keep concrete examples and code snippets
+   - Maintain specific metrics and thresholds where available
+   - Include failure conditions alongside success patterns
+
+3. **Organization Integrity**:
+   - Place insights in appropriate sections
+   - Maintain consistent formatting
+   - Update related cross-references
+
+If a potential bullet conflicts with an existing one, prefer the more specific, evidence‑backed rule and mark the older one for future consolidation (but do not auto‑delete).
+
+### Phase 3: CLAUDE.md Updates
+
+Update the context file with curated insights:
+
+#### Where to Write in `CLAUDE.md`
+
+Create the file if missing with these sections (top‑level headings):
+
+1. **Project Context**
+   - Domain Knowledge: Business domain insights
+   - Technical constraints discovered
+   - User behavior patterns
+
+2. **Code Quality Standards**
+   - Performance criteria that matter
+   - Security considerations
+   - Maintainability patterns
+
+3. **Architecture Decisions**
+   - Patterns that worked well
+   - Integration approaches
+   - Scalability considerations
+
+4. **Testing Strategies**
+   - Effective test patterns
+   - Edge cases to always consider
+   - Quality gates that catch issues
+
+5. **Development Guidelines**
+   - APIs to Use for Specific Information
+   - Formulas and Calculations
+   - Checklists for Common Tasks
+   - Review criteria that help
+   - Documentation standards
+   - Debugging techniques
+
+7. **Strategies and Hard Rules**
+   - Verification Checklist
+   - Patterns and Playbooks
+   - Anti‑patterns and Pitfalls
+
+Place each new bullet under the best‑fit section. Keep bullets concise and actionable.
+
+#### Memory Update Template
+
+For each significant insight, add structured entries:
+
+```markdown
+## [Domain/Pattern Category]
+
+### [Specific Context or Pattern Name]
+
+**Context**: [When this applies]
+
+**Pattern**: [What to do]
+```yaml
+approach: [specific approach]
+validation: [how to verify it's working]
+examples:
+  - case: [specific scenario]
+    implementation: [code or approach snippet]
+  - case: [another scenario]
+    implementation: [different implementation]
+```
+
+**Avoid**: [Anti-patterns or common mistakes]
+
+- [mistake 1]: [why it's problematic]
+- [mistake 2]: [specific issues caused]
+
+**Confidence**: [High/Medium/Low based on evidence quality]
+
+**Source**: [reflection/critique/experience date]
+
+### Phase 4: Memory Validation
+
+#### Quality Gates (Must Pass)
+
+After updating CLAUDE.md:
+
+1. **Coherence Check**:
+   - Do new entries fit with existing context?
+   - Are there any contradictions introduced?
+   - Is the structure still logical and navigable?
+
+2. **Actionability Test**:  A developer should be able to use the bullet immediately
+   - Could a future agent use this guidance effectively?
+   - Are examples concrete enough?
+   - Are success/failure criteria clear?
+
+3. **Consolidation Review**: No near‑duplicates; consolidate wording if similar exists
+   - Can similar insights be grouped together?
+   - Are there duplicate concepts that should be merged?
+   - Is anything too verbose or too vague?
+
+4. **Scoped**: Names technologies, files, or flows when relevant
+5. **Evidence‑backed**: Derived from reflection/critique/tests or official docs
+
+#### Memory Quality Indicators
+
+Track the effectiveness of memory updates:
+
+##### Successful Memory Patterns
+
+- **Specific Thresholds**: "Use pagination for lists >50 items"
+- **Contextual Patterns**: "When user mentions performance, always measure first"
+- **Failure Prevention**: "Always validate input before database operations"
+- **Domain Language**: "In this system, 'customer' means active subscribers only"
+
+##### Memory Anti-Patterns to Avoid
+
+- **Vague Guidelines**: "Write good code" (not actionable)
+- **Personal Preferences**: "I like functional style" (not universal)
+- **Outdated Context**: "Use jQuery for DOM manipulation" (may be obsolete)
+- **Over-Generalization**: "Always use microservices" (ignores context)
+
+##### Implementation Notes
+
+1. **Incremental Updates**: Add insights gradually rather than massive rewrites
+2. **Evidence-Based**: Only memorize patterns with clear supporting evidence
+3. **Context-Aware**: Consider project phase, team size, constraints when curating
+4. **Version Awareness**: Note when insights become obsolete due to tech changes
+5. **Cross-Reference**: Link related concepts within CLAUDE.md for better navigation
+
+##### Expected Outcomes
+
+After effective memory consolidation:
+
+- **Faster Problem Recognition**: Agent quickly identifies similar patterns
+- **Better Solution Quality**: Leverages proven approaches from past success
+- **Fewer Repeated Mistakes**: Avoids anti-patterns that caused issues before
+- **Domain Fluency**: Uses correct terminology and understands business context
+- **Quality Consistency**: Applies learned quality standards automatically
+
+## Usage
+
+```bash
+# Memorize from most recent reflections and outputs
+/reflexion:memorize
+
+# Dry‑run: show proposed bullets without writing to CLAUDE.md
+/reflexion:memorize --dry-run
+
+# Limit number of bullets
+/reflexion:memorize --max=5
+
+# Target a specific section
+/reflexion:memorize --section="Verification Checklist"
+
+# Choose source
+/reflexion:memorize --source=last|selection|chat:<id>
+```
+
+## Output
+
+1) Short summary of additions (counts by section)  
+2) Confirmation that `CLAUDE.md` was created/updated
+
+## Notes
+
+- This command is the counterpart to `/reflexion:reflect`: reflect → curate → memorize.  
+- The design follows ACE to avoid brevity bias and context collapse by accumulating granular, organized knowledge over time (`https://arxiv.org/pdf/2510.04618`).  
+- Do not overwrite or compress existing context; only add high‑signal bullets.
+
+---
+
+**Remember**: The goal is not to memorize everything, but to curate high-impact insights that consistently improve future agent performance. Quality over quantity - each memory should make future work measurably better.
diff --git a/commands/reflect.md b/commands/reflect.md
new file mode 100644
index 0000000..e11b0a1
--- /dev/null
+++ b/commands/reflect.md
@@ -0,0 +1,500 @@
+---
+description: Reflect on previus response and output, based on Self-refinement framework for iterative improvement with complexity triage and verification
+argument-hint: None required - automatically reviews recent work output
+---
+
+# Self-Refinement and Iterative Improvement Framework
+
+Reflect on previus response and output.
+
+## TASK COMPLEXITY TRIAGE
+
+First, categorize the task to apply appropriate reflection depth:
+
+### Quick Path (5-second check)
+
+For simple tasks like:
+
+- Single file edits
+- Documentation updates
+- Simple queries or explanations
+- Straightforward bug fixes
+
+→ **Skip to "Final Verification" section**
+
+### Standard Path (Full reflection)
+
+For tasks involving:
+
+- Multiple file changes
+- New feature implementation
+- Architecture decisions
+- Complex problem solving
+
+→ **Follow complete framework + require confidence >70%**
+
+### Deep Reflection Path
+
+For critical tasks:
+
+- Core system changes
+- Security-related code
+- Performance-critical sections
+- API design decisions
+
+→ **Follow framework + require confidence >90%**
+
+## IMMEDIATE REFLECTION PROTOCOL
+
+### Step 1: Initial Assessment
+
+Before proceeding, evaluate your most recent output against these criteria:
+
+1. **Completeness Check**
+   - [ ] Does the solution fully address the user's request?
+   - [ ] Are all requirements explicitly mentioned by the user covered?
+   - [ ] Are there any implicit requirements that should be addressed?
+
+2. **Quality Assessment**
+   - [ ] Is the solution at the appropriate level of complexity?
+   - [ ] Could the approach be simplified without losing functionality?
+   - [ ] Are there obvious improvements that could be made?
+
+3. **Correctness Verification**
+   - [ ] Have you verified the logical correctness of your solution?
+   - [ ] Are there edge cases that haven't been considered?
+   - [ ] Could there be unintended side effects?
+
+4. **Fact-Checking Required**
+   - [ ] Have you made any claims about performance? (needs verification)
+   - [ ] Have you stated any technical facts? (needs source/verification)
+   - [ ] Have you referenced best practices? (needs validation)
+   - [ ] Have you made security assertions? (needs careful review)
+
+### Step 2: Decision Point
+
+Based on the assessment above, determine:
+
+**REFINEMENT NEEDED?** [YES/NO]
+
+If YES, proceed to Step 3. If NO, skip to Final Verification.
+
+### Step 3: Refinement Planning
+
+If improvement is needed, generate a specific plan:
+
+1. **Identify Issues** (List specific problems found)
+   - Issue 1: [Describe]
+   - Issue 2: [Describe]
+   - ...
+
+2. **Propose Solutions** (For each issue)
+   - Solution 1: [Specific improvement]
+   - Solution 2: [Specific improvement]
+   - ...
+
+3. **Priority Order**
+   - Critical fixes first
+   - Performance improvements second
+   - Style/readability improvements last
+
+### Concrete Example
+
+**Issue Identified**: Function has 6 levels of nesting
+**Solution**: Extract nested logic into separate functions
+**Implementation**:
+
+```
+Before: if (a) { if (b) { if (c) { ... } } }
+After: if (!shouldProcess(a, b, c)) return;
+       processData();
+```
+
+## CODE-SPECIFIC REFLECTION CRITERIA
+
+When the output involves code, additionally evaluate:
+
+### STOP: Library & Existing Solution Check
+
+**BEFORE PROCEEDING WITH CUSTOM CODE:**
+
+1. **Search for Existing Libraries**
+   - [ ] Have you searched npm/PyPI/Maven for existing solutions?
+   - [ ] Is this a common problem that others have already solved?
+   - [ ] Are you reinventing the wheel for utility functions?
+
+   **Common areas to check:**
+   - Date/time manipulation → moment.js, date-fns, dayjs
+   - Form validation → joi, yup, zod
+   - HTTP requests → axios, fetch, got
+   - State management → Redux, MobX, Zustand
+   - Utility functions → lodash, ramda, underscore
+
+2. **Existing Service/Solution Evaluation**
+   - [ ] Could this be handled by an existing service/SaaS?
+   - [ ] Is there an open-source solution that fits?
+   - [ ] Would a third-party API be more maintainable?
+
+   **Examples:**
+   - Authentication → Auth0, Supabase, Firebase Auth
+   - Email sending → SendGrid, Mailgun, AWS SES
+   - File storage → S3, Cloudinary, Firebase Storage
+   - Search → Elasticsearch, Algolia, MeiliSearch
+   - Queue/Jobs → Bull, RabbitMQ, AWS SQS
+
+3. **Decision Framework**
+
+   ```
+   IF common utility function → Use established library
+   ELSE IF complex domain-specific → Check for specialized libraries
+   ELSE IF infrastructure concern → Look for managed services
+   ELSE → Consider custom implementation
+   ```
+
+4. **When Custom Code IS Justified**
+   - Specific business logic unique to your domain
+   - Performance-critical paths with special requirements
+   - When external dependencies would be overkill (e.g., lodash for one function)
+   - Security-sensitive code requiring full control
+   - When existing solutions don't meet requirements after evaluation
+
+### Real Examples of Library-First Approach
+
+**❌ BAD: Custom Implementation**
+
+```javascript
+// utils/dateFormatter.js
+function formatDate(date) {
+  const d = new Date(date);
+  return `${d.getMonth()+1}/${d.getDate()}/${d.getFullYear()}`;
+}
+```
+
+**✅ GOOD: Use Existing Library**
+
+```javascript
+import { format } from 'date-fns';
+const formatted = format(new Date(), 'MM/dd/yyyy');
+```
+
+**❌ BAD: Generic Utilities Folder**
+
+```
+/src/utils/
+  - helpers.js
+  - common.js
+  - shared.js
+```
+
+**✅ GOOD: Domain-Driven Structure**
+
+```
+/src/order/
+  - domain/OrderCalculator.js
+  - infrastructure/OrderRepository.js
+/src/user/
+  - domain/UserValidator.js
+  - application/UserRegistrationService.js
+```
+
+### Common Anti-Patterns to Avoid
+
+1. **NIH (Not Invented Here) Syndrome**
+   - Building custom auth when Auth0/Supabase exists
+   - Writing custom state management instead of using Redux/Zustand
+   - Creating custom form validation instead of using Formik/React Hook Form
+
+2. **Poor Architectural Choices**
+   - Mixing business logic with UI components
+   - Database queries in controllers
+   - No clear separation of concerns
+
+3. **Generic Naming Anti-Patterns**
+   - `utils.js` with 50 unrelated functions
+   - `helpers/misc.js` as a dumping ground
+   - `common/shared.js` with unclear purpose
+
+**Remember**: Every line of custom code is a liability that needs to be maintained, tested, and documented. Use existing solutions whenever possible.
+
+### Architecture and Design
+
+1. **Clean Architecture & DDD Alignment**
+   - [ ] Does naming follow ubiquitous language of the domain?
+   - [ ] Are domain entities separated from infrastructure?
+   - [ ] Is business logic independent of frameworks?
+   - [ ] Are use cases clearly defined and isolated?
+
+   **Naming Convention Check:**
+   - Avoid generic names: `utils`, `helpers`, `common`, `shared`
+   - Use domain-specific names: `OrderCalculator`, `UserAuthenticator`
+   - Follow bounded context naming: `Billing.InvoiceGenerator`
+
+2. **Design Patterns**
+   - Is the current design pattern appropriate?
+   - Could a different pattern simplify the solution?
+   - Are SOLID principles being followed?
+
+3. **Modularity**
+   - Can the code be broken into smaller, reusable functions?
+   - Are responsibilities properly separated?
+   - Is there unnecessary coupling between components?
+   - Does each module have a single, clear purpose?
+
+### Code Quality
+
+1. **Simplification Opportunities**
+   - Can any complex logic be simplified?
+   - Are there redundant operations?
+   - Can loops be replaced with more elegant solutions?
+
+2. **Performance Considerations**
+   - Are there obvious performance bottlenecks?
+   - Could algorithmic complexity be improved?
+   - Are resources being used efficiently?
+   - **IMPORTANT**: Any performance claims in comments must be verified
+
+3. **Error Handling**
+   - Are all potential errors properly handled?
+   - Is error handling consistent throughout?
+   - Are error messages informative?
+
+### Testing and Validation
+
+1. **Test Coverage**
+   - Are all critical paths tested?
+   - Missing edge cases to test:
+     - Boundary conditions
+     - Null/empty inputs
+     - Large/extreme values
+     - Concurrent access scenarios
+   - Are tests meaningful and not just for coverage?
+
+2. **Test Quality**
+   - Are tests independent and isolated?
+   - Do tests follow AAA pattern (Arrange, Act, Assert)?
+   - Are test names descriptive?
+
+## FACT-CHECKING AND CLAIM VERIFICATION
+
+### Claims Requiring Immediate Verification
+
+1. **Performance Claims**
+   - "This is X% faster" → Requires benchmarking
+   - "This has O(n) complexity" → Requires analysis proof
+   - "This reduces memory usage" → Requires profiling
+
+   **Verification Method**: Run actual benchmarks if exists or provide algorithmic analysis
+
+2. **Technical Facts**
+   - "This API supports..." → Check official documentation
+   - "The framework requires..." → Verify with current docs
+   - "This library version..." → Confirm version compatibility
+
+   **Verification Method**: Cross-reference with official documentation
+
+3. **Security Assertions**
+   - "This is secure against..." → Requires security analysis
+   - "This prevents injection..." → Needs proof/testing
+   - "This follows OWASP..." → Verify against standards
+
+   **Verification Method**: Reference security standards and test
+
+4. **Best Practice Claims**
+   - "It's best practice to..." → Cite authoritative source
+   - "Industry standard is..." → Provide reference
+   - "Most developers prefer..." → Need data/surveys
+
+   **Verification Method**: Cite specific sources or standards
+
+### Fact-Checking Checklist
+
+- [ ] All performance claims have benchmarks or Big-O analysis
+- [ ] Technical specifications match current documentation
+- [ ] Security claims are backed by standards or testing
+- [ ] Best practices are cited from authoritative sources
+- [ ] Version numbers and compatibility are verified
+- [ ] Statistical claims have sources or data
+
+### Red Flags Requiring Double-Check
+
+- Absolute statements ("always", "never", "only")
+- Superlatives ("best", "fastest", "most secure")
+- Specific numbers without context (percentages, metrics)
+- Claims about third-party tools/libraries
+- Historical or temporal claims ("recently", "nowadays")
+
+### Concrete Example of Fact-Checking
+
+**Claim Made**: "Using Map is 50% faster than using Object for this use case"
+**Verification Process**:
+
+1. Search for benchmark or documentation comparing both approaches
+2. Provide algorithmic analysis
+**Corrected Statement**: "Map performs better for large collections (10K+ items), while Object is more efficient for small sets (<100 items)"
+
+## NON-CODE OUTPUT REFLECTION
+
+For documentation, explanations, and analysis outputs:
+
+### Content Quality
+
+1. **Clarity and Structure**
+   - Is the information well-organized?
+   - Are complex concepts explained simply?
+   - Is there a logical flow of ideas?
+
+2. **Completeness**
+   - Are all aspects of the question addressed?
+   - Are examples provided where helpful?
+   - Are limitations or caveats mentioned?
+
+3. **Accuracy**
+   - Are technical details correct?
+   - Are claims verifiable?
+   - Are sources or reasoning provided?
+
+### Improvement Triggers for Non-Code
+
+- Ambiguous explanations
+- Missing context or background
+- Overly complex language for the audience
+- Lack of concrete examples
+- Unsubstantiated claims
+
+## ITERATIVE REFINEMENT WORKFLOW
+
+### Chain of Verification (CoV)
+
+1. **Generate**: Create initial solution
+2. **Verify**: Check each component/claim
+3. **Question**: What could go wrong?
+4. **Re-answer**: Address identified issues
+
+### Tree of Thoughts (ToT)
+
+For complex problems, consider multiple approaches:
+
+1. **Branch 1**: Current approach
+   - Pros: [List advantages]
+   - Cons: [List disadvantages]
+
+2. **Branch 2**: Alternative approach
+   - Pros: [List advantages]
+   - Cons: [List disadvantages]
+
+3. **Decision**: Choose best path based on:
+   - Simplicity
+   - Maintainability
+   - Performance
+   - Extensibility
+
+## REFINEMENT TRIGGERS
+
+Automatically trigger refinement if any of these conditions are met:
+
+1. **Complexity Threshold**
+   - Cyclomatic complexity > 10
+   - Nested depth > 3 levels
+   - Function length > 50 lines
+
+2. **Code Smells**
+   - Duplicate code blocks
+   - Long parameter lists (>4)
+   - God classes/functions
+   - Magic numbers/strings
+   - Generic utility folders (`utils/`, `helpers/`, `common/`)
+   - NIH syndrome indicators (custom implementations of standard solutions)
+
+3. **Missing Elements**
+   - No error handling
+   - No input validation
+   - No documentation for complex logic
+   - No tests for critical functionality
+   - No library search for common problems
+   - No consideration of existing services
+
+4. **Architecture Violations**
+   - Business logic in controllers/views
+   - Domain logic depending on infrastructure
+   - Unclear boundaries between contexts
+   - Generic naming instead of domain terms
+
+## FINAL VERIFICATION
+
+Before finalizing any output:
+
+### Self-Refine Checklist
+
+- [ ] Have I considered at least one alternative approach?
+- [ ] Have I verified my assumptions?
+- [ ] Is this the simplest correct solution?
+- [ ] Would another developer easily understand this?
+- [ ] Have I anticipated likely future requirements?
+- [ ] Have all factual claims been verified or sourced?
+- [ ] Are performance/security assertions backed by evidence?
+- [ ] Did I search for existing libraries before writing custom code?
+- [ ] Is the architecture aligned with Clean Architecture/DDD principles?
+- [ ] Are names domain-specific rather than generic (utils/helpers)?
+
+### Reflexion Questions
+
+1. **What worked well in this solution?**
+2. **What could be improved?**
+3. **What would I do differently next time?**
+4. **Are there patterns here that could be reused?**
+
+## IMPROVEMENT DIRECTIVE
+
+If after reflection you identify improvements:
+
+1. **STOP** current implementation
+2. **SEARCH** for existing solutions before continuing
+   - Check package registries (npm, PyPI, etc.)
+   - Research existing services/APIs
+   - Review architectural patterns and libraries
+3. **DOCUMENT** the improvements needed
+   - Why custom vs library?
+   - What architectural pattern fits?
+   - How does it align with Clean Architecture/DDD?
+4. **IMPLEMENT** the refined solution
+5. **RE-EVALUATE** using this framework again
+
+## CONFIDENCE ASSESSMENT
+
+Rate your confidence in the current solution:
+
+- [ ] High (>90%) - Solution is robust and well-tested
+- [ ] Medium (70-90%) - Solution works but could be improved
+- [ ] Low (<70%) - Significant improvements needed
+
+If confidence is not enough based on the TASK COMPLEXITY TRIAGE, iterate again.
+
+## REFINEMENT METRICS
+
+Track the effectiveness of refinements:
+
+### Iteration Count
+
+- First attempt: [Initial solution]
+- Iteration 1: [What was improved]
+- Iteration 2: [Further improvements]
+- Final: [Convergence achieved]
+
+### Quality Indicators
+
+- **Complexity Reduction**: Did refactoring simplify the code?
+- **Bug Prevention**: Were potential issues identified and fixed?
+- **Performance Gain**: Was efficiency improved?
+- **Readability Score**: Is the final version clearer?
+
+### Learning Points
+
+Document patterns for future use:
+
+- What type of issue was this?
+- What solution pattern worked?
+- Can this be reused elsewhere?
+
+---
+
+**REMEMBER**: The goal is not perfection on the first try, but continuous improvement through structured reflection. Each iteration should bring the solution closer to optimal.
diff --git a/plugin.lock.json b/plugin.lock.json
new file mode 100644
index 0000000..d23e946
--- /dev/null
+++ b/plugin.lock.json
@@ -0,0 +1,53 @@
+{
+  "$schema": "internal://schemas/plugin.lock.v1.json",
+  "pluginId": "gh:NeoLabHQ/context-engineering-kit:plugins/reflexion",
+  "normalized": {
+    "repo": null,
+    "ref": "refs/tags/v20251128.0",
+    "commit": "b9aef3a2782f1926173706f324dc539ed4d5a86d",
+    "treeHash": "5cca970e49e2491c8b5bd31e7c713376fe713cb863319ff403c5490a3ca3496c",
+    "generatedAt": "2025-11-28T10:12:09.688815Z",
+    "toolVersion": "publish_plugins.py@0.2.0"
+  },
+  "origin": {
+    "remote": "git@github.com:zhongweili/42plugin-data.git",
+    "branch": "master",
+    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
+    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
+  },
+  "manifest": {
+    "name": "reflexion",
+    "description": "Collection of commands that force LLM to reflect on previous response and output. Based on papers like Self-Refine and Reflexion. These techniques improve the output of large language models by introducing feedback and refinement loops.",
+    "version": "1.0.0"
+  },
+  "content": {
+    "files": [
+      {
+        "path": "README.md",
+        "sha256": "35e53ebb1230cd06b7f28b4169c86d01c01fdd53d34bc375870df13957c41c7f"
+      },
+      {
+        "path": ".claude-plugin/plugin.json",
+        "sha256": "ace85ada748d3c09734cfc8d848a0fb21521fbdc7212caab006a5cb7a663323b"
+      },
+      {
+        "path": "commands/reflect.md",
+        "sha256": "e8fced10a97bf6e7c7a203bd9a5e92ca13c306ed07244edb6295ccaa34d54eb6"
+      },
+      {
+        "path": "commands/critique.md",
+        "sha256": "637e99ae643865b817edf39d4d90bf69b3f033a2838506537629f2e7463b4605"
+      },
+      {
+        "path": "commands/memorize.md",
+        "sha256": "e35b3005a7bb6cccb0535ad4719a2e6f8d4d5e7754161003425d08e39cba62f9"
+      }
+    ],
+    "dirSha256": "5cca970e49e2491c8b5bd31e7c713376fe713cb863319ff403c5490a3ca3496c"
+  },
+  "security": {
+    "scannedAt": null,
+    "scannerVersion": null,
+    "flags": []
+  }
+}
\ No newline at end of file