Initial commit

2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions
--- a/skills/evaluation-rubrics/resources/evaluators/rubric_evaluation_rubrics.json
+++ b/skills/evaluation-rubrics/resources/evaluators/rubric_evaluation_rubrics.json
@@ -0,0 +1,253 @@
+{
+  "criteria": [
+    {
+      "name": "Criteria Clarity",
+      "1": "Criteria vague or subjective (e.g., 'good work', 'shows effort'), no definitions, overlapping dimensions",
+      "3": "Criteria defined but some ambiguity, mostly distinct dimensions, some examples provided",
+      "5": "Criteria crystal clear with precise definitions, completely distinct non-overlapping dimensions, explicit boundaries (what is/isn't included), examples for each criterion"
+    },
+    {
+      "name": "Scale Appropriateness",
+      "1": "Scale granularity mismatched to context (10-point scale for subjective judgment or 3-point for fine distinctions), inconsistent level labels",
+      "3": "Scale granularity reasonable, levels labeled consistently, appropriate for most criteria",
+      "5": "Scale granularity perfectly matched to observable differences and evaluator expertise, level labels clear and consistent (numeric + qualitative), forced-choice or neutral middle justified by context"
+    },
+    {
+      "name": "Descriptor Specificity",
+      "1": "Descriptors use subjective language ('excellent', 'creative', 'professional'), no observable features, comparative only ('better than', 'more')",
+      "3": "Descriptors mostly observable, some quantification (numbers, counts), some comparative language, parallel structure attempted",
+      "5": "Descriptors 100% observable and measurable (could two reviewers score consistently), quantified where possible (specific numbers, percentages), parallel structure across levels (same aspects at each level), concrete examples or anchors provided"
+    },
+    {
+      "name": "Observability",
+      "1": "Criteria require mind-reading or assumptions about process ('worked hard', 'creative thinking'), no evidence trail",
+      "3": "Most criteria observable from artifact, some behavioral indicators, evidence trail for key criteria",
+      "5": "All criteria directly observable from artifact or documented process, behavioral indicators specified, clear evidence trail (where to look, what counts), two reviewers could independently verify"
+    },
+    {
+      "name": "Inter-Rater Reliability Plan",
+      "1": "No calibration plan, no IRR measurement, assumes reviewers will 'just know', no anchors",
+      "3": "Basic calibration mentioned, some anchors or examples, IRR measurement method identified",
+      "5": "Comprehensive calibration plan (pre/during/post steps), specific IRR target (e.g., Kappa ≥0.70), anchor examples at each level for each criterion, ongoing calibration schedule (quarterly), discrepancy resolution protocol"
+    },
+    {
+      "name": "Comprehensiveness",
+      "1": "Missing critical quality dimensions, <3 criteria (too sparse) or >12 criteria (too complex), no coverage of must-haves",
+      "3": "Covers main quality dimensions, 4-8 criteria, may miss some edge cases or secondary aspects",
+      "5": "Comprehensive coverage of all important quality dimensions (product, process, impact as relevant), 4-8 criteria (balanced coverage vs. usability), addresses must-haves and quality gradations, no hidden expectations"
+    },
+    {
+      "name": "Actionability",
+      "1": "Descriptors don't guide improvement (says 'poor' but not what's wrong), no feedback mechanism, evaluatees don't see rubric until scored",
+      "3": "Descriptors somewhat actionable, feedback template exists, rubric shared before evaluation",
+      "5": "Descriptors explicitly actionable (clear what to change to improve level), feedback template tied to criteria with strengths/improvements, rubric shared upfront so evaluatees can self-assess, examples show what 'good' looks like"
+    },
+    {
+      "name": "Weighting Justification",
+      "1": "All criteria weighted equally despite different importance, or weights arbitrary (no justification), critical criteria not flagged",
+      "3": "Some criteria weighted or flagged as critical, basic justification provided, threshold mentioned",
+      "5": "Weighting system explicit and justified (multiplicative or percentage), critical criteria have thresholds (must score ≥X to pass), compensatory vs. non-compensatory trade-offs acknowledged, scoring calculation clear"
+    },
+    {
+      "name": "Bias Mitigation",
+      "1": "No acknowledgment of potential biases (halo, leniency, central tendency, anchoring), no mitigation strategies",
+      "3": "Bias types mentioned, some mitigation (e.g., randomize order, blind scoring), training mentioned",
+      "5": "Comprehensive bias mitigation: Halo (vertical scoring, blind scoring), central tendency (even-number scale or anchors), leniency/severity (calibration, normalization), order effects (randomization), explicit reviewer training, audit plan for detecting bias"
+    },
+    {
+      "name": "Usability",
+      "1": "Rubric overly complex (takes >30 min to score one item), no guidance for reviewers, format hard to use (wall of text)",
+      "3": "Reasonable time to use (<15 min per item), basic reviewer guidance, clear format (table or structured)",
+      "5": "Efficient to use (target time specified and achievable, <10 min for simple rubrics), comprehensive reviewer guidance (instructions, training materials, FAQs), format optimized for use (table, clear layout, easy to reference), accessible to both evaluators and evaluatees"
+    }
+  ],
+  "guidance_by_type": {
+    "Analytic Rubric": {
+      "target_score": 4.2,
+      "key_requirements": [
+        "Descriptor Specificity (score ≥5): Each criterion × level cell has observable descriptor, parallel structure across levels",
+        "Comprehensiveness (≥4): 4-8 criteria covering key quality dimensions without overlap",
+        "Observability (≥5): All criteria measurable from artifact, two reviewers could score consistently",
+        "Inter-Rater Reliability Plan (≥4): Calibration sessions, anchors, IRR measurement (Kappa ≥0.70 target)"
+      ],
+      "common_pitfalls": [
+        "Too many criteria (>10) → time-consuming, overwhelming",
+        "Overlapping criteria ('Clarity' and 'Organization' conflated)",
+        "Descriptors use comparative language only ('better than Level 3') without absolute description"
+      ]
+    },
+    "Holistic Rubric": {
+      "target_score": 3.8,
+      "key_requirements": [
+        "Descriptor Specificity (≥4): Each level integrates multiple criteria, clear gestalt description, concrete examples",
+        "Comprehensiveness (≥3): All important quality aspects mentioned in descriptors (even if not separate criteria)",
+        "Observability (≥4): Overall judgment observable, descriptors reference concrete features",
+        "Inter-Rater Reliability Plan (≥4): Critical for holistic (lower IRR expected), extensive calibration, many anchors"
+      ],
+      "common_pitfalls": [
+        "Descriptors too vague ('excellent overall quality') without specifics",
+        "No examples or anchors (reviewers have widely different standards)",
+        "Lower IRR than analytic (expect Kappa 0.60-0.70, not 0.80+)"
+      ]
+    },
+    "Single-Point Rubric": {
+      "target_score": 3.7,
+      "key_requirements": [
+        "Descriptor Specificity (≥4): 'Meets standard' descriptor crystal clear, observable, quantified",
+        "Comprehensiveness (≥4): All critical quality dimensions listed as criteria",
+        "Actionability (≥5): Strengths/concerns space encourages specific feedback, not just checkmarks",
+        "Usability (≥5): Fast to use, less intimidating than analytic, encourages dialogue"
+      ],
+      "common_pitfalls": [
+        "'Meets standard' too vague (what exactly is the standard?)",
+        "Used as checklist (just check yes/no) rather than noting specific strengths/concerns",
+        "No guidance for what 'exceeds' or 'below' means (reviewers inconsistent)"
+      ]
+    },
+    "Checklist": {
+      "target_score": 3.5,
+      "key_requirements": [
+        "Descriptor Specificity (≥5): Each item binary, observable, verifiable (yes/no clear)",
+        "Comprehensiveness (≥5): All must-haves listed, nothing critical missing",
+        "Observability (≥5): 100% verifiable (can literally check off each item)",
+        "Usability (≥5): Fast to use, unambiguous, minimal judgment required"
+      ],
+      "common_pitfalls": [
+        "Items require judgment ('code is clean') → not truly binary",
+        "Missing critical items (assumes 'everyone knows' but not documented)",
+        "Used alone for quality assessment (checklists ensure minimums, don't capture quality gradations)"
+      ]
+    },
+    "Standards-Based Rubric": {
+      "target_score": 4.0,
+      "key_requirements": [
+        "Criteria Clarity (≥5): Criteria explicitly tied to learning objectives/competencies/standards",
+        "Descriptor Specificity (≥5): Levels represent mastery progression (Novice/Competent/Expert with clear differences)",
+        "Comprehensiveness (≥5): All relevant standards/competencies covered, none missing",
+        "Actionability (≥5): Descriptors show developmental path, clear how to progress from one level to next"
+      ],
+      "common_pitfalls": [
+        "Standards not clearly defined (rubric references 'Standard 3.2' but doesn't explain what it is)",
+        "Levels don't represent true developmental progression (arbitrary distinctions)",
+        "Rubric divorced from instruction (students never taught what's in rubric)"
+      ]
+    }
+  },
+  "guidance_by_complexity": {
+    "Simple Rubric": {
+      "target_score": 3.5,
+      "description": "3-5 criteria, 3-4 scale levels, straightforward domain, single evaluator or small team",
+      "key_requirements": [
+        "Criteria Clarity (≥3): Criteria defined, mostly distinct, examples for key criteria",
+        "Descriptor Specificity (≥3): Observable language, some quantification, basic parallel structure",
+        "Observability (≥3): Criteria observable from artifact, reasonable agreement expected",
+        "Usability (≥4): Fast to create and use, minimal training needed, <5 min to score"
+      ],
+      "time_estimate": "2-4 hours to develop, 1 hour calibration",
+      "examples": [
+        "Internal code review (3 criteria: Correctness, Readability, Tests)",
+        "Student homework (4 criteria: Completeness, Accuracy, Clarity, Timeliness)",
+        "Design critique (3 criteria: Visual hierarchy, Consistency, Accessibility basics)"
+      ]
+    },
+    "Standard Rubric": {
+      "target_score": 4.0,
+      "description": "5-7 criteria, 4-5 scale levels, moderate complexity, multiple evaluators, some stakes",
+      "key_requirements": [
+        "Criteria Clarity (≥4): Precise definitions, distinct dimensions, boundaries explicit, examples for all criteria",
+        "Descriptor Specificity (≥4): Observable and quantified, parallel structure, concrete examples at each level",
+        "Inter-Rater Reliability Plan (≥4): Calibration sessions (3-5 samples), IRR measurement (Kappa ≥0.70), anchors at all levels",
+        "Bias Mitigation (≥3): Acknowledge key biases (halo, central tendency), basic mitigation (randomize, calibration)",
+        "Actionability (≥4): Clear feedback mechanism, rubric shared upfront, descriptors guide improvement"
+      ],
+      "time_estimate": "6-10 hours to develop, 2-3 calibration sessions",
+      "examples": [
+        "Essay grading (6 criteria: Argument, Evidence, Organization, Clarity, Mechanics, Originality)",
+        "Product launch review (5 criteria: User value, Technical quality, Market fit, Risk mitigation, Metrics)",
+        "Vendor selection (7 criteria: Functionality, Cost, Support, Integration, Scalability, Security, Track record)"
+      ]
+    },
+    "Complex Rubric": {
+      "target_score": 4.3,
+      "description": "6-10 criteria, 5-10 scale levels, high complexity/novelty, many evaluators, high stakes, need for consistency and defensibility",
+      "key_requirements": [
+        "Criteria Clarity (≥5): Crystal clear definitions, completely distinct, explicit boundaries, comprehensive examples",
+        "Descriptor Specificity (≥5): 100% observable/measurable, fully quantified, perfect parallel structure, anchors at all levels",
+        "Observability (≥5): All criteria independently verifiable, evidence trail documented, IRR target >80%",
+        "Inter-Rater Reliability Plan (≥5): Extensive calibration (5+ sessions), IRR measurement (Kappa or ICC), ongoing calibration schedule (quarterly), discrepancy protocol, anchor library",
+        "Weighting Justification (≥5): Explicit weighting or thresholds, justified by context, compensatory vs. non-compensatory clear",
+        "Bias Mitigation (≥5): Comprehensive mitigation for all bias types, reviewer training program, audit plan, normalization procedures",
+        "Actionability (≥5): Detailed feedback template, rubric shapes instruction/preparation, multiple examples of work at each level"
+      ],
+      "time_estimate": "15-25 hours to develop, 5-8 calibration sessions, ongoing maintenance",
+      "examples": [
+        "Grant proposal review (10 criteria across significance, innovation, approach, team, environment)",
+        "Hiring rubric (8 criteria: Technical skills, Problem-solving, Communication, Culture fit, Leadership, Growth mindset, Domain expertise, References)",
+        "Clinical competency assessment (9 criteria across knowledge, skills, attitudes, professionalism)",
+        "Algorithmic fairness audit rubric (7 criteria: Accuracy, Disparate impact, Equalized odds, Calibration, Explainability, Recourse, Monitoring)"
+      ]
+    }
+  },
+  "common_failure_modes": [
+    {
+      "failure": "Subjective criteria without operationalization",
+      "symptom": "Criteria like 'creativity', 'professionalism', 'good attitude', 'shows effort' without observable indicators",
+      "detection": "Ask 'Could two reviewers score this consistently without discussing?' If no → subjective",
+      "fix": "Define observable behaviors: 'Creativity = uses 2+ techniques not taught, novel combination'. Test with calibration samples."
+    },
+    {
+      "failure": "Overlapping criteria inflating scores",
+      "symptom": "Criteria like 'Clarity' and 'Organization' or 'Quality' and 'Professionalism' that measure same underlying dimension",
+      "detection": "High correlation between criteria scores (always move together), difficulty explaining difference between criteria",
+      "fix": "Define explicit boundaries ('Clarity = language. Organization = structure.'), combine overlapping criteria, or split into distinct fine-grained criteria"
+    },
+    {
+      "failure": "Descriptors use only comparative language",
+      "symptom": "Level 4 described as 'better than Level 3', 'more sophisticated than Level 2', without absolute description of what Level 4 IS",
+      "detection": "Read descriptor for Level 4 alone (without seeing other levels). Is it clear what constitutes Level 4? If no → comparative only.",
+      "fix": "Write absolute descriptors: 'Level 4 = Zero bugs, meets all 5 requirements, performance <100ms'. Each level stands alone."
+    },
+    {
+      "failure": "Scale granularity mismatched to observable differences",
+      "symptom": "10-point scale for subjective judgment (reviewers can't distinguish 7 vs 8), or 3-point scale for objective dimensions with clear gradations",
+      "detection": "Low IRR (reviewers disagree), or reviewers never use parts of scale (everyone scores 6-8 on 10-point scale)",
+      "fix": "Match granularity to real observable differences. If can only distinguish 'poor/adequate/good', use 3-point. If 5 clear levels, use 5-point. Test with calibration."
+    },
+    {
+      "failure": "No parallel structure across levels",
+      "symptom": "Level 5 mentions A, B, C. Level 3 mentions D, E. Level 1 mentions F. Can't compare what changes between levels.",
+      "detection": "Try to explain what someone must improve to go from Level 3 → Level 4. If unclear → no parallel structure.",
+      "fix": "Create table with dimensions (columns) and levels (rows). Ensure each dimension addressed at each level. E.g., 'Variable names | Comments | Complexity' assessed at all 5 levels."
+    },
+    {
+      "failure": "Hidden expectations not in rubric",
+      "symptom": "Reviewers penalize for things not mentioned in rubric (e.g., rubric doesn't mention formatting but reviewer scores down for poor formatting)",
+      "detection": "Compare rubric criteria to actual feedback given. Feedback mentions dimensions not in rubric → hidden expectations.",
+      "fix": "Make all expectations explicit. If it matters enough to penalize, include it. If not in rubric, don't penalize (can suggest, but doesn't affect score)."
+    },
+    {
+      "failure": "No calibration or IRR measurement",
+      "symptom": "Rubric deployed without testing if reviewers score consistently, no anchor examples, no calibration sessions, 'we trust our reviewers'",
+      "detection": "Ask 'What's the Kappa or ICC?' If answer is blank stare → no IRR measurement.",
+      "fix": "Before full deployment: Select 3-5 samples, have all reviewers score independently, calculate IRR (Kappa, ICC), discuss discrepancies, refine rubric, re-test. Target: Kappa ≥0.70 or ICC ≥0.75."
+    },
+    {
+      "failure": "Central tendency bias (everyone scores 3/5)",
+      "symptom": "Distribution of scores heavily clustered around middle (80% of scores are 3 on 1-5 scale), extremes (1 or 5) almost never used",
+      "detection": "Plot score distribution. If normal curve centered on middle with narrow spread → central tendency bias.",
+      "fix": "Even-number scale (1-4, no middle), anchor examples at extremes (show what 1 and 5 look like), forced distribution (controversial), calibration sessions where reviewers practice using full range."
+    },
+    {
+      "failure": "Weighting doesn't reflect importance",
+      "symptom": "All criteria weighted equally (or no weights) despite some being critical (Security) and others nice-to-have (Code style), or high Style score can compensate for low Security",
+      "detection": "Ask 'If Security=1 but all other criteria=5, should this pass?' If no, but rubric allows it → weighting problem.",
+      "fix": "Explicitly weight critical criteria (Security ×3, Style ×1) OR use thresholds (must score ≥4 on Security to pass, regardless of other scores). Document rationale."
+    },
+    {
+      "failure": "Rubric not shared with evaluatees upfront",
+      "symptom": "Rubric used only by reviewers, evaluatees see rubric for first time when scored, can't self-assess or prepare",
+      "detection": "Ask evaluatees 'Did you see the rubric before submitting work?' If no → transparency problem.",
+      "fix": "Share rubric when assignment/project given. Rubric serves as guide and quality standard, not just grading tool. Provide anchor examples so people know what 'good' looks like."
+    }
+  ]
+}
--- a/skills/evaluation-rubrics/resources/methodology.md
+++ b/skills/evaluation-rubrics/resources/methodology.md
@@ -0,0 +1,365 @@
+# Evaluation Rubrics Methodology
+
+Comprehensive guidance on scale design, descriptor writing, calibration, bias mitigation, and advanced rubric design techniques.
+
+## Workflow
+
+```
+Rubric Development Progress:
+- [ ] Step 1: Define purpose and scope
+- [ ] Step 2: Identify evaluation criteria
+- [ ] Step 3: Design the scale
+- [ ] Step 4: Write performance descriptors
+- [ ] Step 5: Test and calibrate
+- [ ] Step 6: Use and iterate
+```
+
+**Step 1: Define purpose and scope** → See [resources/template.md](template.md#purpose-definition-template)
+
+**Step 2: Identify evaluation criteria** → See [resources/template.md](template.md#criteria-identification-template)
+
+**Step 3: Design the scale** → See [1. Scale Design Principles](#1-scale-design-principles)
+
+**Step 4: Write performance descriptors** → See [2. Descriptor Writing Techniques](#2-descriptor-writing-techniques)
+
+**Step 5: Test and calibrate** → See [3. Calibration Techniques](#3-calibration-techniques)
+
+**Step 6: Use and iterate** → See [4. Bias Mitigation](#4-bias-mitigation) and [6. Common Pitfalls](#6-common-pitfalls)
+
+---
+
+## 1. Scale Design Principles
+
+### Choosing Appropriate Granularity
+
+**The granularity dilemma**: Too few levels (1-3) miss meaningful distinctions; too many levels (1-10) create false precision and inconsistency.
+
+| Factor | Favors Fewer Levels (1-3, 1-4) | Favors More Levels (1-5, 1-10) |
+|--------|--------------------------------|--------------------------------|
+| Evaluator expertise | Novice reviewers, unfamiliar domain | Expert reviewers, deep domain knowledge |
+| Observable differences | Hard to distinguish subtle differences | Clear gradations exist |
+| Stakes | High-stakes binary decisions (pass/fail) | Developmental feedback, rankings |
+| Sample size | Small samples (< 20 items) | Large samples (100+, statistical analysis) |
+| Time available | Quick screening, time pressure | Detailed assessment, ample time |
+| Consistency priority | Inter-rater reliability critical | Differentiation more important |
+
+**Scale characteristics** (See SKILL.md Quick Reference for detailed comparison):
+- **1-3**: Fast, coarse, high reliability. Use for quick screening.
+- **1-4**: Forces choice (no middle), avoids central tendency. Use when bias observed.
+- **1-5**: Most common, allows neutral, good balance. General purpose.
+- **1-10**: Fine gradations, statistical analysis. Use for large samples (100+), expert reviewers.
+- **Qualitative** (Novice/Proficient/Expert): Intuitive for skills, growth-oriented. Educational contexts.
+
+### Central Tendency and Response Biases
+
+**Central tendency bias**: Reviewers avoid extremes, cluster around middle (most get 3/5).
+
+**Causes**: Uncertainty, social pressure, lack of calibration.
+
+**Mitigations**:
+1. **Even-number scales** (1-4, 1-6) force choice above/below standard
+2. **Anchor examples** at each level (what does 1 vs 5 look like?)
+3. **Calibration sessions** where reviewers score same work, discuss discrepancies
+4. **Forced distributions** (controversial): Require X% in each category. Use sparingly.
+
+**Other response biases**:
+
+- **Halo effect**: Overall impression biases individual criteria scores.
+  - **Mitigation**: Vertical scoring (all work on Criterion 1, then Criterion 2), blind scoring.
+
+- **Leniency/severity bias**: Reviewer consistently scores higher/lower than others.
+  - **Mitigation**: Calibration sessions, normalization across reviewers.
+
+- **Range restriction**: Reviewer uses only part of scale (always 3-4, never 1-2 or 5).
+  - **Mitigation**: Anchor examples at extremes, forced distribution (cautiously).
+
+### Numeric vs. Qualitative Scales
+
+**Numeric** (1-5, 1-10): Easy to aggregate, quantitative comparison, ranking. Numbers feel precise but may be arbitrary.
+
+**Qualitative** (Novice/Proficient/Expert, Below/Meets/Exceeds): Intuitive labels, less false precision. Harder to aggregate, ordinal only.
+
+**Hybrid approach** (best of both): Numeric with labels (1=Poor, 2=Fair, 3=Adequate, 4=Good, 5=Excellent). Labels anchor meaning, numbers enable analysis.
+
+**Unipolar vs. Bipolar**:
+- **Unipolar**: 1 (None) → 5 (Maximum). Measures amount or quality. **Use for rubrics.**
+- **Bipolar**: 1 (Strongly Disagree) → 5 (Strongly Agree), 3=Neutral. Measures agreement.
+
+---
+
+## 2. Descriptor Writing Techniques
+
+### Observable, Measurable Language
+
+**Core principle**: Two independent reviewers should score the same work consistently based on descriptors alone.
+
+| ❌ Subjective (Avoid) | ✓ Observable (Use) |
+|----------------------|-------------------|
+| "Shows effort" | "Submitted 3 drafts, incorporated 80%+ of feedback" |
+| "Creative" | "Uses 2+ techniques not taught, novel combination of concepts" |
+| "Professional quality" | "Zero typos, consistent formatting, APA citations correct" |
+| "Good understanding" | "Correctly applies 4/5 key concepts, explains mechanisms" |
+| "Needs improvement" | "Contains 5+ bugs, missing 2 required features, <100ms target" |
+
+**Test for observability**: Could two reviewers count/measure this? (Yes → observable). Does this require mind-reading? (Yes → subjective).
+
+**Techniques**:
+1. **Quantification**: "All 5 requirements met" vs. "Most requirements met"
+2. **Explicit features**: "Includes abstract, intro, methods, results, discussion" vs. "Complete structure"
+3. **Behavioral indicators**: "Asks clarifying questions, proposes alternatives" vs. "Critical thinking"
+4. **Comparison to standards**: "WCAG AA compliant" vs. "Accessible"
+
+### Parallel Structure Across Levels
+
+**Parallel structure**: Each level addresses the same aspects, making differences clear.
+
+**Example: Code Review, "Readability" criterion**
+
+| Level | Variable Names | Comments/Docs | Code Complexity |
+|-------|---------------|---------------|-----------------|
+| **5** | Descriptive, domain-appropriate | Comprehensive docs, all functions commented | Simple, DRY, single responsibility |
+| **3** | Mostly clear, some abbreviations | Key functions documented, some comments | Moderate complexity, some duplication |
+| **1** | Cryptic abbreviations, unclear | No documentation, no comments | Highly complex, nested logic, duplication |
+
+**Benefits**: Easy comparison (what changes 3→5?), diagnostic (pinpoint weakness), fair (same dimensions).
+
+### Examples and Anchors at Each Level
+
+**Anchor**: Concrete example of work at a specific level, calibrates reviewers.
+
+**Types**:
+1. **Exemplar work samples**: Actual submissions scored at each level (authentic, requires permission)
+2. **Synthetic examples**: Crafted to demonstrate each level (controlled, no permission needed)
+3. **Annotated excerpts**: Sections highlighting what merits that score (focused, may miss holistic quality)
+
+**Best practices**:
+- Anchor at extremes and middle (minimum: 1, 3, 5)
+- Diversity of anchors (different ways to achieve a level)
+- Update anchors as rubric evolves
+- Make accessible to evaluators and evaluatees
+
+### Avoiding Hidden Expectations
+
+**Hidden expectation**: Quality dimension reviewers penalize but isn't in rubric.
+
+**Example**: Rubric has "Technical Accuracy", "Clarity", "Practical Value". Reviewer scores down for "poor visual design" (not a criterion). **Problem**: Evaluatee had no way to know design mattered.
+
+**Mitigation**:
+1. **Comprehensive criteria**: If it matters, include it. If not in rubric, don't penalize.
+2. **Criterion definitions**: Explicitly state what is/isn't included.
+3. **Feedback constraints**: Suggestions outside rubric don't affect score.
+4. **Rubric review**: Ask evaluatees what's missing, update accordingly.
+
+---
+
+## 3. Calibration Techniques
+
+### Inter-Rater Reliability Measurement
+
+**Inter-rater reliability (IRR)**: Degree to which independent reviewers give consistent scores.
+
+**Target IRR thresholds**:
+- <50%: Unreliable, major revision needed
+- 50-70%: Marginal, refine descriptors, more calibration
+- 70-85%: Good, acceptable for most uses
+- >85%: Excellent, highly reliable
+
+**Measurement methods**:
+
+**1. Percent Agreement**
+- **Calculation**: (# items where reviewers agree exactly) / (total items)
+- **Pros**: Simple, intuitive. **Cons**: Inflated by chance agreement.
+- **Variant: Within-1 agreement**: Scores within 1 point count as agree. Target: ≥80%.
+
+**2. Cohen's Kappa (κ)**
+- **Calculation**: (Observed agreement - Expected by chance) / (1 - Expected by chance)
+- **Range**: -1 to 1 (0=chance, 1=perfect agreement)
+- **Interpretation**: <0.20 Poor, 0.21-0.40 Fair, 0.41-0.60 Moderate, 0.61-0.80 Substantial, 0.81-1.00 Almost perfect
+- **Pros**: Corrects for chance. **Cons**: Only 2 raters, affected by prevalence.
+
+**3. Intraclass Correlation Coefficient (ICC)**
+- **Use when**: More than 2 raters, continuous scores
+- **Range**: 0 to 1. **Interpretation**: <0.50 Poor, 0.50-0.75 Moderate, 0.75-0.90 Good, >0.90 Excellent
+- **Pros**: Handles multiple raters, gold standard. **Cons**: Requires statistical software.
+
+**4. Krippendorff's Alpha**
+- **Use when**: Multiple raters, missing data, various data types
+- **Range**: 0 to 1. **Interpretation**: α≥0.80 acceptable, ≥0.67 tentatively acceptable
+- **Pros**: Most flexible, robust to missing data. **Cons**: Less familiar.
+
+### Calibration Session Design
+
+**Pre-calibration**:
+1. **Select 3-5 samples** spanning quality range (low, medium, high, edge cases)
+2. **Independent scoring**: Each reviewer scores all samples alone, no discussion
+3. **Calculate IRR**: Baseline reliability (percent agreement, Kappa)
+
+**During calibration**:
+4. **Discuss discrepancies** (focus on differences >1 point): "I scored Sample 1 as 4 because... What led you to 3?"
+5. **Identify ambiguities**: Descriptor unclear? Criterion boundaries fuzzy? Missing cases?
+6. **Refine rubric**: Clarify descriptors (add specificity, numbers, examples), add anchors, revise criteria
+7. **Re-score**: Independently re-score same samples using refined rubric
+
+**Post-calibration**:
+8. **Calculate final IRR**: If ≥70%, proceed. If <70%, iterate (more refinement + re-calibration).
+9. **Document**: Date, participants, samples, IRR metrics (before/after), rubric changes, scoring decisions
+10. **Schedule ongoing calibration**: Monthly or quarterly check-ins (prevents rubric drift)
+
+### Resolving Discrepancies
+
+**When reviewers disagree**:
+
+- **Option 1: Discussion to consensus**: Reviewers discuss, agree on final score. Ensures consistency but time-consuming.
+- **Option 2: Averaged scores**: Mean of reviewers' scores. Fast but can mask disagreement (4+2=3).
+- **Option 3: Third reviewer**: If A and B differ by >1, C scores as tie-breaker. Resolves impasse but requires extra reviewer.
+- **Option 4: Escalation**: Discrepancies >1 escalated to lead reviewer or committee. Quality control but bottleneck.
+
+**Recommended**: Average for small discrepancies (1 point), discussion for large (2+ points), escalate if unresolved.
+
+---
+
+## 4. Bias Mitigation
+
+### Halo Effect
+
+**Halo effect**: Overall impression biases individual criterion scores. "Excellent work" → all criteria high, or "poor work" → all low.
+
+**Example**: Code has excellent documentation (5/5) but poor performance (should be 2/5). Halo: Reviewer scores performance 4/5 due to overall positive impression.
+
+**Mitigation**:
+1. **Vertical scoring**: Score all submissions on Criterion 1, then all on Criterion 2 (focus on one criterion at a time)
+2. **Blind scoring**: Reviewers don't see previous scores when scoring new criterion
+3. **Separate passes**: First pass for overall sense (don't score), second pass to score each criterion
+4. **Criterion definitions**: Clear, narrow definitions reduce bleed-over
+
+### Anchoring and Order Effects
+
+**Anchoring**: First information biases subsequent judgments. First essay scored 5/5 → second (objectively 4/5) feels worse → scored 3/5.
+
+**Mitigation**:
+1. **Randomize order**: Review in random order, not alphabetical or submission time
+2. **Calibration anchors**: Review rubric and anchors before scoring (resets mental baseline)
+3. **Batch scoring**: Score all on one criterion at once (easier to compare)
+
+**Order effects**: Position in sequence affects score (first/last reviewed scored differently).
+
+**Mitigation**: Multiple reviewers score in different random orders (order effect averages out).
+
+### Leniency and Severity Bias
+
+**Leniency**: Reviewer consistently scores higher than others (generous). **Severity**: Consistently scores lower (harsh).
+
+**Detection**: Calculate mean score per reviewer. If Reviewer A averages 4.2 and Reviewer B averages 2.8 on same work → bias present.
+
+**Mitigation**:
+1. **Calibration sessions**: Show reviewers their bias, discuss differences
+2. **Normalization** (controversial): Convert to z-scores (adjust for reviewer's mean). Changes scores, may feel unfair.
+3. **Multiple reviewers**: Average scores (bias cancels out)
+4. **Threshold-based**: Focus on "meets standard" (yes/no) vs numeric score
+
+---
+
+## 5. Advanced Rubric Design
+
+### Weighted Criteria
+
+**Weighting approaches**:
+
+**1. Multiplicative weights**:
+- Score × weight, sum weighted scores, divide by sum of weights
+- Example: Security (4×3=12), Performance (3×2=6), Style (5×1=5). Total: 23/6 = 3.83
+
+**2. Percentage weights**:
+- Assign % to each criterion (sum to 100%)
+- Example: Security 4×50%=2.0, Performance 3×30%=0.9, Style 5×20%=1.0. Total: 3.9/5.0
+
+**When to weight**: Criteria have different importance, regulatory/compliance criteria, developmental priorities.
+
+**Cautions**: Adds complexity, can obscure deficiencies (low critical score hidden in average). Alternative: Threshold scoring.
+
+### Threshold Scoring
+
+**Threshold**: Minimum score required on specific criteria regardless of overall average.
+
+**Example**:
+- Overall average ≥3.0 to pass
+- **AND** Security ≥4.0 (critical threshold)
+- **AND** No criterion <2.0 (floor threshold)
+
+**Benefits**: Ensures critical criteria meet standard, prevents "compensation" (high Style masking low Security), clear requirements.
+
+**Use cases**: Safety-critical systems, compliance requirements, competency gatekeeping.
+
+### Combination Rubrics
+
+**Hybrid approaches**:
+
+- **Analytic + Holistic**: Analytic for diagnostic detail, holistic for overall judgment. Use when want both.
+- **Checklist + Rubric**: Checklist for must-haves (gatekeeping), rubric for quality gradations (among passing). Use for gatekeeping then ranking.
+- **Self-Assessment + Peer + Instructor**: Same rubric used by student, peers, instructor. Compare scores, discuss. Use for metacognitive learning.
+
+---
+
+## 6. Common Pitfalls
+
+### Overlapping Criteria
+
+**Problem**: Criteria not distinct, same dimension scored multiple times.
+
+**Example**: "Organization" (structure, flow, coherence) + "Clarity" (easy to understand, well-structured, logical). **Overlap**: "well-structured" in both.
+
+**Detection**: High correlation between criteria scores. Difficulty explaining difference.
+
+**Fix**: Define boundaries explicitly ("Organization = structure. Clarity = language."), combine overlapping criteria, or split into finer-grained distinct criteria.
+
+### Rubric Drift
+
+**Problem**: Over time, reviewers interpret descriptors differently, rubric meaning changes.
+
+**Causes**: No ongoing calibration, staff turnover, system changes.
+
+**Detection**: IRR declines (was 80%, now 60%), scores inflate/deflate (average was 3.5, now 4.2 with no quality change), inconsistency complaints.
+
+**Prevention**:
+1. **Periodic calibration**: Quarterly sessions even with experienced reviewers
+2. **Anchor examples**: Maintain library, use same anchors over time
+3. **Documentation**: Record scoring decisions, accessible to new reviewers
+4. **Version control**: Date rubric versions, note changes, communicate updates
+
+### False Precision
+
+**Problem**: Numeric scores imply precision that doesn't exist. 10-point scale but difference between 7 vs 8 arbitrary.
+
+**Fix**:
+- Reduce granularity (10→5 or 3 categories)
+- Add descriptors for each level
+- Report confidence intervals (Score = 3.5 ± 0.5)
+- Be transparent: "Scores are informed judgments, not objective measurements"
+
+### No Consequences for Ignoring Rubric
+
+**Problem**: Rubric exists but reviewers don't use it or override scores based on gut feeling. Rubric becomes meaningless.
+
+**Fix**:
+1. **Require justification**: Reviewers must cite rubric descriptors when scoring
+2. **Audit scores**: Spot-check scores against rubric, challenge unjustified deviations
+3. **Training**: Emphasize rubric as contract (if wrong, change rubric, don't ignore)
+4. **Accountability**: Reviewers who consistently deviate lose review privileges
+
+---
+
+## Summary
+
+**Scale design**: Choose granularity matching observable differences. Mitigate central tendency with even-number scales or anchors.
+
+**Descriptor writing**: Use observable language, parallel structure, examples at each level. Test: Can two reviewers score consistently?
+
+**Calibration**: Measure IRR (Kappa, ICC), conduct calibration sessions, refine rubric, prevent drift with ongoing calibration.
+
+**Bias mitigation**: Vertical scoring for halo effect, randomize order for anchoring, normalize or average for leniency/severity.
+
+**Advanced design**: Weight critical criteria, use thresholds to prevent compensation, combine rubric types.
+
+**Pitfalls**: Define distinct criteria, prevent drift with documentation and re-calibration, avoid false precision, ensure rubric has teeth.
+
+**Final principle**: Rubrics structure judgment, not replace it. Use to increase consistency and transparency, not mechanize evaluation.
--- a/skills/evaluation-rubrics/resources/template.md
+++ b/skills/evaluation-rubrics/resources/template.md
@@ -0,0 +1,414 @@
+# Evaluation Rubrics Templates
+
+Quick-start templates for purpose definition, criteria selection, scale design, descriptor writing, and rubric formats.
+
+## Workflow
+
+```
+Rubric Development Progress:
+- [ ] Step 1: Define purpose and scope
+- [ ] Step 2: Identify evaluation criteria
+- [ ] Step 3: Design the scale
+- [ ] Step 4: Write performance descriptors
+- [ ] Step 5: Test and calibrate
+- [ ] Step 6: Use and iterate
+```
+
+**Step 1: Define purpose and scope**
+
+Use [Purpose Definition Template](#purpose-definition-template) to clarify evaluation context and constraints.
+
+**Step 2: Identify evaluation criteria**
+
+Brainstorm and prioritize quality dimensions using [Criteria Identification Template](#criteria-identification-template).
+
+**Step 3: Design the scale**
+
+Select scale type and levels using [Scale Selection Template](#scale-selection-template).
+
+**Step 4: Write performance descriptors**
+
+Write clear, observable descriptors using [Descriptor Writing Template](#descriptor-writing-template).
+
+**Step 5: Test and calibrate**
+
+Conduct inter-rater reliability testing and refine rubric.
+
+**Step 6: Use and iterate**
+
+Apply rubric, collect feedback, revise as needed.
+
+---
+
+## Purpose Definition Template
+
+**What are we evaluating?**
+- Artifact type: [e.g., code pull requests, research proposals, design mockups, student essays]
+- Specific context: [e.g., internal code review, grant competition, course assignment]
+
+**Who will evaluate?**
+- Number of evaluators: [Single reviewer or multiple?]
+- Evaluator expertise: [Subject matter experts, peers, instructors, automated systems]
+- Evaluator availability: [Time per evaluation? Total volume?]
+
+**Who are the evaluatees?**
+- Audience: [Students, employees, vendors, applicants]
+- Skill level: [Novice, intermediate, expert]
+- Will they see rubric before evaluation? [Yes/No - if yes, rubric serves as guide]
+
+**What decisions depend on scores?**
+- High stakes: [Pass/fail, hiring, funding, promotion, grades]
+- Medium stakes: [Feedback for improvement, prioritization, awards]
+- Low stakes: [Self-assessment, informal feedback]
+
+**Success criteria for rubric:**
+- [ ] Enables consistent scoring across evaluators (inter-rater reliability >70%)
+- [ ] Provides actionable feedback for improvement
+- [ ] Takes reasonable time to use (target: X minutes per evaluation)
+- [ ] Acceptable to evaluators (not overly complex or rigid)
+- [ ] Acceptable to evaluatees (perceived as fair and transparent)
+
+---
+
+## Criteria Identification Template
+
+### Brainstorming Quality Dimensions
+
+**Product criteria** (artifact itself):
+- Correctness/Accuracy: [Is it right? Factually accurate? Meets requirements?]
+- Completeness: [Covers all necessary elements? No major gaps?]
+- Clarity: [Easy to understand? Well-organized? Clear communication?]
+- Quality/Craftsmanship: [Attention to detail? Polished? Professional?]
+- Originality/Creativity: [Novel approach? Innovative? Goes beyond expected?]
+- Performance: [Fast? Efficient? Scalable? Meets technical specs?]
+
+**Process criteria** (how it was made):
+- Methodology: [Followed appropriate process? Research methods sound?]
+- Collaboration: [Teamwork? Communication? Used feedback?]
+- Iteration: [Multiple drafts? Refinement? Responsiveness to critique?]
+- Time management: [Completed on time? Paced work appropriately?]
+
+**Impact criteria** (effects/outcomes):
+- Usability: [User-friendly? Accessible? Intuitive?]
+- Value: [Solves problem? Addresses need? Business impact?]
+- Learning demonstrated: [Shows understanding? Growth from previous work?]
+
+**Meta criteria** (quality of quality):
+- Maintainability: [Can others work with this? Documented? Modular?]
+- Testability: [Can be verified? Validated? Measured?]
+- Extensibility: [Can be built upon? Flexible? Adaptable?]
+
+### Prioritization
+
+**Rate each candidate criterion:**
+
+| Criterion | Importance (H/M/L) | Observable (Y/N) | Distinct from others (Y/N) | Include? |
+|-----------|-------------------|------------------|---------------------------|----------|
+| [Criterion 1] | | | | |
+| [Criterion 2] | | | | |
+| [Criterion 3] | | | | |
+
+**Selection rules:**
+- Must be High or Medium importance
+- Must be Observable (can two reviewers score consistently?)
+- Must be Distinct (not overlapping with other criteria)
+- Aim for 4-8 criteria (balance coverage vs. simplicity)
+
+**Final criteria** (4-8 selected):
+1. [Criterion]: [Brief definition]
+2. [Criterion]: [Brief definition]
+3. [Criterion]: [Brief definition]
+4. [Criterion]: [Brief definition]
+
+---
+
+## Scale Selection Template
+
+**Scale type options:**
+
+### Numeric Scales
+
+**1-3 scale** (Low/Medium/High)
+- Use when: Quick categorization, clear tiers sufficient
+- Levels: 1=Below standard, 2=Meets standard, 3=Exceeds standard
+
+**1-4 scale** (Forced choice, no middle)
+- Use when: Want to avoid central tendency, need clear differentiation
+- Levels: 1=Poor, 2=Fair, 3=Good, 4=Excellent
+
+**1-5 scale** (Most common, allows neutral)
+- Use when: General purpose, familiar to evaluators
+- Levels: 1=Poor, 2=Fair, 3=Adequate, 4=Good, 5=Excellent
+
+**1-10 scale** (Fine gradations)
+- Use when: Large sample, need statistical analysis, can distinguish subtle differences
+- Levels: 1-2=Poor, 3-4=Fair, 5-6=Adequate, 7-8=Good, 9-10=Excellent
+
+### Qualitative Scales
+
+**Developmental**: Novice → Developing → Proficient → Expert
+**Standards-based**: Below Standard → Approaching → Meets → Exceeds
+**Competency**: Not Yet Competent → Partially Competent → Competent → Highly Competent
+
+### Binary
+
+**Pass/Fail, Yes/No, Present/Absent**
+- Use when: Compliance checks, minimum thresholds, clear criteria
+
+**Selected scale for this rubric**: [Choose one]
+- **Type**: [Numeric 1-5, Qualitative, etc.]
+- **Levels**: [List with labels]
+- **Rationale**: [Why this scale fits purpose]
+
+---
+
+## Descriptor Writing Template
+
+For each criterion, write descriptors at each scale level.
+
+### Criterion: [Name]
+
+**Definition**: [What does this criterion assess? 1-2 sentences]
+
+**Why it matters**: [Importance to overall quality]
+
+**Scale descriptors:**
+
+#### Level 5 (or highest): [Label]
+**Observable characteristics**:
+- [Concrete, observable feature 1]
+- [Concrete, observable feature 2]
+- [Concrete, observable feature 3]
+
+**Example**: [Specific instance of work at this level]
+
+#### Level 4: [Label]
+**Observable characteristics**:
+- [How this differs from Level 5 - what's missing or less strong]
+- [Concrete observable feature]
+
+**Example**: [Specific instance]
+
+#### Level 3: [Label] (Baseline/Adequate)
+**Observable characteristics**:
+- [Minimum acceptable performance]
+- [Observable feature]
+
+**Example**: [Specific instance]
+
+#### Level 2: [Label]
+**Observable characteristics**:
+- [What's lacking compared to Level 3]
+- [Observable deficiency]
+
+**Example**: [Specific instance]
+
+#### Level 1 (or lowest): [Label]
+**Observable characteristics**:
+- [Significant deficiencies]
+- [Observable problems]
+
+**Example**: [Specific instance]
+
+---
+
+### Descriptor Writing Guidelines
+
+**DO:**
+- Use observable, measurable language ("Contains 3+ bugs" not "poor quality")
+- Provide concrete examples or anchors for each level
+- Focus on what IS present at each level, not just "less than" higher level
+- Use parallel structure across levels (same aspects addressed at each level)
+- Specify quantities when possible ("All 5 requirements met" vs "Most requirements met")
+
+**DON'T:**
+- Use subjective terms without definition ("creative", "professional", "excellent effort")
+- Rely on comparative language only ("better than", "more sophisticated")
+- Make assumptions about process ("spent time", "worked hard" - unless observable)
+- Penalize for things not mentioned in descriptor (hidden expectations)
+
+---
+
+## Analytic Rubric Template
+
+Most common format: Multiple criteria (rows) × Multiple levels (columns)
+
+### Rubric for: [Artifact Type]
+
+**Purpose**: [Brief description]
+
+**Scale**: [1-5, 1-4, etc. with labels]
+
+| Criterion | 1 | 2 | 3 | 4 | 5 | Weight |
+|-----------|---|---|---|---|---|--------|
+| **[Criterion 1]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
+| **[Criterion 2]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
+| **[Criterion 3]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
+| **[Criterion 4]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
+
+**Scoring:**
+- Calculate: (Score1 × Weight1) + (Score2 × Weight2) + ... / Total Weights
+- Threshold: [e.g., Must average ≥3.0 to pass, ≥4 on critical criteria]
+
+**Usage notes:**
+- Score each criterion independently before looking at others (avoid halo effect)
+- Provide brief justification for each score
+- Flag areas for improvement in feedback
+
+---
+
+## Holistic Rubric Template
+
+Single overall score integrating multiple criteria.
+
+### Rubric for: [Artifact Type]
+
+**Purpose**: [Brief description]
+
+#### Level 5: Excellent
+**Overall quality**: [Integrated description touching all important aspects]
+- Criterion A: [How it manifests at this level]
+- Criterion B: [How it manifests at this level]
+- Criterion C: [How it manifests at this level]
+
+**Example**: [Work that exemplifies this level]
+
+#### Level 4: Good
+**Overall quality**: [Integrated description]
+- Differences from Level 5: [What's less strong]
+- Key characteristics: [Observable features]
+
+**Example**: [Work that exemplifies this level]
+
+#### Level 3: Adequate
+**Overall quality**: [Integrated description of baseline acceptable]
+- Meets minimum standards: [What's required]
+- May have: [Acceptable weaknesses]
+
+**Example**: [Work that exemplifies this level]
+
+#### Level 2: Weak
+**Overall quality**: [Integrated description of below standard]
+- Falls short because: [Key deficiencies]
+- Problems include: [Observable issues]
+
+**Example**: [Work that exemplifies this level]
+
+#### Level 1: Poor
+**Overall quality**: [Integrated description of unacceptable]
+- Major problems: [Significant deficiencies across multiple aspects]
+
+**Example**: [Work that exemplifies this level]
+
+---
+
+## Single-Point Rubric Template
+
+Lists criteria with "meets standard" description only, space to note exceeds/below.
+
+### Rubric for: [Artifact Type]
+
+| Criterion | Concerns (Below Standard) | Meets Standard | Advanced (Exceeds Standard) |
+|-----------|---------------------------|----------------|----------------------------|
+| **[Criterion 1]** | | [Clear description of standard] | |
+| **[Criterion 2]** | | [Clear description of standard] | |
+| **[Criterion 3]** | | [Clear description of standard] | |
+| **[Criterion 4]** | | [Clear description of standard] | |
+
+**Usage:**
+- Check if work meets standard for each criterion
+- Note specific strengths in "Advanced" column (e.g., "+Exceptionally clear examples")
+- Note specific areas for improvement in "Concerns" column (e.g., "-Missing citations for 3 claims")
+
+---
+
+## Checklist Template
+
+Binary yes/no for must-have requirements.
+
+### Checklist for: [Artifact Type]
+
+#### Category 1: [e.g., Completeness]
+- [ ] [Specific requirement 1]
+- [ ] [Specific requirement 2]
+- [ ] [Specific requirement 3]
+
+#### Category 2: [e.g., Quality]
+- [ ] [Specific requirement 4]
+- [ ] [Specific requirement 5]
+
+#### Category 3: [e.g., Compliance]
+- [ ] [Specific requirement 6]
+- [ ] [Specific requirement 7]
+
+**Pass/Fail Criteria:**
+- **Pass**: All items checked OR All items in critical categories + X% of others
+- **Fail**: Any critical item unchecked OR <Y% total items checked
+
+---
+
+## Weighted Scoring Template
+
+When criteria have different importance.
+
+### Weighted Rubric for: [Artifact Type]
+
+| Criterion | Score (1-5) | Weight | Weighted Score |
+|-----------|-------------|--------|----------------|
+| [Criterion 1] | | ×3 (Critical) | Score × 3 = |
+| [Criterion 2] | | ×2 (Important) | Score × 2 = |
+| [Criterion 3] | | ×2 (Important) | Score × 2 = |
+| [Criterion 4] | | ×1 (Desirable) | Score × 1 = |
+| **Total** | | **8** | **[Sum] / 8 =** |
+
+**Weight categories:**
+- ×3 = Critical (must be strong, threshold: ≥4 required)
+- ×2 = Important (significant impact on overall quality)
+- ×1 = Desirable (nice to have, less critical)
+
+---
+
+## Calibration Session Template
+
+**Pre-calibration:**
+1. Select 3-5 sample works spanning quality range (low, medium, high)
+2. Have each reviewer independently score samples using rubric
+3. Record scores without discussion
+
+**During calibration:**
+4. Compare scores across reviewers for each sample
+5. For discrepancies (>1 point difference):
+   - Discuss what each reviewer saw
+   - Identify ambiguous descriptors
+   - Clarify criterion boundaries
+   - Refine rubric language
+6. Re-score samples using refined rubric
+
+**Post-calibration:**
+7. Calculate inter-rater reliability (% agreement, Kappa)
+8. Target: ≥70% agreement (within 1 point) or Kappa ≥0.6
+9. If below target: Iterate with more refinement + calibration
+10. Document calibration decisions and rubric changes
+
+---
+
+## Feedback Template
+
+**For: [Evaluatee Name]**
+
+**Overall Score**: [X.X / 5.0 or Level]
+
+**Criterion-by-Criterion Scores:**
+
+| Criterion | Score | Feedback |
+|-----------|-------|----------|
+| [Criterion 1] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
+| [Criterion 2] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
+| [Criterion 3] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
+
+**Summary:**
+- **Greatest strengths**: [2-3 specific strengths]
+- **Priority improvements**: [2-3 most important areas to address]
+- **Next steps**: [Actionable recommendations]
+
+**Overall assessment**: [Pass/Fail or qualitative judgment]