Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions

View File

@@ -0,0 +1,253 @@
{
"criteria": [
{
"name": "Criteria Clarity",
"1": "Criteria vague or subjective (e.g., 'good work', 'shows effort'), no definitions, overlapping dimensions",
"3": "Criteria defined but some ambiguity, mostly distinct dimensions, some examples provided",
"5": "Criteria crystal clear with precise definitions, completely distinct non-overlapping dimensions, explicit boundaries (what is/isn't included), examples for each criterion"
},
{
"name": "Scale Appropriateness",
"1": "Scale granularity mismatched to context (10-point scale for subjective judgment or 3-point for fine distinctions), inconsistent level labels",
"3": "Scale granularity reasonable, levels labeled consistently, appropriate for most criteria",
"5": "Scale granularity perfectly matched to observable differences and evaluator expertise, level labels clear and consistent (numeric + qualitative), forced-choice or neutral middle justified by context"
},
{
"name": "Descriptor Specificity",
"1": "Descriptors use subjective language ('excellent', 'creative', 'professional'), no observable features, comparative only ('better than', 'more')",
"3": "Descriptors mostly observable, some quantification (numbers, counts), some comparative language, parallel structure attempted",
"5": "Descriptors 100% observable and measurable (could two reviewers score consistently), quantified where possible (specific numbers, percentages), parallel structure across levels (same aspects at each level), concrete examples or anchors provided"
},
{
"name": "Observability",
"1": "Criteria require mind-reading or assumptions about process ('worked hard', 'creative thinking'), no evidence trail",
"3": "Most criteria observable from artifact, some behavioral indicators, evidence trail for key criteria",
"5": "All criteria directly observable from artifact or documented process, behavioral indicators specified, clear evidence trail (where to look, what counts), two reviewers could independently verify"
},
{
"name": "Inter-Rater Reliability Plan",
"1": "No calibration plan, no IRR measurement, assumes reviewers will 'just know', no anchors",
"3": "Basic calibration mentioned, some anchors or examples, IRR measurement method identified",
"5": "Comprehensive calibration plan (pre/during/post steps), specific IRR target (e.g., Kappa ≥0.70), anchor examples at each level for each criterion, ongoing calibration schedule (quarterly), discrepancy resolution protocol"
},
{
"name": "Comprehensiveness",
"1": "Missing critical quality dimensions, <3 criteria (too sparse) or >12 criteria (too complex), no coverage of must-haves",
"3": "Covers main quality dimensions, 4-8 criteria, may miss some edge cases or secondary aspects",
"5": "Comprehensive coverage of all important quality dimensions (product, process, impact as relevant), 4-8 criteria (balanced coverage vs. usability), addresses must-haves and quality gradations, no hidden expectations"
},
{
"name": "Actionability",
"1": "Descriptors don't guide improvement (says 'poor' but not what's wrong), no feedback mechanism, evaluatees don't see rubric until scored",
"3": "Descriptors somewhat actionable, feedback template exists, rubric shared before evaluation",
"5": "Descriptors explicitly actionable (clear what to change to improve level), feedback template tied to criteria with strengths/improvements, rubric shared upfront so evaluatees can self-assess, examples show what 'good' looks like"
},
{
"name": "Weighting Justification",
"1": "All criteria weighted equally despite different importance, or weights arbitrary (no justification), critical criteria not flagged",
"3": "Some criteria weighted or flagged as critical, basic justification provided, threshold mentioned",
"5": "Weighting system explicit and justified (multiplicative or percentage), critical criteria have thresholds (must score ≥X to pass), compensatory vs. non-compensatory trade-offs acknowledged, scoring calculation clear"
},
{
"name": "Bias Mitigation",
"1": "No acknowledgment of potential biases (halo, leniency, central tendency, anchoring), no mitigation strategies",
"3": "Bias types mentioned, some mitigation (e.g., randomize order, blind scoring), training mentioned",
"5": "Comprehensive bias mitigation: Halo (vertical scoring, blind scoring), central tendency (even-number scale or anchors), leniency/severity (calibration, normalization), order effects (randomization), explicit reviewer training, audit plan for detecting bias"
},
{
"name": "Usability",
"1": "Rubric overly complex (takes >30 min to score one item), no guidance for reviewers, format hard to use (wall of text)",
"3": "Reasonable time to use (<15 min per item), basic reviewer guidance, clear format (table or structured)",
"5": "Efficient to use (target time specified and achievable, <10 min for simple rubrics), comprehensive reviewer guidance (instructions, training materials, FAQs), format optimized for use (table, clear layout, easy to reference), accessible to both evaluators and evaluatees"
}
],
"guidance_by_type": {
"Analytic Rubric": {
"target_score": 4.2,
"key_requirements": [
"Descriptor Specificity (score ≥5): Each criterion × level cell has observable descriptor, parallel structure across levels",
"Comprehensiveness (≥4): 4-8 criteria covering key quality dimensions without overlap",
"Observability (≥5): All criteria measurable from artifact, two reviewers could score consistently",
"Inter-Rater Reliability Plan (≥4): Calibration sessions, anchors, IRR measurement (Kappa ≥0.70 target)"
],
"common_pitfalls": [
"Too many criteria (>10) → time-consuming, overwhelming",
"Overlapping criteria ('Clarity' and 'Organization' conflated)",
"Descriptors use comparative language only ('better than Level 3') without absolute description"
]
},
"Holistic Rubric": {
"target_score": 3.8,
"key_requirements": [
"Descriptor Specificity (≥4): Each level integrates multiple criteria, clear gestalt description, concrete examples",
"Comprehensiveness (≥3): All important quality aspects mentioned in descriptors (even if not separate criteria)",
"Observability (≥4): Overall judgment observable, descriptors reference concrete features",
"Inter-Rater Reliability Plan (≥4): Critical for holistic (lower IRR expected), extensive calibration, many anchors"
],
"common_pitfalls": [
"Descriptors too vague ('excellent overall quality') without specifics",
"No examples or anchors (reviewers have widely different standards)",
"Lower IRR than analytic (expect Kappa 0.60-0.70, not 0.80+)"
]
},
"Single-Point Rubric": {
"target_score": 3.7,
"key_requirements": [
"Descriptor Specificity (≥4): 'Meets standard' descriptor crystal clear, observable, quantified",
"Comprehensiveness (≥4): All critical quality dimensions listed as criteria",
"Actionability (≥5): Strengths/concerns space encourages specific feedback, not just checkmarks",
"Usability (≥5): Fast to use, less intimidating than analytic, encourages dialogue"
],
"common_pitfalls": [
"'Meets standard' too vague (what exactly is the standard?)",
"Used as checklist (just check yes/no) rather than noting specific strengths/concerns",
"No guidance for what 'exceeds' or 'below' means (reviewers inconsistent)"
]
},
"Checklist": {
"target_score": 3.5,
"key_requirements": [
"Descriptor Specificity (≥5): Each item binary, observable, verifiable (yes/no clear)",
"Comprehensiveness (≥5): All must-haves listed, nothing critical missing",
"Observability (≥5): 100% verifiable (can literally check off each item)",
"Usability (≥5): Fast to use, unambiguous, minimal judgment required"
],
"common_pitfalls": [
"Items require judgment ('code is clean') → not truly binary",
"Missing critical items (assumes 'everyone knows' but not documented)",
"Used alone for quality assessment (checklists ensure minimums, don't capture quality gradations)"
]
},
"Standards-Based Rubric": {
"target_score": 4.0,
"key_requirements": [
"Criteria Clarity (≥5): Criteria explicitly tied to learning objectives/competencies/standards",
"Descriptor Specificity (≥5): Levels represent mastery progression (Novice/Competent/Expert with clear differences)",
"Comprehensiveness (≥5): All relevant standards/competencies covered, none missing",
"Actionability (≥5): Descriptors show developmental path, clear how to progress from one level to next"
],
"common_pitfalls": [
"Standards not clearly defined (rubric references 'Standard 3.2' but doesn't explain what it is)",
"Levels don't represent true developmental progression (arbitrary distinctions)",
"Rubric divorced from instruction (students never taught what's in rubric)"
]
}
},
"guidance_by_complexity": {
"Simple Rubric": {
"target_score": 3.5,
"description": "3-5 criteria, 3-4 scale levels, straightforward domain, single evaluator or small team",
"key_requirements": [
"Criteria Clarity (≥3): Criteria defined, mostly distinct, examples for key criteria",
"Descriptor Specificity (≥3): Observable language, some quantification, basic parallel structure",
"Observability (≥3): Criteria observable from artifact, reasonable agreement expected",
"Usability (≥4): Fast to create and use, minimal training needed, <5 min to score"
],
"time_estimate": "2-4 hours to develop, 1 hour calibration",
"examples": [
"Internal code review (3 criteria: Correctness, Readability, Tests)",
"Student homework (4 criteria: Completeness, Accuracy, Clarity, Timeliness)",
"Design critique (3 criteria: Visual hierarchy, Consistency, Accessibility basics)"
]
},
"Standard Rubric": {
"target_score": 4.0,
"description": "5-7 criteria, 4-5 scale levels, moderate complexity, multiple evaluators, some stakes",
"key_requirements": [
"Criteria Clarity (≥4): Precise definitions, distinct dimensions, boundaries explicit, examples for all criteria",
"Descriptor Specificity (≥4): Observable and quantified, parallel structure, concrete examples at each level",
"Inter-Rater Reliability Plan (≥4): Calibration sessions (3-5 samples), IRR measurement (Kappa ≥0.70), anchors at all levels",
"Bias Mitigation (≥3): Acknowledge key biases (halo, central tendency), basic mitigation (randomize, calibration)",
"Actionability (≥4): Clear feedback mechanism, rubric shared upfront, descriptors guide improvement"
],
"time_estimate": "6-10 hours to develop, 2-3 calibration sessions",
"examples": [
"Essay grading (6 criteria: Argument, Evidence, Organization, Clarity, Mechanics, Originality)",
"Product launch review (5 criteria: User value, Technical quality, Market fit, Risk mitigation, Metrics)",
"Vendor selection (7 criteria: Functionality, Cost, Support, Integration, Scalability, Security, Track record)"
]
},
"Complex Rubric": {
"target_score": 4.3,
"description": "6-10 criteria, 5-10 scale levels, high complexity/novelty, many evaluators, high stakes, need for consistency and defensibility",
"key_requirements": [
"Criteria Clarity (≥5): Crystal clear definitions, completely distinct, explicit boundaries, comprehensive examples",
"Descriptor Specificity (≥5): 100% observable/measurable, fully quantified, perfect parallel structure, anchors at all levels",
"Observability (≥5): All criteria independently verifiable, evidence trail documented, IRR target >80%",
"Inter-Rater Reliability Plan (≥5): Extensive calibration (5+ sessions), IRR measurement (Kappa or ICC), ongoing calibration schedule (quarterly), discrepancy protocol, anchor library",
"Weighting Justification (≥5): Explicit weighting or thresholds, justified by context, compensatory vs. non-compensatory clear",
"Bias Mitigation (≥5): Comprehensive mitigation for all bias types, reviewer training program, audit plan, normalization procedures",
"Actionability (≥5): Detailed feedback template, rubric shapes instruction/preparation, multiple examples of work at each level"
],
"time_estimate": "15-25 hours to develop, 5-8 calibration sessions, ongoing maintenance",
"examples": [
"Grant proposal review (10 criteria across significance, innovation, approach, team, environment)",
"Hiring rubric (8 criteria: Technical skills, Problem-solving, Communication, Culture fit, Leadership, Growth mindset, Domain expertise, References)",
"Clinical competency assessment (9 criteria across knowledge, skills, attitudes, professionalism)",
"Algorithmic fairness audit rubric (7 criteria: Accuracy, Disparate impact, Equalized odds, Calibration, Explainability, Recourse, Monitoring)"
]
}
},
"common_failure_modes": [
{
"failure": "Subjective criteria without operationalization",
"symptom": "Criteria like 'creativity', 'professionalism', 'good attitude', 'shows effort' without observable indicators",
"detection": "Ask 'Could two reviewers score this consistently without discussing?' If no → subjective",
"fix": "Define observable behaviors: 'Creativity = uses 2+ techniques not taught, novel combination'. Test with calibration samples."
},
{
"failure": "Overlapping criteria inflating scores",
"symptom": "Criteria like 'Clarity' and 'Organization' or 'Quality' and 'Professionalism' that measure same underlying dimension",
"detection": "High correlation between criteria scores (always move together), difficulty explaining difference between criteria",
"fix": "Define explicit boundaries ('Clarity = language. Organization = structure.'), combine overlapping criteria, or split into distinct fine-grained criteria"
},
{
"failure": "Descriptors use only comparative language",
"symptom": "Level 4 described as 'better than Level 3', 'more sophisticated than Level 2', without absolute description of what Level 4 IS",
"detection": "Read descriptor for Level 4 alone (without seeing other levels). Is it clear what constitutes Level 4? If no → comparative only.",
"fix": "Write absolute descriptors: 'Level 4 = Zero bugs, meets all 5 requirements, performance <100ms'. Each level stands alone."
},
{
"failure": "Scale granularity mismatched to observable differences",
"symptom": "10-point scale for subjective judgment (reviewers can't distinguish 7 vs 8), or 3-point scale for objective dimensions with clear gradations",
"detection": "Low IRR (reviewers disagree), or reviewers never use parts of scale (everyone scores 6-8 on 10-point scale)",
"fix": "Match granularity to real observable differences. If can only distinguish 'poor/adequate/good', use 3-point. If 5 clear levels, use 5-point. Test with calibration."
},
{
"failure": "No parallel structure across levels",
"symptom": "Level 5 mentions A, B, C. Level 3 mentions D, E. Level 1 mentions F. Can't compare what changes between levels.",
"detection": "Try to explain what someone must improve to go from Level 3 → Level 4. If unclear → no parallel structure.",
"fix": "Create table with dimensions (columns) and levels (rows). Ensure each dimension addressed at each level. E.g., 'Variable names | Comments | Complexity' assessed at all 5 levels."
},
{
"failure": "Hidden expectations not in rubric",
"symptom": "Reviewers penalize for things not mentioned in rubric (e.g., rubric doesn't mention formatting but reviewer scores down for poor formatting)",
"detection": "Compare rubric criteria to actual feedback given. Feedback mentions dimensions not in rubric → hidden expectations.",
"fix": "Make all expectations explicit. If it matters enough to penalize, include it. If not in rubric, don't penalize (can suggest, but doesn't affect score)."
},
{
"failure": "No calibration or IRR measurement",
"symptom": "Rubric deployed without testing if reviewers score consistently, no anchor examples, no calibration sessions, 'we trust our reviewers'",
"detection": "Ask 'What's the Kappa or ICC?' If answer is blank stare → no IRR measurement.",
"fix": "Before full deployment: Select 3-5 samples, have all reviewers score independently, calculate IRR (Kappa, ICC), discuss discrepancies, refine rubric, re-test. Target: Kappa ≥0.70 or ICC ≥0.75."
},
{
"failure": "Central tendency bias (everyone scores 3/5)",
"symptom": "Distribution of scores heavily clustered around middle (80% of scores are 3 on 1-5 scale), extremes (1 or 5) almost never used",
"detection": "Plot score distribution. If normal curve centered on middle with narrow spread → central tendency bias.",
"fix": "Even-number scale (1-4, no middle), anchor examples at extremes (show what 1 and 5 look like), forced distribution (controversial), calibration sessions where reviewers practice using full range."
},
{
"failure": "Weighting doesn't reflect importance",
"symptom": "All criteria weighted equally (or no weights) despite some being critical (Security) and others nice-to-have (Code style), or high Style score can compensate for low Security",
"detection": "Ask 'If Security=1 but all other criteria=5, should this pass?' If no, but rubric allows it → weighting problem.",
"fix": "Explicitly weight critical criteria (Security ×3, Style ×1) OR use thresholds (must score ≥4 on Security to pass, regardless of other scores). Document rationale."
},
{
"failure": "Rubric not shared with evaluatees upfront",
"symptom": "Rubric used only by reviewers, evaluatees see rubric for first time when scored, can't self-assess or prepare",
"detection": "Ask evaluatees 'Did you see the rubric before submitting work?' If no → transparency problem.",
"fix": "Share rubric when assignment/project given. Rubric serves as guide and quality standard, not just grading tool. Provide anchor examples so people know what 'good' looks like."
}
]
}

View File

@@ -0,0 +1,365 @@
# Evaluation Rubrics Methodology
Comprehensive guidance on scale design, descriptor writing, calibration, bias mitigation, and advanced rubric design techniques.
## Workflow
```
Rubric Development Progress:
- [ ] Step 1: Define purpose and scope
- [ ] Step 2: Identify evaluation criteria
- [ ] Step 3: Design the scale
- [ ] Step 4: Write performance descriptors
- [ ] Step 5: Test and calibrate
- [ ] Step 6: Use and iterate
```
**Step 1: Define purpose and scope** → See [resources/template.md](template.md#purpose-definition-template)
**Step 2: Identify evaluation criteria** → See [resources/template.md](template.md#criteria-identification-template)
**Step 3: Design the scale** → See [1. Scale Design Principles](#1-scale-design-principles)
**Step 4: Write performance descriptors** → See [2. Descriptor Writing Techniques](#2-descriptor-writing-techniques)
**Step 5: Test and calibrate** → See [3. Calibration Techniques](#3-calibration-techniques)
**Step 6: Use and iterate** → See [4. Bias Mitigation](#4-bias-mitigation) and [6. Common Pitfalls](#6-common-pitfalls)
---
## 1. Scale Design Principles
### Choosing Appropriate Granularity
**The granularity dilemma**: Too few levels (1-3) miss meaningful distinctions; too many levels (1-10) create false precision and inconsistency.
| Factor | Favors Fewer Levels (1-3, 1-4) | Favors More Levels (1-5, 1-10) |
|--------|--------------------------------|--------------------------------|
| Evaluator expertise | Novice reviewers, unfamiliar domain | Expert reviewers, deep domain knowledge |
| Observable differences | Hard to distinguish subtle differences | Clear gradations exist |
| Stakes | High-stakes binary decisions (pass/fail) | Developmental feedback, rankings |
| Sample size | Small samples (< 20 items) | Large samples (100+, statistical analysis) |
| Time available | Quick screening, time pressure | Detailed assessment, ample time |
| Consistency priority | Inter-rater reliability critical | Differentiation more important |
**Scale characteristics** (See SKILL.md Quick Reference for detailed comparison):
- **1-3**: Fast, coarse, high reliability. Use for quick screening.
- **1-4**: Forces choice (no middle), avoids central tendency. Use when bias observed.
- **1-5**: Most common, allows neutral, good balance. General purpose.
- **1-10**: Fine gradations, statistical analysis. Use for large samples (100+), expert reviewers.
- **Qualitative** (Novice/Proficient/Expert): Intuitive for skills, growth-oriented. Educational contexts.
### Central Tendency and Response Biases
**Central tendency bias**: Reviewers avoid extremes, cluster around middle (most get 3/5).
**Causes**: Uncertainty, social pressure, lack of calibration.
**Mitigations**:
1. **Even-number scales** (1-4, 1-6) force choice above/below standard
2. **Anchor examples** at each level (what does 1 vs 5 look like?)
3. **Calibration sessions** where reviewers score same work, discuss discrepancies
4. **Forced distributions** (controversial): Require X% in each category. Use sparingly.
**Other response biases**:
- **Halo effect**: Overall impression biases individual criteria scores.
- **Mitigation**: Vertical scoring (all work on Criterion 1, then Criterion 2), blind scoring.
- **Leniency/severity bias**: Reviewer consistently scores higher/lower than others.
- **Mitigation**: Calibration sessions, normalization across reviewers.
- **Range restriction**: Reviewer uses only part of scale (always 3-4, never 1-2 or 5).
- **Mitigation**: Anchor examples at extremes, forced distribution (cautiously).
### Numeric vs. Qualitative Scales
**Numeric** (1-5, 1-10): Easy to aggregate, quantitative comparison, ranking. Numbers feel precise but may be arbitrary.
**Qualitative** (Novice/Proficient/Expert, Below/Meets/Exceeds): Intuitive labels, less false precision. Harder to aggregate, ordinal only.
**Hybrid approach** (best of both): Numeric with labels (1=Poor, 2=Fair, 3=Adequate, 4=Good, 5=Excellent). Labels anchor meaning, numbers enable analysis.
**Unipolar vs. Bipolar**:
- **Unipolar**: 1 (None) → 5 (Maximum). Measures amount or quality. **Use for rubrics.**
- **Bipolar**: 1 (Strongly Disagree) → 5 (Strongly Agree), 3=Neutral. Measures agreement.
---
## 2. Descriptor Writing Techniques
### Observable, Measurable Language
**Core principle**: Two independent reviewers should score the same work consistently based on descriptors alone.
| ❌ Subjective (Avoid) | ✓ Observable (Use) |
|----------------------|-------------------|
| "Shows effort" | "Submitted 3 drafts, incorporated 80%+ of feedback" |
| "Creative" | "Uses 2+ techniques not taught, novel combination of concepts" |
| "Professional quality" | "Zero typos, consistent formatting, APA citations correct" |
| "Good understanding" | "Correctly applies 4/5 key concepts, explains mechanisms" |
| "Needs improvement" | "Contains 5+ bugs, missing 2 required features, <100ms target" |
**Test for observability**: Could two reviewers count/measure this? (Yes → observable). Does this require mind-reading? (Yes → subjective).
**Techniques**:
1. **Quantification**: "All 5 requirements met" vs. "Most requirements met"
2. **Explicit features**: "Includes abstract, intro, methods, results, discussion" vs. "Complete structure"
3. **Behavioral indicators**: "Asks clarifying questions, proposes alternatives" vs. "Critical thinking"
4. **Comparison to standards**: "WCAG AA compliant" vs. "Accessible"
### Parallel Structure Across Levels
**Parallel structure**: Each level addresses the same aspects, making differences clear.
**Example: Code Review, "Readability" criterion**
| Level | Variable Names | Comments/Docs | Code Complexity |
|-------|---------------|---------------|-----------------|
| **5** | Descriptive, domain-appropriate | Comprehensive docs, all functions commented | Simple, DRY, single responsibility |
| **3** | Mostly clear, some abbreviations | Key functions documented, some comments | Moderate complexity, some duplication |
| **1** | Cryptic abbreviations, unclear | No documentation, no comments | Highly complex, nested logic, duplication |
**Benefits**: Easy comparison (what changes 3→5?), diagnostic (pinpoint weakness), fair (same dimensions).
### Examples and Anchors at Each Level
**Anchor**: Concrete example of work at a specific level, calibrates reviewers.
**Types**:
1. **Exemplar work samples**: Actual submissions scored at each level (authentic, requires permission)
2. **Synthetic examples**: Crafted to demonstrate each level (controlled, no permission needed)
3. **Annotated excerpts**: Sections highlighting what merits that score (focused, may miss holistic quality)
**Best practices**:
- Anchor at extremes and middle (minimum: 1, 3, 5)
- Diversity of anchors (different ways to achieve a level)
- Update anchors as rubric evolves
- Make accessible to evaluators and evaluatees
### Avoiding Hidden Expectations
**Hidden expectation**: Quality dimension reviewers penalize but isn't in rubric.
**Example**: Rubric has "Technical Accuracy", "Clarity", "Practical Value". Reviewer scores down for "poor visual design" (not a criterion). **Problem**: Evaluatee had no way to know design mattered.
**Mitigation**:
1. **Comprehensive criteria**: If it matters, include it. If not in rubric, don't penalize.
2. **Criterion definitions**: Explicitly state what is/isn't included.
3. **Feedback constraints**: Suggestions outside rubric don't affect score.
4. **Rubric review**: Ask evaluatees what's missing, update accordingly.
---
## 3. Calibration Techniques
### Inter-Rater Reliability Measurement
**Inter-rater reliability (IRR)**: Degree to which independent reviewers give consistent scores.
**Target IRR thresholds**:
- <50%: Unreliable, major revision needed
- 50-70%: Marginal, refine descriptors, more calibration
- 70-85%: Good, acceptable for most uses
- >85%: Excellent, highly reliable
**Measurement methods**:
**1. Percent Agreement**
- **Calculation**: (# items where reviewers agree exactly) / (total items)
- **Pros**: Simple, intuitive. **Cons**: Inflated by chance agreement.
- **Variant: Within-1 agreement**: Scores within 1 point count as agree. Target: ≥80%.
**2. Cohen's Kappa (κ)**
- **Calculation**: (Observed agreement - Expected by chance) / (1 - Expected by chance)
- **Range**: -1 to 1 (0=chance, 1=perfect agreement)
- **Interpretation**: <0.20 Poor, 0.21-0.40 Fair, 0.41-0.60 Moderate, 0.61-0.80 Substantial, 0.81-1.00 Almost perfect
- **Pros**: Corrects for chance. **Cons**: Only 2 raters, affected by prevalence.
**3. Intraclass Correlation Coefficient (ICC)**
- **Use when**: More than 2 raters, continuous scores
- **Range**: 0 to 1. **Interpretation**: <0.50 Poor, 0.50-0.75 Moderate, 0.75-0.90 Good, >0.90 Excellent
- **Pros**: Handles multiple raters, gold standard. **Cons**: Requires statistical software.
**4. Krippendorff's Alpha**
- **Use when**: Multiple raters, missing data, various data types
- **Range**: 0 to 1. **Interpretation**: α≥0.80 acceptable, ≥0.67 tentatively acceptable
- **Pros**: Most flexible, robust to missing data. **Cons**: Less familiar.
### Calibration Session Design
**Pre-calibration**:
1. **Select 3-5 samples** spanning quality range (low, medium, high, edge cases)
2. **Independent scoring**: Each reviewer scores all samples alone, no discussion
3. **Calculate IRR**: Baseline reliability (percent agreement, Kappa)
**During calibration**:
4. **Discuss discrepancies** (focus on differences >1 point): "I scored Sample 1 as 4 because... What led you to 3?"
5. **Identify ambiguities**: Descriptor unclear? Criterion boundaries fuzzy? Missing cases?
6. **Refine rubric**: Clarify descriptors (add specificity, numbers, examples), add anchors, revise criteria
7. **Re-score**: Independently re-score same samples using refined rubric
**Post-calibration**:
8. **Calculate final IRR**: If ≥70%, proceed. If <70%, iterate (more refinement + re-calibration).
9. **Document**: Date, participants, samples, IRR metrics (before/after), rubric changes, scoring decisions
10. **Schedule ongoing calibration**: Monthly or quarterly check-ins (prevents rubric drift)
### Resolving Discrepancies
**When reviewers disagree**:
- **Option 1: Discussion to consensus**: Reviewers discuss, agree on final score. Ensures consistency but time-consuming.
- **Option 2: Averaged scores**: Mean of reviewers' scores. Fast but can mask disagreement (4+2=3).
- **Option 3: Third reviewer**: If A and B differ by >1, C scores as tie-breaker. Resolves impasse but requires extra reviewer.
- **Option 4: Escalation**: Discrepancies >1 escalated to lead reviewer or committee. Quality control but bottleneck.
**Recommended**: Average for small discrepancies (1 point), discussion for large (2+ points), escalate if unresolved.
---
## 4. Bias Mitigation
### Halo Effect
**Halo effect**: Overall impression biases individual criterion scores. "Excellent work" → all criteria high, or "poor work" → all low.
**Example**: Code has excellent documentation (5/5) but poor performance (should be 2/5). Halo: Reviewer scores performance 4/5 due to overall positive impression.
**Mitigation**:
1. **Vertical scoring**: Score all submissions on Criterion 1, then all on Criterion 2 (focus on one criterion at a time)
2. **Blind scoring**: Reviewers don't see previous scores when scoring new criterion
3. **Separate passes**: First pass for overall sense (don't score), second pass to score each criterion
4. **Criterion definitions**: Clear, narrow definitions reduce bleed-over
### Anchoring and Order Effects
**Anchoring**: First information biases subsequent judgments. First essay scored 5/5 → second (objectively 4/5) feels worse → scored 3/5.
**Mitigation**:
1. **Randomize order**: Review in random order, not alphabetical or submission time
2. **Calibration anchors**: Review rubric and anchors before scoring (resets mental baseline)
3. **Batch scoring**: Score all on one criterion at once (easier to compare)
**Order effects**: Position in sequence affects score (first/last reviewed scored differently).
**Mitigation**: Multiple reviewers score in different random orders (order effect averages out).
### Leniency and Severity Bias
**Leniency**: Reviewer consistently scores higher than others (generous). **Severity**: Consistently scores lower (harsh).
**Detection**: Calculate mean score per reviewer. If Reviewer A averages 4.2 and Reviewer B averages 2.8 on same work → bias present.
**Mitigation**:
1. **Calibration sessions**: Show reviewers their bias, discuss differences
2. **Normalization** (controversial): Convert to z-scores (adjust for reviewer's mean). Changes scores, may feel unfair.
3. **Multiple reviewers**: Average scores (bias cancels out)
4. **Threshold-based**: Focus on "meets standard" (yes/no) vs numeric score
---
## 5. Advanced Rubric Design
### Weighted Criteria
**Weighting approaches**:
**1. Multiplicative weights**:
- Score × weight, sum weighted scores, divide by sum of weights
- Example: Security (4×3=12), Performance (3×2=6), Style (5×1=5). Total: 23/6 = 3.83
**2. Percentage weights**:
- Assign % to each criterion (sum to 100%)
- Example: Security 4×50%=2.0, Performance 3×30%=0.9, Style 5×20%=1.0. Total: 3.9/5.0
**When to weight**: Criteria have different importance, regulatory/compliance criteria, developmental priorities.
**Cautions**: Adds complexity, can obscure deficiencies (low critical score hidden in average). Alternative: Threshold scoring.
### Threshold Scoring
**Threshold**: Minimum score required on specific criteria regardless of overall average.
**Example**:
- Overall average ≥3.0 to pass
- **AND** Security ≥4.0 (critical threshold)
- **AND** No criterion <2.0 (floor threshold)
**Benefits**: Ensures critical criteria meet standard, prevents "compensation" (high Style masking low Security), clear requirements.
**Use cases**: Safety-critical systems, compliance requirements, competency gatekeeping.
### Combination Rubrics
**Hybrid approaches**:
- **Analytic + Holistic**: Analytic for diagnostic detail, holistic for overall judgment. Use when want both.
- **Checklist + Rubric**: Checklist for must-haves (gatekeeping), rubric for quality gradations (among passing). Use for gatekeeping then ranking.
- **Self-Assessment + Peer + Instructor**: Same rubric used by student, peers, instructor. Compare scores, discuss. Use for metacognitive learning.
---
## 6. Common Pitfalls
### Overlapping Criteria
**Problem**: Criteria not distinct, same dimension scored multiple times.
**Example**: "Organization" (structure, flow, coherence) + "Clarity" (easy to understand, well-structured, logical). **Overlap**: "well-structured" in both.
**Detection**: High correlation between criteria scores. Difficulty explaining difference.
**Fix**: Define boundaries explicitly ("Organization = structure. Clarity = language."), combine overlapping criteria, or split into finer-grained distinct criteria.
### Rubric Drift
**Problem**: Over time, reviewers interpret descriptors differently, rubric meaning changes.
**Causes**: No ongoing calibration, staff turnover, system changes.
**Detection**: IRR declines (was 80%, now 60%), scores inflate/deflate (average was 3.5, now 4.2 with no quality change), inconsistency complaints.
**Prevention**:
1. **Periodic calibration**: Quarterly sessions even with experienced reviewers
2. **Anchor examples**: Maintain library, use same anchors over time
3. **Documentation**: Record scoring decisions, accessible to new reviewers
4. **Version control**: Date rubric versions, note changes, communicate updates
### False Precision
**Problem**: Numeric scores imply precision that doesn't exist. 10-point scale but difference between 7 vs 8 arbitrary.
**Fix**:
- Reduce granularity (10→5 or 3 categories)
- Add descriptors for each level
- Report confidence intervals (Score = 3.5 ± 0.5)
- Be transparent: "Scores are informed judgments, not objective measurements"
### No Consequences for Ignoring Rubric
**Problem**: Rubric exists but reviewers don't use it or override scores based on gut feeling. Rubric becomes meaningless.
**Fix**:
1. **Require justification**: Reviewers must cite rubric descriptors when scoring
2. **Audit scores**: Spot-check scores against rubric, challenge unjustified deviations
3. **Training**: Emphasize rubric as contract (if wrong, change rubric, don't ignore)
4. **Accountability**: Reviewers who consistently deviate lose review privileges
---
## Summary
**Scale design**: Choose granularity matching observable differences. Mitigate central tendency with even-number scales or anchors.
**Descriptor writing**: Use observable language, parallel structure, examples at each level. Test: Can two reviewers score consistently?
**Calibration**: Measure IRR (Kappa, ICC), conduct calibration sessions, refine rubric, prevent drift with ongoing calibration.
**Bias mitigation**: Vertical scoring for halo effect, randomize order for anchoring, normalize or average for leniency/severity.
**Advanced design**: Weight critical criteria, use thresholds to prevent compensation, combine rubric types.
**Pitfalls**: Define distinct criteria, prevent drift with documentation and re-calibration, avoid false precision, ensure rubric has teeth.
**Final principle**: Rubrics structure judgment, not replace it. Use to increase consistency and transparency, not mechanize evaluation.

View File

@@ -0,0 +1,414 @@
# Evaluation Rubrics Templates
Quick-start templates for purpose definition, criteria selection, scale design, descriptor writing, and rubric formats.
## Workflow
```
Rubric Development Progress:
- [ ] Step 1: Define purpose and scope
- [ ] Step 2: Identify evaluation criteria
- [ ] Step 3: Design the scale
- [ ] Step 4: Write performance descriptors
- [ ] Step 5: Test and calibrate
- [ ] Step 6: Use and iterate
```
**Step 1: Define purpose and scope**
Use [Purpose Definition Template](#purpose-definition-template) to clarify evaluation context and constraints.
**Step 2: Identify evaluation criteria**
Brainstorm and prioritize quality dimensions using [Criteria Identification Template](#criteria-identification-template).
**Step 3: Design the scale**
Select scale type and levels using [Scale Selection Template](#scale-selection-template).
**Step 4: Write performance descriptors**
Write clear, observable descriptors using [Descriptor Writing Template](#descriptor-writing-template).
**Step 5: Test and calibrate**
Conduct inter-rater reliability testing and refine rubric.
**Step 6: Use and iterate**
Apply rubric, collect feedback, revise as needed.
---
## Purpose Definition Template
**What are we evaluating?**
- Artifact type: [e.g., code pull requests, research proposals, design mockups, student essays]
- Specific context: [e.g., internal code review, grant competition, course assignment]
**Who will evaluate?**
- Number of evaluators: [Single reviewer or multiple?]
- Evaluator expertise: [Subject matter experts, peers, instructors, automated systems]
- Evaluator availability: [Time per evaluation? Total volume?]
**Who are the evaluatees?**
- Audience: [Students, employees, vendors, applicants]
- Skill level: [Novice, intermediate, expert]
- Will they see rubric before evaluation? [Yes/No - if yes, rubric serves as guide]
**What decisions depend on scores?**
- High stakes: [Pass/fail, hiring, funding, promotion, grades]
- Medium stakes: [Feedback for improvement, prioritization, awards]
- Low stakes: [Self-assessment, informal feedback]
**Success criteria for rubric:**
- [ ] Enables consistent scoring across evaluators (inter-rater reliability >70%)
- [ ] Provides actionable feedback for improvement
- [ ] Takes reasonable time to use (target: X minutes per evaluation)
- [ ] Acceptable to evaluators (not overly complex or rigid)
- [ ] Acceptable to evaluatees (perceived as fair and transparent)
---
## Criteria Identification Template
### Brainstorming Quality Dimensions
**Product criteria** (artifact itself):
- Correctness/Accuracy: [Is it right? Factually accurate? Meets requirements?]
- Completeness: [Covers all necessary elements? No major gaps?]
- Clarity: [Easy to understand? Well-organized? Clear communication?]
- Quality/Craftsmanship: [Attention to detail? Polished? Professional?]
- Originality/Creativity: [Novel approach? Innovative? Goes beyond expected?]
- Performance: [Fast? Efficient? Scalable? Meets technical specs?]
**Process criteria** (how it was made):
- Methodology: [Followed appropriate process? Research methods sound?]
- Collaboration: [Teamwork? Communication? Used feedback?]
- Iteration: [Multiple drafts? Refinement? Responsiveness to critique?]
- Time management: [Completed on time? Paced work appropriately?]
**Impact criteria** (effects/outcomes):
- Usability: [User-friendly? Accessible? Intuitive?]
- Value: [Solves problem? Addresses need? Business impact?]
- Learning demonstrated: [Shows understanding? Growth from previous work?]
**Meta criteria** (quality of quality):
- Maintainability: [Can others work with this? Documented? Modular?]
- Testability: [Can be verified? Validated? Measured?]
- Extensibility: [Can be built upon? Flexible? Adaptable?]
### Prioritization
**Rate each candidate criterion:**
| Criterion | Importance (H/M/L) | Observable (Y/N) | Distinct from others (Y/N) | Include? |
|-----------|-------------------|------------------|---------------------------|----------|
| [Criterion 1] | | | | |
| [Criterion 2] | | | | |
| [Criterion 3] | | | | |
**Selection rules:**
- Must be High or Medium importance
- Must be Observable (can two reviewers score consistently?)
- Must be Distinct (not overlapping with other criteria)
- Aim for 4-8 criteria (balance coverage vs. simplicity)
**Final criteria** (4-8 selected):
1. [Criterion]: [Brief definition]
2. [Criterion]: [Brief definition]
3. [Criterion]: [Brief definition]
4. [Criterion]: [Brief definition]
---
## Scale Selection Template
**Scale type options:**
### Numeric Scales
**1-3 scale** (Low/Medium/High)
- Use when: Quick categorization, clear tiers sufficient
- Levels: 1=Below standard, 2=Meets standard, 3=Exceeds standard
**1-4 scale** (Forced choice, no middle)
- Use when: Want to avoid central tendency, need clear differentiation
- Levels: 1=Poor, 2=Fair, 3=Good, 4=Excellent
**1-5 scale** (Most common, allows neutral)
- Use when: General purpose, familiar to evaluators
- Levels: 1=Poor, 2=Fair, 3=Adequate, 4=Good, 5=Excellent
**1-10 scale** (Fine gradations)
- Use when: Large sample, need statistical analysis, can distinguish subtle differences
- Levels: 1-2=Poor, 3-4=Fair, 5-6=Adequate, 7-8=Good, 9-10=Excellent
### Qualitative Scales
**Developmental**: Novice → Developing → Proficient → Expert
**Standards-based**: Below Standard → Approaching → Meets → Exceeds
**Competency**: Not Yet Competent → Partially Competent → Competent → Highly Competent
### Binary
**Pass/Fail, Yes/No, Present/Absent**
- Use when: Compliance checks, minimum thresholds, clear criteria
**Selected scale for this rubric**: [Choose one]
- **Type**: [Numeric 1-5, Qualitative, etc.]
- **Levels**: [List with labels]
- **Rationale**: [Why this scale fits purpose]
---
## Descriptor Writing Template
For each criterion, write descriptors at each scale level.
### Criterion: [Name]
**Definition**: [What does this criterion assess? 1-2 sentences]
**Why it matters**: [Importance to overall quality]
**Scale descriptors:**
#### Level 5 (or highest): [Label]
**Observable characteristics**:
- [Concrete, observable feature 1]
- [Concrete, observable feature 2]
- [Concrete, observable feature 3]
**Example**: [Specific instance of work at this level]
#### Level 4: [Label]
**Observable characteristics**:
- [How this differs from Level 5 - what's missing or less strong]
- [Concrete observable feature]
**Example**: [Specific instance]
#### Level 3: [Label] (Baseline/Adequate)
**Observable characteristics**:
- [Minimum acceptable performance]
- [Observable feature]
**Example**: [Specific instance]
#### Level 2: [Label]
**Observable characteristics**:
- [What's lacking compared to Level 3]
- [Observable deficiency]
**Example**: [Specific instance]
#### Level 1 (or lowest): [Label]
**Observable characteristics**:
- [Significant deficiencies]
- [Observable problems]
**Example**: [Specific instance]
---
### Descriptor Writing Guidelines
**DO:**
- Use observable, measurable language ("Contains 3+ bugs" not "poor quality")
- Provide concrete examples or anchors for each level
- Focus on what IS present at each level, not just "less than" higher level
- Use parallel structure across levels (same aspects addressed at each level)
- Specify quantities when possible ("All 5 requirements met" vs "Most requirements met")
**DON'T:**
- Use subjective terms without definition ("creative", "professional", "excellent effort")
- Rely on comparative language only ("better than", "more sophisticated")
- Make assumptions about process ("spent time", "worked hard" - unless observable)
- Penalize for things not mentioned in descriptor (hidden expectations)
---
## Analytic Rubric Template
Most common format: Multiple criteria (rows) × Multiple levels (columns)
### Rubric for: [Artifact Type]
**Purpose**: [Brief description]
**Scale**: [1-5, 1-4, etc. with labels]
| Criterion | 1 | 2 | 3 | 4 | 5 | Weight |
|-----------|---|---|---|---|---|--------|
| **[Criterion 1]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
| **[Criterion 2]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
| **[Criterion 3]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
| **[Criterion 4]** | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [Descriptor] | [×N or %] |
**Scoring:**
- Calculate: (Score1 × Weight1) + (Score2 × Weight2) + ... / Total Weights
- Threshold: [e.g., Must average ≥3.0 to pass, ≥4 on critical criteria]
**Usage notes:**
- Score each criterion independently before looking at others (avoid halo effect)
- Provide brief justification for each score
- Flag areas for improvement in feedback
---
## Holistic Rubric Template
Single overall score integrating multiple criteria.
### Rubric for: [Artifact Type]
**Purpose**: [Brief description]
#### Level 5: Excellent
**Overall quality**: [Integrated description touching all important aspects]
- Criterion A: [How it manifests at this level]
- Criterion B: [How it manifests at this level]
- Criterion C: [How it manifests at this level]
**Example**: [Work that exemplifies this level]
#### Level 4: Good
**Overall quality**: [Integrated description]
- Differences from Level 5: [What's less strong]
- Key characteristics: [Observable features]
**Example**: [Work that exemplifies this level]
#### Level 3: Adequate
**Overall quality**: [Integrated description of baseline acceptable]
- Meets minimum standards: [What's required]
- May have: [Acceptable weaknesses]
**Example**: [Work that exemplifies this level]
#### Level 2: Weak
**Overall quality**: [Integrated description of below standard]
- Falls short because: [Key deficiencies]
- Problems include: [Observable issues]
**Example**: [Work that exemplifies this level]
#### Level 1: Poor
**Overall quality**: [Integrated description of unacceptable]
- Major problems: [Significant deficiencies across multiple aspects]
**Example**: [Work that exemplifies this level]
---
## Single-Point Rubric Template
Lists criteria with "meets standard" description only, space to note exceeds/below.
### Rubric for: [Artifact Type]
| Criterion | Concerns (Below Standard) | Meets Standard | Advanced (Exceeds Standard) |
|-----------|---------------------------|----------------|----------------------------|
| **[Criterion 1]** | | [Clear description of standard] | |
| **[Criterion 2]** | | [Clear description of standard] | |
| **[Criterion 3]** | | [Clear description of standard] | |
| **[Criterion 4]** | | [Clear description of standard] | |
**Usage:**
- Check if work meets standard for each criterion
- Note specific strengths in "Advanced" column (e.g., "+Exceptionally clear examples")
- Note specific areas for improvement in "Concerns" column (e.g., "-Missing citations for 3 claims")
---
## Checklist Template
Binary yes/no for must-have requirements.
### Checklist for: [Artifact Type]
#### Category 1: [e.g., Completeness]
- [ ] [Specific requirement 1]
- [ ] [Specific requirement 2]
- [ ] [Specific requirement 3]
#### Category 2: [e.g., Quality]
- [ ] [Specific requirement 4]
- [ ] [Specific requirement 5]
#### Category 3: [e.g., Compliance]
- [ ] [Specific requirement 6]
- [ ] [Specific requirement 7]
**Pass/Fail Criteria:**
- **Pass**: All items checked OR All items in critical categories + X% of others
- **Fail**: Any critical item unchecked OR <Y% total items checked
---
## Weighted Scoring Template
When criteria have different importance.
### Weighted Rubric for: [Artifact Type]
| Criterion | Score (1-5) | Weight | Weighted Score |
|-----------|-------------|--------|----------------|
| [Criterion 1] | | ×3 (Critical) | Score × 3 = |
| [Criterion 2] | | ×2 (Important) | Score × 2 = |
| [Criterion 3] | | ×2 (Important) | Score × 2 = |
| [Criterion 4] | | ×1 (Desirable) | Score × 1 = |
| **Total** | | **8** | **[Sum] / 8 =** |
**Weight categories:**
- ×3 = Critical (must be strong, threshold: ≥4 required)
- ×2 = Important (significant impact on overall quality)
- ×1 = Desirable (nice to have, less critical)
---
## Calibration Session Template
**Pre-calibration:**
1. Select 3-5 sample works spanning quality range (low, medium, high)
2. Have each reviewer independently score samples using rubric
3. Record scores without discussion
**During calibration:**
4. Compare scores across reviewers for each sample
5. For discrepancies (>1 point difference):
- Discuss what each reviewer saw
- Identify ambiguous descriptors
- Clarify criterion boundaries
- Refine rubric language
6. Re-score samples using refined rubric
**Post-calibration:**
7. Calculate inter-rater reliability (% agreement, Kappa)
8. Target: ≥70% agreement (within 1 point) or Kappa ≥0.6
9. If below target: Iterate with more refinement + calibration
10. Document calibration decisions and rubric changes
---
## Feedback Template
**For: [Evaluatee Name]**
**Overall Score**: [X.X / 5.0 or Level]
**Criterion-by-Criterion Scores:**
| Criterion | Score | Feedback |
|-----------|-------|----------|
| [Criterion 1] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
| [Criterion 2] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
| [Criterion 3] | X/5 | **Strengths**: [What was done well]<br>**Areas for improvement**: [Specific suggestions] |
**Summary:**
- **Greatest strengths**: [2-3 specific strengths]
- **Priority improvements**: [2-3 most important areas to address]
- **Next steps**: [Actionable recommendations]
**Overall assessment**: [Pass/Fail or qualitative judgment]