Files
gh-lyndonkl-claude/skills/bayesian-reasoning-calibration/resources/evaluators/rubric_bayesian_reasoning_calibration.json
2025-11-30 08:38:26 +08:00

136 lines
5.8 KiB
JSON

{
"name": "Bayesian Reasoning Quality Rubric",
"scale": {
"min": 1,
"max": 5,
"description": "1=Poor, 2=Fair, 3=Good, 4=Very Good, 5=Excellent"
},
"criteria": [
{
"name": "Prior Quality",
"description": "Prior is based on base rates and reference classes, not just intuition",
"scoring": {
"1": "No prior stated or purely intuitive guess",
"2": "Prior stated but ignores base rates entirely",
"3": "Prior considers base rates with some adjustment",
"4": "Prior well-grounded in base rates with justified adjustments",
"5": "Exceptional prior with multiple reference classes and clear reasoning"
}
},
{
"name": "Likelihood Justification",
"description": "Likelihoods P(E|H) and P(E|¬H) are estimated with clear reasoning",
"scoring": {
"1": "No likelihoods or purely guessed",
"2": "Likelihoods given but no justification",
"3": "Likelihoods have basic reasoning",
"4": "Likelihoods well-justified with clear logic",
"5": "Exceptional likelihood estimates with empirical grounding or detailed reasoning"
}
},
{
"name": "Evidence Diagnosticity",
"description": "Evidence meaningfully distinguishes between hypotheses (LR ≠ 1)",
"scoring": {
"1": "Evidence is not diagnostic at all (LR ≈ 1)",
"2": "Evidence is weakly diagnostic (LR = 1-2)",
"3": "Evidence is moderately diagnostic (LR = 2-5)",
"4": "Evidence is strongly diagnostic (LR = 5-10)",
"5": "Evidence is very strongly diagnostic (LR > 10)"
}
},
{
"name": "Calculation Correctness",
"description": "Bayesian calculation is mathematically correct",
"scoring": {
"1": "Major calculation errors",
"2": "Some calculation errors",
"3": "Calculation is correct with minor issues",
"4": "Calculation is fully correct",
"5": "Perfect calculation with both probability and odds forms shown"
}
},
{
"name": "Calibration & Realism",
"description": "Posterior is calibrated, not overconfident (avoids extremes without justification)",
"scoring": {
"1": "Posterior is 0% or 100% without extreme evidence",
"2": "Posterior is very extreme (>95% or <5%) with weak evidence",
"3": "Posterior is reasonable but might be slightly overconfident",
"4": "Well-calibrated posterior with appropriate uncertainty",
"5": "Exceptional calibration with explicit confidence bounds"
}
},
{
"name": "Assumption Transparency",
"description": "Key assumptions and limitations are stated explicitly",
"scoring": {
"1": "No assumptions stated",
"2": "Few assumptions mentioned vaguely",
"3": "Key assumptions stated",
"4": "Comprehensive assumption documentation",
"5": "Exceptional transparency with sensitivity analysis showing assumption impact"
}
},
{
"name": "Base Rate Usage",
"description": "Analysis uses base rates appropriately (avoids base rate neglect)",
"scoring": {
"1": "Completely ignores base rates",
"2": "Acknowledges base rates but doesn't use them",
"3": "Uses base rates for prior",
"4": "Properly incorporates base rates with adjustments",
"5": "Exceptional use of multiple base rates and reference classes"
}
},
{
"name": "Sensitivity Analysis",
"description": "Tests how sensitive conclusion is to input assumptions",
"scoring": {
"1": "No sensitivity analysis",
"2": "Minimal sensitivity check",
"3": "Basic sensitivity analysis on key inputs",
"4": "Comprehensive sensitivity analysis",
"5": "Exceptional sensitivity analysis showing robustness or fragility clearly"
}
},
{
"name": "Interpretation Quality",
"description": "Posterior is interpreted correctly with decision implications",
"scoring": {
"1": "Misinterprets posterior or no interpretation",
"2": "Basic interpretation but lacks context",
"3": "Good interpretation with some decision guidance",
"4": "Clear interpretation with actionable decision implications",
"5": "Exceptional interpretation linking probability to specific actions and thresholds"
}
},
{
"name": "Avoidance of Common Errors",
"description": "Avoids prosecutor's fallacy, base rate neglect, and other Bayesian errors",
"scoring": {
"1": "Multiple major errors (confusing P(E|H) with P(H|E), ignoring base rates)",
"2": "One major error present",
"3": "Mostly avoids common errors",
"4": "Cleanly avoids all common errors",
"5": "Exceptional awareness with explicit checks against common errors"
}
}
],
"overall_assessment": {
"thresholds": {
"excellent": "Average score ≥ 4.5 (publication quality)",
"very_good": "Average score ≥ 4.0 (most forecasts should aim for this)",
"good": "Average score ≥ 3.5 (minimum for important decisions)",
"acceptable": "Average score ≥ 3.0 (workable for low-stakes predictions)",
"needs_rework": "Average score < 3.0 (redo before using)"
},
"stakes_guidance": {
"low_stakes": "Personal predictions, low-cost decisions: aim for ≥ 3.0",
"medium_stakes": "Business decisions, moderate cost: aim for ≥ 3.5",
"high_stakes": "Major decisions, high cost of error: aim for ≥ 4.0"
}
},
"usage_instructions": "Rate each criterion on 1-5 scale. Calculate average. For important forecasts or decisions, minimum score is 3.5. For high-stakes decisions where cost of error is high, aim for ≥4.0. Check especially for base rate neglect, prosecutor's fallacy, and overconfidence - these are the most common errors."
}