Files
gh-lyndonkl-claude/skills/hypotheticals-counterfactuals/resources/evaluators/rubric_hypotheticals_counterfactuals.json
2025-11-30 08:38:26 +08:00

212 lines
15 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"criteria": [
{
"name": "Scenario Plausibility",
"description": "Scenarios are possible given current knowledge, not fantasy. Counterfactuals were realistic alternatives at decision time.",
"scale": {
"1": "Implausible scenarios (magic, impossible foreknowledge). Counterfactuals couldn't have been chosen at the time.",
"3": "Mostly plausible but some unrealistic assumptions. Counterfactuals stretch believability.",
"5": "All scenarios plausible given what was/is known. Counterfactuals were genuine alternatives available at decision time."
}
},
{
"name": "Minimal Rewrite Principle (Counterfactuals)",
"description": "Counterfactuals change as little as possible to isolate causal factor. Not multiple changes bundled together.",
"scale": {
"1": "Many factors changed simultaneously. Can't tell which caused different outcome. 'What if X AND Y AND Z...'",
"3": "Some attempt at isolation but still multiple changes. Primary factor identified but confounded.",
"5": "Perfect isolation: single factor changed, all else held constant. Causal factor clearly identified."
}
},
{
"name": "Causal Mechanism Specification",
"description": "Explains HOW change leads to different outcome. Not just stating result but tracing causal chain.",
"scale": {
"1": "No mechanism specified. Just outcome stated ('sales would be higher') without explanation.",
"3": "Partial mechanism. Some causal steps identified but incomplete chain.",
"5": "Complete causal chain: initial change → immediate effect → secondary effects → final outcome. Each step explained."
}
},
{
"name": "Probability Calibration",
"description": "Scenarios assigned probabilities based on evidence, base rates, analogies. Not all weighted equally.",
"scale": {
"1": "No probabilities assigned, or all scenarios treated as equally likely. No base rate consideration.",
"3": "Rough probabilities assigned but weak justification. Some consideration of likelihood.",
"5": "Well-calibrated probabilities using base rates, analogies, expert judgment. Sum to 100%. Clear reasoning for each."
}
},
{
"name": "Pre-Mortem Rigor",
"description": "For pre-mortems: follows 6-step process, generates novel failure modes specific to context, not just generic risks.",
"scale": {
"1": "Generic risk list copied from elsewhere. Hindsight bias ('obvious' failures). No structured process.",
"3": "Some specific risks but mixed with generic ones. Process partially followed.",
"5": "Rigorous 6-step process: silent brainstorm, round-robin, voting, mitigations. Context-specific failure modes identified."
}
},
{
"name": "Action Extraction",
"description": "Clear extraction of common actions, hedges, options, and decision points from scenarios. Not just stories.",
"scale": {
"1": "Scenarios developed but no actions extracted. 'Interesting stories' with no operational implications.",
"3": "Some actions identified but incomplete. Missing hedges or options.",
"5": "Comprehensive action plan: common actions (all scenarios), hedges (downside), options (upside), decision triggers clearly specified."
}
},
{
"name": "Leading Indicator Quality",
"description": "Indicators are observable, early (6+ months advance), and actionable. Clear thresholds defined.",
"scale": {
"1": "No indicators, or lagging indicators (show scenario after it's happened). No thresholds.",
"3": "Some leading indicators but vague thresholds or not truly early signals.",
"5": "High-quality leading indicators: observable metrics, 6+ months advance notice, clear thresholds, trigger specific actions."
}
},
{
"name": "Scenario Diversity",
"description": "Scenarios are qualitatively different, not just magnitude variations. Cover meaningful range of futures.",
"scale": {
"1": "Scenarios differ only in magnitude (10% growth vs 15% vs 20%). Basically same story.",
"3": "Some qualitative differences but scenarios too similar or narrow range.",
"5": "Meaningfully different scenarios: qualitative distinctions, broad range captured, distinct strategic implications for each."
}
},
{
"name": "Bias Avoidance (Hindsight/Confirmation)",
"description": "Avoids hindsight bias in counterfactuals, confirmation bias in scenario selection, anchoring on current trends.",
"scale": {
"1": "Strong hindsight bias ('we should have known'). Only scenarios confirming current view. Anchored on status quo.",
"3": "Some bias awareness but incomplete mitigation. Mostly avoids obvious biases.",
"5": "Rigorous bias mitigation: re-inhabits decision context, considers disconfirming scenarios, challenges assumptions, uses base rates."
}
},
{
"name": "Monitoring and Adaptation Plan",
"description": "Defined monitoring cadence (quarterly), indicator tracking, scenario probability updates, adaptation triggers.",
"scale": {
"1": "No monitoring plan. Set-it-and-forget-it scenarios. No updates planned.",
"3": "Informal plan to review occasionally. No specific cadence or triggers.",
"5": "Detailed monitoring: quarterly reviews, indicator dashboard, probability updates, clear adaptation triggers and owner."
}
}
],
"guidance_by_type": {
"Strategic Planning (1-3 year horizon)": {
"target_score": 4.2,
"key_criteria": ["Scenario Diversity", "Probability Calibration", "Action Extraction"],
"common_pitfalls": ["Too narrow scenario range", "No hedges against downside", "Monitoring plan missing"],
"specific_guidance": "Use three-scenario framework (optimistic/baseline/pessimistic) or 2×2 matrix. Assign probabilities (optimistic 15-30%, baseline 40-60%, pessimistic 15-30%). Extract common actions that work across all scenarios, plus hedges for downside. Quarterly monitoring."
},
"Pre-Mortem (Project Risk Identification)": {
"target_score": 4.0,
"key_criteria": ["Pre-Mortem Rigor", "Action Extraction", "Bias Avoidance"],
"common_pitfalls": ["Generic risks (not context-specific)", "Hindsight bias ('obvious' failures)", "No mitigations assigned"],
"specific_guidance": "Follow 6-step process rigorously. Silent brainstorm 5-10 min to prevent groupthink. Generate context-specific failure modes. Vote on top 5-7 risks. Assign mitigation owner and deadline for each."
},
"Counterfactual Learning (Post-Decision Analysis)": {
"target_score": 3.8,
"key_criteria": ["Minimal Rewrite Principle", "Causal Mechanism Specification", "Bias Avoidance"],
"common_pitfalls": ["Changing multiple factors (can't isolate cause)", "No causal mechanism (just outcome)", "Hindsight bias ('knew it all along')"],
"specific_guidance": "Change single factor, hold all else constant. Trace complete causal chain (change → immediate effect → secondary effects → outcome). Re-inhabit decision context to avoid hindsight. Use base rates and analogies to estimate counterfactual probability."
},
"Stress Testing (Decision Robustness)": {
"target_score": 4.0,
"key_criteria": ["Scenario Diversity", "Causal Mechanism Specification", "Action Extraction"],
"common_pitfalls": ["Only optimistic/pessimistic (no black swan)", "No mechanism for how extremes occur", "Decision not actually tested"],
"specific_guidance": "Test decision against optimistic, pessimistic, AND black swan scenarios. Specify HOW extreme outcomes occur. Ask 'Does decision still hold?' for each scenario. Extract hedges to protect against downside extremes."
},
"Assumption Reversal (Innovation/Pivots)": {
"target_score": 3.5,
"key_criteria": ["Scenario Plausibility", "Action Extraction", "Bias Avoidance"],
"common_pitfalls": ["Reversed assumptions implausible", "Interesting but no experiments", "Confirmation bias (only reverse convenient assumptions)"],
"specific_guidance": "Reverse core assumptions ('customers want more features' → 'want fewer'). Test plausibility (could reversal be true?). Design small experiments to test reversal. Challenge assumptions that support current strategy, not just peripheral ones."
}
},
"guidance_by_complexity": {
"Simple (Routine Decisions, Short-Term)": {
"target_score": 3.5,
"focus_areas": ["Scenario Plausibility", "Causal Mechanism Specification", "Action Extraction"],
"acceptable_shortcuts": ["Informal probabilities", "Two scenarios instead of three", "Simple pre-mortem (no voting)"],
"specific_guidance": "Quick pre-mortem (30 min) or simple counterfactual analysis. Two scenarios (optimistic/pessimistic). Extract 2-3 key actions. Informal monitoring acceptable."
},
"Standard (Strategic Decisions, 1-2 year horizon)": {
"target_score": 4.0,
"focus_areas": ["Probability Calibration", "Scenario Diversity", "Leading Indicator Quality", "Monitoring and Adaptation"],
"acceptable_shortcuts": ["Three scenarios (not full 2×2 matrix)", "Quarterly vs monthly monitoring"],
"specific_guidance": "Three-scenario framework with probabilities. Extract common actions, hedges, options. Define 5-7 leading indicators. Quarterly scenario reviews and updates. Assign owners for monitoring."
},
"Complex (High Stakes, Multi-Year, High Uncertainty)": {
"target_score": 4.5,
"focus_areas": ["All criteria", "Rigorous validation", "Comprehensive monitoring"],
"acceptable_shortcuts": ["None - full rigor required"],
"specific_guidance": "Full 2×2 scenario matrix or cone of uncertainty. Rigorous probability calibration using base rates and expert judgment. Comprehensive pre-mortem with cross-functional team. Leading indicators with clear thresholds and decision triggers. Monthly monitoring, quarterly deep reviews. All mitigations assigned with owners and deadlines."
}
},
"common_failure_modes": [
{
"name": "Implausible Counterfactuals (Fantasy)",
"symptom": "Counterfactuals require magic, impossible foreknowledge, or weren't real options at decision time. 'What if we had known pandemic was coming?'",
"detection": "Ask: 'Could a reasonable decision-maker have chosen this alternative given information available then?' If no, implausible.",
"fix": "Restrict to alternatives actually available at decision time. Use 'what was on the table?' test. Avoid hindsight-dependent counterfactuals."
},
{
"name": "Multiple Changes (Can't Isolate Cause)",
"symptom": "Counterfactual changes many factors: 'What if we had raised $3M AND launched EU AND hired different CEO...' Can't tell which mattered.",
"detection": "Count changes. If >1 factor changed, causal isolation violated.",
"fix": "Minimal rewrite: change ONE factor, hold all else constant. Want to test funding? Change funding only. Want to test geography? Change geography only."
},
{
"name": "No Causal Mechanism",
"symptom": "Outcome stated without explanation. 'Sales would be 2× higher' but no WHY or HOW.",
"detection": "Ask 'How does change lead to outcome?' If answer vague or missing, no mechanism.",
"fix": "Trace causal chain: initial change → immediate effect → secondary effects → final outcome. Each step must be explained with logic or evidence."
},
{
"name": "Scenarios Too Similar",
"symptom": "Three scenarios differ only in magnitude (10% growth vs 15% vs 20%). Same story, different numbers.",
"detection": "Read scenarios. Do they describe qualitatively different worlds? If no, too similar.",
"fix": "Make scenarios qualitatively distinct. Different drivers, different strategic implications. Use 2×2 matrix to force diversity via two independent uncertainties."
},
{
"name": "No Probabilities Assigned",
"symptom": "All scenarios treated as equally likely, or no probabilities given. Implies 33% each for three scenarios regardless of plausibility.",
"detection": "Check if probabilities assigned and justified. If missing or all equal, red flag.",
"fix": "Assign probabilities using base rates, analogies, expert judgment. Baseline typically 40-60%, optimistic/pessimistic 15-30% each. Justify each estimate."
},
{
"name": "Hindsight Bias in Counterfactuals",
"symptom": "'Obviously we should have done X' - outcome seems inevitable in retrospect. Overconfidence counterfactual would have succeeded.",
"detection": "Ask: 'Was outcome predictable given information at decision time?' If reasoning depends on information learned after, hindsight bias.",
"fix": "Re-inhabit decision context: what was known/unknown then? What uncertainty existed? Acknowledge alternative could have failed too. Use base rates to calibrate confidence."
},
{
"name": "Generic Pre-Mortem Risks",
"symptom": "Pre-mortem lists generic risks ('ran out of money', 'competition', 'tech didn't work') not specific to this project.",
"detection": "Could these risks apply to any project? If yes, too generic.",
"fix": "Push for context-specific failure modes. What's unique about THIS project? What specific technical challenges? Which specific competitors? What particular market risks?"
},
{
"name": "Scenarios Without Actions",
"symptom": "Interesting stories developed but no operational implications. 'So what should we do?' question unanswered.",
"detection": "Read scenario analysis. Is there action plan with common actions, hedges, options? If no, incomplete.",
"fix": "Always end with action extraction: (1) Common actions (all scenarios), (2) Hedges (downside protection), (3) Options (upside preparation), (4) Leading indicators (monitoring)."
},
{
"name": "Lagging Indicators (Not Leading)",
"symptom": "Indicators show scenario after it's happened. 'Revenue collapse' indicates pessimistic scenario, but too late to act.",
"detection": "Ask: 'Does this indicator give 6+ months advance notice?' If no, it's lagging.",
"fix": "Find early signals: regulatory votes (before law passed), competitor funding rounds (before product launched), adoption rate trends (before market share shift). Leading indicators are predictive, not reactive."
},
{
"name": "No Monitoring Plan",
"symptom": "Scenarios developed, actions defined, then filed away. No one tracking which scenario unfolding or updating probabilities.",
"detection": "Ask: 'Who monitors? How often? What triggers update?' If no answers, no plan.",
"fix": "Define: (1) Owner responsible for monitoring, (2) Cadence (monthly/quarterly reviews), (3) Indicator dashboard, (4) Decision triggers ('If X crosses threshold Y, then action Z'), (5) Scenario probability update process."
}
],
"minimum_standard": 3.5,
"target_score": 4.0,
"excellence_threshold": 4.5
}