commit dccefb2050ee35abee8f1d9e4d09092fe700ec01 Author: Zhongwei Li Date: Sun Nov 30 09:00:08 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..60db491 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "yzmir-systems-thinking", + "description": "Systems thinking methodology - patterns, leverage points, archetypes, modeling, visualization - 6 skills", + "version": "1.0.0", + "author": { + "name": "tachyon-beep", + "url": "https://github.com/tachyon-beep" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a08f8d7 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# yzmir-systems-thinking + +Systems thinking methodology - patterns, leverage points, archetypes, modeling, visualization - 6 skills diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..b823d07 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,69 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/yzmir-systems-thinking", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "4db384d5dd4d82e2cf019a9652b84dafb8c1ed3c", + "treeHash": "27e603df8c86b7aa080e1608b0585b68ba9c069917a3dc31330386999bfa9682", + "generatedAt": "2025-11-28T10:28:34.840800Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "yzmir-systems-thinking", + "description": "Systems thinking methodology - patterns, leverage points, archetypes, modeling, visualization - 6 skills", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "90c2f1ac64eb3cb3edac37aecd1999496a650365361d46c188a4c5c99e219949" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "3c0524d25b066f8cb3d30845be51958f84d64e24d303e5e9b154bdd351387cc8" + }, + { + "path": "skills/using-systems-thinking/causal-loop-diagramming.md", + "sha256": "df6c6f38815af334dfd4e87aac793960c9f6ec44abfc5ceae7391e565d90e3dd" + }, + { + "path": "skills/using-systems-thinking/systems-archetypes-reference.md", + "sha256": "e5db64246c014391fb09c2a975d0512b24cf6bf5d697d9aa0557c26e82604d6d" + }, + { + "path": "skills/using-systems-thinking/stocks-and-flows-modeling.md", + "sha256": "ffcbd51a40bd6e38c2d8b8bdee0724250da3bb63705484c2de4dc83c6e373d32" + }, + { + "path": "skills/using-systems-thinking/behavior-over-time-graphs.md", + "sha256": "12246eb7b09499f8db499dea192a62d9cf25994c871ed3243a68db77d0403940" + }, + { + "path": "skills/using-systems-thinking/leverage-points-mastery.md", + "sha256": "f79c95aee764f165334729b5f04a8d643a75ed7d4fc9162d3fc192313884fcfd" + }, + { + "path": "skills/using-systems-thinking/SKILL.md", + "sha256": "27a57c6162af5c4e8eab0ed671f1c57f98cecb527231d19367d32b77f2e3a4bb" + }, + { + "path": "skills/using-systems-thinking/recognizing-system-patterns.md", + "sha256": "71c8f98d702c4eb93462550ed3ab18a11d153e7e54a19180a76f8eaba05b931b" + } + ], + "dirSha256": "27e603df8c86b7aa080e1608b0585b68ba9c069917a3dc31330386999bfa9682" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/using-systems-thinking/SKILL.md b/skills/using-systems-thinking/SKILL.md new file mode 100644 index 0000000..7b6ddeb --- /dev/null +++ b/skills/using-systems-thinking/SKILL.md @@ -0,0 +1,483 @@ +--- +name: using-systems-thinking +description: Router for systems thinking methodology - patterns, leverage points, archetypes, stocks-flows, causal loops, BOT graphs +mode: true +pack: yzmir/systems-thinking +faction: yzmir +skill_type: meta_router +dependencies: + - yzmir/systems-thinking/recognizing-system-patterns + - yzmir/systems-thinking/leverage-points-mastery + - yzmir/systems-thinking/systems-archetypes-reference + - yzmir/systems-thinking/stocks-and-flows-modeling + - yzmir/systems-thinking/causal-loop-diagramming + - yzmir/systems-thinking/behavior-over-time-graphs +estimated_time_hours: 0.5 +--- + +# Using Systems-Thinking (Meta-Skill Router) + +**Your entry point to systems thinking methodology.** This skill routes you to the right combination of systems analysis skills for understanding complex, interconnected problems. + +## Purpose + +This is a **meta-skill** that: +1. ✅ **Routes** you to the correct systems thinking skills +2. ✅ **Combines** multiple skills for comprehensive analysis +3. ✅ **Provides** workflows for common problem types +4. ✅ **Explains** when to use systems thinking vs other approaches + +**You should use this skill:** When facing complex problems with feedback loops, delays, unintended consequences, or persistent failures despite interventions. + +--- + +## Core Philosophy: Think in Systems + +### The Central Idea + +**Linear Thinking**: Problem → Solution → Fixed +- Assumes cause and effect are close in time and space +- Ignores feedback loops and delays +- Leads to "fixes that fail" and escalation +- Symptoms return or move elsewhere + +**Systems Thinking**: Structure → Behavior → Intervention +- Recognizes feedback loops create behavior +- Delays cause intuition failures +- Small interventions at leverage points beat brute force +- Address root causes, not symptoms + +### When This Pack Applies + +**✅ Use systems-thinking when:** +- Problems persist despite repeated fixes +- Solutions create new problems (unintended consequences) +- System behavior is counter-intuitive +- Multiple stakeholders with conflicting incentives +- Long delays between action and result +- "The harder we push, the harder the system pushes back" + +**❌ Don't use systems-thinking when:** +- Simple, isolated problems with clear cause-effect +- One-time decisions with immediate results +- Pure optimization (no feedback dynamics) +- Well-understood linear processes + +--- + +## Pack Overview: 6 Core Skills + +### Wave 1: Foundation and Pattern Recognition + +#### 1. recognizing-system-patterns +**When to use:** ANY complex problem - start here +**Teaches:** S-curves, feedback loops (reinforcing/balancing), delays, stock-flow thinking +**Examples:** Viral growth, technical debt, burnout spirals +**Time:** 45-60 min +**Key insight:** Behavior patterns reveal underlying structure + +#### 2. systems-archetypes-reference +**When to use:** Recognize recurring problem patterns +**Teaches:** 10 classic archetypes (Fixes that Fail, Shifting the Burden, Escalation, etc.) +**Examples:** Feature factory, hero culture, arms race +**Time:** 60-90 min +**Key insight:** Most problems match known patterns with known solutions + +#### 3. leverage-points-mastery +**When to use:** Design interventions, prioritize where to act +**Teaches:** Donella Meadows' 12 leverage points hierarchy +**Examples:** Constants (weak) vs rules vs paradigms (powerful) +**Time:** 60-75 min +**Key insight:** Small changes at high leverage points beat large changes at low points + +### Wave 2: Quantitative Analysis + +#### 4. stocks-and-flows-modeling +**When to use:** Predict future states, calculate equilibrium, analyze accumulation dynamics +**Teaches:** Formal notation, equilibrium analysis, time constants, delay analysis +**Examples:** Customer churn, bug backlog, burnout accumulation +**Time:** 75-90 min +**Key insight:** Quantification elevates from "will get worse" to "6.7 weeks to crisis" + +#### 5. causal-loop-diagramming +**When to use:** Map system structure, communicate feedback dynamics, find root causes +**Teaches:** 6-step construction process, polarity testing, loop identification +**Examples:** Death spirals, virtuous cycles, balancing processes +**Time:** 60-75 min +**Key insight:** Systematic construction prevents polarity errors that change diagnosis + +#### 6. behavior-over-time-graphs +**When to use:** Show trajectories, compare scenarios, communicate dynamics over time +**Teaches:** 7-step construction, 70-80% scale rule, ASCII standards, validation +**Examples:** S-curve adoption, crisis timing, intervention impact +**Time:** 60-75 min +**Key insight:** "What happens over time" with concrete numbers and dates + +--- + +## Routing Logic: Which Skills Do I Need? + +### Decision Tree + +``` +START: What's your goal? + +├─ UNDERSTAND A PROBLEM (First time encountering complexity) +│ ├─ Start here → recognizing-system-patterns +│ ├─ Does it match a known pattern? → systems-archetypes-reference +│ └─ What behavior over time? → behavior-over-time-graphs +│ +├─ MAP SYSTEM STRUCTURE (How does this work?) +│ ├─ Identify feedback loops → causal-loop-diagramming +│ ├─ Calculate accumulation → stocks-and-flows-modeling +│ └─ Show dynamics → behavior-over-time-graphs +│ +├─ DESIGN INTERVENTIONS (What should we do?) +│ ├─ Identify leverage points → leverage-points-mastery +│ ├─ Predict outcomes → stocks-and-flows-modeling + behavior-over-time-graphs +│ └─ Match to archetype solution → systems-archetypes-reference +│ +├─ COMMUNICATE TO STAKEHOLDERS (Convince others) +│ ├─ Executive version → behavior-over-time-graphs (with $ impacts) +│ ├─ Technical version → causal-loop-diagramming + stocks-and-flows-modeling +│ └─ Pattern recognition → systems-archetypes-reference ("We're in Fixes that Fail") +│ +└─ QUANTITATIVE PREDICTION (When will crisis hit? How many?) + ├─ Calculate trajectory → stocks-and-flows-modeling + ├─ Visualize scenarios → behavior-over-time-graphs + └─ Validate structure → causal-loop-diagramming +``` + +--- + +## Common Problem Types and Skill Combinations + +### Scenario 1: "Our Solution Keeps Failing" + +**Symptoms:** +- Applied fix multiple times +- Problem returns or gets worse +- "We tried everything" + +**Routing Sequence:** +1. **systems-archetypes-reference** → Recognize "Fixes that Fail" or "Shifting the Burden" +2. **causal-loop-diagramming** → Map the reinforcing loop keeping problem alive +3. **leverage-points-mastery** → Find intervention point (probably addressing root cause, not symptom) +4. **behavior-over-time-graphs** → Show "with fix" vs "without fix" vs "address root cause" + +**Why this sequence:** +- Archetypes give you the pattern (quick recognition) +- CLD maps the specific instance +- Leverage points guide where to intervene +- BOT graphs communicate to stakeholders + +### Scenario 2: "Growth is Slowing / Hitting Limits" + +**Symptoms:** +- Initial success, now plateauing +- S-curve behavior +- Limits to growth + +**Routing Sequence:** +1. **recognizing-system-patterns** → Identify S-curve, find the balancing loop +2. **stocks-and-flows-modeling** → Calculate time to saturation, equilibrium capacity +3. **systems-archetypes-reference** → "Limits to Growth" archetype +4. **leverage-points-mastery** → Options: expand limit, find new growth, stabilize +5. **behavior-over-time-graphs** → Show trajectory with/without limit expansion + +**Why this sequence:** +- Pattern recognition confirms S-curve +- Stock-flow gives you numbers (when hits limit?) +- Archetype provides intervention options +- Leverage points prioritize options +- BOT graphs show impact + +### Scenario 3: "We're in a Vicious Spiral" + +**Symptoms:** +- Self-reinforcing decline +- "The harder we work, the worse it gets" +- Death spiral, burnout, quality collapse + +**Routing Sequence:** +1. **recognizing-system-patterns** → Identify reinforcing loop (R) +2. **causal-loop-diagramming** → Map the specific reinforcing structure +3. **systems-archetypes-reference** → Match to "Escalation" or "Success to the Successful" +4. **stocks-and-flows-modeling** → Calculate time to crisis (when does morale hit 0?) +5. **leverage-points-mastery** → Break the loop (add balancing feedback) +6. **behavior-over-time-graphs** → Show crisis timing + intervention impact + +**Why this sequence:** +- Pattern recognition identifies reinforcing dynamic +- CLD maps exact structure (what's reinforcing what?) +- Archetype gives tested solutions +- Stock-flow calculates urgency +- Leverage points find where to break loop +- BOT graphs communicate stakes + +### Scenario 4: "Delay Between Action and Result" + +**Symptoms:** +- Decisions based on old information +- Overshooting, oscillation +- "We keep over/under correcting" + +**Routing Sequence:** +1. **recognizing-system-patterns** → Identify delay, oscillation pattern +2. **stocks-and-flows-modeling** → Calculate delay time constant, D/R ratio +3. **causal-loop-diagramming** → Mark delays on causal links (||) +4. **systems-archetypes-reference** → "Shifting the Burden to the Intervenor" (long-term fix delayed) +5. **behavior-over-time-graphs** → Show overshoot/undershoot pattern + +**Why this sequence:** +- Pattern recognition confirms delay issue +- Stock-flow quantifies delay danger (D/R > 0.5 = crisis) +- CLD visualizes where delays are +- Archetype matches delay-based patterns +- BOT graphs show oscillation + +### Scenario 5: "Presenting to Executives" + +**Goal:** Get buy-in for systems-based solution + +**Routing Sequence:** +1. **behavior-over-time-graphs** → LEAD with this (clear, visual, $$ impacts) + - "Current trajectory: 6.7 weeks to crisis" + - "With intervention: stabilizes at 80% capacity" +2. **systems-archetypes-reference** → Frame as known pattern ("We're in Fixes that Fail") +3. **leverage-points-mastery** → Justify intervention choice ("This is a high-leverage point") +4. **causal-loop-diagramming** → BACKUP ONLY (if asked "why does this happen?") + +**Why this sequence:** +- Executives want impact first (BOT graphs) +- Pattern names create shared language (archetypes) +- Leverage points justify resource allocation +- CLDs available if deep dive needed + +### Scenario 6: "Multi-Variable System (Technical Debt, Velocity, Morale)" + +**Symptoms:** +- Many interacting variables +- Hard to see connections +- Non-obvious causality + +**Routing Sequence:** +1. **causal-loop-diagramming** → Map all variables and causal links +2. **stocks-and-flows-modeling** → Calculate multi-stock dynamics (debt, morale, velocity all accumulate) +3. **behavior-over-time-graphs** → Show multi-variable trajectories (separate panels or dual-axis) +4. **leverage-points-mastery** → Identify highest leverage variable + +**Why this sequence:** +- CLD handles many variables well +- Stock-flow models accumulation of each +- BOT graphs show multiple trajectories +- Leverage points prioritize where to act + +--- + +## Step-by-Step Workflows + +### Workflow 1: Systematic Problem Analysis (80% of use cases) + +**Process:** +1. **recognizing-system-patterns** (15 min) - What patterns appear? S-curve? Reinforcing loop? +2. **systems-archetypes-reference** (20 min) - Does this match a known archetype? +3. **causal-loop-diagramming** (30 min) - Map the specific structure +4. **stocks-and-flows-modeling** (45 min) - Quantify key stocks, calculate time constants +5. **leverage-points-mastery** (20 min) - Identify high-leverage interventions +6. **behavior-over-time-graphs** (30 min) - Show current trajectory + intervention scenarios + +**Total time:** ~2.5-3 hours +**Output:** Complete systems analysis with quantitative predictions and intervention design + +### Workflow 2: Quick Pattern Recognition (When time-limited) + +**Process:** +1. **recognizing-system-patterns** (15 min) - Quick pattern ID +2. **systems-archetypes-reference** (20 min) - Match to archetype, use archetype's standard solution + +**Total time:** ~35 min +**Output:** Pattern diagnosis + known solution approach +**Trade-off:** No quantification, no custom structure mapping + +### Workflow 3: Executive Presentation Prep + +**Process:** +1. **stocks-and-flows-modeling** (45 min) - Calculate key numbers (crisis timing, costs, ROI) +2. **behavior-over-time-graphs** (40 min) - Create executive-friendly graphs ($$ impact) +3. **leverage-points-mastery** (15 min) - Justify intervention choice +4. **systems-archetypes-reference** (10 min) - Frame with archetype name + +**Total time:** ~110 min +**Output:** Executive presentation with quantified impact + +### Workflow 4: Deep Technical Analysis + +**Process:** +1. **recognizing-system-patterns** (15 min) - Pattern confirmation +2. **causal-loop-diagramming** (60 min) - Detailed structure mapping, polarity validation +3. **stocks-and-flows-modeling** (90 min) - Multi-stock equations, sensitivity analysis +4. **behavior-over-time-graphs** (45 min) - Multi-scenario comparison +5. **leverage-points-mastery** (30 min) - Evaluate intervention points + +**Total time:** ~4 hours +**Output:** Comprehensive technical analysis with validated structure and quantified scenarios + +--- + +## Skill Dependencies and Learning Path + +### Learning Path for Beginners + +**Start here if new to systems thinking:** + +1. **recognizing-system-patterns** (REQUIRED FIRST) + - Foundation for all other skills + - Teaches core concepts: stocks, flows, feedback, delays + - Builds intuition for system behavior + +2. **systems-archetypes-reference** (LEARN SECOND) + - Pattern library accelerates analysis + - Provides vocabulary (names for patterns) + - Gives tested solutions + +3. Choose path based on needs: + - **Quantitative path** → stocks-and-flows-modeling → behavior-over-time-graphs + - **Structural path** → causal-loop-diagramming → leverage-points-mastery + +### Skill Dependencies + +**No prerequisites:** +- recognizing-system-patterns (START HERE) + +**Requires recognizing-system-patterns:** +- systems-archetypes-reference (builds on patterns) +- causal-loop-diagramming (uses feedback loop concepts) +- stocks-and-flows-modeling (uses stock/flow distinction) +- leverage-points-mastery (uses system structure concepts) +- behavior-over-time-graphs (uses pattern recognition) + +**Works better together:** +- stocks-and-flows-modeling + behavior-over-time-graphs (calculate, then visualize) +- causal-loop-diagramming + leverage-points-mastery (map structure, find intervention) +- systems-archetypes-reference + leverage-points-mastery (pattern → known leverage points) + +--- + +## Rationalization Resistance Table + +| Rationalization | Reality | Counter-Guidance | Red Flag | +|-----------------|---------|------------------|----------| +| "Just add more resources" | Resource additions often activate balancing loops | "Route to leverage-points-mastery - this is lowest-leverage point (constants)" | Ignoring system structure | +| "This isn't a system, it's a simple bug" | Bugs that persist are symptoms of system structure | "Route to systems-archetypes-reference - likely 'Fixes that Fail'" | Linear thinking on complex problems | +| "We don't have time for analysis" | Crisis timing requires stock-flow calculation | "Route to stocks-and-flows-modeling - 15 min calculation vs wrong 6-month commitment" | Analysis paralysis fear | +| "Our situation is unique" | 90% match archetypes | "Route to systems-archetypes-reference - most 'unique' problems aren't" | Not invented here syndrome | +| "Just draw a quick diagram" | Polarity errors change diagnosis (R vs B) | "Route to causal-loop-diagramming - use systematic 6-step process" | Skipping validation | +| "Intuition says it will get worse" | Intuition fails on delays, non-linear dynamics | "Route to stocks-and-flows-modeling - calculate, don't guess" | Overconfidence in intuition | +| "We need to act NOW" | Acting without understanding wastes resources | "Route to recognizing-system-patterns - 15 min pattern ID prevents months of wrong solution" | Action bias | +| "Too complicated to model" | Most systems can be modeled simply | "Route to stocks-and-flows-modeling - start with 1-2 stocks" | Complexity avoidance | +| "Graphs are for presentations, not analysis" | Graphs reveal patterns invisible in tables | "Route to behavior-over-time-graphs - construction process IS analysis" | Separating analysis from communication | + +--- + +## Red Flags Checklist + +Watch for these signs of incorrect approach: + +- [ ] **Linear Thinking**: "X causes Y, so fix X" (ignoring feedback loops) +- [ ] **Symptom Treating**: Addressing symptoms without mapping structure +- [ ] **Resource Obsession**: Only considering "add more people/money" solutions +- [ ] **Analysis Paralysis**: Trying to model everything instead of starting simple +- [ ] **Skipping Validation**: Drawing CLDs without polarity double-test +- [ ] **Gut-Feel Quantification**: "Probably double in 6 months" without calculation +- [ ] **Graph Manipulation**: Tweaking scale to make problems look bigger/smaller +- [ ] **Archetype Forcing**: Forcing problem into wrong archetype +- [ ] **Ignoring Delays**: Not marking delays on CLDs or calculating time constants +- [ ] **Single-Skill Fixation**: Using only one tool (e.g., only CLDs, no quantification) + +**If any red flag triggered → STOP → Route to appropriate skill(s)** + +--- + +## When NOT to Use This Pack + +Clarify boundaries with other approaches: + +| Problem Type | Use Instead | Reason | +|--------------|-------------|--------| +| Well-understood algorithm optimization | Standard profiling/optimization | No feedback dynamics | +| One-time decision with immediate result | Decision analysis, expected value | No time dynamics | +| Pure data analysis / statistics | Data science methods | Not about system structure | +| Legal/compliance requirements | Ordis security-architect | Different domain | +| Pure UX research | Lyra ux-designer | Different methodology | +| Code architecture | Axiom system-architect | Code structure, not system dynamics | + +**Edge case:** Software architecture CAN have systems dynamics (technical debt accumulation, team coordination). Use **both** system-architect (structure) AND systems-thinking (dynamics). + +--- + +## Integration with Other Packs + +### Simulation-Foundations (Yzmir) +- **Use together when:** Need to implement simulation based on systems model +- **Workflow:** systems-thinking (design) → simulation-foundations (implementation) +- **Example:** Model ecosystem with stocks-and-flows → implement with differential-equations-for-games + +### System-Architect (Axiom) +- **Use together when:** Software architecture decisions have feedback dynamics +- **Workflow:** system-architect (code structure) + systems-thinking (team/process dynamics) +- **Example:** Microservices architecture (static) + team coordination dynamics (systems thinking) + +### Deep-RL (Yzmir) +- **Use together when:** Training RL agents in systems with feedback +- **Workflow:** systems-thinking (environment analysis) → deep-rl (agent design) +- **Example:** Understand ecosystem dynamics with causal-loop-diagramming → train agents with actor-critic-methods + +--- + +## Summary: Start Here + +**First time with systems thinking?** +→ **recognizing-system-patterns** (foundation skill, 45-60 min) + +**Problem keeps returning despite fixes?** +→ **systems-archetypes-reference** → Find "Fixes that Fail" or "Shifting the Burden" + +**Need to predict future states?** +→ **stocks-and-flows-modeling** → Calculate time to crisis, equilibrium + +**Need to map system structure?** +→ **causal-loop-diagramming** → Visualize feedback loops + +**Need to design intervention?** +→ **leverage-points-mastery** → Find high-leverage points + +**Need to communicate dynamics?** +→ **behavior-over-time-graphs** → Show trajectories over time + +**Not sure where to start?** +→ Use this router skill! Ask diagnostic questions: +- "Is this problem persisting despite fixes?" +- "Are there delays between action and result?" +- "Do we understand the feedback loops?" +- "What's the goal: understand, map, intervene, or communicate?" + +**Most common workflow:** +recognizing-system-patterns → systems-archetypes-reference → causal-loop-diagramming → stocks-and-flows-modeling → leverage-points-mastery → behavior-over-time-graphs + +**Time for complete analysis:** 2.5-4 hours (depending on complexity) + +**Key principle:** Start with patterns, match to archetypes, map structure, quantify dynamics, find leverage, visualize scenarios. + +--- + +## Systems Thinking Specialist Skills Catalog + +After routing, load the appropriate specialist skill for detailed guidance: + +1. [recognizing-system-patterns.md](recognizing-system-patterns.md) - Foundation: S-curves, feedback loops, delays, stock-flow thinking, pattern recognition +2. [systems-archetypes-reference.md](systems-archetypes-reference.md) - 10 classic archetypes: Fixes that Fail, Shifting the Burden, Escalation, recurring patterns +3. [leverage-points-mastery.md](leverage-points-mastery.md) - Donella Meadows' 12 leverage points, intervention design, prioritization +4. [stocks-and-flows-modeling.md](stocks-and-flows-modeling.md) - Quantitative modeling: equilibrium analysis, time constants, accumulation dynamics +5. [causal-loop-diagramming.md](causal-loop-diagramming.md) - Structure mapping: 6-step construction, polarity testing, loop identification +6. [behavior-over-time-graphs.md](behavior-over-time-graphs.md) - Trajectory visualization: 7-step construction, scenario comparison, communication diff --git a/skills/using-systems-thinking/behavior-over-time-graphs.md b/skills/using-systems-thinking/behavior-over-time-graphs.md new file mode 100644 index 0000000..1097682 --- /dev/null +++ b/skills/using-systems-thinking/behavior-over-time-graphs.md @@ -0,0 +1,843 @@ + +# Behavior-Over-Time Graphs + +## When to Use This Skill + +Use behavior-over-time (BOT) graphs when: +- **Predicting future states**: "What will customer count be in 6 months?" +- **Comparing scenarios**: "With intervention vs without intervention" +- **Communicating urgency**: "Look how fast debt is growing!" +- **Demonstrating time-to-crisis**: "We have 14 months before capacity saturated" +- **Validating models**: Overlay actual vs predicted behavior +- **Explaining delays**: "Why solutions take 3 months to show results" + +**Don't use BOT graphs when**: +- You don't know the structure yet → Start with causal loop diagram (CLD) +- You need to show feedback loops → Use CLD with polarity markers +- You want current state only (no trajectory) → Use stock-flow diagram +- Data too uncertain to plot → Use qualitative archetype analysis +- Audience needs WHY not WHEN → Use CLD to show causal logic + +**Key insight**: BOT graphs answer "What happens over time?" with concrete numbers and dates. Use them AFTER you've mapped structure (CLD) and calculated values (stock-flow), to communicate dynamics visually. + + +## The 7-Step Construction Process + +**Build BOT graphs systematically. Never jump to the final graph without validating each step.** + +### Step 1: Identify What to Plot (Stock vs Flow) + +**Rule**: BOT graphs typically show STOCKS (accumulated quantities), not flows (rates). + +**Why**: Stakeholders care about "How bad is the problem?" (stock level) more than "How fast is it changing?" (flow rate). + +**Test**: Can you measure this at a single instant without reference to time? +- YES → Stock (plot it) +- NO → Flow (consider plotting the stock it affects instead) + +**Examples**: +- ✅ Plot: Customer Count (stock) +- ❌ Not: Customer Acquisition Rate (flow) - unless specifically analyzing flow behavior +- ✅ Plot: Bug Backlog (stock) +- ❌ Not: Bug Arrival Rate (flow) +- ✅ Plot: Technical Debt Points (stock) +- ❌ Not: Debt Accumulation Rate (flow) + +**Exception**: Plot flows when analyzing flow behavior itself (e.g., "Development Velocity over time") + + +### Step 2: Determine Time Scale (Granularity and Range) + +**Two decisions**: How fine-grained? How far forward? + +**Granularity** (X-axis intervals): +- **Hourly**: Real-time monitoring, very fast dynamics +- **Daily**: Operational metrics (deployments, incidents) +- **Weekly**: Sprint-level analysis +- **Monthly**: Business metrics (MRR, customer count) +- **Quarterly**: Strategic planning +- **Yearly**: Long-term trends + +**Decision criteria**: +- Match measurement frequency (if customers tracked monthly, use monthly) +- Show intervention timeframe (if intervention monthly, don't use yearly) +- Avoid unnecessary noise (daily SaaS revenue too volatile, use monthly) + +**Range** (how far forward to project): + +**Rule of thumb**: Show **2-3× the time constant** of the system + +**Time constant (τ)** = Time for system to reach ~63% of equilibrium (from stocks-and-flows-modeling) + +**Examples**: +- Customer growth τ = 8 months → Plot 16-24 months +- Bug backlog τ = 2 weeks → Plot 4-6 weeks +- Technical debt τ = infinity (unbounded) → Plot until crisis or intervention + +**Practical**: +- Show intervention point + outcome period (decide at month 3, show months 0-12) +- Include phase transitions (growth → crisis → stabilization) +- Don't over-extend (24 months for 2-week problem dilutes insight) + + +### Step 3: Calculate Values (Using Stock-Flow Equations) + +**Never eyeball the curve**. Calculate stock levels using formal equations from stocks-and-flows-modeling. + +**Standard formula**: +``` +Stock(t+1) = Stock(t) + Δt × (Inflow - Outflow) +``` + +**Process**: +1. Identify initial condition: Stock(0) = ? +2. Calculate flows for each time period +3. Apply formula iteratively +4. Verify units: Stock in [X], Flows in [X/time], Δt in [time] +5. Validate: Does equilibrium match calculation? (Set Inflow = Outflow) + +**Example - Bug Backlog**: +``` +Backlog(0) = 50 bugs +Inflow = 30 bugs/month (constant) +Outflow = 0.8 × Velocity (bugs/month, stock-dependent) +Velocity = 40 points/sprint, 2 sprints/month + +Month 0: 50 bugs +Month 1: 50 + (30 - 0.8×40×2) = 50 + (30 - 64) = 16 bugs +Month 2: 16 + (30 - 0.8×40×2) = -18 bugs → Floor at 0 bugs +Equilibrium: Inflow < Outflow, backlog drains to 0 +``` + +**Common mistake**: Guessing values instead of calculating. If stakeholders question, you must defend with math. + + +### Step 4: Select Graph Type + +**Decision tree**: + +**Is the data continuous or discrete?** +- **Continuous** (smooth accumulation) → **Line graph** ✓ (default) +- **Discrete** (step changes) → **Step function** + +**Do you want to emphasize magnitude?** +- **YES** → **Area chart** (fills area under line) +- **NO** → **Line graph** + +**Are you comparing discrete time periods?** +- **YES** → **Bar chart** +- **NO** → **Line graph** + +**Examples**: +- Customer growth over time: **Line graph** (continuous accumulation) +- Headcount changes (hire whole people): **Step function** (discrete jumps) +- Quarterly revenue comparison: **Bar chart** (discrete periods) +- Technical debt accumulation: **Area chart** or **Line** (either works, area emphasizes magnitude) + +**Default**: When unsure, use **line graph**. It's the most versatile and widely understood. + + +### Step 5: Choose Scale (Y-Axis Range) + +**The 70-80% Rule**: Maximum value in your data should occupy **70-80% of the Y-axis range**. + +**Formula**: +``` +Y_max = Data_max / 0.75 +``` + +**Example**: +- Data maximum: 60 debt points +- Y-axis max: 60 / 0.75 = 80 points ✓ + +**Why 70-80%?** +- Provides visual buffer (not cramped at top) +- Makes growth impactful (not tiny slope in vast space) +- Industry standard for clear visualization + +**Common mistakes**: +- ❌ Y-axis = 120 when data max = 60 (only 50% occupied, wastes space) +- ❌ Y-axis = 65 when data max = 60 (92% occupied, cramped, hard to see trend) +- ✅ Y-axis = 80 when data max = 60 (75% occupied, perfect) + +**When to start Y-axis at non-zero**: +- **Use 0 baseline** when showing absolute change (customer count growth 0 → 7,000) +- **Use non-zero** when showing small variations around large baseline (server uptime 98.5% → 99.2%) +- **Warning**: Non-zero baselines can mislead. If using, annotate clearly. + +**Logarithmic scale**: +- Use when data spans multiple orders of magnitude (1 → 1,000 → 1,000,000) +- Use when exponential growth makes linear scale unreadable +- **Always label** "logarithmic scale" explicitly + + +### Step 6: Add Annotations (Events, Phases, Thresholds) + +**Annotations reveal WHY the curve behaves the way it does.** + +**Types of annotations**: + +**1. Event markers** (vertical lines at intervention points): +``` + │ + ↓ +[INTERVENTION] +``` +- Product launch, infrastructure investment, policy change +- Mark the TIME of the decision/event + +**2. Phase labels** (text for regions): +``` +[GROWTH PHASE] [CRISIS] [STABILIZATION] +``` +- Mark distinct system behaviors over time periods + +**3. Threshold lines** (horizontal lines for critical values): +``` +─────────────── Capacity Limit (100 customers/month) +─────────────── Crisis Threshold (200 bugs) +``` +- Show when system crosses critical boundaries + +**4. Annotations density limit**: **Max 5-7 annotations per graph** +- More than 7 → Cluttered, unreadable +- If you need more, split into multiple graphs + +**Placement**: +- Events: Vertical line at X position, label above or below +- Phases: Text box or bracket spanning time period +- Thresholds: Horizontal line with label at end or middle + +**Priority**: Annotate the 3 most important events/thresholds, not everything. + + +### Step 7: Validate (Quality Checklist) + +**Before presenting any BOT graph, check**: + +✅ **Units clearly labeled on both axes?** +- Y-axis: "Technical Debt (story points)" +- X-axis: "Time (months)" + +✅ **Scale follows 70-80% rule?** +- Data_max / Y_max between 0.70 and 0.80? + +✅ **Time range shows full story?** +- Intervention point + enough time to see outcome? +- Shows equilibrium or steady state if system reaches it? + +✅ **Annotations clear and not cluttered?** +- ≤7 annotations total? +- Labels don't overlap? + +✅ **Graph type appropriate for data?** +- Continuous data → Line +- Discrete changes → Step function +- Time period comparison → Bar + +✅ **Readable at presentation size?** +- Can you read axis labels from 10 feet away? +- Are data lines thick enough? + +✅ **Validated against stock-flow calculations?** +- Do plotted values match your calculated spreadsheet? +- Did you verify equilibrium point? + +✅ **Comparison method clear (if multiple scenarios)?** +- Different line styles (solid vs dashed)? +- Legend shows which line is which? + +**If any check fails, FIX before presenting.** Wrong scale or missing units destroys credibility. + + +## ASCII/Text Visualization Standards + +**Character set for text-based graphs**: +``` +│ ─ ┌ ┐ └ ┘ ╱ ╲ ● ○ ▲ ▼ ┼ ├ ┤ +``` + +**Axis notation**: +``` +Y-Axis Label (units) +│ +80│ +│ +60│ +│ +40│ +│ +20│ +│ +0└───┬───┬───┬───┬───┬───┬─── + 0 2 4 6 8 10 12 + X-Axis Label (units) +``` + +**Data line styles**: +- **Solid line**: ─── (primary scenario, baseline) +- **Dashed line**: ╌╌╌ or - - - (alternative scenario, comparison) +- **Markers**: ● (data points), ▲ (intervention), ▼ (crisis event) + +**Multiple scenarios on same graph**: +``` +80│ ┌───●─── Scenario A (solid) + │ ┌─○┤ +60│ ╌─┘ │ ○╌╌╌ Scenario B (dashed) + │ ╌─┘ │ +40│ ╌─┘ │ + │╌─┘ │ +20│ │ + └──────────────┼────────── + 0 3 6 9 12 months + ▲ + INTERVENTION +``` + +**Spacing and readability**: +- Leave 2-3 character spaces between axis ticks +- Align numbers right-justified on Y-axis +- Keep X-axis labels centered under tick marks + +**Template** (copy and modify): +``` +[Y-AXIS LABEL] (units) +│ +MAX│ + │ +75%│ ┌─── + │ ┌─┘ +50%│ ┌─┘ + │ ┌─┘ +25%│ ┌─┘ + │ ┌─┘ +0 └───┬───┬───┬───┬───┬───┬─── + 0 1 2 3 4 5 6 + [X-AXIS LABEL] (units) +``` + + +## Multi-Variable Framework + +**When you need to plot multiple variables**, choose strategy systematically: + +### Strategy 1: Dual Y-Axis (Same Graph, Two Scales) + +**When to use**: +- ✅ Variables have **causal relationship** (team size drives velocity) +- ✅ Different units (engineers vs story points) +- ✅ Similar time dynamics (both change over same period) +- ✅ Viewer needs to see correlation visually + +**Example**: Team Size (left axis: engineers) + Velocity (right axis: points/sprint) + +**Limitations**: +- Hard in ASCII (need clear labeling) +- Max 2 variables (more is confusing) + + +### Strategy 2: Separate Panels (Stacked, Shared X-Axis) + +**When to use**: +- ✅ Variables from **different domains** (technical vs human) +- ✅ Very different scales (0-100 bugs vs 1-10 morale) +- ✅ Want independent Y-axes for clarity +- ✅ More than 2 variables + +**Example**: +``` +Bug Backlog (bugs) +200│ ╱─── + │ ╱── +100│╱── +0 └─────────── + +Morale (1-10) +10│────╲ + │ ╲ +5 │ ──╲ +0 └─────────── + 0 3 6 months +``` + +**Benefit**: Each variable has appropriate scale, viewer can cross-reference via shared time axis + + +### Strategy 3: Normalized 0-100% (Same Scale) + +**When to use**: +- ✅ Relative trends matter more than absolute values +- ✅ Comparing variables with very different units +- ✅ Showing patterns, not magnitudes + +**Example**: Customer % vs Revenue % vs Team % (all normalized to 0-100%) + +**Warning**: Loses actionability. "Customer % = 75%" doesn't tell stakeholder "we have 7,500 customers." + +**Use sparingly**: Only when pattern visualization is the goal, not decision-making. + + +### Decision Matrix: + +| Variables | Strategy | Example | +|-----------|----------|---------| +| 2 related, different units | Dual Y-axis | Team Size + Velocity | +| 3+ from different domains | Separate panels | Bugs + Morale + Debt | +| Need pattern, not magnitude | Normalized 0-100% | Multi-metric dashboard | +| 2 same units | Single axis, overlay | Scenario A vs B customers | + + +## Comparison Strategies + +**Showing "with intervention vs without intervention":** + +### Method 1: Overlay (Same Graph) + +**Best for**: +- Similar scales (both scenarios fit 70-80% rule on same Y-axis) +- Direct visual comparison +- 2-3 scenarios maximum + +**Technique**: +- Solid line = Baseline +- Dashed line = Alternative +- Markers differentiate: ● vs ○ +- Legend shows which is which + +**Example**: +``` +7000│ ○╌╌╌ With Investment (+5%) + │ ╌─┤ +6000│ ╌─┘ │ ●── Baseline + │ ╌─┘ ●─┘ +5000│ ╌─┘ ●─┘ + │ ╌──●─┘ +4000│●─┘ +``` + + +### Method 2: Side-by-Side (Separate Graphs) + +**Best for**: +- Different scales (Scenario A: 0-100, Scenario B: 0-500) +- Many scenarios (4+) +- Independent analysis + +**Technique**: +- Graph 1: Scenario A +- Graph 2: Scenario B +- Shared time axis +- Separate Y-axis scales + +**Use**: When overlay would be cluttered or scales incompatible + + +### Method 3: Stacked Panels (Vertically Aligned) + +**Best for**: +- Showing multiple aspects of same scenario +- Different variables (customers, revenue, cost) +- Aligned time for cross-reference + +**Technique**: +- Panel 1: Primary metric +- Panel 2: Secondary metric +- Panel 3: Tertiary metric +- Shared X-axis, independent Y-axes + + +## Phase/Region Marking + +**Showing "crisis zone" or "stable region":** + +**Technique 1: Vertical bands** (time periods): +``` +│ [GROWTH] [CRISIS] [STABLE] +│ ╱──────╲ ────────── +│ ╱ ╲ +│╱ ╲──────── +└───────────────────── + 0 3 6 9 12 +``` + +**Technique 2: Horizontal regions** (threshold bands): +``` +│ ───────── 200 bugs ←─── CRISIS THRESHOLD +│ ╱────── +│ ╱── [SAFE ZONE] +│╱── +└──────── +``` + +**Technique 3: Text labels with brackets**: +``` +│ ╱────── +│ ╱── └──[Peak: Crisis Mode] +│╱── +└───── +``` + +**When to use**: +- Complex dynamics with distinct phases (growth, plateau, decline) +- Critical thresholds (capacity limits, SLA boundaries) +- Multi-phase interventions (before, during, after) + + +## Common Mistakes Catalog + +### 1. Y-Axis Too Large + +**Mistake**: +``` +120│ + │ ┌───── (Data only reaches 60) +60│ ┌─┘ + │ ╱ +0 └────────── +``` +**Problem**: Wastes 50% of space, minimizes visual impact +**Fix**: Apply 70-80% rule → Y-max = 80 + + +### 2. Y-Axis Too Small + +**Mistake**: +``` +65│┌───────── (Data hits 60, cramped!) + │││ +60││ + └────── +``` +**Problem**: Exaggerates tiny changes, looks volatile +**Fix**: Provide 20-30% buffer above max value + + +### 3. Missing Units on Axes + +**Mistake**: +``` +│ "Technical Debt" ← What units? Story points? Hours? $$? +└── "Time" ← Days? Weeks? Months? +``` +**Fix**: Always label with units: "Technical Debt (story points)", "Time (months)" + + +### 4. Time Range Too Short + +**Mistake**: Showing months 0-3 when intervention at month 3 (cuts off outcome) +**Fix**: Extend to month 6-12 to show result of intervention + + +### 5. Time Range Too Long + +**Mistake**: Showing 24 months for 2-week bug fix project (dilutes insight) +**Fix**: Match time range to problem scale (weeks for bugs, months for customers, years for strategy) + + +### 6. Too Many Annotations + +**Mistake**: 15 labels, arrows, boxes → Unreadable clutter +**Fix**: Limit to 5-7 most important events/thresholds + + +### 7. Wrong Graph Type + +**Mistake**: Bar chart for continuous accumulation (treats smooth growth as discrete jumps) +**Fix**: Use line graph for continuous, step function for discrete, bar for period comparison + + +### 8. Misleading Non-Zero Baseline + +**Mistake**: +``` +99.5│ ╱─── (Looks like 10× growth!) + │ ╱ +99.0│╱ +``` +**Reality**: 99.0% → 99.5% is only +0.5% absolute change +**Fix**: Either use 0 baseline OR annotate "Y-axis starts at 99%" prominently + + +### 9. Overlaying Incompatible Scales + +**Mistake**: Plotting Customers (0-10,000) and Revenue ($0-$100) on same Y-axis without dual-axis +**Fix**: Use dual Y-axis (left: customers, right: revenue) or separate panels + + +### 10. Missing Key Events + +**Mistake**: Curve changes slope at month 6, no annotation explaining why +**Fix**: Mark event: "▲ Infrastructure Investment" at month 6 + + +## Audience Adaptation Template + +**Create different versions for different audiences systematically.** + +### Technical Version (Engineers, Analysts) + +**Language**: +- Use precise terms: "Equilibrium", "Time constant", "Stock-dependent outflow" +- Show equations: `Debt(t+1) = Debt(t) + 15 - 5` +- Include units: "story points", "bugs/week" + +**Detail level**: +- All calculations shown +- Validation checks documented +- Alternative scenarios with sensitivity analysis +- Limitations and assumptions listed + +**Visual complexity**: +- Multi-panel graphs acceptable +- Dual Y-axes if needed +- Detailed annotations (formulas, thresholds) + +**Focus**: HOW and WHY (mechanics, validation, replication) + + +### Executive Version (Board, C-Suite) + +**Language**: +- Use business terms: "Debt stabilizes", "Crisis trajectory", "ROI" +- Hide equations (show result only) +- Use business units: "% of team capacity", "months to crisis" + +**Detail level**: +- Key insights only (no intermediate calculations) +- Single clear recommendation +- ROI or cost-benefit comparison +- Risk framing ("Without action, we reach crisis in 6 months") + +**Visual complexity**: +- Single clean graph (not multi-panel) +- Simple annotations (plain English, no jargon) +- Clear comparison (with vs without intervention) + +**Focus**: WHAT and SO WHAT (outcomes, decisions, impact) + + +### General Audience (Team, Stakeholders) + +**Language**: +- Minimal jargon +- Clear labels ("Bug Count", not "Defect Density") +- Intuitive units (days/months, not time constants) + +**Detail level**: +- Enough to understand trend, not full derivation +- Key events marked +- Why it matters explained in one sentence + +**Visual complexity**: +- Simple line graph +- 3-5 annotations maximum +- Pattern should be obvious (up, down, stable) + +**Focus**: UNDERSTANDING (what's happening, why it matters) + + +### Systematic Translation Process: + +| Aspect | Technical | Executive | General | +|--------|-----------|-----------|---------| +| **Language** | Equilibrium, τ, ΔS | Stabilizes, timeline, change | Levels off, when, difference | +| **Detail** | All calculations | Key insights | Main pattern | +| **Visual** | Multi-panel, dual-axis | Single clean graph | Simple line | +| **Equations** | Show formulas | Hide formulas | Hide formulas | +| **Units** | Precise (story points) | Business (% capacity) | Intuitive (days) | +| **Focus** | How/Why | What/So What | What/Why it matters | + +**Process**: Create technical version first (complete), then simplify for executive/general by removing detail and translating language. + + +## Integration with Other Skills + +### BOT + Stock-Flow Modeling + +**Workflow**: +1. **Stock-Flow**: Build equations, calculate values, find equilibrium +2. **BOT Graph**: Visualize those values over time +3. **BOT Graph**: Show trajectory toward (or away from) equilibrium + +**Example**: Stock-flow calculates "Bug backlog drains to 0 in 4 weeks", BOT graph shows the decline curve + + +### BOT + Causal Loop Diagrams + +**Workflow**: +1. **CLD**: Map feedback loops, identify reinforcing vs balancing +2. **Stock-Flow**: Quantify the stocks and flows in loops +3. **BOT Graph**: Show how loops create growth, decline, or oscillation over time + +**Example**: CLD shows "Debt → Slow Velocity → Pressure → Shortcuts → Debt (R loop)", BOT graph shows exponential debt growth + + +### BOT + System Archetypes + +**Workflow**: +1. **Archetype**: Recognize pattern (Fixes that Fail, Escalation) +2. **Stock-Flow**: Model the specific instance +3. **BOT Graph**: Show characteristic behavior (symptom relief then return worse) + +**Example**: "Fixes that Fail" archetype → BOT shows quick fix working temporarily (months 1-3), then problem returning worse (months 4-6) + + +### BOT + Leverage Points + +**Workflow**: +1. **Leverage Points**: Identify intervention options (parameter vs structure change) +2. **Stock-Flow**: Model each intervention's impact +3. **BOT Graph**: Compare scenarios visually (intervention A vs B vs do nothing) + +**Example**: BOT shows "Hiring (Level 12): Small improvement, Quality (Level 10): Reaches equilibrium" + + +### Complete Workflow: + +1. **Unknown problem** → Start with **Causal Loop Diagram** (map structure) +2. **Familiar pattern** → Match to **System Archetype** (leverage known interventions) +3. **Need numbers** → Build **Stock-Flow Model** (quantify stocks, flows, equilibrium) +4. **Show dynamics** → Create **BOT Graph** (visualize trajectory over time) +5. **Choose intervention** → Apply **Leverage Points** (rank options) +6. **Communicate decision** → Use **BOT Graph** + **Leverage Points** (show impact of choice) + +**BOT graphs are communication and prediction tools** - use them AFTER structure (CLD) and calculation (Stock-Flow) to show "what happens over time." + + +## Red Flags: Rationalizations to Resist + +### "I can eyeball the curve" + +**Reality**: Intuition fails on non-linear dynamics, delays, equilibrium points. + +**Counter**: +- Exponential growth looks slow until it's not (then it's too late) +- Delays create overshoot your intuition won't predict +- Equilibrium isn't obvious (is it at 5,000 customers or 20,000?) + +**Test**: Sketch your intuitive curve, then calculate. If they match, calculation was quick confirmation. If they don't, your intuition would have misled stakeholders. + + +### "Math takes too long" + +**Reality**: 10 minutes of calculation vs months of wrong decisions. + +**Counter**: +- Stock-flow calculation: 10-15 minutes in spreadsheet +- Drawing wrong curve: Stakeholders make $100K decisions based on it +- Wrong trajectory = wrong intervention = wasted resources + +**Test**: Time to calculate vs cost of error. If error >$10K and decision not easily reversed, CALCULATE. + + +### "Let's make it look dramatic for the board" + +**Reality**: Manipulated graphs destroy credibility permanently. + +**Counter**: +- Non-zero baseline tricks can be spotted (lost trust forever) +- Exaggerated Y-axis makes real data look silly when revealed +- Board members aren't stupid - they'll ask questions + +**Test**: If your graph would look different with accurate scale, you're manipulating. Use honest scale, let the real data speak. + + +### "Too many details, keep it clean" + +**Reality**: "Clean" without context is ambiguous; "simple" ≠ "simplistic" + +**Counter**: +- Removing intervention annotation: Now curve's slope change is mysterious +- Removing threshold: Now viewer doesn't know when crisis hits +- Removing units: Now "60" means nothing + +**Test**: Can stakeholder make correct decision with this graph? If annotations are needed for that, they stay. + + +### "It's obvious what will happen" + +**Reality**: Equilibrium points, overshoot, phase transitions are NOT obvious. + +**Counter**: +- "Obviously grows forever" → Actually stabilizes at equilibrium +- "Obviously stabilizes" → Actually oscillates due to delays +- "Obviously smooth curve" → Actually has crisis dip (infrastructure limit) + +**Test**: Ask three people to sketch their mental model. If they draw different curves, it's NOT obvious. Model it. + + +### "We don't have time to calculate" + +**Reality**: Presenting wrong trajectory wastes everyone's time. + +**Counter**: +- Meeting starts in 30 min → 15 min to calculate, 15 min to draw +- Presenting without calculation → "How did you get these numbers?" → Credibility lost +- Stakeholders make multi-month plans based on your graph → Worth getting right + +**Test**: Is this graph for decision-making or just discussion? If decision-making, calculate. Always. + + +### "The actual data won't match anyway" + +**Reality**: Models predict DYNAMICS (trends), not exact values. + +**Counter**: +- You're right absolute numbers may be off ±20% +- But DYNAMICS are accurate: "Growth then plateau" vs "Unbounded growth" +- Overlay actual data when available, refine model +- Imperfect model > no model > wrong intuition + +**Test**: Model shows "stabilizes at 5,000-7,000 customers in 12-18 months" - even if exact is 6,200 customers at 14 months, you captured the right behavior for decision-making. + + +## Summary + +**Behavior-over-time graphs** visualize system dynamics over time: + +**7-step construction process**: +1. Identify what to plot (stocks, not flows) +2. Determine time scale (granularity and range) +3. Calculate values (using stock-flow equations) +4. Select graph type (line, area, step, bar) +5. Choose scale (70-80% rule) +6. Add annotations (events, phases, thresholds, max 5-7) +7. Validate (checklist before presenting) + +**ASCII standards**: +- Consistent character set: │ ─ ┌ ┐ └ ┘ ╱ ╲ ● ○ +- Clear axis labels with units +- Templates for common patterns + +**Key rules**: +- 70-80% scale rule (data_max = 70-80% of Y-axis) +- 2-3× time constant for range +- <7 annotations maximum +- Always calculate, never eyeball + +**Multi-variable strategies**: +- Dual Y-axis: Related variables, different units +- Separate panels: Different domains, independent scales +- Normalized: Pattern focus, not magnitude + +**Audience adaptation**: +- Technical: All details, equations, validation +- Executive: Key insights, business language, ROI +- General: Main pattern, minimal jargon, why it matters + +**Integration**: +- BOT + Stock-Flow: Calculate then visualize +- BOT + CLD: Structure then dynamics +- BOT + Archetypes: Pattern then trajectory +- BOT + Leverage Points: Compare interventions + +**Resist rationalizations**: +- "Eyeball it" → Intuition fails on non-linear systems +- "No time" → 15 min calculation vs wrong decisions +- "Make it dramatic" → Manipulation destroys credibility +- "Keep it clean" → Context matters for decisions +- "It's obvious" → Equilibrium, overshoot, phases aren't obvious + +**The discipline**: Calculate values, choose scale systematically, validate before presenting, adapt to audience. + +**The payoff**: Show concrete predictions with timelines, compare scenarios visually, communicate urgency effectively, enable data-driven decisions. diff --git a/skills/using-systems-thinking/causal-loop-diagramming.md b/skills/using-systems-thinking/causal-loop-diagramming.md new file mode 100644 index 0000000..93908f4 --- /dev/null +++ b/skills/using-systems-thinking/causal-loop-diagramming.md @@ -0,0 +1,781 @@ + +# Causal Loop Diagramming + +## When to Use This Skill + +Use causal loop diagrams (CLDs) when: +- **Exploring problem structure**: "Why does this keep happening?" +- **Identifying feedback loops**: Finding vicious cycles and virtuous circles +- **Communicating to stakeholders**: Showing system dynamics simply +- **Pattern matching**: Recognizing archetypes (Fixes that Fail, Escalation, etc.) +- **Early-stage analysis**: Don't have data yet, exploring relationships +- **Building shared understanding**: Team has different mental models + +**Don't use CLDs when**: +- Need specific numbers ("how many?", "when?") → Use stock-flow models +- Problem is well-understood → Use archetypes directly +- System is trivial (one cause, one effect, no feedback) +- Audience needs quantitative proof → Stock-flow first, then CLD to communicate + +**Key insight**: CLDs reveal STRUCTURE (feedback loops), not MAGNITUDE (numbers). Use them to understand "why", then quantify with stock-flow if needed. + + +## The Incremental Construction Process + +**Build CLDs step-by-step to catch errors early. Never jump to the complex final diagram.** + +### Step 1: Identify Variables (States, Not Actions) + +**Rule**: Variables must be STATES (nouns) that can increase or decrease, not ACTIONS (verbs). + +**Test**: "How much X do we have right now?" If answerable, it's a valid variable. + +**Examples**: +- ✅ GOOD: "Technical Debt" (can measure in story points) +- ❌ BAD: "Refactoring" (this is an action, not a state) +- ✅ GOOD: "Team Morale" (can measure on 1-10 scale) +- ❌ BAD: "Improving morale" (action, not state) +- ✅ GOOD: "Manual Process Burden" (hours/week spent on manual work) +- ❌ BAD: "Automating processes" (action, not state) + +**Measurability test**: Can you track this variable over time? If not, it's probably not a good variable. + +**Common error**: Using symptoms instead of root states. +- ❌ "Frustration with deployments" → ✅ "Developer Frustration" + "Deployment Frequency" + +**From scenario to variables**: +1. Underline every noun phrase in the problem description +2. Ask: "Can this increase or decrease?" +3. Ask: "Can we measure this?" +4. Rename to be clearly a state (if needed) + +**Audience-appropriate naming**: +- **Technical**: "Code Complexity", "Test Coverage %", "Deployment Frequency" +- **Executive**: "Product Value", "Customer Satisfaction", "Market Position" +- **Both**: "Revenue", "Team Size", "Customer Count" + +**Pick names your audience will understand immediately**. You can translate later, but diagram readability matters. + + +### Step 2: Map Causal Links (Test Mechanism and Direction) + +**For each potential connection, ask THREE questions**: + +**Q1: If A changes, does B change?** +- Not just correlation - is there a MECHANISM? +- Example: "Customers" → "Revenue" (yes, customers pay money - direct mechanism) +- Example: "Customers" → "Stock Price" (indirect through revenue, earnings, etc. - don't link directly) + +**Q2: Which direction does causality flow?** +- A → B (A causes B) +- Not: A ← B (avoid bidirectional arrows) +- Pick the PRIMARY causal direction + +**Example**: +- Revenue enables hiring: Revenue → Team Size ✓ +- Not: Team Size → Revenue (though more team eventually leads to more features → revenue, that's a longer path) + +**Q3: Is this link strong, weak, or conditional?** +- Strong: Direct, immediate, clear +- Weak: Indirect, long delay, many mediating factors +- Conditional: Only happens under certain circumstances + +**Mark weak or conditional links later** (after basic structure is clear). Start with strong, direct links. + +**The mechanism test**: +State the link in a sentence: "When [A] increases, [B] changes because [mechanism]." + +**Example**: +- "When Technical Debt increases, Development Velocity decreases because complexity slows down coding" ✓ +- "When Team Size increases, Bugs decrease because..." (wait, do more people reduce bugs? Or increase coordination overhead? This link might be wrong!) + +**Common mistake**: Assuming "more X is better" without testing the mechanism. + + +### Step 3: Assign Polarities (Test Both Directions) + +**Polarity indicates whether A and B move in the same direction or opposite directions.** + +**Same direction (+, S)**: +- A ↑ → B ↑ (more A causes more B) +- A ↓ → B ↓ (less A causes less B) +- Example: Features ↑ → Revenue ↑ (more features, more value, more revenue) + +**Opposite direction (o, −)**: +- A ↑ → B ↓ (more A causes less B) +- A ↓ → B ↑ (less A causes more B) +- Example: Technical Debt ↑ → Velocity ↓ (more debt slows development) + +**THE DOUBLE TEST (prevents 90% of polarity errors)**: + +1. Test increase: "If A INCREASES, does B increase or decrease?" +2. Test decrease: "If A DECREASES, does B increase or decrease?" +3. Verify both give consistent polarity + +**Example - Testing "Budget Pressure → Automation Investment"**: +- If budget pressure INCREASES → Investment DECREASES (CFO cuts spending) → Opposite (o) +- If budget pressure DECREASES → Investment INCREASES (more slack, can invest) → Opposite (o) +- **Consistent**: Both tests show opposite direction ✓ + +**Common mistake**: "More pressure should drive more investment" (confusing "pressure to invest" with "financial pressure to cut"). **ALWAYS test the actual mechanism**, not what "should" happen. + +**Negative words ≠ negative polarity**: +- "Technical Debt" (sounds bad) → "Velocity" (slower is bad) = OPPOSITE polarity (o) +- Don't confuse "bad thing" with polarity direction + +**State the relationship in words** before marking polarity: +- "More debt makes development slower" → OPPOSITE (o) +- "More customers brings more revenue" → SAME (+) + +**Notation**: +- Use `--+-->` or `→+` for same direction +- Use `--o-->` or `→o` (or `→−`) for opposite direction +- Be consistent throughout diagram + + +### Step 4: Find Loops (Trace Until You Return) + +**Algorithm**: + +1. **Pick any variable** (ideally one you think is important) +2. **Follow the arrows** until you return to the starting variable +3. **Mark the loop** with a label (R1, B1, R2, etc.) +4. **Repeat** from different starting points until no new loops found + +**Example**: +``` +Start: Manual Process Burden + → (o) → Release Frequency + → (o) → Developer Frustration + → (+) → Automation Investment + → (o) → Manual Process Burden +(returned to start = LOOP FOUND) +``` + +**Loop type determination**: + +**Count the number of OPPOSITE (o) polarities in the loop**: +- **Even number (including 0)** = **Reinforcing (R)** (amplifies change) +- **Odd number** = **Balancing (B)** (resists change, seeks equilibrium) + +**Example above**: +- Opposite links: 3 (odd number) +- **Loop type**: Balancing (B1) + +**Why this works**: +- Each opposite link "flips" the direction +- Odd number of flips = net opposite = balancing (brings you back) +- Even number of flips = net same = reinforcing (amplifies) + +**Multiple loops**: +Complex systems have many loops. Label them: +- R1, R2, R3... (reinforcing) +- B1, B2, B3... (balancing) + +**Dominant loop**: Which loop drives the system? +- **Shortest delay**: Faster loops dominate early +- **Strongest amplification**: Which grows/shrinks fastest? +- **Phase-dependent**: R1 might dominate early, B1 later + +**Nested loops**: +Some loops share variables. This creates complex dynamics where loops amplify or counteract each other. + + +### Step 5: Mark Delays (Where Significant) + +**Delay notation**: `||delay time||` on the link where delay occurs + +**When is delay significant?** +- Delay / Response time > 0.2 (20% of cycle time) → Mark it +- If delay > 50% of response time → VERY significant, double-mark or bold + +**Types of delays**: + +1. **Information delay**: Time to notice the change + - Example: Performance degrades → 2 weeks → Customers complain + - Mark: Performance → ||2 weeks|| → Customer Complaints + +2. **Material delay**: Time to implement solution + - Example: Decide to hire → 3 months → New engineer productive + - Mark: Hiring Decision → ||3 months|| → Team Capacity + +3. **Perception delay**: Time to believe/accept + - Example: Metrics improve → 1 month → Team believes it's real + - Mark: Metrics → ||1 month|| → Team Confidence + +**Why delays matter**: +- Create overshoot (solution arrives too late) +- Enable oscillation (system bounces past equilibrium) +- Hide causality (cause and effect separated in time) + +**Impact on loops**: +- Balancing loop with long delay → Oscillates around target +- Reinforcing loop with long delay → Problem invisible until crisis + +**Example**: +``` +Hiring → ||4 months|| → Team Capacity → (+) → Features → (+) → Revenue + +By the time new hires are productive (4 months), the market has changed. +Decision made in Q1 affects outcomes in Q2 - causality is hidden. +``` + + +### Step 6: Validate Your Diagram (Checklist) + +**Before presenting any CLD, check these items**: + +✅ **All variables are states** (nouns), not actions (verbs)? +- "Investment Level" ✓ not "Investing" ✗ + +✅ **All variables are measurable**? +- Can you track this over time? +- "Quality" is vague → "Bug Density" or "Test Coverage %" ✓ + +✅ **All links are truly causal**? +- Is there a MECHANISM connecting A to B? +- Not just correlation or "feels related" + +✅ **Polarities tested both directions**? +- If A ↑ → B? AND If A ↓ → B? +- Both tests give consistent polarity? + +✅ **Loops correctly identified**? +- Counted opposite links? +- Even count = R, Odd count = B? + +✅ **Delays marked where significant**? +- Delay > 20% of cycle time? +- Marked on correct link? + +✅ **No bidirectional arrows**? +- Picked PRIMARY causal direction? +- (If truly bidirectional, it's two separate loops) + +✅ **Variables are independent concepts**? +- Not circular definitions (A defined by B, B defined by A) +- Each variable has clear meaning on its own + +✅ **Diagram is readable**? +- Can your audience follow the arrows? +- Variables clearly labeled? +- Loops labeled (R1, B1, etc.)? + +**If any check fails, FIX before presenting**. Polarity errors change diagnosis completely. + + +## Diagram Simplification Techniques + +**When diagram has >4-5 loops, it's too complex to communicate effectively.** + +### Technique 1: Split by Time Phase + +**Early stage** vs **Mature stage** dynamics: +- Draw two diagrams showing which loops dominate when +- Example: R1 (Growth) dominates months 0-12, B1 (Capacity limits) dominates months 12-24 + +### Technique 2: Split by Subsystem + +**Growth dynamics** vs **Sustainability dynamics**: +- One diagram: Customer acquisition loops +- Second diagram: Technical debt and capacity loops +- Third diagram: How they interact + +### Technique 3: Aggregate Variables + +**Combine related variables**: +- "Bug Backlog" + "Tech Debt" + "Code Complexity" → "Technical Health" +- Simplifies diagram, loses some detail +- Good for executive audiences + +### Technique 4: Hide Secondary Loops + +**Show only dominant loop(s)**: +- For initial presentation, show R1 (main driver) +- Add B1 (constraint) after audience grasps R1 +- Full diagram as appendix for detailed analysis + +### Technique 5: Progressive Disclosure + +**Build complexity layer by layer**: +- Slide 1: Show simplest loop (just 3-4 variables) +- Slide 2: Add balancing constraint +- Slide 3: Add delays and secondary loops +- Slide 4: Complete diagram + +**Decision rule**: If you can't explain the diagram in 90 seconds, it's too complex. Simplify. + + +## Audience Adaptation Templates + +### Template A: Technical Diagram (Engineers, Analysts) + +**Include**: +- All loops (R1, R2, B1, B2, etc.) +- Specific variable names ("Cyclomatic Complexity", "Code Coverage %") +- Delays marked precisely ("||4.2 weeks||") +- Leverage points annotated +- Integration with stock-flow model notes + +**Example variable names**: +- "Deployment Frequency" (releases/week) +- "Technical Debt" (story points) +- "Test Suite Runtime" (minutes) +- "Mean Time to Recovery" (hours) + +**Purpose**: Detailed analysis, finding leverage points, building interventions + + +### Template B: Executive Diagram (Board, C-Suite) + +**Include**: +- 1-2 dominant loops only +- Business-level variable names ("Customer Satisfaction", "Market Share") +- Delays in business terms ("||1 quarter||") +- Clear "what drives growth" and "what limits it" labels +- One-sentence insight per loop + +**Example variable names**: +- "Revenue Growth" +- "Product Value" +- "Customer Satisfaction" +- "Market Position" + +**Simplifications**: +- Aggregate technical details ("Complexity" instead of listing 5 types) +- Focus on strategic dynamics, not tactical +- Use analogies ("Vicious cycle", "Virtuous circle") + +**Purpose**: Strategic decision-making, resource allocation, communicate "why we're stuck" + + +### Template C: Workshop Diagram (Collaborative Teams) + +**Include**: +- Simple starting loop (draw live with participants) +- Add variables as team suggests them +- Test links together ("If A increases, what happens to B?") +- Build shared mental model interactively + +**Process**: +1. Start with key variable (e.g., "Customer Churn") +2. Ask: "What causes this?" +3. Draw links as team suggests +4. Trace back to original variable → Loop found! +5. Validate together + +**Purpose**: Alignment, shared understanding, buy-in for interventions + + +## Visual Layout Best Practices + +**ASCII/Text conventions**: +``` +Variable A --+--> Variable B (same direction +) +Variable C --o--> Variable D (opposite direction o) +Variable E --|delay|--> Variable F (with delay marking) +``` + +**Circular vs linear layout**: +- **Circular**: Good for showing single clear loop +- **Linear**: Good for showing cause → effect chains +- **Nested**: Good for showing multiple interacting loops + +**Minimize crossing arrows**: +- Hard to follow if arrows cross frequently +- Rearrange variables to reduce crossings +- Or split into multiple diagrams + +**Group related variables**: +- Cluster customer-related variables together +- Cluster technical variables together +- Cluster financial variables together +- Makes structure more obvious + +**Loop flow direction**: +- **Reinforcing loops**: Often drawn clockwise +- **Balancing loops**: Often drawn showing the goal/target +- No strict rule, just be consistent + +**Annotations**: +- Loop labels: (R1), (B1) near the loop +- Time constants: "Loop completes in 3 months" +- Leverage points: Mark with ⭐ or "HIGH LEVERAGE" +- Delays: ||time|| on the link + +**Color coding** (if not ASCII): +- Reinforcing loops: Red (danger/amplification) +- Balancing loops: Blue (stability/control) +- High-leverage points: Green or gold +- Delays: Orange or yellow markers + + +## Common Mistakes Catalog + +### 1. Confusing Symptoms with Root Causes + +❌ **Mistake**: "Problem: Slow releases. Cause: Slow releases." + +✅ **Fix**: Dig deeper. What CAUSES slow releases? Manual processes, testing bottlenecks, approval chains? + +**Test**: Can you intervene on this variable? If "fix slow releases" is the answer, you're describing the symptom, not the cause. + + +### 2. Mixing Actions and States + +❌ **Mistake**: "Refactoring" → "Code Quality" + +✅ **Fix**: "Refactoring Time Allocated" (state) → "Code Quality" + +**Rule**: If it's something you DO, it's an action. Convert to the LEVEL or RATE of doing it. + + +### 3. Wrong Polarity (Most Common!) + +❌ **Mistake**: "Budget Pressure → (+) → Automation Investment" + +**Reasoning**: "Pressure drives investment" + +**Reality**: Financial pressure causes CUTS, not increases + +✅ **Fix**: "Budget Pressure → (o) → Automation Investment" + +**Prevention**: ALWAYS test both directions (A↑ and A↓) + + +### 4. Missing Key Delays + +❌ **Mistake**: Draw link without delay: "Hire Engineers → Team Capacity" + +**Reality**: 3-6 month delay (recruiting + onboarding) + +✅ **Fix**: "Hire Engineers → ||4 months|| → Team Capacity" + +**Impact**: Without delay, you'll think hiring solves problems instantly. With delay, you see why solutions arrive too late. + + +### 5. Bidirectional Arrows + +❌ **Mistake**: Revenue ↔ Features (both directions) + +**Reality**: This creates confusion - which is the PRIMARY driver? + +✅ **Fix**: Pick dominant direction: Features → Revenue (features enable sales). The reverse is a separate loop through Budget → Hiring → Engineering → Features. + + +### 6. Vague Variables + +❌ **Mistake**: "Quality" (quality of what? measured how?) + +✅ **Fix**: "Code Quality (bug density)" or "Product Quality (NPS score)" + +**Test**: Can you measure this? If not, it's too vague. + + +### 7. Circular Definitions + +❌ **Mistake**: +- Variable A: "Developer Productivity" +- Variable B: "Features Shipped" +- Link: Productivity → Features + +**Problem**: Productivity IS features shipped - same thing! + +✅ **Fix**: Break into: "Developer Experience" (satisfaction, tools, focus time) → "Development Velocity" (story points/sprint) → "Features Shipped" + + +### 8. Ignoring Negative Consequences + +❌ **Mistake**: Only show positive loops (growth, success) + +✅ **Fix**: Add balancing loops showing limits, degradation, costs + +**Example**: Show growth loop R1, BUT ALSO show capacity limit B1, technical debt R2 (negative reinforcing), budget pressure B2. + +**Reality**: All systems have BOTH growth and limits. If you only show growth, diagram is incomplete. + + +### 9. Overcomplication + +❌ **Mistake**: Single diagram with 8 loops, 25 variables, impossible to follow + +✅ **Fix**: Split into multiple diagrams or simplify by aggregating variables + +**Rule of thumb**: If you can't explain it in 90 seconds, it's too complex. + + +### 10. Presenting Without Validation + +❌ **Mistake**: Draw diagram, immediately present to stakeholders, polarity error discovered during meeting + +✅ **Fix**: Run validation checklist (above) before any presentation + +**Result of skipping validation**: Wrong diagnosis → wrong intervention → problem persists or worsens + + +## Integration with Other Skills + +### Causal Loop + Archetypes + +**Use CLD to verify archetype diagnosis**: +1. Suspect "Fixes that Fail" pattern +2. Draw CLD to confirm structure: Quick fix → Symptom relief → Side effect → Problem returns worse +3. CLD validates or refutes archetype guess + +**Use archetype to simplify CLD**: +1. Draw complex CLD with multiple loops +2. Recognize archetype pattern (e.g., "Escalation") +3. Use archetype name as shorthand: "This is Escalation between Engineering and Product" +4. Leverage known interventions from archetype library + + +### Causal Loop + Stock-Flow + +**Workflow**: +1. **Start with CLD**: Explore structure, identify loops +2. **Identify key stocks**: Which variables accumulate? (Customers, Debt, Capacity) +3. **Build stock-flow model**: Quantify accumulation, equilibrium, time constants +4. **Return to CLD**: Communicate insights to stakeholders + +**Example**: +- CLD reveals: Technical Debt → Velocity → Pressure → Shortcuts → Debt (R loop) +- Stock-flow quantifies: Debt grows 15 points/sprint, reaches critical mass at 180 points, crisis in 12 sprints +- CLD communicates: "This is a vicious cycle that will crash us in 6 months unless we break it" + +**When to use which**: +- **CLD first**: Unknown problem, exploring dynamics +- **Stock-flow first**: Known problem, need numbers/timing +- **Both**: Complex problem needing analysis AND communication + + +### Causal Loop + Leverage Points + +**CLDs show WHERE to intervene**: +- **Loop structure** = Meadows' Level 10, 9, 8, 7 (structure) +- **Information flows** = Level 6 (what info affects decisions) +- **Rules** = Level 5 (policies that govern links) +- **Goals** = Level 3 (what loops optimize for) + +**Example**: +- CLD shows: Budget Pressure → (o) → Automation Investment (weak link, gets cut easily) +- Leverage Point (Level 5 - Rules): "Automation budget ring-fenced, immune to quarterly cuts" +- Intervention: Change rules to protect high-leverage investment from short-term pressure + +**High-leverage points in CLDs**: +- **Break reinforcing loops**: Interrupt vicious cycles +- **Strengthen balancing loops**: Enhance stabilizing feedback +- **Shorten delays**: Make feedback faster +- **Change goals**: Redefine what success means + + +## Decision Framework: Which Tool When? + +**Start here**: + +**Unknown problem, exploring dynamics** → Causal Loop Diagram +- "Why does this keep happening?" +- "What's driving this behavior?" + +**Familiar pattern, quick diagnosis** → System Archetypes +- "I've seen this before" +- Pattern matches known archetype +- Leverage standard interventions + +**Need specific numbers or timing** → Stock-Flow Model +- "When will we hit capacity?" +- "How many customers at equilibrium?" +- "How fast is debt growing?" + +**Need to show change over time** → Behavior-Over-Time Graph +- "What will this look like in 6 months?" +- Compare scenarios (with intervention vs without) + +**Multiple stocks interacting** → Phase Diagram (advanced) +- Two stocks plotted against each other +- Shows equilibrium points, trajectories + +**Typical workflow**: +1. **CLD**: Explore structure, find loops → Identify archetype +2. **Archetype**: Apply known interventions → Choose strategy +3. **Stock-Flow**: Quantify impact → Validate timing and magnitude +4. **BOT Graph**: Show predicted future → Communicate to stakeholders +5. **CLD** (again): Present structure and recommendation + + +## Real-World Example Patterns + +### Pattern 1: "Fixes That Fail" Structure + +``` +Problem Symptom + ↓ (o) +Quick Fix Applied + ↓ (+) +Symptom Relief (SHORT TERM) + ↓ (+) +Unintended Consequence + ↓ (+) +Problem Symptom (LONG TERM, WORSE) + +Example: Hire more engineers (fix) → Lower quality (consequence) → More bugs → More pressure → Hire more (makes it worse) +``` + +**CLD insight**: Quick fix creates balancing loop (symptom relief), BUT also creates reinforcing loop (side effects worsen root cause). The reinforcing loop dominates long-term. + + +### Pattern 2: "Escalation" Structure + +``` +Party A's Actions + ↓ (+) +Party B's Perceived Threat + ↓ (+) +Party B's Actions + ↓ (+) +Party A's Perceived Threat + ↓ (+) +Party A's Actions (cycle repeats) + +Example: Engineering cuts corners → Product demands faster delivery → Engineering cuts more corners → Product demands even faster → Escalation +``` + +**CLD insight**: Two reinforcing loops feeding each other. Each side's response amplifies the other's reaction. No natural limit (balancing loop absent). + + +### Pattern 3: "Growth and Underinvestment" + +``` +R1: GROWTH ENGINE +Performance → Demand → Resources → Investment → Capacity → Performance + +B1: CAPACITY CONSTRAINT +Demand → Load on Capacity → Performance Degradation → Demand + +Gap: Investment should match growth, but often lags (underinvestment) +Result: B1 eventually overpowers R1, growth stalls +``` + +**CLD insight**: Growth creates need for capacity investment. If investment lags (due to short-term focus), performance degrades, limiting growth. Self-fulfilling: "Growth slowed, we didn't need that investment" (but underinvestment CAUSED the slowdown). + + +## Red Flags: Rationalizations to Resist + +### "Everyone already knows this structure" + +**Reality**: Different people have different mental models. Drawing it aligns them. + +**Counter**: "Let's draw it to verify we agree. 5 minutes to draw, saves 2 hours of talking past each other." + +**Test**: Ask three people to describe the problem. If explanations differ, you NEED the diagram. + + +### "We don't have time for diagramming" + +**Reality**: Meeting starts in 1 hour, temptation to skip validation. + +**Counter**: +- 15 minutes to draw correctly > 2-hour confused debate +- Present wrong diagram → Wrong intervention → Weeks of wasted work + +**Test**: Can you afford to be wrong? If cost of error >$5K, take 15 minutes to validate. + + +### "I can explain this verbally" + +**Reality**: Verbal explanations fade, diagrams persist. Verbal misses feedback loops. + +**Counter**: +- Diagrams reveal structure that verbal descriptions miss +- Loops are invisible in linear narrative +- Diagram becomes shared reference for future discussions + +**Test**: Try explaining "R1 amplifies while B1 constrains until R2 dominates" verbally. Now show the diagram - which is clearer? + + +### "This diagram is close enough" + +**Reality**: Polarity error or missing loop changes diagnosis completely. + +**Counter**: +- Wrong polarity = wrong loop type (R vs B) = wrong intervention +- "Close enough" in diagnosis → Completely wrong in prescription + +**Test**: Run validation checklist. Takes 3 minutes. If error found, diagram ISN'T close enough. + + +### "The problem is too simple to diagram" + +**Reality**: "Simple" problems often have hidden feedback loops. + +**Counter**: +- Simple problems with surprising persistence = Hidden loop +- If it's truly simple, diagram takes 5 minutes +- If diagram reveals complexity, it WASN'T simple + +**Test**: If problem was simple, it would be solved. Persistence suggests feedback loop - diagram it. + + +### "My audience won't understand diagrams" + +**Reality**: Audiences understand pictures better than equations or walls of text. + +**Counter**: +- Use executive template (simple, business language) +- Walk through diagram with them: "More customers → More revenue → More hiring" +- Diagrams are EASIER than verbal for many people (visual learners) + +**Test**: Try explaining multi-loop system verbally vs showing simplified CLD. Which leads to "aha!" moments faster? + + +### "I'll just sketch it quickly without validating" + +**Reality**: Quick sketch presented as analysis → Stakeholders trust it → Wrong intervention + +**Counter**: +- Polarity errors are EASY to make and HARD to spot without systematic check +- Validation checklist takes 3 minutes +- Presenting wrong structure has long-term consequences (months of wrong decisions) + +**Test**: How much time to fix wrong diagnosis and reverse bad intervention? Hours/weeks. How much time to validate before presenting? 3 minutes. Do the math. + + +## Summary + +**Causal loop diagrams** reveal the feedback structure driving system behavior: + +**Construction process** (step-by-step): +1. Identify variables (states, measurable, audience-appropriate names) +2. Map causal links (test mechanism, pick direction) +3. Assign polarities (double-test: A↑ and A↓) +4. Find loops (trace until return) +5. Identify loop types (count opposite links: even = R, odd = B) +6. Mark delays (where significant: D/R > 0.2) +7. Validate (checklist before presenting) +8. Simplify (for audience readability) + +**Error prevention**: +- Double-test polarities (prevents most common mistake) +- Validation checklist (catches errors before presentation) +- Common mistakes catalog (avoid known pitfalls) + +**Audience adaptation**: +- Technical: All loops, specific variables, detailed analysis +- Executive: 1-2 dominant loops, business language, strategic insight +- Workshop: Build together, simple starting point, progressive complexity + +**Integration**: +- CLD + Archetypes: Verify pattern, leverage known interventions +- CLD + Stock-Flow: Structure first, quantify second +- CLD + Leverage Points: Loops show where to intervene + +**Resist rationalizations**: +- "Everyone knows this" → Draw it to align mental models +- "No time" → 15 min now vs hours of confused debate +- "I can explain verbally" → Diagrams persist, reveal loops verbal misses +- "Close enough" → Polarity error = wrong diagnosis +- "Too simple" → Persistent "simple" problems have hidden loops +- "Audience won't understand" → Use executive template, walk through it + +**The discipline**: Build incrementally, test polarities twice, validate before presenting, simplify for audience. + +**The payoff**: Reveal feedback loops driving persistence, align stakeholder mental models, identify high-leverage intervention points, communicate system structure clearly. diff --git a/skills/using-systems-thinking/leverage-points-mastery.md b/skills/using-systems-thinking/leverage-points-mastery.md new file mode 100644 index 0000000..6a73548 --- /dev/null +++ b/skills/using-systems-thinking/leverage-points-mastery.md @@ -0,0 +1,502 @@ + +# Leverage Points Mastery + +## Overview + +**Most people intervene at the weakest points in a system because they're obvious and easy.** Donella Meadows identified 12 places to intervene in systems, ranked by leverage (power to change system behavior). The counterintuitive truth: **highest leverage points seem wrong, dangerous, or too soft** + + at first - yet they create the most fundamental change with least effort. + +**Core principle:** Small shifts at high leverage points beat massive efforts at low leverage points. + +**Required foundation:** Understanding of system structure (stocks, flows, feedback loops). See recognizing-system-patterns skill for basics. + +## The 12 Places to Intervene (Weakest to Strongest) + +### 12. Constants, Parameters, Numbers (WEAKEST) + +**What:** Changing quantities without changing structure (subsidies, taxes, standards, quotas, budget allocations, salaries, prices) + +**Why weak:** System structure stays intact; other forces adapt to offset your change + +**Software examples:** +- Increasing server count without fixing query inefficiency +- Raising salaries without addressing retention root causes +- Adding engineers without improving development process +- Setting code coverage targets without improving testing culture + +**When it works:** When structure is already optimal and you just need fine-tuning + +**When it fails:** When structure itself is the problem (most cases) + + +### 11. Buffers (Size of Stabilizing Stocks) + +**What:** Reserve capacity that absorbs fluctuations and smooths variability + +**Why stronger:** Prevents cascade failures, buys time for adaptation, reduces brittleness + +**Software examples:** +- Connection pool size (absorbs traffic spikes) +- Retry queues with backoff (buffer failed requests) +- Feature flags (buffer risky deployments) +- Incident response team capacity (buffer for unexpected load) +- Cash runway (financial buffer for startups) + +**When it works:** When variability is the problem, not average load + +**When it fails:** When used to hide structural inefficiency instead of fixing it + +**Design principle:** Right-size buffers - too small = brittle, too large = inefficient and masks problems + + +### 10. Stock-and-Flow Structures (Physical Systems) + +**What:** The plumbing - who's connected to what, what can flow where, physical constraints + +**Why stronger:** Changes what's physically possible, not just incentivized + +**Software examples:** +- Microservices vs monolith (changes possible communication patterns) +- Database sharding (changes possible query patterns) +- Service mesh (changes how services can discover/communicate) +- Consolidating repositories (changes possible code reuse) +- Network topology (what can talk to what) + +**When it works:** When the current structure makes desired behavior impossible + +**When it fails:** When behavior issues, not capability issues, are the problem + +**Warning:** Expensive and slow to change; make sure higher leverage points won't work first + + +### 9. Delays (Length of Time Relative to Rate of Change) + +**What:** Time between action and consequence; how long feedback takes + +**Why stronger:** Delays determine stability - too long and you overshoot/oscillate + +**Software examples:** +- CI/CD pipeline speed (delay from code to production feedback) +- Monitoring alert latency (delay from problem to notification) +- Onboarding duration (delay from hire to productivity) +- Release cycles (delay from idea to user feedback) +- Code review turnaround (delay in feedback loop) + +**When it works:** Shortening delays in negative feedback loops improves stability + +**When it fails:** Shortening delays in positive (reinforcing) loops accelerates problems + +**Critical insight:** Not all delays are bad - some stabilize systems. Diagnose which loop you're in first. + + +### 8. Balancing Feedback Loops (Strength of Negative Feedback) + +**What:** Mechanisms that bring system back toward target (error-correction, stabilization) + +**Why stronger:** Determines how fast the system self-corrects + +**Software examples:** +- Automated rollback on error rate spike (fast correction) +- Auto-scaling based on load metrics (correction strength) +- Test failures blocking deployment (correction mechanism) +- Pre-commit hooks preventing bad code (early correction) +- Rate limiters preventing overload (protection mechanism) + +**When it works:** When you want stability and error-correction + +**When it fails:** When balancing loop fights a reinforcing loop (you're treating symptoms) + +**Design principle:** Strengthen balancing loops that address root causes, not symptoms + + +### 7. Reinforcing Feedback Loops (Strength of Positive Feedback) + +**What:** Mechanisms that amplify change (growth, collapse, virtuous/vicious cycles) + +**Why stronger:** Determines rate of exponential growth or decline + +**Software examples:** +- Network effects (more users → more value → more users) +- Technical debt (debt → slower → pressure → shortcuts → more debt) +- Knowledge sharing (documentation → easier onboarding → more contributors → more docs) +- Code quality (good tests → confidence → refactoring → better design → easier testing) + +**When it works:** Amplify virtuous cycles, dampen vicious ones + +**When it fails:** When you amplify the wrong loop or can't identify which loop dominates + +**Critical skill:** Recognize which reinforcing loop you're in - this determines whether to amplify or dampen + + +### 6. Information Flows (Structure of Who Gets What Info When) + +**What:** Adding, removing, or changing availability of information; making visible what was invisible + +**Why stronger:** Can't respond to what you can't see; information changes behavior without forcing it + +**Software examples:** +- Real-time dashboards (make system state visible) +- Transparent incident reports company-wide (distribute awareness) +- Public API usage/costs (help users self-optimize) +- Test coverage visible to all (creates quality awareness) +- Tech debt made visible to product managers (enables informed trade-offs) +- Blameless post-mortems (share learning, not just outcomes) + +**When it works:** When people would do the right thing if they had the information + +**When it fails:** When incentives oppose desired behavior regardless of information + +**Why counterintuitive:** Seems passive ("just sharing info") but often more powerful than mandates + + +### 5. Rules (Incentives, Constraints, Feedback) + +**What:** Formal and informal rules determining scope, boundaries, permissions, consequences + +**Why stronger:** Changes what's rewarded/punished, allowed/forbidden + +**Software examples:** +- Deployment windows (constraint rules) +- Code review required before merge (process rules) +- On-call rotation (accountability rules) +- Blameless culture for incidents (incentive structure) +- "You build it, you run it" (ownership rules) +- Budget authority levels (decision rights) + +**When it works:** When structure and information exist but incentives misalign behavior + +**When it fails:** When rules are gamed, or structure makes compliance impossible + +**Common mistake:** Adding rules to fix problems caused by misaligned goals or bad information + + +### 4. Self-Organization (Power to Add/Change System Structure) + +**What:** System's ability to evolve its own structure, learn, diversify, complexify + +**Why stronger:** System can adapt to unforeseen circumstances without external intervention + +**Software examples:** +- Evolutionary architecture (system can reshape itself) +- Engineer-driven RFC process (system can propose its own changes) +- Hackathons and innovation time (system experiments with new structures) +- Open source contributions (system attracts external evolution) +- Autonomous teams with decision authority (system components self-optimize) +- Automated refactoring tools (code structure self-improves) + +**When it works:** In complex, changing environments where central planning fails + +**When it fails:** When self-organization optimizes locally at expense of global optimum + +**How to enable:** Create conditions for experimentation, learning, and bounded autonomy + + +### 3. Goals (Purpose or Function of the System) + +**What:** The explicit objective the system is designed to achieve + +**Why stronger:** Everything else serves the goal; change goal, everything changes + +**Software examples:** +- "Prevent all incidents" → "Learn from every incident" (changes entire security posture) +- "Ship features fast" → "Maintain sustainable pace" (changes quality/velocity trade-offs) +- "Maximize uptime" → "Maximize learning velocity" (changes risk tolerance) +- "Minimize costs" → "Maximize customer value" (changes architecture decisions) +- "Individual performance" → "Team outcomes" (changes collaboration patterns) + +**When it works:** When current goal creates perverse incentives or misses the real purpose + +**When it fails:** When goals change but structure/rules/information stay aligned to old goal + +**Why counterintuitive:** Seems abstract or "soft" but fundamentally reorients the entire system + + +### 2. Paradigms (Mindset, Model, or Perception of the System) + +**What:** The mental model, shared assumptions, or worldview that gives rise to goals and structures + +**Why stronger:** Changes how we see the system, which changes everything we do + +**Software examples:** +- "Engineers as resources" → "Engineers as investors" (changes retention approach) +- "Bugs are failures" → "Bugs are learning opportunities" (changes quality culture) +- "Requests are tasks" → "Requests are relationships" (changes API design) +- "Code is liability" → "Code is asset" (changes deletion vs preservation) +- "Users consume features" → "Users solve problems" (changes product thinking) +- "Synchronous by default" → "Async by default" (changes entire architecture) + +**When it works:** When system can't reach desired state because mental model constrains thinking + +**When it fails:** When paradigm shifts without organizational readiness (resistance, confusion) + +**How to shift:** Question assumptions, study systems that work differently, name current paradigm explicitly + + +### 1. Transcending Paradigms (STRONGEST) + +**What:** Ability to step outside any paradigm, hold multiple paradigms, recognize all paradigms as provisional + +**Why strongest:** Not attached to any one way of seeing; can choose appropriate paradigm for context + +**Software examples:** +- Recognizing "all models are wrong but some are useful" (doesn't cling to one approach) +- Polyglot programming (uses paradigm appropriate to problem) +- "Strong opinions, weakly held" (updates worldview with new evidence) +- Switching between optimizing for different constraints (speed/cost/quality) based on context +- Recognizing trade-offs as fundamental, not problems to eliminate + +**When it works:** In environments requiring navigation of multiple conflicting paradigms + +**When it fails:** Can seem wishy-washy or uncommitted if not grounded in principles + +**How to practice:** Study diverse systems, question your own assumptions, practice "Yes, AND" thinking + + +## Why This Order? The Underlying Theory + +**Counterintuitive principle:** Higher leverage points are **more abstract, slower-changing, and harder to see** - yet they control everything below them. + +### The Hierarchy of Influence + +``` +Paradigm (how we see reality) + ↓ determines +Goals (what we optimize for) + ↓ determines +Self-organization (how system evolves) + ↓ determines +Rules (what's rewarded/punished) + ↓ determines +Information flows (what's visible) + ↓ determines +Feedback loops (what's amplified/dampened) + ↓ determines +Delays (system responsiveness) + ↓ determines +Structure (what's physically possible) + ↓ determines +Buffers (how much variability is tolerated) + ↓ determines +Parameters (the actual numbers) +``` + +**Why parameters are weak:** Changing a number doesn't change the structure generating the problem + +**Why paradigms are strong:** Changing how you see the system changes which goals you pursue, which rules you create, which information you share, and ultimately which parameters you adjust + +### The Resistance Principle + +**Leverage is inversely proportional to ease:** +- Parameters: Easy to change, little resistance, little impact +- Rules: Harder to change, some resistance, moderate impact +- Goals: Hard to change, strong resistance, large impact +- Paradigms: Very hard to change, massive resistance, fundamental impact + +**Why high leverage feels wrong:** You're challenging deeply held assumptions and threatening existing power structures. + + +## Quick Identification: What Level Are You At? + +| If your solution... | You're likely at level... | +|---------------------|---------------------------| +| Adjusts a number, budget, quantity | 12 (Parameters) | +| Adds capacity, reserves, slack | 11 (Buffers) | +| Redesigns architecture, topology | 10 (Structure) | +| Speeds up or slows down a process | 9 (Delays) | +| Adds monitoring, alerts, auto-scaling | 8 (Balancing loops) | +| Amplifies network effects or growth | 7 (Reinforcing loops) | +| Makes something visible, adds transparency | 6 (Information) | +| Changes policies, mandates, incentives | 5 (Rules) | +| Enables teams to self-organize, experiment | 4 (Self-organization) | +| Redefines what success means | 3 (Goals) | +| Changes fundamental assumptions | 2 (Paradigm) | +| Questions whether the problem is real | 1 (Transcending) | + +**Red flag:** If your first 3 solutions are levels 12-10, you're stuck in "parameter tweaking" mode + + +## Generating Higher-Leverage Alternatives + +**Heuristic: Ask "Why?" three times, then intervene there** + +Example: "We need more servers" +- Why? Because response time is slow +- Why is response time slow? Because we have 20 serial service calls +- Why do we have 20 serial calls? Because we designed for strong consistency everywhere +- **Intervention:** Question paradigm of "sync by default" → move to async/eventual consistency (Level 2) + +**Heuristic: Move up the hierarchy systematically** + +For any proposed solution at level N, ask: +- Level N+1: "What rule/incentive would make this parameter self-adjust?" +- Level N+2: "What information would make people want this outcome?" +- Level N+3: "What goal would make this rule unnecessary?" +- Level N+4: "What paradigm shift would make this goal obvious?" + +**Example: "Raise salaries to retain engineers" (Level 12)** +- Level 11: Add buffer (retention bonuses, unvested stock) +- Level 10: Change structure (career paths, project diversity) +- Level 9: Speed feedback (monthly check-ins vs annual reviews) +- Level 6: Add information (transparent growth paths, impact visibility) +- Level 5: Change rules (promotion criteria value mentorship) +- Level 3: Change goal ("Retain engineers" → "Be worth staying for") +- Level 2: Change paradigm ("Engineers as resources" → "Engineers as investors") + + +## Risks and Prerequisites by Level + +### Low Leverage (12-10): Low Risk, Low Reward +**Risk:** Wasted effort, treats symptoms +**Prerequisites:** None, safe to experiment +**When to use:** Quick wins to buy time for deeper fixes + +### Medium Leverage (9-7): Moderate Risk and Reward +**Risk:** Unintended consequences if feedback loops misunderstood +**Prerequisites:** Map system structure first +**When to use:** When structure is sound but dynamics are problematic + +### High Leverage (6-5): High Reward, Moderate-High Risk +**Risk:** Gaming, resistance, backfire if incentives misaligned +**Prerequisites:** +- Leadership buy-in for information transparency +- Understand current incentives and power structures +**When to use:** When structure is right but behavior is wrong + +### Highest Leverage (4-1): Highest Reward, Highest Risk +**Risk:** Massive resistance, confusion, destabilization during transition +**Prerequisites:** +- Psychological safety (especially for goal/paradigm shifts) +- Organizational readiness for fundamental change +- Clear communication of "why" and "how" +- Patience for long time horizons (6-18 months) + +**When to use:** When lower leverage points have failed repeatedly, or starting fresh + +**Critical warning:** Don't shift paradigms or goals under extreme time pressure - you'll get compliance without commitment, and revert as soon as pressure eases. + + +## Red Flags - Rationalizations for Avoiding High Leverage + +If you catch yourself saying ANY of these, you're optimizing for ease over impact: + +| Rationalization | Reality | Response | +|-----------------|---------|----------| +| "Too urgent for high-leverage thinking" | Urgency is exactly when leverage matters most | Use parameters tactically while addressing root cause | +| "High-leverage is too slow" | Low-leverage that fails is slower (months of firefighting) | Multi-level: immediate + high-leverage in parallel | +| "High-leverage is too risky" | Repeating failed low-leverage attempts is riskier | Assess prerequisites, mitigate risks, start with pilots | +| "I don't have authority for this" | Confusing authority with influence | Build case through information, demonstration, evidence | +| "Let's just do what we can control" | You're self-limiting your sphere of influence | Senior ICs can influence goals via information and pilots | +| "Leadership won't listen to this" | You haven't made the cost visible yet | Level 6 first (information), then propose change | +| "This is too academic for real world" | Systems thinking IS pragmatic - it fixes root causes | Show evidence from companies that solved similar problems | + +**The pattern:** Rationalizations always push toward low-leverage interventions because they feel safer and more controllable. Recognize this as a cognitive bias, not a valid reason. + +## Common Mistakes + +### ❌ Parameter Tweaking Marathon + +**Symptom:** Adjusting numbers repeatedly without improvement + +**Why:** The structure generating the problem remains unchanged + +**Fix:** Map system structure, identify which feedback loop or rule is actually causing behavior + + +### ❌ High-Leverage Intervention Without Foundation + +**Symptom:** Changed goal/paradigm but nothing else changed + +**Example:** Announced "blameless culture" but still punish people for mistakes + +**Why:** Goals and paradigms need supporting information, rules, and structure + +**Fix:** Work down from high-leverage point - align rules, information, and structure to new goal + + +### ❌ Ignoring Resistance as Signal + +**Symptom:** People resist high-leverage change, so you double down with mandates + +**Why:** Resistance often indicates misaligned incentives or missing prerequisites + +**Fix:** Listen to resistance, identify what needs to change first (usually rules or information) + + +### ❌ Confusing Effectiveness with Feasibility + +**Symptom:** "Changing paradigm is too hard, let's just adjust parameters" + +**Why:** You've optimized for ease, not impact + +**Fix:** Be honest - are you avoiding high-leverage because it's hard, or because it's genuinely wrong? + + +### ❌ One-Level Thinking + +**Symptom:** All your solutions at same level (usually parameters or rules) + +**Why:** Stuck in habitual mode of thinking + +**Fix:** Force yourself to generate one solution at each level before choosing + + +## Real-World Impact + +**Example: Reducing Deployment Risk** + +| Level | Intervention | Result | +|-------|--------------|--------| +| 12 (Parameters) | Require 3 approvers instead of 2 | Slower deploys, same risk | +| 10 (Structure) | Add staging environment | Catches some issues, adds delay | +| 9 (Delays) | Faster CI/CD | Faster feedback, same quality | +| 8 (Balancing) | Automated rollback on errors | Limits blast radius | +| 7 (Reinforcing) | Feature flags enable gradual rollout | Compounds learning | +| 6 (Information) | Real-time impact metrics visible | Teams self-correct faster | +| 5 (Rules) | Deploy on-call engineer's code first | Aligns incentives with quality | +| 4 (Self-org) | Teams choose deploy frequency | Adapts to team maturity | +| 3 (Goals) | "Maximize learning velocity" → "Sustainable pace" | Changes risk tolerance | +| 2 (Paradigm) | "Deploys are risky" → "Deploys are learning" | Fundamental reframe | + +**Outcome:** Level 2 change (paradigm) with Level 6 (information) and Level 5 (rules) support achieved 10x deploy frequency with 50% fewer incidents. Parameter tweaking (Level 12) would have achieved nothing. + + +## When Lower Leverage Is Actually Right + +**Paradox:** Sometimes parameters ARE the right intervention. + +**When to use low-leverage points:** +- **Emergency situations:** Parameters are fastest (add servers NOW to handle load spike) +- **Well-designed systems:** Structure is already optimal, just needs tuning +- **Experimentation:** Cheap to test parameters before committing to structural changes +- **Buying time:** Quick parameter fix creates space to work on high-leverage changes +- **Constraint satisfaction:** You must hit a number (compliance, SLA) regardless of philosophy + +**Key distinction:** Using parameters **tactically** (temporary, buying time) vs **strategically** (thinking it's the real solution) + + +## Integration with Other Patterns + +**Leverage points + System Archetypes:** +- Archetypes reveal WHICH leverage point to target +- "Fixes that Fail" → intervention is too low-leverage +- "Shifting the Burden" → symptom relief (low leverage) prevents root cause fix (high leverage) + +**Leverage points + Unintended Consequences:** +- Higher leverage = more widespread effects +- Always trace 2nd/3rd order effects for levels 4-1 + +**Leverage points + Pre-mortem:** +- "Our high-leverage intervention failed spectacularly. Why?" +- Usually: Didn't align supporting levels, or lacked prerequisites + + +## The Bottom Line + +**Most people solve problems at level 12 (parameters).** It's obvious, feels productive, and rarely works. + +**Systems thinkers intervene at levels 6-3** (information, rules, goals). It's counterintuitive, seems too soft or abstract, and transforms systems. + +**The skill:** Recognize what level you're at, generate alternatives at higher levels, choose based on leverage vs. readiness, then align all supporting levels. + +**The discipline:** Resist the urge to tweak parameters when structure is the problem. diff --git a/skills/using-systems-thinking/recognizing-system-patterns.md b/skills/using-systems-thinking/recognizing-system-patterns.md new file mode 100644 index 0000000..ffd02de --- /dev/null +++ b/skills/using-systems-thinking/recognizing-system-patterns.md @@ -0,0 +1,226 @@ + +# Recognizing System Patterns + +## Overview + +**Systems thinking reveals invisible structures causing visible behaviors.** Most problems arise from system structure (feedback loops, delays, stocks/flows) rather than external forces or individual actions. Interventions failing despite good intentions signal systemic causes. + +## When to Use + +```dot +digraph when { + problem [label="Problem to solve" shape=box]; + recurring [label="Same issue\nkeeps returning?" shape=diamond]; + fix_failed [label="Fix worked\nthen stopped?" shape=diamond]; + unintended [label="Solutions create\nnew problems?" shape=diamond]; + complex [label="Many interconnected\nfactors?" shape=diamond]; + + apply [label="Apply systems thinking" shape=box style=filled fillcolor=lightgreen]; + local [label="Likely local/isolated issue" shape=box]; + + problem -> recurring; + recurring -> fix_failed [label="yes"]; + recurring -> local [label="no"]; + fix_failed -> unintended [label="yes"]; + fix_failed -> local [label="no"]; + unintended -> complex [label="yes"]; + unintended -> local [label="no"]; + complex -> apply [label="yes"]; + complex -> local [label="no"]; +} +``` + +**Use when you see:** +- "It helped for a while, then got worse" (reinforcing loop) +- "We keep fighting the same problem" (symptom treatment) +- "The obvious solution made it worse" (unintended consequences) +- "Everything affects everything" (interconnected system) +- "No single cause" (emergent behavior) + +**Don't use for:** +- Simple cause-and-effect problems +- Isolated technical bugs +- Problems with clear external causes + +## System Archetypes (Pattern Matching) + +Recognizing common structures saves analysis time: + +| Archetype | Symptom | Structure | Example | +|-----------|---------|-----------|---------| +| **Fixes that Fail** | Solution works temporarily, problem returns | Reinforcing loop eventually dominates | Database indexes help, then data growth overwhelms them | +| **Shifting the Burden** | Symptom relief prevents real solution | Quick fix reduces pressure to solve root cause | Quality team fixes bugs so devs never improve | +| **Accidental Adversaries** | Well-intentioned actions hurt each other | Each side's solution worsens other's problem | API rate limits → users create multiple accounts → stricter limits | +| **Escalation** | Both sides increase efforts, making it worse | Reinforcing competition loop | Tech debt → pressure to go faster → more debt | +| **Tragedy of the Commons** | Individual optimization degrades shared resource | Many users, one depleting resource | Every team adds database queries, DB slows for all | + +**When you recognize an archetype:** Jump directly to known leverage points for that pattern. + +## Quick Reference: Analysis Checklist + +When facing a complex problem: + +- [ ] **Map causal structure** - What causes what? Draw the loops +- [ ] **Identify stocks and flows** - What accumulates? What's the rate of change? +- [ ] **Find feedback loops** - Reinforcing (amplifying)? Balancing (stabilizing)? +- [ ] **Spot delays** - How long between cause and effect? +- [ ] **Check for archetypes** - Does this match a common pattern? +- [ ] **Trace 2nd/3rd order effects** - Then what happens? And then? +- [ ] **Find leverage points** - Where does small effort create large change? + +## Causal Loop Diagrams + +**Simple notation for showing structure:** + +``` +[A] --+--> [B] "A increases B" (same direction) +[A] ----> [B] + +[A] ---o> [C] "A decreases C" (opposite direction) + +[B] --+--> [D] --+--> [B] Reinforcing loop (R) + (more B → more D → more B) + +[E] --+--> [F] ---o> [E] Balancing loop (B) + (more E → more F → less E) +``` + +**Example: Code Quality Decline** + +``` +Technical Debt --+--> Time to Add Features ---o> Feature Velocity + ^ | + | | + +------------o---------------------------------+ + (Pressure to Ship Faster) + + R: ESCALATION LOOP +``` + +This shows: Low velocity → pressure → cut corners → more debt → slower velocity → more pressure (reinforcing) + +**Leverage point:** Break the loop by making debt VISIBLE and protecting time for reduction (see Meadows' "Information flows" leverage point). + +## Stocks, Flows, and Delays + +**Stocks** = What accumulates (technical debt, data size, user trust) +**Flows** = Rate of change (bug creation rate, data growth rate, churn rate) +**Delays** = Time between action and result + +**Why this matters:** +- Stocks can't change instantly (you can't fix all tech debt today) +- Flows determine stock direction (reduce bug creation rate > fix existing bugs) +- Delays hide consequences (hire now, onboarding overhead hits in 3 months) + +**Example: Performance Problem** + +``` +Stock: Total Data in Database +Inflow: Records Added per Day (growing) +Outflow: Records Archived per Day (zero) +Result: Database size grows unbounded → performance degrades + +Leverage: Implement outflow (archival strategy) +``` + +## Leverage Points (Meadows' Hierarchy) + +**Where to intervene** (most to least effective): + +1. **Change system goals** - What is the system trying to do? +2. **Change paradigms** - How do we think about this? +3. **Add/change information flows** - Who knows what, when? +4. **Change rules** - Incentives, constraints, feedback +5. **Change structure** - Physical/organizational relationships +6. **Adjust flows** - Rates of change +7. **Adjust parameters** - Numbers in the system (least effective) + +**Most people start at #7 (parameters) - least effective!** + +**Example Application:** + +Problem: API making too many requests + +| Level | Intervention | Effectiveness | +|-------|--------------|---------------| +| Parameter | Set rate limit to 100/hour | Low - treats symptom | +| Flow | Add caching to reduce request rate | Medium | +| Structure | Add webhooks so clients don't need to poll | High | +| Information | Show users their call patterns/costs | High | +| Rules | Charge per API call above threshold | High | +| Paradigm | Rethink: "API is request-response" → "API is event-driven" | Highest | + +## Predicting Unintended Consequences + +**Ask three levels deep:** + +1. **First order**: What happens immediately? +2. **Second order**: Then how do people change behavior? +3. **Third order**: Then what else changes? + +**Example: Strict Code Review Requirements** + +| Order | Effect | +|-------|--------| +| 1st | Code quality improves, fewer bugs | +| 2nd | Developers split PRs smaller to get faster reviews | +| 3rd | PRs become too small to understand context, review quality drops | + +**Technique: Pre-mortem** +"It's 6 months from now. Our solution failed spectacularly. Why?" + +## Red Flags - When to STOP and Apply Systems Thinking + +If you catch yourself saying or thinking ANY of these, STOP immediately and map the system: + +| Rationalization | Reality | What to Do | +|-----------------|---------|------------| +| "Too simple for systems thinking" | Simple-seeming problems often have systemic roots | Spend 5 minutes checking for archetypes | +| "I already know the answer" | Expertise creates blind spots to structure | Map one causal loop to verify | +| "No time for analysis" | Fast wrong action wastes more time than analysis | 10-minute system check prevents hours of rework | +| "Boss wants solution X" | Authority doesn't override system structure | Show data: "X addresses symptom, Y addresses cause" | +| "The obvious solution" | Obvious solutions often treat parameters not structure | Check leverage point level before implementing | +| "This worked before" | Systems change; yesterday's solution may worsen today's problem | Verify: What's different now? | +| "We just need to ship something" | Shipping wrong fix loses more time/trust than delay | Propose: "15 min diagnosis, then ship correct fix" | + +**When under time pressure, systems thinking becomes MORE critical, not less.** + +Quick wrong action compounds problems. Spending 10 minutes mapping the system often reveals 30-minute fixes that would have been overlooked in favor of 2-week wrong solutions. + +## Common Mistakes + +### ❌ Treating Symptoms Instead of Structure + +**Symptom:** "Database is slow" +**Symptom treatment:** Add indexes, more RAM, faster hardware +**Structural fix:** Why is data growing unbounded? Add archival, change query patterns + +### ❌ Optimizing Parts, Not System + +**Example:** Each team optimizes their service latency, but system latency increases due to more service-to-service calls + +### ❌ Missing Delays + +**Example:** Hiring looks great month 1 (more hands), terrible months 2-4 (onboarding burden), good month 6+ (productivity) + +### ❌ Fighting Feedback Loops + +**Example:** More pressure to go fast → lower quality → slower velocity → more pressure (reinforcing). You can't win by "trying harder" - must break the loop + +### ❌ Solving Problems at Wrong Level + +**Example:** Adjusting rate limit numbers (parameter) instead of adding webhooks (structure change) + +## Real-World Impact + +**Performance debugging:** Recognizing "unbounded growth" pattern (stock with inflow, no outflow) immediately points to archival strategies instead of hardware scaling. + +**Team dynamics:** Seeing "Shifting the Burden" archetype (quality team) reveals why feature quality never improves - the quick fix prevents real solution. + +**Architecture decisions:** Using leverage point hierarchy shows why "add caching" (flow adjustment) is less effective than "add webhooks" (structure change). + +## Related Patterns + +- **Iceberg Model**: Events (what happened) → Patterns (trends) → Structure (system dynamics) → Mental Models (beliefs/assumptions) +- **Feedback dominance**: Systems shift which loop dominates over time +- **Emergence**: System behavior not predictable from individual parts diff --git a/skills/using-systems-thinking/stocks-and-flows-modeling.md b/skills/using-systems-thinking/stocks-and-flows-modeling.md new file mode 100644 index 0000000..4af9640 --- /dev/null +++ b/skills/using-systems-thinking/stocks-and-flows-modeling.md @@ -0,0 +1,1251 @@ + +# Stocks and Flows Modeling + +## When to Use This Skill + +Use stocks-and-flows modeling when: +- **Predicting future states**: "How many customers will we have in 6 months?" +- **Finding equilibrium**: "At what backlog size does the queue stabilize?" +- **Analyzing delays**: "Why does auto-scaling overshoot?" +- **Quantifying accumulation**: "How fast does technical debt grow?" +- **Validating intuition**: "Will doubling capacity solve this?" +- **Making decisions with cost of error**: Production incidents, capacity planning, resource allocation + +Skip quantitative modeling when: +- System is very simple (single stock, obvious dynamics) +- Exploratory thinking (just brainstorming archetypes) +- No one will act on precise numbers +- Parameters are completely unknown (no way to estimate) + +**Key insight**: Most management mistakes come from confusing stocks with flows. This skill provides frameworks to avoid that trap. + + +## Fundamentals: Stocks vs Flows + +### Definition + +**Stock**: A quantity that accumulates over time. You can measure it at a single instant. +- Examples: Bug count, cache entries, customers, technical debt, memory used, inventory +- Units: Things (customers, bugs, GB, etc.) +- Test: "How many X do we have RIGHT NOW?" → If answerable, it's a stock + +**Flow**: A rate of change per unit time. It's an action happening continuously. +- Examples: Bug arrival rate, churn rate, requests/sec, memory leak rate +- Units: Things per time (customers/month, bugs/week, MB/sec) +- Test: "How fast is X changing?" → If that's the question, it's a flow + +**Derived metric**: Neither stock nor flow, but calculated from them. +- Examples: Cache hit rate (hits/requests), utilization (used/capacity), velocity (story points/sprint) +- These are ratios or percentages, not accumulations + +### The Bathtub Metaphor + +``` + INFLOW (faucet) + ↓ + ┌─────────────────────┐ + │ │ ← STOCK (water level) + │ ~~~~~~~~~~~~~~~ │ + │ │ + └──────────┬──────────┘ + ↓ + OUTFLOW (drain) +``` + +**Stock changes by**: Inflow - Outflow +- If Inflow > Outflow: Stock rises +- If Inflow < Outflow: Stock falls +- If Inflow = Outflow: Equilibrium (stock constant) + +**Why this matters**: You can't change the stock level instantly. You can only adjust the faucets and drains. The stock responds with a delay determined by flow rates. + +### Units Discipline + +**Iron rule**: Check dimensional consistency in every equation. + +``` +CORRECT: + ΔCustomers = (150 customers/month) - (0.05 × customers × 1/month) + Units: customers = customers/month × month ✓ + +WRONG: + ΔRevenue = Customers + Churn + Units: $/month ≠ customers + customers/month ✗ +``` + +**Practice**: Write units next to every number. If units don't match across an equation, you've made a conceptual error. + + +## Formal Notation + +### Basic Stock-Flow Equation + +**Discrete time** (month-by-month, day-by-day): +``` +S(t+1) = S(t) + Δt × (Inflow - Outflow) + +Where: + S(t) = Stock at time t + Inflow = Rate coming in (units/time) + Outflow = Rate going out (units/time) + Δt = Time step (usually 1 if you match units) +``` + +**Example - Bug Backlog**: +``` +Backlog(tomorrow) = Backlog(today) + (Bugs reported) - (Bugs fixed) +B(t+1) = B(t) + R - F + +If R = 40 bugs/day, F = 25 bugs/day, B(0) = 100: + B(1) = 100 + 40 - 25 = 115 bugs + B(2) = 115 + 40 - 25 = 130 bugs + B(3) = 130 + 40 - 25 = 145 bugs +``` + +### Flows Depending on Stocks + +Often flows aren't constant—they depend on stock levels: + +``` +Outflow = Rate × Stock + +Examples: + Churn = 0.05/month × Customers + Cache evictions = New entries (only when cache is full) + Bug fix rate = Engineer capacity × (Bugs / Bugs per engineer-day) +``` + +**Bug backlog with stock-dependent fixing**: +``` +F = min(Team_capacity, 0.5 × B) ← More bugs → faster fixing (to a limit) + +If B is small: Team isn't working at capacity +If B is large: Team is saturated at max throughput +``` + +### Multi-Stock Systems + +When stocks transfer between states: + +``` +BASIC CUSTOMERS (B): + ΔB = +Acquisitions - Upgrades + Downgrades - Churn_B + +PREMIUM CUSTOMERS (P): + ΔP = +Upgrades - Downgrades - Churn_P + +Note: Upgrades leave B and enter P (transfer flow) + Acquisitions only enter B (source flow) + Churn leaves system entirely (sink flow) +``` + +**Template for multi-stock**: +``` +Stock_A(t+1) = Stock_A(t) + Sources_A + Transfers_to_A - Transfers_from_A - Sinks_A +Stock_B(t+1) = Stock_B(t) + Sources_B + Transfers_to_B - Transfers_from_B - Sinks_B +``` + + +## Stock vs Flow Identification + +**Decision tree**: + +1. **Can you measure it at a single instant without reference to time?** + - YES → It's a stock (or derived metric) + - NO → It's a flow + +2. **If YES, does it accumulate based on past activity?** + - YES → Stock (customers accumulate from past acquisitions) + - NO → Derived metric (hit rate = hits/requests right now) + +3. **What are the units?** + - Things (GB, customers, bugs) → Stock + - Things/time (GB/sec, customers/month) → Flow + - Dimensionless (%, ratio) → Derived metric + +**Common ambiguities**: + +| Concept | Stock or Flow? | Why | +|---------|---------------|-----| +| **Technical debt** | Stock | Accumulates over time, measured in "story points of debt" | +| **Debt accumulation** | Flow | Rate at which debt is added (points/sprint) | +| **Velocity** | Derived metric | Story points/sprint (ratio of two flows) | +| **Morale** | Stock | Current team morale level (1-10 scale at instant) | +| **Morale erosion** | Flow | Rate of morale decline (points/month) | +| **Cache hit rate** | Derived metric | Hits/Requests (ratio, not accumulation) | +| **Response time** | Derived metric | Total time / Requests (average at instant) | +| **Bug count** | Stock | Number of open bugs right now | +| **Bug arrival rate** | Flow | New bugs per week | + +**Red flag**: If you're tempted to say "we need more velocity", stop. You can't "have" velocity—it's a measurement of throughput. You need more **throughput capacity** (stock: engineer hours) or better **process efficiency** (affects flow rate). + + +## When to Model Quantitatively + +### Decision Criteria + +**Build a quantitative model when**: + +1. **Equilibrium is non-obvious** + - "Will the queue ever stabilize?" + - Multi-stock systems with transfers (churn + upgrades + downgrades) + - Need to know: "At what size?" + +2. **Delays are significant** + - Delay > 50% of desired response time → Danger zone + - Auto-scaling with 4-minute cold start for 5-minute traffic spike + - Information travels slower than problem evolves + +3. **Non-linear relationships** + - Performance cliffs (CPU 80% → 95% causes 10× slowdown) + - Network effects (value per user increases with user count) + - Saturation (hiring more doesn't help past some point) + +4. **Cost of error is high** + - Production capacity planning + - Financial projections + - SLA compliance decisions + - Cost: "If we're wrong, we lose $X or reputation" + +5. **Intuition conflicts** + - Team disagrees on what will happen + - "Common sense" says one thing, someone suspects otherwise + - Model adjudicates + +6. **Validation needed** + - Need to convince stakeholders with numbers + - Compliance or audit requirement + - Building confidence before expensive commitment + +**Stay qualitative when**: +- Brainstorming phase (exploring problem space) +- System is trivial (one stock, constant flows, obvious outcome) +- Parameters are completely unknown (garbage in, garbage out) +- Decision won't change regardless of numbers +- Time to model > time to just try it + +**Rule of thumb**: If you're about to make a decision that takes >1 week to reverse and costs >$10K if wrong, spend 30 minutes building a spreadsheet model. + + +## Equilibrium Analysis + +### Finding Steady States + +**Equilibrium** = Stock levels where nothing changes (ΔS = 0) + +**Method**: +1. Write stock-flow equations +2. Set ΔS = 0 (no change) +3. Solve for stock levels algebraically + +**Example - Bug Backlog Equilibrium**: +``` +ΔB = R - F +Set ΔB = 0: + 0 = R - F + F = R + +If R = 40 bugs/day: + Equilibrium when F = 40 bugs/day + +If fixing rate depends on backlog: F = min(50, 0.5 × B) + 0 = 40 - 0.5 × B + B = 80 bugs ← Equilibrium backlog +``` + +**Interpretation**: System will settle at 80-bug backlog where team fixes 40/day. + +### Multi-Stock Equilibrium + +**SaaS customer example**: +``` +ΔB = 150 - 0.15×B + 0.08×P = 0 ... (1) +ΔP = 0.10×B - 0.13×P = 0 ... (2) + +From (2): P = (0.10/0.13) × B = 0.769 × B + +Substitute into (1): + 150 - 0.15×B + 0.08×(0.769×B) = 0 + 150 = 0.15×B - 0.0615×B + 150 = 0.0885×B + B = 1,695 customers + P = 1,304 customers + Total equilibrium = 2,999 customers +``` + +**Validation**: +- Check: 150 acquisitions = 0.15 × 1,695 = 254 exits ✓ +- Sanity: Total grows from 1,000 → ~3,000 over ~18 months ✓ + +### Stable vs Unstable Equilibria + +**Stable**: Perturbations decay back to equilibrium +- Bug backlog with stock-dependent fixing +- Customer base with constant churn % +- Cache at capacity (every new entry evicts old) + +**Unstable**: Small perturbations grow exponentially +- Bug backlog where fixing gets SLOWER as backlog grows (team overwhelmed) +- Product with negative word-of-mouth (more users → worse experience → churn accelerates) +- Memory leak (usage grows unbounded) + +**Test**: +- Increase stock slightly above equilibrium +- Do flows push it back down? → Stable +- Do flows push it further up? → Unstable (runaway) + +**No equilibrium**: +- ΔS = constant > 0 → Unbounded growth (venture-backed startup in growth mode) +- ΔS = constant < 0 → Runaway collapse (company in death spiral) +- These systems don't have steady states, only trajectories + + +## Time Constants and Dynamics + +### How Fast to Equilibrium? + +**Time constant (τ)**: Characteristic time for system to respond + +**For simple balancing loop**: +``` +τ = Stock_equilibrium / Outflow_rate + +Example - Filling cache: + Capacity: 1,000 entries + Miss rate: 8,000 unique requests/hour (when mostly empty) + τ = 1,000 / 8,000 = 0.125 hours = 7.5 minutes +``` + +**Exponential approach**: Stock approaches equilibrium like: +``` +S(t) = S_eq - (S_eq - S_0) × e^(-t/τ) + +Where: + S_eq = Equilibrium level + S_0 = Starting level + τ = Time constant +``` + +**Useful milestones**: +- After 1τ: 63% of the way to equilibrium +- After 2τ: 86% there +- After 3τ: 95% there +- After 5τ: 99% there (effectively "done") + +**Practical**: "90% there" ≈ 2.3 × τ + +**Example - Customer growth**: +``` +Current: 1,000 customers +Equilibrium: 3,000 customers +Time constant: τ = 8 months (calculated from acquisition/churn rates) + +When will we hit 2,700 customers (90% of growth)? + t = 2.3 × 8 = 18.4 months +``` + +### Multi-Stock Time Constants + +Different stocks approach equilibrium at different rates: + +**SaaS example**: +- Basic customer base: τ_B ≈ 10 months (slow growth due to upgrades) +- Premium customer base: τ_P ≈ 5 months (faster growth from upgrade flow) +- MRR: Tracks premium customers, so τ_MRR ≈ 5 months + +**System reaches overall equilibrium** when the SLOWEST stock stabilizes. + +**Implication**: Revenue growth will plateau before customer count does (because premium customers equilibrate faster, and they drive revenue). + + +## Modeling Delays + +### Types of Delays + +**Information delay**: Time between event and awareness +- Monitoring lag: 5 minutes to detect CPU spike +- Reporting lag: Bug discovered 2 weeks after code shipped +- Metric delay: Dashboard updates every hour + +**Material delay**: Time between decision and physical result +- Provisioning: 4 minutes to start new instance +- Hiring: 3 months to recruit and onboard engineer +- Training: 6 months for new team member to be fully productive + +**Pipeline delay**: Work in progress +- Deployment pipeline: 20 minutes CI/CD +- Manufacturing: Parts in assembly +- Support tickets: Acknowledged but not resolved + +### Delay Notation + +``` +Event → [Information Delay] → Detection → [Decision Time] → Action → [Material Delay] → Effect + +Example - Auto-scaling: +CPU spike → [5 min monitoring] → Alert → [instant] → Add instances → [4 min startup] → Capacity + +Total delay: 9 minutes from problem to solution +``` + +### Delay-Induced Failure Modes + +**1. Prolonged degradation**: Solution arrives too late +``` +Problem at t=0 +Solution effective at t=9 +If problem only lasts 5 minutes → Wasted scaling +If problem lasts 15 minutes → 60% of duration in pain +``` + +**2. Overshoot**: Multiple decisions made during delay +``` +t=0: CPU spikes to 95% +t=5: Decision #1: Add 10 instances (not aware of in-flight) +t=9: Decision #1 takes effect, CPU drops to 60% +t=10: Decision #2: Add 10 more (based on stale data at t=5) +t=14: Decision #2 takes effect, CPU at 30%, massive overcapacity +``` + +**3. Oscillation**: System bounces around equilibrium +``` +Undercapacity → Scale up → [delay] → Overcapacity → Scale down → [delay] → Undercapacity → ... +``` + +### Delay Analysis Framework + +**Question 1**: What is the delay magnitude (D)? +- Sum information + decision + material delays + +**Question 2**: What is the desired response time (R)? +- How fast does the problem evolve? +- How quickly do we need the solution? + +**Question 3**: What is the delay ratio (D/R)? + +**Rules of thumb**: +- **D/R < 0.2**: Delay negligible, can treat as instant +- **0.2 < D/R < 0.5**: Delay noticeable, may cause slight overshoot +- **0.5 < D/R < 1.0**: Danger zone, significant overshoot/oscillation risk +- **D/R > 1.0**: Solution arrives after problem evolved, high risk of wrong action + +**Auto-scaling example**: +- D = 9 minutes (5 + 4) +- R = 5 minutes (traffic spike duration) +- D/R = 1.8 → **HIGH RISK** + +**Implications**: +- Need faster provisioning (reduce D) +- Need earlier warning (increase R by predicting) +- Need feedforward control (preemptive scaling) + +### Addressing Delays: Leverage Points + +**Level 12 (weakest)**: Tune parameters +- Adjust scaling thresholds (70% vs 80% CPU) +- Helps marginally, doesn't eliminate delay + +**Level 11**: Add buffers +- Keep warm pool of pre-started instances +- Reduces material delay, still has information delay + +**Level 6**: Change information flow +- Predictive auto-scaling (ML forecasting) +- Eliminates information delay by anticipating + +**Level 10 (stronger)**: Change system structure +- Scheduled scaling for known patterns +- Feedforward control (bypass feedback loop entirely) + +**Key insight**: Delays in balancing loops create most of the problem. Fixing delays is high-leverage. + + +## Non-Linear Dynamics + +### When Linear Intuition Fails + +**Linear thinking**: "Double the input, double the output" +- Works for: Simple arithmetic, direct proportions +- Fails for: Real systems with constraints, thresholds, interactions + +**Signs of non-linearity**: +1. **Diminishing returns**: Adding more stops helping (hiring past team size 50) +2. **Accelerating returns**: More begets more (network effects) +3. **Thresholds/cliffs**: Small change causes regime shift (cache 95% → 100% full) +4. **Saturation**: Can't grow past ceiling (CPU can't exceed 100%) + +### Common Non-Linear Patterns + +**1. S-Curve (Logistic Growth)**: +``` +Slow start → Exponential growth → Saturation + +Example: Product adoption + Early: Few users, slow growth (no network effects yet) + Middle: Rapid growth (word of mouth kicks in) + Late: Market saturated, growth slows +``` + +**Formula**: +``` +S(t) = K / (1 + e^(-r(t - t0))) + +Where: + K = Carrying capacity (max possible) + r = Growth rate + t0 = Inflection point +``` + +**2. Performance Cliffs**: +``` +CPU Utilization vs Response Time (typical web server): + 0-70%: 50ms (constant) + 70-85%: 80ms (slight increase) + 85-95%: 200ms (degraded) + 95-98%: 800ms (severe degradation) + 98%+: 5000ms (collapse) +``` + +**Why**: Queuing theory—small increases in utilization cause exponential increases in wait time near saturation. + +**Implication**: "We're at 90% CPU, let's add 20% capacity" → Only brings you to 75%, still in degraded zone. Need 2× capacity to get to safe 45%. + +**3. Tipping Points**: +``` +Small change crosses threshold → Large regime shift + +Examples: + - Technical debt reaches point where all time spent fixing, no features + - Team morale drops below threshold → Attrition spiral + - Cache eviction rate exceeds insertion rate → Thrashing +``` + +**Modeling**: Need to identify the threshold and model behavior on each side separately. + +**4. Reinforcing Loops (Exponential)**: +``` +Compound growth: S(t) = S(0) × (1 + r)^t + +Examples: + - Viral growth: Each user brings k friends (k > 1) + - Technical debt: Slows development → More shortcuts → More debt + - Attrition: People leave → Remaining overworked → More leave +``` + +**Danger**: Exponentials seem slow at first, then explode. By the time you notice, system is in crisis. + +### Identifying Non-Linearities + +**Method 1**: Plot the relationship +- Graph flow vs stock (e.g., fix rate vs backlog) +- Linear: Straight line +- Non-linear: Curve, bend, cliff + +**Method 2**: Test extremes +- What happens at stock = 0? +- What happens at stock = very large? +- If behavior changes qualitatively, it's non-linear + +**Method 3**: Look for limits +- Physical limits (100% CPU, 24 hours/day) +- Economic limits (budget constraints) +- Social limits (team coordination breaks down past 50 people) + +**Method 4**: Check for interactions +- Does flow depend on MULTIPLE stocks? +- Does one stock's growth affect another's? +- Interactions create non-linearities + +### Modeling Non-Linear Systems + +**Piecewise linear**: +``` +Fix_rate = + if B < 50: 25 bugs/day (constant) + if B >= 50: 0.5 × B bugs/day (linear in B) + if B > 100: 50 bugs/day (saturated) +``` + +**Lookup tables**: +``` +CPU% | Response_ms +-----|------------ +60 | 50 +70 | 60 +80 | 90 +90 | 200 +95 | 800 +98 | 5000 +``` + +Interpolate between values for model. + +**Functional forms**: +- Exponential saturation: `F = F_max × (1 - e^(-k×S))` +- Power law: `F = a × S^b` +- Logistic: `F = K / (1 + e^(-r×S))` + +**Practical advice**: Start simple (linear), add non-linearity only where it matters for the question you're answering. + + +## Visualization Techniques + +### Bathtub Diagrams + +**Purpose**: Communicate stock-flow structure to non-technical audiences + +**Format**: +``` + Acquisitions + 150/month + ↓ + ┌──────────────────┐ + │ │ + │ CUSTOMERS │ ← Stock (current: 1,000) + │ │ + └────────┬─────────┘ + ↓ + Churn + 5% × Customers + = 50/month +``` + +**When to use**: Explaining accumulation dynamics to executives, stakeholders, non-engineers + +**Key**: Label flows with rates, stock with current level and units + +### Stock-Flow Diagrams + +**Purpose**: Technical analysis, show equations visually + +**Notation**: +- Rectangle = Stock +- Valve = Flow +- Cloud = Source/Sink (outside system boundary) +- Arrow = Information link (affects flow) + +**Example**: +``` + ☁ → [Acquisition] → |BASIC| → [Upgrade] → |PREMIUM| → [Churn] → ☁ + ↑ ↓ + └──── [Downgrade] ────┘ + + [Flow] affects rate + |Stock| accumulates + ☁ = External source/sink +``` + +**When to use**: Detailed analysis, documenting model structure, team discussion + +### Behavior Over Time (BOT) Graphs + +**Purpose**: Show how stocks and flows change dynamically + +**Format**: Time series plots +``` +Customers + │ ┌─────── Equilibrium (3,000) +3000│ / + │ / +2000│ / + │ / +1000├/─────────────────── + └─┴─┴─┴─┴─┴─┴─┴─┴─┴─ + 0 3 6 9 12 15 18 Months +``` + +**When to use**: +- Demonstrating "what happens over time" +- Comparing scenarios ("with churn reduction vs without") +- Showing approach to equilibrium + +**Best practice**: Plot both stocks and key flows on same graph with dual y-axes if needed + +### Phase Diagrams (Advanced) + +**Purpose**: Visualize multi-stock systems + +**Format**: Plot Stock A vs Stock B +``` +Premium + │ + │ / ← Equilibrium point (1,695 B, 1,304 P) + │ / + │ / ← Trajectory from start + │ / + │● + └────────── Basic + + Arrow shows direction of movement over time +``` + +**When to use**: Complex systems with 2-3 interacting stocks + +### Choosing Visualization + +| Audience | Purpose | Best Visualization | +|----------|---------|-------------------| +| Executive | Explain problem | Bathtub diagram | +| Engineer | Analyze dynamics | Stock-flow diagram + BOT graph | +| Stakeholder | Compare options | Multiple BOT graphs (scenarios) | +| Team | Build shared model | Whiteboard stock-flow diagram | +| Self | Understand system | All of the above iteratively | + + +## Model Validation + +### Units Check (Dimensional Analysis) + +**Every equation must have consistent units on both sides.** + +**Process**: +1. Write units next to every variable +2. Check each term in equation has same units +3. If units don't match, you've made a conceptual error + +**Example**: +``` +WRONG: + MRR = Basic_customers + Premium_revenue + [$/month] ≠ [customers] + [$/month] ✗ + +RIGHT: + MRR = (Basic_customers × $100/month) + (Premium_customers × $150/month) + [$/month] = [customers × $/month] + [customers × $/month] ✓ +``` + +**Common errors caught by units**: +- Adding stock to flow +- Multiplying when you should divide +- Forgetting time scale (monthly vs annual rates) + +### Boundary Testing + +**Test extreme values** to catch nonsensical model behavior: + +**What if stock = 0?** +``` +Bug backlog = 0 bugs +Fix rate = 0.5 × 0 = 0 bugs/day ✓ (Can't fix non-existent bugs) +``` + +**What if flow = 0?** +``` +Churn = 0% +Equilibrium customers = ∞ ✗ (Unbounded growth is unrealistic) + +Insight: Need to add market saturation limit +``` + +**What if stock = very large?** +``` +Backlog = 10,000 bugs +Fix rate = 0.5 × 10,000 = 5,000 bugs/day ✗ (Team of 5 can't fix 5,000/day) + +Insight: Need to cap fix rate at team capacity +``` + +**What if flow is negative?** +``` +Acquisition rate = -50 customers/month ✗ (Negative acquisition is nonsense) + +Insight: Model might produce negative flows in edge cases, need floor at 0 +``` + +### Assumptions Documentation + +**State every assumption explicitly**: + +**Example - Cache model assumptions**: +1. Request distribution is stable (20/80 hot/cold) +2. FIFO eviction (not LRU or LFU) +3. Cache lookup time is negligible +4. No cache invalidation (entries only evicted, not deleted) +5. Hot resources are accessed frequently enough to never evict + +**Why this matters**: +- Identify where model breaks if reality differs +- Communicate limitations to stakeholders +- Know where to improve model if predictions fail + +**Template**: +``` +## Model Assumptions +1. [Physical]: What are we assuming about the system? +2. [Behavioral]: What are we assuming about users/actors? +3. [Parameter]: What values are we guessing? +4. [Scope]: What are we deliberately ignoring? +``` + +### Sensitivity Analysis + +**Question**: How robust is the conclusion to parameter uncertainty? + +**Method**: Vary parameters ±20% or ±50%, see if conclusion changes + +**Example - Churn reduction ROI**: +``` +Base case: 5% → 3% churn = +$98K MRR at 12 months + +Sensitivity: + Acquisition rate ±20%: +$85K to +$112K (Conclusion robust ✓) + Upgrade rate ±20%: +$92K to +$104K (Conclusion robust ✓) + Initial customers ±20%: +$88K to +$108K (Conclusion robust ✓) +``` + +**If conclusion changes sign** (e.g., ROI goes negative), the model is sensitive to that parameter. You need better data for that parameter or acknowledge high uncertainty. + +**Traffic light test**: +- Green: Conclusion unchanged across plausible range +- Yellow: Magnitude changes but direction same +- Red: Conclusion flips (positive to negative) + +### Calibration: Simple to Complex + +**Start simple**: +- Constant flows +- Linear relationships +- Single stock + +**Add complexity only if**: +- Simple model predictions don't match reality +- Non-linearity matters for your question +- Stakeholders won't accept simple model + +**Iterative refinement**: +1. Build simplest model +2. Compare to real data (if available) +3. Identify largest discrepancy +4. Add ONE complexity to address it +5. Repeat + +**Warning**: Complex models have more parameters → More ways to be wrong. Prefer simple models that are "approximately right" over complex models that are "precisely wrong." + + +## Common Patterns in Software + +### 1. Technical Debt Accumulation +``` +STOCK: Technical Debt (story points) +INFLOWS: + - Shortcuts taken: 5 points/sprint (pressure to ship) + - Dependencies decaying: 2 points/sprint (libraries age) +OUTFLOWS: + - Refactoring: 3 points/sprint (allocated capacity) + +ΔDebt = 5 + 2 - 3 = +4 points/sprint + +Equilibrium: Never (unbounded growth) +Time to crisis: When debt > team capacity to understand codebase +``` + +**Interventions**: +- Level 12: Increase refactoring allocation (3 → 5 points/sprint) +- Level 8: Change process to prevent shortcuts (balancing loop) +- Level 3: Change goal from "ship fast" to "ship sustainable" + +### 2. Queue Dynamics +``` +STOCK: Backlog (tickets, bugs, support requests) +INFLOW: Arrival rate (requests/day) +OUTFLOW: Service rate (resolved/day) + +Special cases: + - Arrivals > Service: Queue grows unbounded (hire more or reduce demand) + - Arrivals < Service: Queue drains (over-capacity) + - Arrivals = Service: Equilibrium, but queue length depends on variability + +Note: Even at equilibrium, queue has non-zero size due to randomness (queuing theory) +``` + +### 3. Resource Depletion +``` +STOCK: Available Resources (DB connections, memory, file handles) +INFLOWS: + - Release: Connections closed, memory freed +OUTFLOWS: + - Allocation: Connections opened, memory allocated + +Leak: Outflow > Inflow (allocate but don't release) + → Stock depletes to 0 + → System fails + +Time to failure: Initial_stock / Net_outflow +``` + +### 4. Capacity Planning +``` +STOCK: Capacity (servers, bandwidth, storage) +DEMAND: Usage (request rate, data size) + +Key question: When does demand exceed capacity? + +Model demand growth: + D(t) = D(0) × (1 + growth_rate)^t + +Solve for t when D(t) = Capacity: + t = log(Capacity / D(0)) / log(1 + growth_rate) + +Example: + Current: 1,000 req/sec, Capacity: 2,000 req/sec + Growth: 5%/month + t = log(2) / log(1.05) = 14.2 months until saturation +``` + +### 5. Customer Dynamics +``` +STOCK: Active Customers +INFLOWS: + - Acquisition: Marketing spend → New customers + - Reactivation: Win-back campaigns +OUTFLOWS: + - Churn: % leaving per month + - Downgrades: Moving to free tier (if that's outside system boundary) + +Equilibrium: Acquisition = Churn + A = c × C (where c = churn rate) + C_eq = A / c + +If A = 150/month, c = 5%: + C_eq = 150 / 0.05 = 3,000 customers +``` + +### 6. Cache Behavior +``` +STOCK: Cache Entries (current: E, max: E_max) +INFLOWS: + - Cache misses for new resources +OUTFLOWS: + - Evictions (when cache is full) + +Phases: + 1. Fill (E < E_max): Inflow > 0, Outflow = 0 + 2. Equilibrium (E = E_max): Inflow = Outflow (every new entry evicts one) + +Hit rate at equilibrium: + Depends on request distribution vs cache size + - Perfect: Hot set < E_max → 100% hit rate + - Reality: Long tail → Partial hit rate +``` + + +## Integration with Other Skills + +### Stock-Flow + Archetypes + +**Archetypes are patterns of stock-flow structure**: + +**Fixes that Fail**: +``` +STOCK: Problem Symptom +Quick fix reduces symptom (outflow) but adds to root cause (inflow to different stock) +Result: Symptom returns worse + +Example: + Stock 1: Bug Backlog + Stock 2: Technical Debt + Quick fix: Hack patches (reduces backlog, increases debt) + Debt → Slower development → More bugs → Backlog returns +``` + +**Use stock-flow to quantify archetypes**: +- How fast does the symptom return? +- What's the equilibrium after fix? +- How much worse is long-term state? + +### Stock-Flow + Leverage Points + +**Map leverage points to stock-flow structure**: + +- **Level 12 (Parameters)**: Change flow rates (increase acquisition budget) +- **Level 11 (Buffers)**: Change stock capacity (bigger cache, more servers) +- **Level 10 (Structure)**: Add/remove stocks or flows (new customer tier) +- **Level 8 (Balancing loops)**: Change outflow relationships (reduce churn) +- **Level 7 (Reinforcing loops)**: Change inflow relationships (viral growth) +- **Level 6 (Information)**: Change what affects flows (predictive scaling) +- **Level 3 (Goals)**: Change target equilibrium (growth vs profitability) + +**Quantitative modeling helps evaluate leverage**: +- Calculate impact of 20% parameter change (Level 12) +- Compare to impact of structural change (Level 10) +- See that structural change is often 5-10× more effective + +### Stock-Flow + Causal Loops + +**Causal loops show feedback structure**: +``` +Customers → Revenue → Marketing → Customers (reinforcing) +``` + +**Stock-flow quantifies the loops**: +``` +C(t+1) = C(t) + M(t) - 0.05×C(t) (customers) +R(t) = $100 × C(t) (revenue) +M(t) = 0.10 × R(t) / $500 (marketing converts revenue to customers) +``` + +**Use stock-flow to**: +- Calculate loop strength (how fast does reinforcing loop accelerate growth?) +- Find equilibrium (where do balancing loops stabilize system?) +- Identify delays (how long before marketing investment shows up in customers?) + +### Decision Framework: Which Skill When? + +**Start with Archetypes** when: +- Problem seems familiar ("we've seen this before") +- Need quick pattern matching +- Communicating to non-technical audience + +**Add Stock-Flow** when: +- Need to quantify ("how fast?", "how much?", "when?") +- Archetype diagnosis unclear (need to map structure first) +- Validating intuition with numbers + +**Use Leverage Points** when: +- Evaluating interventions (which fix is highest impact?) +- Communicating strategy (where should we focus?) +- Already have stock-flow model, need to decide what to change + +**Typical workflow**: +1. Sketch causal loops (quick structure) +2. Identify archetype (pattern matching) +3. Build stock-flow model (quantify) +4. Evaluate interventions with leverage points (decide) + + +## Common Mistakes + +### 1. Confusing Stocks with Flows + +**Mistake**: "We need more velocity" +- Velocity is a flow (story points/sprint), not a stock you can "have" + +**Correct**: "We need more capacity" (engineer hours, a stock) or "We need better process efficiency" (affects velocity, a flow rate) + +**Test**: Can you measure it at a single instant without time reference? + +### 2. Forgetting Delays + +**Mistake**: "Just add more servers, problem solved" +- Ignores 4-minute cold start +- Ignores 5-minute detection lag +- By the time servers are online, spike is over + +**Correct**: "9-minute total delay means we'll be overloaded for most of the spike. Need faster provisioning or predictive scaling." + +**Test**: What is delay / response_time? If >0.5, delay dominates. + +### 3. Linear Thinking in Non-Linear Systems + +**Mistake**: "We're at 90% CPU, add 20% more servers → 72% CPU" +- Queuing theory: Response time is non-linear near saturation +- 90% → 72% keeps you in degraded performance zone + +**Correct**: "Need to get below 70% CPU to escape performance cliff. Requires 2× capacity, not 1.2×." + +**Test**: Plot performance vs utilization. If it curves, it's non-linear. + +### 4. Ignoring Units + +**Mistake**: +``` +Total_cost = Customers + (Revenue × 0.3) +[units?] = [customers] + [$/month × dimensionless] ✗ +``` + +**Correct**: Write units, check consistency +``` +Total_cost [$/month] = (Customers [count] × $100/customer/month) + ... +``` + +### 5. Over-Modeling + +**Mistake**: Building 500-line Python simulation for simple question +- "How many customers at equilibrium?" +- Could solve with 2-line algebra + +**Correct**: Start simple. Add complexity only if simple model fails. + +**Test**: Can you answer the question with envelope math? If yes, do that first. + +### 6. Under-Modeling + +**Mistake**: Guessing at capacity needs for $100K infrastructure investment +- "Seems like we need 50 servers" +- No model, no calculation + +**Correct**: 30 minutes in Excel to model growth, calculate breakpoint, sensitivity test + +**Test**: Cost of error >$10K and decision takes >1 week to reverse? Build a model. + +### 7. Snapshot Thinking + +**Mistake**: "We have 100 bugs right now, that's manageable" +- Ignores accumulation: 40/day in, 25/day out +- In 30 days: 100 + (40-25)×30 = 550 bugs + +**Correct**: "Backlog is growing 15 bugs/day. At this rate, we'll have 550 bugs in a month. Need to increase fix rate or reduce inflow." + +**Test**: Are flows balanced? If not, stock will change dramatically. + +### 8. Equilibrium Blindness + +**Mistake**: "Let's hire our way out of tech debt" +- More engineers → More code → More debt +- Doesn't change debt/code ratio (the equilibrium structure) + +**Correct**: "Hiring changes throughput but not debt accumulation rate. Need to change development process (reduce debt inflow) or allocate refactoring time (increase debt outflow)." + +**Test**: Does the intervention change the equilibrium, or just the time to get there? + +### 9. Ignoring Delays in Feedback Loops + +**Mistake**: "We shipped the performance fix, why are users still complaining?" +- Fix deployed today +- Users notice over next 2 weeks +- Reviews/sentiment update over next month +- Information delay is 30+ days + +**Correct**: "Fix will take 4-6 weeks to show up in sentiment metrics. Don't panic if next week's NPS is still low." + +### 10. Treating Symptoms vs Stocks + +**Mistake**: "Add more servers every time we get slow" +- Symptom: Slow response +- Stock: Request rate growth +- Treating symptom (capacity) not root cause (demand) + +**Correct**: "Why is request rate growing? Can we cache, optimize queries, or rate-limit to reduce inflow? Then add capacity if structural changes aren't enough." + + +## Red Flags: Rationalizations to Resist + +When you're tempted to skip quantitative modeling, watch for these rationalizations: + +### "This is too simple to model" + +**Reality**: Simple systems often have non-obvious equilibria. +- Bug backlog seems simple, but when does it stabilize? +- Customer churn seems obvious, but what's equilibrium size? + +**Counter**: If it's simple, the model takes 5 minutes. If it's not simple, you NEED the model. + +**Test**: Can you predict the equilibrium and time constant in your head? If not, it's not simple. + +### "We don't have time for spreadsheets" + +**Reality**: 30 minutes modeling vs 3 months living with wrong decision. + +**Counter**: +- Production incident? Model delay dynamics in 10 minutes to pick right intervention. +- Capacity planning? 1 hour in Excel saves $50K in overprovisioning. + +**Test**: Time to model vs time to reverse decision. If model_time < 0.01 × reversal_time, model it. + +### "I can estimate this in my head" + +**Reality**: Human intuition fails on: +- Exponential growth (seems slow then explodes) +- Delays (underestimate overshoot) +- Non-linearities (performance cliffs) +- Multi-stock systems (competing flows) + +**Counter**: Write down your mental estimate, build model, compare. You'll be surprised how often your intuition is 2-5× off. + +**Test**: If you're confident, the model will be quick confirmation. If you're uncertain, you need the model. + +### "We don't have data for parameters" + +**Reality**: You know more than you think. +- "Churn is somewhere between 3% and 7%" is enough for sensitivity analysis +- Rough estimates reveal qualitative insights (growing vs shrinking) + +**Counter**: Build model with plausible ranges, test sensitivity. If conclusion is robust across range, you don't need exact data. If it's sensitive, THEN invest in measurement. + +**Test**: Can you bound parameters to ±50%? If yes, model it and check sensitivity. + +### "Math is overkill for this decision" + +**Reality**: +- "Add 20% capacity" seems like common sense +- Model reveals: Need 2× due to performance cliff +- Math just prevented $40K waste + +**Counter**: Engineering decisions deserve engineering rigor. You wouldn't deploy code without testing; don't make capacity decisions without modeling. + +**Test**: Cost of error >$5K? Use math. + +### "The system is too complex to model" + +**Reality**: All models are simplifications. That's the point. +- Don't need to model every detail +- Model the parts that matter for your decision + +**Counter**: Start with simplest model that addresses your question. Three stocks and five flows captures 80% of systems. + +**Test**: What's the ONE question you need to answer? Build minimal model for that question only. + +### "We'll just monitor and adjust" + +**Reality**: By the time you see the problem, it may be too late. +- Delays mean problem is bigger than it appears +- Exponential growth hides until crisis +- Prevention is easier than cure + +**Counter**: Model predicts WHEN you'll hit the wall. "Monitor and adjust" becomes "monitor for predicted warning signs and execute prepared plan." + +**Test**: What's the delay between problem and solution? If >50% of problem duration, you need prediction, not reaction. + +### "This is a special case, stock-flow doesn't apply" + +**Reality**: If something accumulates or depletes, it's a stock-flow system. +- Queues (tickets, requests, bugs) +- Resources (memory, connections, capacity) +- People (customers, users, employees) +- Intangibles (morale, technical debt, knowledge) + +**Counter**: Describe the system. If you can identify what's accumulating and what's flowing, stock-flow applies. + +**Test**: Is there something that can grow or shrink? That's a stock. What changes it? Those are flows. + + +## Summary + +**Stocks and flows modeling** is the quantitative backbone of systems thinking: + +1. **Stocks** accumulate (measurable at an instant) +2. **Flows** change stocks (rates per unit time) +3. **Equilibrium** = where flows balance (ΔS = 0) +4. **Delays** create overshoot, oscillation, and failure +5. **Non-linearities** break linear intuition (cliffs, S-curves, exponentials) +6. **Validation** = units check, boundary test, sensitivity analysis + +**When to use**: +- Predicting future states +- Finding equilibrium +- Quantifying delays +- Validating intuition +- Making high-stakes decisions + +**Key techniques**: +- Formal notation: S(t+1) = S(t) + (Inflow - Outflow) +- Equilibrium: Set ΔS = 0, solve algebraically +- Time constants: τ = Stock / Flow +- Delay analysis: D/R ratio (danger when >0.5) +- Visualization: Bathtub diagrams, stock-flow diagrams, BOT graphs + +**Integration**: +- Archetypes = patterns of stock-flow structure +- Leverage points = where to intervene in stock-flow system +- Causal loops = qualitative preview of stock-flow dynamics + +**Resist rationalizations**: +- "Too simple" → Simple models take 5 minutes +- "No time" → 30 min modeling vs 3 months of wrong decision +- "I can estimate" → Intuition fails on delays, exponentials, non-linearities +- "No data" → Sensitivity analysis works with ranges +- "Too complex" → Start simple, add complexity only if needed + +**The discipline**: Check units, test boundaries, state assumptions, validate with sensitivity analysis. + +**The payoff**: Predict system behavior, avoid crises, choose high-leverage interventions, make decisions with confidence instead of guessing. diff --git a/skills/using-systems-thinking/systems-archetypes-reference.md b/skills/using-systems-thinking/systems-archetypes-reference.md new file mode 100644 index 0000000..f822375 --- /dev/null +++ b/skills/using-systems-thinking/systems-archetypes-reference.md @@ -0,0 +1,918 @@ + +# Systems Archetypes Reference + +## Overview + +**System archetypes are recurring structural patterns that produce characteristic behaviors.** Recognizing the archetype reveals the intervention strategy - you don't need to re-solve the problem, you can apply known solutions. + +**Core principle:** Systems are governed by archetypal structures. The same 10 patterns appear across domains. Once you recognize the pattern, you know how to intervene. + +**Required foundation:** Understanding of feedback loops, stocks/flows, and leverage points. See recognizing-system-patterns and leverage-points-mastery skills. + +## The 10 System Archetypes + +Quick reference table - detailed explanations follow: + +| Archetype | Signature Pattern | Primary Loop | Key Intervention | +|-----------|-------------------|--------------|------------------| +| 1. Fixes that Fail | Solution works temporarily, then problem returns worse | Reinforcing (symptom relief → side effect → worse problem) | Address root cause, not symptom | +| 2. Shifting the Burden | Symptom relief prevents fundamental solution | Balancing (quick fix) overpowers balancing (real fix) | Make quick fix difficult or undesirable | +| 3. Escalation | Two parties each escalate responses to each other | Reinforcing (A→B→A) | Unilateral de-escalation or shared goal | +| 4. Success to the Successful | Winner gets more resources, creates brittleness | Reinforcing (success → resources → more success) | Level the playing field or diversify | +| 5. Tragedy of the Commons | Individual optimization degrades shared resource | Reinforcing (individual gain → commons depletion → less for all) | Regulate commons or create feedback | +| 6. Accidental Adversaries | Well-intentioned actions hurt each other | Reinforcing (A helps self, hurts B; B helps self, hurts A) | Align incentives or coordinate | +| 7. Drifting Goals | Standards erode gradually from complacency | Balancing (gap → lower standard rather than improve) | Make gap visible, fix standards | +| 8. Limits to Growth | Growth slows despite effort, hits ceiling | Balancing (growth → constraint → slow growth) | Remove constraint or shift focus | +| 9. Growth and Underinvestment | Growth creates need for capacity, underfunded | Reinforcing (growth → insufficient capacity → quality drops → growth slows) | Invest ahead of demand | +| 10. Eroding Goals (Pessimistic) | Standards lower in response to performance pressure | Reinforcing (pressure → lower standards → worse performance → more pressure) | Break cycle, re-establish standards | + + +## 1. Fixes that Fail + +### Structure + +``` +Problem Symptom + ↓ + Quick Fix Applied + ↓ +Symptom Relieved (temporarily) + ↓ +Unintended Side Effect + ↓ +Problem Returns Worse + ↓ +Apply More of Same Fix + ↓ +[REINFORCING LOOP - Gets Worse Over Time] +``` + +**Causal Loop Diagram:** +``` +Problem --+--> Quick Fix --+--> Symptom Relief + ^ | + | ↓ + +------o----- Unintended Side Effect (delay) + +R: Fix amplifies problem via side effects +``` + +### Software Engineering Examples + +**Database Performance** +- Problem: Slow queries +- Fix: Add indexes +- Works temporarily: Queries faster +- Side effect: Data grows, indexes can't keep up, worse than before +- Root cause unaddressed: Unbounded data growth, no archival + +**Alert Fatigue** +- Problem: Missing incidents +- Fix: Add more alerts +- Works temporarily: Catch more issues +- Side effect: Alert fatigue, engineers ignore alerts +- Root cause unaddressed: Incident rate, system reliability + +**Hiring for Velocity** +- Problem: Team too slow +- Fix: Hire more engineers +- Works temporarily: More hands +- Side effect: Onboarding burden, communication overhead, slower overall +- Root cause unaddressed: Process inefficiency, tech debt + +### Diagnostic Questions + +- Does the solution work at first, then stop working? +- Are you applying more of the same solution repeatedly? +- Is there a delay between fix and side effect appearing? +- Are side effects making the original problem worse? + +**If YES to these:** Likely Fixes that Fail + +### Intervention Strategy + +**Level 3 (Goals):** Change goal from "relieve symptom" to "solve root cause" + +**Level 6 (Information):** Make side effects visible early (before they dominate) + +**Level 5 (Rules):** Prohibit applying the same fix more than twice without root cause analysis + +**What NOT to do:** +- ❌ Apply more of the failing fix +- ❌ Ignore the side effects as "unrelated" +- ❌ Speed up the fix (makes side effects appear faster) + +**What to DO:** +- ✅ Identify the root cause being masked +- ✅ Trace the path from fix → side effect → worsened problem +- ✅ Solve root cause OR accept living with symptom + + +## 2. Shifting the Burden + +### Structure + +``` + Problem Symptom + ↓ + ┌─────────Quick Fix (Path A) + │ ↓ + │ Symptom Relieved + │ ↓ + │ Side Effect: Fundamental + │ Solution Never Pursued + │ ↓ + └── Problem Returns → Quick Fix Again + + Fundamental Solution (Path B) ← Never taken +``` + +**Key difference from Fixes that Fail:** Two pathways compete - symptom relief vs. fundamental solution. Quick fix actively prevents fundamental solution by reducing pressure. + +### Software Engineering Examples + +**QA Team vs. Quality Culture** +- Symptom: Bugs in production +- Quick fix: Add QA team to catch bugs +- Fundamental: Teams build quality in +- Burden shift: Dev teams never learn quality practices, depend on QA +- Result: QA becomes bottleneck, teams can't ship without them + +**Outsourcing vs. Skill Building** +- Symptom: Team lacks skill X +- Quick fix: Outsource or hire contractor +- Fundamental: Train existing team +- Burden shift: Team never gains capability, permanent dependency +- Result: Can't maintain what contractors build + +**Framework vs. Understanding** +- Symptom: Complex problem +- Quick fix: Import framework/library +- Fundamental: Understand and solve directly +- Burden shift: Team never learns underlying concepts +- Result: Can't debug framework issues, framework lock-in + +### Diagnostic Questions + +- Is there a "quick fix" and a "fundamental solution" to the same problem? +- Does the quick fix reduce pressure to pursue fundamental solution? +- Is the team becoming dependent on the quick fix? +- Does the quick fix have ongoing costs (time, money, capability drain)? + +**If YES:** Likely Shifting the Burden + +### Intervention Strategy + +**Level 5 (Rules):** Make quick fix expensive or inconvenient (force fundamental solution) + +**Level 6 (Information):** Track total cost of quick fix over time, make dependency visible + +**Level 3 (Goals):** Prioritize capability building over symptom relief + +**What NOT to do:** +- ❌ Make quick fix easier/cheaper (strengthens burden shift) +- ❌ Remove fundamental solution resources (makes shift permanent) +- ❌ Accept "this is just how we work" (normalization of dependency) + +**What to DO:** +- ✅ Simultaneously apply quick fix AND start fundamental solution +- ✅ Set sunset date for quick fix +- ✅ Measure capability growth, not just symptom relief + + +## 3. Escalation + +### Structure + +``` +Party A's Action --+--> Threat to Party B + ↓ + Party B's Response --+--> Threat to Party A + ↓ ↓ + More Aggressive Response (loop continues) + ↓ + [REINFORCING LOOP - Accelerating Conflict] +``` + +**Characteristic:** Both parties think they're being defensive, both are actually escalating. + +### Software Engineering Examples + +**Tech Debt vs. Feature Pressure** +- Party A (Management): Pressure to ship features faster +- Party B (Engineering): Take shortcuts, accumulate debt +- Escalation: Debt → slower velocity → more pressure → more shortcuts → more debt +- Result: Velocity approaches zero, both sides blame each other + +**Security vs. Usability** +- Party A (Security): Add restrictions (2FA, password policies, access controls) +- Party B (Users): Find workarounds (shared passwords, written down, disabled 2FA) +- Escalation: Workarounds → more restrictions → more creative workarounds +- Result: Security theater, actual security compromised + +**Performance Team vs. Feature Team** +- Party A (Features): Add features that slow system +- Party B (Performance): Add rules/gates that slow feature delivery +- Escalation: Slower features → pressure to bypass gates → worse performance → stricter gates +- Result: Gridlock, both teams frustrated + +### Diagnostic Questions + +- Are two parties each making the other's problem worse? +- Does each side think they're being defensive/reasonable? +- Is the conflict intensifying despite both sides "trying harder"? +- Would unilateral de-escalation feel like "giving up"? + +**If YES:** Likely Escalation + +### Intervention Strategy + +**Level 3 (Goals):** Create shared goal that supersedes individual goals + +**Level 6 (Information):** Make each party's actions visible to the other (break assumptions) + +**Level 2 (Paradigm):** Shift from "zero-sum" to "collaborative" mindset + +**What NOT to do:** +- ❌ Escalate further ("fight fire with fire") +- ❌ Blame one party (both are trapped in system) +- ❌ Split the difference (doesn't break the loop) + +**What to DO:** +- ✅ Unilateral de-escalation by one party (breaks cycle) +- ✅ Create joint accountability (merge teams, shared metrics) +- ✅ Make escalation cost visible to both parties + + +## 4. Success to the Successful + +### Structure + +``` +Team A's Success --+--> More Resources to Team A + ↓ + Team A More Successful + ↓ + Even More Resources to Team A + +Team B Struggles ---o-> Fewer Resources to Team B + ↓ + Team B Struggles More + ↓ + Even Fewer Resources + +[REINFORCING LOOP - Rich Get Richer, Poor Get Poorer] +``` + +**Result:** Concentration risk - over-dependence on "winner" + +### Software Engineering Examples + +**Enterprise vs. SMB Product** +- Winner: Enterprise team (big deals) +- Gets: Custom features, eng resources, exec attention +- Result: 90% revenue from 5 customers, SMB product dies +- Risk: One enterprise customer leaves = company crisis + +**Popular Service Gets Resources** +- Winner: Service A (high traffic) +- Gets: More engineers, better infra, attention +- Result: Service A dominates, Service B atrophies +- Risk: Service B fails, takes down Service A (hidden dependency) + +**Star Developer Effect** +- Winner: Senior dev who delivers fast +- Gets: Best projects, promotions, resources +- Result: Junior devs never get growth opportunities, team dependent on one person +- Risk: Star leaves = team collapses + +### Diagnostic Questions + +- Is one team/project/person getting disproportionate resources? +- Is the gap between "winner" and "loser" widening? +- Is the organization becoming dependent on the winner? +- Would the loser's failure create cascading problems? + +**If YES:** Likely Success to the Successful + +### Intervention Strategy + +**Level 5 (Rules):** Resource allocation must consider portfolio balance, not just current ROI + +**Level 6 (Information):** Make concentration risk visible (dependency graphs, customer concentration) + +**Level 4 (Self-organization):** Let "losers" experiment with different approaches + +**What NOT to do:** +- ❌ "Double down" on winners exclusively +- ❌ Let losers die without understanding systemic value +- ❌ Assume current success predicts future success + +**What to DO:** +- ✅ Limit maximum resource allocation to any single entity (diversify) +- ✅ Invest in "losers" as strategic options (portfolio thinking) +- ✅ Rotate resources to prevent permanent advantage + + +## 5. Tragedy of the Commons + +### Structure + +``` +Individual A Optimizes for Self --+--> Uses Shared Resource +Individual B Optimizes for Self --+--> Uses Shared Resource +Individual C Optimizes for Self --+--> Uses Shared Resource + ↓ + Shared Resource Degrades + ↓ + Less Available for All + ↓ + Individuals Use MORE (scarcity response) + ↓ + [REINFORCING LOOP - Accelerating Depletion] +``` + +### Software Engineering Examples + +**Database Connection Pool** +- Shared resource: DB connections +- Individual optimization: Each service opens more connections for speed +- Result: Pool exhausted, all services slow +- Commons degraded: Database becomes bottleneck for everyone + +**Production Deployment Windows** +- Shared resource: Production stability +- Individual optimization: Each team deploys whenever ready +- Result: Too many changes, hard to debug issues +- Commons degraded: Production unstable for all teams + +**Shared Codebase Quality** +- Shared resource: Code maintainability +- Individual optimization: Each team ships fast without refactoring +- Result: Tech debt accumulates, codebase unmaintainable +- Commons degraded: Everyone slowed by poor code quality + +### Diagnostic Questions + +- Is there a shared resource that multiple parties use? +- Does each party optimize individually without considering others? +- Is the resource degrading over time despite (or because of) individual optimization? +- Would regulation/limits be resisted as "unfair" by individuals? + +**If YES:** Likely Tragedy of the Commons + +### Intervention Strategy + +**Level 5 (Rules):** Regulate access to commons (quotas, rate limits, governance) + +**Level 6 (Information):** Make individual impact on commons visible (usage dashboards) + +**Level 3 (Goals):** Align individual goals with commons health (team incentives) + +**What NOT to do:** +- ❌ Appeal to good behavior without enforcement +- ❌ Wait for commons to collapse before acting +- ❌ Blame individuals (system incentivizes this) + +**What to DO:** +- ✅ Create feedback loop: usage → visible cost → self-regulation +- ✅ Privatize commons OR enforce collective management +- ✅ Charge for usage (make externalities internal) + + +## 6. Accidental Adversaries + +### Structure + +``` +Party A Takes Action to Help Self + ↓ +Action Inadvertently Hurts Party B + ↓ +Party B Takes Action to Help Self + ↓ +Action Inadvertently Hurts Party A + ↓ +[REINFORCING LOOP - Mutual Harm Despite Good Intentions] +``` + +**Key difference from Escalation:** Not intentional conflict - each party solving own problem, unaware they're hurting the other. + +### Software Engineering Examples + +**API Rate Limiting** +- Party A (Platform): Add rate limits to protect servers +- Hurts B: Users hit limits, break integrations +- Party B (Users): Create multiple accounts to bypass limits +- Hurts A: More load, harder to detect abuse, stricter limits needed +- Result: Arms race, both worse off + +**Microservices Boundaries** +- Party A (Team A): Optimizes their service, changes API frequently +- Hurts B: Team B's service breaks from API changes +- Party B (Team B): Adds defensive caching, duplicates data +- Hurts A: Team A can't deploy changes, data consistency issues +- Result: Tight coupling despite microservices + +**Oncall Rotation** +- Party A (Oncall eng): Deploys quickly to reduce queue, incomplete testing +- Hurts B: Next oncall gets incidents from rushed deploy +- Party B (Next oncall): Adds deployment gates and approvals +- Hurts A: Original oncall's deploys now blocked, queue grows +- Result: Slower deploys, more incidents + +### Diagnostic Questions + +- Are two parties pursuing legitimate goals? +- Do their solutions inadvertently harm each other? +- Is neither party trying to cause harm? +- Is the relationship deteriorating despite good intentions? + +**If YES:** Likely Accidental Adversaries + +### Intervention Strategy + +**Level 6 (Information):** Make impact visible - A sees how they hurt B, B sees how they hurt A + +**Level 5 (Rules):** Coordinate actions (shared calendar, RFC process, communication protocols) + +**Level 3 (Goals):** Create joint success metric that requires cooperation + +**What NOT to do:** +- ❌ Blame either party (both acting rationally) +- ❌ Let them "work it out" without structural change +- ❌ Optimize one party at expense of other + +**What to DO:** +- ✅ Joint planning sessions, shared visibility +- ✅ Align incentives (both rewarded for cooperation) +- ✅ Create shared ownership or merge teams + + +## 7. Drifting Goals (Complacency-Driven) + +### Structure + +``` +Target Goal: 95% +Actual Performance: 94.8% + ↓ +Small Gap - "Close Enough" + ↓ +Lower Target to 94% + ↓ +Actual Drops to 93% + ↓ +Lower Target to 93% - "Be Realistic" + ↓ +[REINFORCING LOOP - Standards Erode Gradually] +``` + +**Key characteristic:** Driven by complacency, not necessity. Team CAN achieve target but chooses not to. + +### Software Engineering Examples + +**Test Coverage Erosion** +- Started: 90% coverage standard +- "Just this once": 70% for urgent feature +- New normal: 75% "is realistic" +- Current: 60%, bugs increasing +- Team accepts: "Given constraints, 60% is good" + +**Code Review Standards** +- Started: 2 reviewers, thorough feedback +- Drift: 1 reviewer "to move faster" +- Current: Rubber-stamp reviews +- Result: Quality declined, but normalized + +**Deployment Frequency** +- Started: Deploy daily +- Drift: Deploy weekly "to reduce risk" +- Current: Deploy monthly +- Result: Releases become risky big-bang events, confirming fear + +### Diagnostic Questions + +- Did standards start higher? +- Was there a gradual lowering over time? +- Are current standards justified by "being realistic" rather than necessity? +- Can the team achieve original standards with current resources? + +**Critical test:** "If we gave team 2 more weeks, could they hit original target?" +- **If YES:** Drifting Goals (capability exists, will doesn't) +- **If NO:** Different archetype (resource constraint exists) + +### Intervention Strategy + +**Level 6 (Information - Highest leverage for this archetype):** +- Make drift visible: Historical trend chart, original vs. current standard +- Customer impact metrics tied to lowered standards +- Public commitment to original standard + +**Level 3 (Goals):** +- Re-establish non-negotiable minimum standards +- Remove authority to lower standards without explicit approval +- Tie consequences to meeting original target + +**Level 5 (Rules):** +- Automatic escalation when standards not met +- Blameless post-mortems for "what would this look like at 95%?" + +**What NOT to do:** +- ❌ Accept "constraints" without evidence (often post-hoc justification) +- ❌ Add resources (no resource gap exists) +- ❌ Negotiate standards based on convenience + +**What to DO:** +- ✅ Make gap painfully visible +- ✅ Celebrate meeting original standard, don't accept "close enough" +- ✅ Re-commit publicly to original goal + + +## 8. Limits to Growth + +### Structure + +``` +Growth Action --+--> Success/Growth + ↓ + Growth Continues + ↓ + Hits Limiting Constraint + ↓ + Growth Slows Despite Effort + ↓ + More Effort → Still Can't Grow + ↓ + [BALANCING LOOP - Constraint Dominates] +``` + +**Characteristic:** Growth works until it doesn't. Constraint kicks in, effort becomes futile. + +### Software Engineering Examples + +**Traffic Growth Hits Infrastructure** +- Growth: User acquisition working, doubling every 6 months +- Constraint: Infrastructure can't scale fast enough +- Limit: At 180K users, app crashes under load +- Result: Growth stops, users churn, opportunity lost + +**Team Growth Hits Communication Overhead** +- Growth: Hiring velocity high, team growing fast +- Constraint: Communication overhead grows exponentially (n² problem) +- Limit: Coordination cost exceeds productivity gain +- Result: Bigger team, slower delivery + +**Feature Growth Hits Cognitive Load** +- Growth: Shipping features rapidly +- Constraint: User cognitive overload, can't find anything +- Limit: More features make product HARDER to use +- Result: User satisfaction drops despite more features + +### Diagnostic Questions + +- Was growth working well, then suddenly stopped? +- Are you applying more effort but seeing diminishing returns? +- Is there an identifiable constraint that wasn't a problem before? +- Does "trying harder" feel increasingly futile? + +**If YES:** Likely Limits to Growth + +### Intervention Strategy + +**Level 10 (Structure - Highest leverage for this archetype):** +- Remove or redesign the constraint +- Examples: Rearchitect for scale, restructure team, simplify product + +**Level 3 (Goals):** +- Change growth target to different dimension where constraint doesn't apply +- Example: Growth in user engagement instead of user count + +**Level 11 (Buffers):** +- Anticipate constraint, build capacity BEFORE hitting limit + +**What NOT to do:** +- ❌ Apply more growth effort (won't work, constraint dominates) +- ❌ Ignore constraint hoping it resolves itself +- ❌ Treat constraint as temporary obstacle + +**What to DO:** +- ✅ Identify the limiting constraint explicitly +- ✅ Remove constraint OR pivot to different growth strategy +- ✅ Invest in constraint removal before restarting growth + + +## 9. Growth and Underinvestment + +### Structure + +``` +Growth --+--> Demand Increases + ↓ + Need for Capacity Investment + ↓ + Underinvest (short-term thinking) + ↓ + Quality/Performance Degrades + ↓ + Growth Slows + ↓ + "See? Didn't need that investment" + ↓ + [REINFORCING LOOP - Self-Fulfilling Prophecy] +``` + +**Key difference from Limits to Growth:** Constraint is CREATED by underinvestment, not inherent. + +### Software Engineering Examples + +**Infrastructure Underinvestment** +- Growth: Traffic increasing +- Need: Scale infrastructure proactively +- Underinvest: "Wait until we need it" +- Result: Performance degrades → users leave → "see, didn't need more servers" +- Self-fulfilling: Underinvestment killed growth + +**Technical Debt Underinvestment** +- Growth: Feature demand high +- Need: Pay down tech debt to maintain velocity +- Underinvest: "Features first, debt later" +- Result: Velocity drops → fewer features shipped → "see, we can ship with this debt" +- Self-fulfilling: Debt accumulation slowed growth + +**Team Capability Underinvestment** +- Growth: Business expanding +- Need: Train team on new technologies +- Underinvest: "No time for training, ship features" +- Result: Quality drops → customers churn → "see, training wouldn't have helped" +- Self-fulfilling: Lack of training killed growth + +### Diagnostic Questions + +- Is there growth potential being unrealized? +- Was there a decision to delay investment? +- Did performance degrade, causing growth to slow? +- Is the slowdown being used to justify the underinvestment? + +**Critical tell:** "We didn't need X after all" - but slowdown was CAUSED by not having X + +### Intervention Strategy + +**Level 3 (Goals):** +- Measure long-term capacity, not just short-term delivery +- Goal: "Sustainable growth" not "maximize short-term growth" + +**Level 6 (Information):** +- Model growth scenarios with/without investment +- Make opportunity cost of underinvestment visible + +**Level 5 (Rules):** +- Mandatory investment allocation (% of resources to capacity/capability) +- Investment cannot be deferred without explicit growth target reduction + +**What NOT to do:** +- ❌ Defer investment "until we're sure we need it" +- ❌ Use growth slowdown to justify underinvestment +- ❌ Optimize for short-term metrics at expense of capacity + +**What to DO:** +- ✅ Invest AHEAD of demand (leading indicator) +- ✅ Track capacity utilization, invest before hitting 80% +- ✅ Make investment non-negotiable part of growth strategy + + +## 10. Eroding Goals (Pressure-Driven) + +### Structure + +``` +Performance Gap (can't meet target) + ↓ +Pressure to Improve + ↓ +Can't Improve (resource constrained) + ↓ +Lower Standards to "Be Realistic" + ↓ +Pressure Temporarily Reduced + ↓ +Performance Drops Further (no standards to meet) + ↓ +Lower Standards Again + ↓ +[REINFORCING LOOP - Death Spiral] +``` + +**Key difference from Drifting Goals (#7):** Driven by necessity, not complacency. Team CANNOT meet target with current resources. + +### Software Engineering Examples + +**Uptime SLA Erosion** +- Target: 95% uptime +- Reality: Team achieves 92-93%, burning out +- Pressure: Management demands 95% +- Can't achieve: Insufficient resources/tooling +- Lower standards: "92% is realistic given constraints" +- Result: Team delivers 89%, standards lowered again → death spiral + +**Velocity Pressure** +- Target: 50 story points/sprint +- Reality: Team delivers 35, working overtime +- Pressure: "Try harder" +- Can't achieve: Structural bottlenecks +- Lower expectations: "35 is the new normal" +- Result: Team delivers 28, morale collapses + +**Security Compliance** +- Target: Pass all security audits +- Reality: Team fixes 70% of issues +- Pressure: Must pass audit +- Can't achieve: Not enough security expertise +- Lower standards: Accept "known risks" +- Result: More issues next audit, lower bar again + +### Diagnostic Questions + +**Critical test:** "If we gave team 2 more weeks, could they hit original target?" +- **If NO:** Eroding Goals (structural constraint) +- **If YES:** Drifting Goals (capability exists) + +**Other signs:** +- Is the team burning out trying to meet targets? +- Are resources insufficient for stated goals? +- Is lowering standards framed as "being realistic" given constraints? +- Is performance declining DESPITE effort increase? + +### Intervention Strategy + +**Level 5 (Rules - Force Honest Choice):** +- "Goals must match resources OR resources must match goals - pick one" +- Cannot demand outcomes without providing means +- Sustainable pace is non-negotiable + +**Level 11 (Buffers):** +- Add slack/capacity to stop the death spiral +- Provide recovery time for burned-out team + +**Level 2 (Paradigm Shift):** +- From: "Try harder" → "Performance is emergent from system capacity" +- From: "Pressure produces results" → "Burnout produces collapse" + +**What NOT to do:** +- ❌ Just lower standards (doesn't address root cause) +- ❌ Add pressure (accelerates death spiral) +- ❌ Accept "try harder" as strategy + +**What to DO:** +- ✅ Force explicit choice: Add resources OR lower goals (and own it) +- ✅ Make current gap between goals and resources visible +- ✅ Break the cycle with capacity addition or scope reduction + + +## Distinguishing Similar Archetypes + +### Drifting Goals (#7) vs. Eroding Goals (#10) + +**Both:** Standards lower over time +**Key difference:** WHY standards are lowered + +| Dimension | Drifting Goals | Eroding Goals | +|-----------|----------------|---------------| +| **Driver** | Complacency | Resource pressure | +| **Team capability** | CAN achieve, chooses not to | CANNOT with current resources | +| **Diagnostic test** | "2 more weeks?" → YES | "2 more weeks?" → NO | +| **Pressure level** | Low, comfortable | High, burning out | +| **Justification** | "Close enough" | "Realistic given constraints" | +| **Intervention** | Make gap visible, recommit to standards | Add resources OR lower goals officially | + +### Fixes that Fail (#1) vs. Shifting the Burden (#2) + +**Both:** Symptomatic solution, problem returns +**Key difference:** Competing pathways + +| Dimension | Fixes that Fail | Shifting the Burden | +|-----------|-----------------|---------------------| +| **Structure** | One pathway with side effects | Two pathways (quick vs fundamental) | +| **What happens** | Fix creates side effect that worsens problem | Fix prevents pursuit of real solution | +| **Dependency** | Not necessarily | Creates addiction to quick fix | +| **Example** | Adding alerts creates alert fatigue | QA team prevents quality culture | + +### Escalation (#3) vs. Accidental Adversaries (#6) + +**Both:** Two parties harming each other +**Key difference:** Intent + +| Dimension | Escalation | Accidental Adversaries | +|-----------|-----------|------------------------| +| **Intent** | Deliberate response to threat | Solving own problem, unaware of harm | +| **Awareness** | Both know they're in conflict | Neither realizes they're hurting other | +| **Example** | Tech debt vs feature pressure (both aware) | Rate limits → multi-accounts (unaware impact) | + +### Limits to Growth (#8) vs. Growth and Underinvestment (#9) + +**Both:** Growth stops +**Key difference:** Source of constraint + +| Dimension | Limits to Growth | Growth and Underinvestment | +|-----------|------------------|----------------------------| +| **Constraint** | Inherent limit (user cognitive load) | Created by underinvestment (infrastructure) | +| **Timing** | Hits suddenly | Degradation visible in advance | +| **Prevention** | Hard (inherent to system) | Easy (invest proactively) | + + +## Archetype Combinations + +**Systems often exhibit multiple archetypes simultaneously.** Recognize the pattern: + +### Example: Feature Factory Disaster + +**Primary: Shifting the Burden** +- Quick fix: QA team catches bugs +- Fundamental: Dev teams build quality in +- Burden shifted: Devs never learn quality + +**Secondary: Escalation** +- Management: Pressure to ship faster +- Engineering: Cut more corners +- Both escalate: More pressure ↔ worse quality + +**Tertiary: Tragedy of the Commons** +- Commons: Codebase quality +- Individual optimization: Each team ships fast +- Commons degraded: Everyone slowed + +**Intervention strategy for combinations:** +1. **Identify primary archetype** (drives the system) +2. **Address secondary archetypes** that reinforce primary +3. **Use highest-leverage intervention** that addresses multiple archetypes + +Example: Level 2 (Paradigm shift) to "quality is built in, not inspected in" addresses all three. + + +## Quick Recognition Guide + +**Start here when analyzing a problem:** + +1. **Map the feedback loops** - Reinforcing or balancing? +2. **Identify the parties/stocks** - Who/what is involved? +3. **Check the signature patterns:** + - Problem returns after fix? → Fixes that Fail (#1) + - Symptom relief + fundamental solution ignored? → Shifting Burden (#2) + - Two parties making it worse? → Escalation (#3) or Adversaries (#6) + - Winner gets more resources? → Success to Successful (#4) + - Shared resource degrading? → Tragedy of Commons (#5) + - Standards lowering from complacency? → Drifting Goals (#7) + - Standards lowering from pressure? → Eroding Goals (#10) + - Growth stopped suddenly? → Limits to Growth (#8) + - Growth stopped from underinvestment? → Growth/Underinvestment (#9) + +4. **Use diagnostic questions** from each archetype section +5. **Check for archetype combinations** (multiple patterns may apply) + + +## Integration with Leverage Points + +**Each archetype has characteristic high-leverage interventions:** + +| Archetype | Highest-Leverage Intervention Level | +|-----------|--------------------------------------| +| Fixes that Fail | #3 (Goals) - Focus on root cause not symptom | +| Shifting the Burden | #5 (Rules) - Make quick fix expensive/difficult | +| Escalation | #3 (Goals) - Create shared goal | +| Success to Successful | #5 (Rules) - Regulate resource allocation | +| Tragedy of Commons | #6 (Information) + #5 (Rules) - Feedback + regulation | +| Accidental Adversaries | #6 (Information) - Make impact visible | +| Drifting Goals | #6 (Information) - Make drift visible | +| Limits to Growth | #10 (Structure) - Remove constraint | +| Growth/Underinvestment | #3 (Goals) - Measure long-term capacity | +| Eroding Goals | #5 (Rules) - Force resource/goal alignment | + +**Pattern:** Most archetypes respond to Levels 3-6 (Goals, Rules, Information, Feedback) + + +## Red Flags - Rationalizations for Skipping Archetype Analysis + +If you catch yourself saying ANY of these, STOP and identify the archetype first: + +| Rationalization | Reality | Response | +|-----------------|---------|----------| +| "No time for archetype analysis in crisis" | 10 minutes of pattern matching saves weeks of wrong fixes | Crisis is EXACTLY when archetypes matter most - prevents accelerating the problem | +| "My situation is unique, doesn't fit neat categories" | Uniqueness is in details, not structure - archetypes describe feedback loops | Test archetype predictions - if they match, it's the same structure | +| "This fits multiple archetypes, any intervention works" | Multiple archetypes require identifying PRIMARY one first | Address dominant archetype first, then secondary reinforcing patterns | +| "Archetypes are too academic/theoretical for real engineering" | Every archetype has software examples from production systems | This is pattern recognition, not theory - pragmatic shortcut to solutions | +| "I already know the solution, archetype is overhead" | If solution is obvious, archetype confirms it in 2 minutes | Unknown solutions become obvious once archetype identified | +| "We need action, not analysis" | Wrong action makes crisis worse (see: Fixes that Fail, Escalation) | Archetype analysis IS action - it prevents implementing failed patterns | + +**The pattern:** All rationalizations push you toward repeating known failure modes. The archetypes catalog exists because these patterns have been solved before. + +**The meta-trap:** "We're unique" is itself predicted by several archetypes (Shifting the Burden creates belief that quick fix is necessary, Drifting Goals creates post-hoc justification for lowered standards). + +## The Bottom Line + +**Don't reinvent solutions to archetypal problems.** + +The same patterns recur across systems. Recognize the archetype, apply the known intervention strategy, save time. + +**The skill:** Pattern matching speed. Experienced systems thinkers recognize archetypes in minutes, know immediately where to intervene. + +**The discipline:** Don't jump to solutions before identifying the archetype. Taking 15 minutes to recognize the pattern saves hours implementing the wrong fix.