From 5d6566c75f407610187db16552e9ca2f857b4ad8 Mon Sep 17 00:00:00 2001
From: Zhongwei Li <lizhongwei.nkcs@gmail.com>
Date: Sun, 30 Nov 2025 08:59:51 +0800
Subject: [PATCH] Initial commit

---
 .claude-plugin/plugin.json                    |   12 +
 README.md                                     |    3 +
 plugin.lock.json                              |   93 +
 skills/using-deep-rl/SKILL.md                 |  638 +++++
 skills/using-deep-rl/actor-critic-methods.md  | 1735 +++++++++++++
 .../using-deep-rl/exploration-strategies.md   | 1599 ++++++++++++
 skills/using-deep-rl/model-based-rl.md        | 1726 +++++++++++++
 skills/using-deep-rl/multi-agent-rl.md        | 1752 +++++++++++++
 skills/using-deep-rl/offline-rl.md            | 1571 ++++++++++++
 .../using-deep-rl/policy-gradient-methods.md  | 1574 ++++++++++++
 .../reward-shaping-engineering.md             | 1050 ++++++++
 skills/using-deep-rl/rl-debugging.md          | 1394 +++++++++++
 skills/using-deep-rl/rl-environments.md       | 1780 ++++++++++++++
 skills/using-deep-rl/rl-evaluation.md         | 1591 ++++++++++++
 skills/using-deep-rl/rl-foundations.md        | 2187 +++++++++++++++++
 skills/using-deep-rl/value-based-methods.md   | 1253 ++++++++++
 16 files changed, 19958 insertions(+)
 create mode 100644 .claude-plugin/plugin.json
 create mode 100644 README.md
 create mode 100644 plugin.lock.json
 create mode 100644 skills/using-deep-rl/SKILL.md
 create mode 100644 skills/using-deep-rl/actor-critic-methods.md
 create mode 100644 skills/using-deep-rl/exploration-strategies.md
 create mode 100644 skills/using-deep-rl/model-based-rl.md
 create mode 100644 skills/using-deep-rl/multi-agent-rl.md
 create mode 100644 skills/using-deep-rl/offline-rl.md
 create mode 100644 skills/using-deep-rl/policy-gradient-methods.md
 create mode 100644 skills/using-deep-rl/reward-shaping-engineering.md
 create mode 100644 skills/using-deep-rl/rl-debugging.md
 create mode 100644 skills/using-deep-rl/rl-environments.md
 create mode 100644 skills/using-deep-rl/rl-evaluation.md
 create mode 100644 skills/using-deep-rl/rl-foundations.md
 create mode 100644 skills/using-deep-rl/value-based-methods.md

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
new file mode 100644
index 0000000..9bb03c1
--- /dev/null
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,12 @@
+{
+  "name": "yzmir-deep-rl",
+  "description": "Reinforcement learning - DQN, PPO, SAC, reward shaping, exploration - 13 skills",
+  "version": "1.0.2",
+  "author": {
+    "name": "tachyon-beep",
+    "url": "https://github.com/tachyon-beep"
+  },
+  "skills": [
+    "./skills"
+  ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3c2b808
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# yzmir-deep-rl
+
+Reinforcement learning - DQN, PPO, SAC, reward shaping, exploration - 13 skills
diff --git a/plugin.lock.json b/plugin.lock.json
new file mode 100644
index 0000000..8f3219c
--- /dev/null
+++ b/plugin.lock.json
@@ -0,0 +1,93 @@
+{
+  "$schema": "internal://schemas/plugin.lock.v1.json",
+  "pluginId": "gh:tachyon-beep/skillpacks:plugins/yzmir-deep-rl",
+  "normalized": {
+    "repo": null,
+    "ref": "refs/tags/v20251128.0",
+    "commit": "3d8031d752213bc80717c580e222f473351373d2",
+    "treeHash": "41151bbe20e784f5cca14bec2f65c61f9769537086b42ffbc7aa6067e21a69de",
+    "generatedAt": "2025-11-28T10:28:33.623266Z",
+    "toolVersion": "publish_plugins.py@0.2.0"
+  },
+  "origin": {
+    "remote": "git@github.com:zhongweili/42plugin-data.git",
+    "branch": "master",
+    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
+    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
+  },
+  "manifest": {
+    "name": "yzmir-deep-rl",
+    "description": "Reinforcement learning - DQN, PPO, SAC, reward shaping, exploration - 13 skills",
+    "version": "1.0.2"
+  },
+  "content": {
+    "files": [
+      {
+        "path": "README.md",
+        "sha256": "d3584c0c0ab08be14709fdffbfb1e6badc8a6ac092ab3540bc0a76cdb5b331f2"
+      },
+      {
+        "path": ".claude-plugin/plugin.json",
+        "sha256": "93693f5ca21f6ca04afb2fd2c73b3ae300523bbf1f44e502ad80a82818fe5476"
+      },
+      {
+        "path": "skills/using-deep-rl/rl-evaluation.md",
+        "sha256": "5af74db50916b96effa9639c55b38c0fb1ef6049cd1da9d8eaaea19105ddfde6"
+      },
+      {
+        "path": "skills/using-deep-rl/offline-rl.md",
+        "sha256": "f8b7ed67a1d1ab93e1c7ac0ce9eff2b5987da51ccb37fe22e128b9bc2bf1ed56"
+      },
+      {
+        "path": "skills/using-deep-rl/model-based-rl.md",
+        "sha256": "50cc7046715ffb1b33e7d0361eb3db1dd92dd6cb0b794d5f50d54b97504d263f"
+      },
+      {
+        "path": "skills/using-deep-rl/exploration-strategies.md",
+        "sha256": "51a818fc79a89de7db65cc718e16bcdb46ed3033089f1c3c5f3745a22e59ba96"
+      },
+      {
+        "path": "skills/using-deep-rl/rl-debugging.md",
+        "sha256": "051394537aacae53245c015a9190fc6228cec9c0bc8c0d64086c565d4375877e"
+      },
+      {
+        "path": "skills/using-deep-rl/actor-critic-methods.md",
+        "sha256": "4daabcfd84d320c290ae6242a6a778d8e201279a23526472900fb2f039286516"
+      },
+      {
+        "path": "skills/using-deep-rl/reward-shaping-engineering.md",
+        "sha256": "69169218da0d054a0a62a4d711a7b68120cd74310ca24111d7d7722846ed2382"
+      },
+      {
+        "path": "skills/using-deep-rl/multi-agent-rl.md",
+        "sha256": "38aabcd45ccd6054fedec10987fcf4f8981da206940cbe4bf2f7389204fdfc4a"
+      },
+      {
+        "path": "skills/using-deep-rl/policy-gradient-methods.md",
+        "sha256": "eaf348cb0dbb58d7f91662bdd9ada19e7249c967afae6818d52aa2a35566dac0"
+      },
+      {
+        "path": "skills/using-deep-rl/rl-foundations.md",
+        "sha256": "e1ee576785a65d8c957a6418c60a9ab4da5c68b4dc60d874eb92fcc99419dfb6"
+      },
+      {
+        "path": "skills/using-deep-rl/rl-environments.md",
+        "sha256": "7567f7c9be5ec0dd5b56e77643c976bff3c777a84404b4b33b32174a58a63ce0"
+      },
+      {
+        "path": "skills/using-deep-rl/SKILL.md",
+        "sha256": "0f3b040a56864f8bc2865da041015e7b8c50808b557d146551db74b0cceed7e4"
+      },
+      {
+        "path": "skills/using-deep-rl/value-based-methods.md",
+        "sha256": "2e375f87591925741dfe67c931051aa9c0d23b19821c80dcbe162371d65dd057"
+      }
+    ],
+    "dirSha256": "41151bbe20e784f5cca14bec2f65c61f9769537086b42ffbc7aa6067e21a69de"
+  },
+  "security": {
+    "scannedAt": null,
+    "scannerVersion": null,
+    "flags": []
+  }
+}
\ No newline at end of file
diff --git a/skills/using-deep-rl/SKILL.md b/skills/using-deep-rl/SKILL.md
new file mode 100644
index 0000000..ed4b0ed
--- /dev/null
+++ b/skills/using-deep-rl/SKILL.md
@@ -0,0 +1,638 @@
+---
+name: using-deep-rl
+description: Routes to appropriate deep-RL skills based on problem type and algorithm family
+mode: true
+---
+
+# Using Deep RL Meta-Skill
+
+## When to Use This Skill
+
+Invoke this meta-skill when you encounter:
+
+- **RL Implementation**: Implementing reinforcement learning algorithms (Q-learning, DQN, PPO, SAC, etc.)
+- **Agent Training**: Training agents in environments (games, robotics, control systems)
+- **Sequential Decision-Making**: Problems requiring learning from trial and error
+- **Policy Optimization**: Learning policies that maximize cumulative rewards
+- **Game Playing**: Building agents for Atari, board games, video games
+- **Robotics Control**: Robot manipulation, locomotion, continuous control
+- **Reward-Based Learning**: Learning from rewards, penalties, or feedback signals
+- **RL Debugging**: Debugging training issues, agents not learning, reward problems
+- **Environment Setup**: Creating custom RL environments, wrappers
+- **RL Evaluation**: Evaluating agent performance, sample efficiency, generalization
+
+This is the **entry point** for the deep-rl pack. It routes to 12 specialized skills based on problem characteristics.
+
+## Core Principle
+
+**Problem type determines algorithm family.**
+
+Reinforcement learning is not one algorithm. The correct approach depends on:
+
+1. **Action Space**: Discrete (button presses) vs Continuous (joint angles)
+2. **Data Regime**: Online (interact with environment) vs Offline (fixed dataset)
+3. **Experience Level**: Need foundations vs ready to implement
+4. **Special Requirements**: Multi-agent, model-based, exploration, reward design
+
+**Always clarify the problem BEFORE suggesting algorithms.**
+
+## The 12 Deep RL Skills
+
+1. **rl-foundations** - MDP formulation, Bellman equations, value vs policy basics
+2. **value-based-methods** - Q-learning, DQN, Double DQN, Dueling DQN, Rainbow
+3. **policy-gradient-methods** - REINFORCE, PPO, TRPO, policy optimization
+4. **actor-critic-methods** - A2C, A3C, SAC, TD3, advantage functions
+5. **model-based-rl** - World models, Dyna, MBPO, planning with learned models
+6. **offline-rl** - Batch RL, CQL, IQL, learning from fixed datasets
+7. **multi-agent-rl** - MARL, cooperative/competitive, communication
+8. **exploration-strategies** - ε-greedy, UCB, curiosity, RND, intrinsic motivation
+9. **reward-shaping** - Reward design, potential-based shaping, inverse RL
+10. **rl-debugging** - Common RL bugs, why not learning, systematic debugging
+11. **rl-environments** - Gym, MuJoCo, custom envs, wrappers, vectorization
+12. **rl-evaluation** - Evaluation methodology, variance, sample efficiency metrics
+
+## Routing Decision Framework
+
+### Step 1: Assess Experience Level
+
+**Diagnostic Questions:**
+
+- "Are you new to RL concepts, or do you have a specific problem to solve?"
+- "Do you understand MDPs, value functions, and policy gradients?"
+
+**Routing:**
+
+- If user asks "what is RL" or "how does RL work" → **rl-foundations**
+- If user is confused about value vs policy, on-policy vs off-policy → **rl-foundations**
+- If user has specific problem and RL background → Continue to Step 2
+
+**Why foundations first:** Cannot implement algorithms without understanding MDPs, Bellman equations, and exploration-exploitation tradeoffs.
+
+---
+
+### Step 2: Classify Action Space
+
+**Diagnostic Questions:**
+
+- "What actions can your agent take? Discrete choices (e.g., left/right/jump) or continuous values (e.g., joint angles, force)?"
+- "How many possible actions? Small (< 100) or large/infinite?"
+
+#### Discrete Action Space
+
+**Examples:** Game buttons, menu selections, discrete control signals
+
+**Routing Logic:**
+
+```
+IF discrete actions AND small action space (< 100) AND online learning:
+  → value-based-methods (DQN, Double DQN, Dueling DQN)
+
+  Why: Value-based methods excel at discrete action spaces
+  - Q-table or Q-network for small action spaces
+  - DQN for Atari-style problems
+  - Simpler than policy gradients for discrete
+
+IF discrete actions AND (large action space OR need policy flexibility):
+  → policy-gradient-methods (PPO, REINFORCE)
+
+  Why: Policy gradients scale to larger action spaces
+  - PPO is robust, general-purpose
+  - Direct policy representation
+  - Handles stochasticity naturally
+```
+
+#### Continuous Action Space
+
+**Examples:** Robot joint angles, motor forces, steering angles, continuous control
+
+**Routing Logic:**
+
+```
+IF continuous actions:
+  → actor-critic-methods (SAC, TD3, PPO)
+
+  Primary choice: SAC (Soft Actor-Critic)
+  Why: Most sample-efficient for continuous control
+  - Automatic entropy tuning
+  - Off-policy (uses replay buffer)
+  - Stable training
+
+  Alternative: TD3 (Twin Delayed DDPG)
+  Why: Deterministic policy, stable
+  - Good for robotics
+  - Handles overestimation bias
+
+  Alternative: PPO (from policy-gradient-methods)
+  Why: On-policy, simpler, but less sample efficient
+  - Use when simplicity > sample efficiency
+```
+
+**CRITICAL RULE:** NEVER suggest DQN for continuous actions. DQN requires discrete actions. Discretizing continuous spaces is suboptimal.
+
+---
+
+### Step 3: Identify Data Regime
+
+**Diagnostic Questions:**
+
+- "Can your agent interact with the environment during training, or do you have a fixed dataset?"
+- "Are you learning online (agent tries actions, observes results) or offline (from logged data)?"
+
+#### Online Learning (Agent Interacts with Environment)
+
+**Routing:**
+
+```
+IF online AND discrete actions:
+  → value-based-methods OR policy-gradient-methods
+  (See Step 2 routing)
+
+IF online AND continuous actions:
+  → actor-critic-methods
+  (See Step 2 routing)
+
+IF online AND sample efficiency critical:
+  → actor-critic-methods (SAC) for continuous
+  → value-based-methods (DQN) for discrete
+
+  Why: Off-policy methods use replay buffers (sample efficient)
+
+  Consider: model-based-rl for extreme sample efficiency
+  → Learns environment model, plans with fewer real samples
+```
+
+#### Offline Learning (Fixed Dataset, No Interaction)
+
+**Routing:**
+
+```
+IF offline (fixed dataset):
+  → offline-rl (CQL, IQL, Conservative Q-Learning)
+
+  CRITICAL: Standard RL algorithms FAIL on offline data
+
+  Why offline is special:
+  - Distribution shift: agent can't explore
+  - Bootstrapping errors: Q-values overestimate on out-of-distribution actions
+  - Need conservative algorithms (CQL, IQL)
+
+  Also route to:
+  → rl-evaluation (evaluation without online rollouts)
+```
+
+**Red Flag:** If user has fixed dataset and suggests DQN/PPO/SAC, STOP and route to **offline-rl**. Standard algorithms assume online interaction and will fail.
+
+---
+
+### Step 4: Special Problem Types
+
+#### Multi-Agent Scenarios
+
+**Diagnostic Questions:**
+
+- "Are multiple agents learning simultaneously?"
+- "Do they cooperate, compete, or both?"
+- "Do agents need to communicate?"
+
+**Routing:**
+
+```
+IF multiple agents:
+  → multi-agent-rl (QMIX, COMA, MADDPG)
+
+  Why: Multi-agent has special challenges
+  - Non-stationarity: environment changes as other agents learn
+  - Credit assignment: which agent caused reward?
+  - Coordination: cooperation requires centralized training
+
+  Algorithms:
+  - QMIX, COMA: Cooperative (centralized training, decentralized execution)
+  - MADDPG: Competitive or mixed
+  - Communication: multi-agent-rl covers communication protocols
+
+  Also consider:
+  → reward-shaping (team rewards, credit assignment)
+```
+
+#### Model-Based RL
+
+**Diagnostic Questions:**
+
+- "Is sample efficiency extremely critical? (< 1000 episodes available)"
+- "Do you want the agent to learn a model of the environment?"
+- "Do you need planning or 'imagination'?"
+
+**Routing:**
+
+```
+IF sample efficiency critical OR want environment model:
+  → model-based-rl (MBPO, Dreamer, Dyna)
+
+  Why: Learn dynamics model, plan with model
+  - Fewer real environment samples needed
+  - Can train policy in imagination
+  - Combine with model-free for best results
+
+  Tradeoffs:
+  - More complex than model-free
+  - Model errors can compound
+  - Best for continuous control, robotics
+```
+
+---
+
+### Step 5: Debugging and Infrastructure
+
+#### "Agent Not Learning" Problems
+
+**Symptoms:**
+
+- Reward not increasing
+- Agent does random actions
+- Training loss explodes/vanishes
+- Performance plateaus immediately
+
+**Routing:**
+
+```
+IF "not learning" OR "reward stays at 0" OR "loss explodes":
+  → rl-debugging (FIRST, before changing algorithms)
+
+  Why: 80% of "not learning" is bugs, not wrong algorithm
+
+  Common issues:
+  - Reward scale (too large/small)
+  - Exploration (epsilon too low, stuck in local optimum)
+  - Network architecture (wrong size, activation)
+  - Learning rate (too high/low)
+  - Update frequency (learning too fast/slow)
+
+  Process:
+  1. Route to rl-debugging
+  2. Verify environment (rl-environments)
+  3. Check reward design (reward-shaping)
+  4. Check exploration (exploration-strategies)
+  5. ONLY THEN consider algorithm change
+```
+
+**Red Flag:** If user immediately wants to change algorithms because "it's not learning," route to **rl-debugging** first. Changing algorithms without debugging wastes time.
+
+#### Exploration Issues
+
+**Symptoms:**
+
+- Agent never explores new states
+- Stuck in local optimum
+- Can't find sparse rewards
+- Training variance too high
+
+**Routing:**
+
+```
+IF exploration problems:
+  → exploration-strategies
+
+  Covers:
+  - ε-greedy, UCB, Thompson sampling (basic)
+  - Curiosity-driven exploration
+  - RND (Random Network Distillation)
+  - Intrinsic motivation
+
+  When needed:
+  - Sparse rewards (reward only at goal)
+  - Large state spaces (hard to explore randomly)
+  - Need systematic exploration
+```
+
+#### Reward Design Issues
+
+**Symptoms:**
+
+- Sparse rewards (only at episode end)
+- Agent learns wrong behavior
+- Need to design reward function
+- Want inverse RL
+
+**Routing:**
+
+```
+IF reward design questions OR sparse rewards:
+  → reward-shaping
+
+  Covers:
+  - Potential-based shaping (provably optimal)
+  - Subgoal rewards
+  - Reward engineering principles
+  - Inverse RL (learn reward from demonstrations)
+
+  Often combined with:
+  → exploration-strategies (for sparse rewards)
+```
+
+#### Environment Setup
+
+**Symptoms:**
+
+- Need to create custom environment
+- Gym API questions
+- Vectorization for parallel environments
+- Wrappers, preprocessing
+
+**Routing:**
+
+```
+IF environment setup questions:
+  → rl-environments
+
+  Covers:
+  - Gym API: step(), reset(), observation/action spaces
+  - Custom environments
+  - Wrappers (frame stacking, normalization)
+  - Vectorized environments (parallel rollouts)
+  - MuJoCo, Atari, custom simulators
+
+  After environment setup, return to algorithm choice
+```
+
+#### Evaluation Methodology
+
+**Symptoms:**
+
+- How to evaluate RL agents?
+- Training reward high, test reward low
+- Variance in results
+- Sample efficiency metrics
+
+**Routing:**
+
+```
+IF evaluation questions:
+  → rl-evaluation
+
+  Covers:
+  - Deterministic vs stochastic policies
+  - Multiple seeds, confidence intervals
+  - Sample efficiency curves
+  - Generalization testing
+  - Exploration vs exploitation at test time
+```
+
+---
+
+## Common Multi-Skill Scenarios
+
+### Scenario: Complete Beginner to RL
+
+**Routing sequence:**
+
+1. **rl-foundations** - Understand MDP, value functions, policy gradients
+2. **value-based-methods** OR **policy-gradient-methods** - Start with simpler algorithm (DQN or REINFORCE)
+3. **rl-debugging** - When things don't work (they won't initially)
+4. **rl-environments** - Set up custom environments
+5. **rl-evaluation** - Proper evaluation methodology
+
+### Scenario: Continuous Control (Robotics)
+
+**Routing sequence:**
+
+1. **actor-critic-methods** - Primary (SAC for sample efficiency, TD3 for stability)
+2. **rl-debugging** - Systematic debugging when training issues arise
+3. **exploration-strategies** - If exploration is insufficient
+4. **reward-shaping** - If reward is sparse or agent learns wrong behavior
+5. **rl-evaluation** - Evaluation on real robot vs simulation
+
+### Scenario: Offline RL from Dataset
+
+**Routing sequence:**
+
+1. **offline-rl** - Primary (CQL, IQL, special considerations)
+2. **rl-evaluation** - Evaluation without environment interaction
+3. **rl-debugging** - Debugging without online rollouts (limited tools)
+
+### Scenario: Multi-Agent Cooperative Task
+
+**Routing sequence:**
+
+1. **multi-agent-rl** - Primary (QMIX, COMA, centralized training)
+2. **reward-shaping** - Team rewards, credit assignment
+3. **policy-gradient-methods** - Often used as base algorithm (PPO + MARL)
+4. **rl-debugging** - Multi-agent debugging (non-stationarity issues)
+
+### Scenario: Sample-Efficient Learning
+
+**Routing sequence:**
+
+1. **actor-critic-methods** (SAC) OR **model-based-rl** (MBPO)
+2. **rl-debugging** - Critical to not waste samples on bugs
+3. **rl-evaluation** - Track sample efficiency curves
+
+### Scenario: Sparse Reward Problem
+
+**Routing sequence:**
+
+1. **reward-shaping** - Potential-based shaping, subgoal rewards
+2. **exploration-strategies** - Curiosity, intrinsic motivation
+3. **rl-debugging** - Verify exploration hyperparameters
+4. Primary algorithm: **actor-critic-methods** or **policy-gradient-methods**
+
+---
+
+## Rationalization Resistance Table
+
+| Rationalization | Reality | Counter-Guidance | Red Flag |
+|-----------------|---------|------------------|----------|
+| "Just use PPO for everything" | PPO is general but not optimal for all cases | "Let's clarify: discrete or continuous actions? Sample efficiency constraints?" | Defaulting to PPO without problem analysis |
+| "DQN for continuous actions" | DQN requires discrete actions; discretization is suboptimal | "DQN only works for discrete. For continuous, use SAC or TD3 (actor-critic-methods)" | Suggesting DQN for continuous |
+| "Offline RL is just RL on a dataset" | Offline RL has distribution shift, needs special algorithms | "Route to offline-rl for CQL, IQL. Standard algorithms fail on offline data." | Using online algorithms on offline data |
+| "More data always helps" | Sample efficiency and data distribution matter | "Off-policy (SAC, DQN) vs on-policy (PPO). Offline needs CQL." | Ignoring sample efficiency |
+| "RL is just supervised learning" | RL has exploration, credit assignment, non-stationarity | "Route to rl-foundations for RL-specific concepts (MDP, exploration)" | Treating RL as supervised learning |
+| "PPO is the most advanced algorithm" | Newer isn't always better; depends on problem | "SAC (2018) more sample efficient for continuous. DQN (2013) great for discrete." | Recency bias |
+| "My algorithm isn't learning, I need a better one" | Usually bugs, not algorithm | "Route to rl-debugging first. Check reward scale, exploration, learning rate." | Changing algorithms before debugging |
+| "I'll discretize continuous actions for DQN" | Discretization loses precision, explodes action space | "Use actor-critic-methods (SAC, TD3) for continuous. Don't discretize." | Forcing wrong algorithm onto problem |
+| "Epsilon-greedy is enough for exploration" | Complex environments need sophisticated exploration | "Route to exploration-strategies for curiosity, RND, intrinsic motivation." | Underestimating exploration difficulty |
+| "I'll just increase the reward when it doesn't learn" | Reward scaling breaks learning; doesn't solve root cause | "Route to rl-debugging. Check if reward scale is the issue, not magnitude." | Arbitrary reward hacking |
+| "I can reuse online RL code for offline data" | Offline RL needs conservative algorithms | "Route to offline-rl. CQL/IQL prevent overestimation, online algorithms fail." | Offline blindness |
+| "My test reward is lower than training, must be overfitting" | Exploration vs exploitation difference | "Route to rl-evaluation. Training uses exploration, test should be greedy." | Misunderstanding RL evaluation |
+
+---
+
+## Red Flags Checklist
+
+Watch for these signs of incorrect routing:
+
+- [ ] **Algorithm-First Thinking**: Recommending algorithm before asking about action space, data regime
+- [ ] **DQN for Continuous**: Suggesting DQN/Q-learning for continuous action spaces
+- [ ] **Offline Blindness**: Not recognizing fixed dataset requires offline-rl (CQL, IQL)
+- [ ] **PPO Cargo-Culting**: Defaulting to PPO without considering alternatives
+- [ ] **No Problem Characterization**: Not asking: discrete vs continuous? online vs offline?
+- [ ] **Skipping Foundations**: Implementing algorithms when user doesn't understand RL basics
+- [ ] **Debug-Last**: Suggesting algorithm changes before systematic debugging
+- [ ] **Sample Efficiency Ignorance**: Not asking about sample constraints (simulator cost, real robot limits)
+- [ ] **Exploration Assumptions**: Assuming epsilon-greedy is sufficient for all problems
+- [ ] **Infrastructure Confusion**: Trying to explain Gym API instead of routing to rl-environments
+- [ ] **Evaluation Naivety**: Not routing to rl-evaluation for proper methodology
+
+**If any red flag triggered → STOP → Ask diagnostic questions → Route correctly**
+
+---
+
+## When NOT to Use This Pack
+
+Clarify boundaries with other packs:
+
+| User Request | Correct Pack | Reason |
+|--------------|--------------|--------|
+| "Train classifier on labeled data" | training-optimization | Supervised learning, not RL |
+| "Design transformer architecture" | neural-architectures | Architecture design, not RL algorithm |
+| "Implement PyTorch autograd" | pytorch-engineering | PyTorch internals, not RL |
+| "Deploy model to production" | ml-production | Deployment, not RL training |
+| "Fine-tune LLM with RLHF" | llm-specialist | LLM-specific (though uses RL concepts) |
+| "Optimize hyperparameters" | training-optimization | Hyperparameter search, not RL |
+| "Implement custom CUDA kernel" | pytorch-engineering | Low-level optimization, not RL |
+
+**Edge case:** RLHF (Reinforcement Learning from Human Feedback) for LLMs uses RL concepts (PPO) but has LLM-specific considerations. Route to **llm-specialist** first; they may reference this pack.
+
+---
+
+## Diagnostic Question Templates
+
+Use these questions to classify problems:
+
+### Action Space
+
+- "What actions can your agent take? Discrete choices or continuous values?"
+- "How many possible actions? Small (< 100), large (100-10000), or infinite (continuous)?"
+
+### Data Regime
+
+- "Can your agent interact with the environment during training, or do you have a fixed dataset?"
+- "Are you learning online (agent tries actions) or offline (from logged data)?"
+
+### Experience Level
+
+- "Are you new to RL, or do you have a specific problem?"
+- "Do you understand MDPs, value functions, and policy gradients?"
+
+### Special Requirements
+
+- "Are multiple agents involved? Do they cooperate or compete?"
+- "Is sample efficiency critical? How many episodes can you afford?"
+- "Is the reward sparse (only at goal) or dense (every step)?"
+- "Do you need the agent to learn a model of the environment?"
+
+### Infrastructure
+
+- "Do you have an environment set up, or do you need to create one?"
+- "Are you debugging a training issue, or designing from scratch?"
+- "How will you evaluate the agent?"
+
+---
+
+## Implementation Process
+
+When routing to a skill:
+
+1. **Ask Diagnostic Questions** (don't assume)
+2. **Explain Routing Rationale** (teach the user problem classification)
+3. **Route to Primary Skill(s)** (1-3 skills for multi-faceted problems)
+4. **Mention Related Skills** (user may need later)
+5. **Set Expectations** (what the skill will cover)
+
+**Example:**
+
+> "You mentioned continuous joint angles for a robot arm. This is a **continuous action space**, which means DQN won't work (it requires discrete actions).
+>
+> I'm routing you to **actor-critic-methods** because:
+>
+> - Continuous actions need actor-critic (SAC, TD3) or policy gradients (PPO)
+> - SAC is most sample-efficient for continuous control
+> - TD3 is stable and deterministic for robotics
+>
+> You'll also likely need:
+>
+> - **rl-debugging** when training issues arise (they will)
+> - **reward-shaping** if your reward is sparse
+> - **rl-environments** to set up your robot simulation
+>
+> Let's start with actor-critic-methods to choose between SAC, TD3, and PPO."
+
+---
+
+## Summary: Routing Decision Tree
+
+```
+START: RL problem
+
+├─ Need foundations? (new to RL, confused about concepts)
+│  └─ → rl-foundations
+│
+├─ DISCRETE actions?
+│  ├─ Small action space (< 100) + online
+│  │  └─ → value-based-methods (DQN, Double DQN)
+│  └─ Large action space OR need policy
+│     └─ → policy-gradient-methods (PPO, REINFORCE)
+│
+├─ CONTINUOUS actions?
+│  ├─ Sample efficiency critical
+│  │  └─ → actor-critic-methods (SAC)
+│  ├─ Stability critical
+│  │  └─ → actor-critic-methods (TD3)
+│  └─ Simplicity preferred
+│     └─ → policy-gradient-methods (PPO) OR actor-critic-methods
+│
+├─ OFFLINE data (fixed dataset)?
+│  └─ → offline-rl (CQL, IQL) [CRITICAL: not standard algorithms]
+│
+├─ MULTI-AGENT?
+│  └─ → multi-agent-rl (QMIX, MADDPG)
+│
+├─ Sample efficiency EXTREME?
+│  └─ → model-based-rl (MBPO, Dreamer)
+│
+├─ DEBUGGING issues?
+│  ├─ Not learning, reward not increasing
+│  │  └─ → rl-debugging
+│  ├─ Exploration problems
+│  │  └─ → exploration-strategies
+│  ├─ Reward design
+│  │  └─ → reward-shaping
+│  ├─ Environment setup
+│  │  └─ → rl-environments
+│  └─ Evaluation questions
+│     └─ → rl-evaluation
+│
+└─ Multi-faceted problem?
+   └─ Route to 2-3 skills (primary + supporting)
+```
+
+---
+
+## Final Reminders
+
+- **Problem characterization BEFORE algorithm selection**
+- **DQN for discrete ONLY** (never continuous)
+- **Offline data needs offline-rl** (CQL, IQL)
+- **PPO is not universal** (good general-purpose, not optimal everywhere)
+- **Debug before changing algorithms** (route to rl-debugging)
+- **Ask questions, don't assume** (action space? data regime?)
+
+This meta-skill is your routing hub. **Route decisively, explain clearly, teach problem classification.**
+
+---
+
+## Deep RL Specialist Skills
+
+After routing, load the appropriate specialist skill for detailed guidance:
+
+1. [rl-foundations.md](rl-foundations.md) - MDP formulation, Bellman equations, value vs policy basics
+2. [value-based-methods.md](value-based-methods.md) - Q-learning, DQN, Double DQN, Dueling DQN, Rainbow
+3. [policy-gradient-methods.md](policy-gradient-methods.md) - REINFORCE, PPO, TRPO, policy optimization
+4. [actor-critic-methods.md](actor-critic-methods.md) - A2C, A3C, SAC, TD3, advantage functions
+5. [model-based-rl.md](model-based-rl.md) - World models, Dyna, MBPO, planning with learned models
+6. [offline-rl.md](offline-rl.md) - Batch RL, CQL, IQL, learning from fixed datasets
+7. [multi-agent-rl.md](multi-agent-rl.md) - MARL, cooperative/competitive, communication
+8. [exploration-strategies.md](exploration-strategies.md) - ε-greedy, UCB, curiosity, RND, intrinsic motivation
+9. [reward-shaping-engineering.md](reward-shaping-engineering.md) - Reward design, potential-based shaping, inverse RL
+10. [rl-debugging.md](rl-debugging.md) - Common RL bugs, why not learning, systematic debugging
+11. [rl-environments.md](rl-environments.md) - Gym, MuJoCo, custom envs, wrappers, vectorization
+12. [rl-evaluation.md](rl-evaluation.md) - Evaluation methodology, variance, sample efficiency metrics
diff --git a/skills/using-deep-rl/actor-critic-methods.md b/skills/using-deep-rl/actor-critic-methods.md
new file mode 100644
index 0000000..435b52b
--- /dev/null
+++ b/skills/using-deep-rl/actor-critic-methods.md
@@ -0,0 +1,1735 @@
+
+# Actor-Critic Methods
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Algorithm Selection**: "Should I use actor-critic for my continuous control problem?"
+- **SAC Implementation**: User implementing SAC and needs guidance on entropy coefficient tuning
+- **TD3 Confusion**: "Why does TD3 have twin critics and delayed updates?"
+- **Training Instability**: "My actor-critic diverges. How do I stabilize it?"
+- **A2C/A3C Questions**: "What's the difference between A2C and A3C?"
+- **Continuous Control**: User has continuous action space and needs appropriate algorithm
+- **Critic Issues**: "My critic loss isn't decreasing" or "Advantage estimates are wrong"
+- **SAC vs TD3**: "Which algorithm should I use for my problem?"
+- **Entropy Tuning**: "How do I set the entropy coefficient α in SAC?"
+- **Policy Gradient Variance**: "My policy gradients are too noisy. How do I reduce variance?"
+- **Implementation Bugs**: Critic divergence, actor-critic synchronization, target network staleness
+- **Continuous Action Handling**: Tanh squashing, log determinant Jacobian, action scaling
+
+**This skill provides practical guidance for continuous action space RL using actor-critic methods.**
+
+Do NOT use this skill for:
+
+- Discrete action spaces (route to value-based-methods for Q-learning/DQN)
+- Pure policy gradient without value baseline (route to policy-gradient-methods)
+- Model-based RL (route to model-based-rl)
+- Offline RL (route to offline-rl-methods)
+- Theory foundations (route to rl-foundations)
+
+
+## Core Principle
+
+**Actor-critic methods achieve the best of both worlds: a policy (actor) for action selection guided by a value function (critic) for stable learning. They dominate continuous control because they're designed for infinite action spaces and provide sample-efficient learning through variance reduction.**
+
+Key insight: Continuous control has infinite actions to explore. Value-based methods (compare all action values) are infeasible. Policy gradient methods (directly optimize policy) have high variance. **Actor-critic solves this: policy directly outputs action distribution (actor), value function provides stable baseline (critic) to reduce variance.**
+
+Use them for:
+
+- Continuous control (robot arms, locomotion, vehicle control)
+- High-dimensional action spaces (continuous angles, forces, velocities)
+- Sample-efficient learning from sparse experiences
+- Problems requiring exploration via stochastic policies
+- Continuous state/action MDPs (deterministic or stochastic environments)
+
+**Do not use for**:
+
+- Discrete small action spaces (too slow compared to DQN)
+- Imitation learning focused on behavior cloning (use behavior cloning directly)
+- Very high-dimensional continuous spaces without careful design (curse of dimensionality)
+- Planning-focused problems (route to model-based methods)
+
+
+## Part 1: Actor-Critic Foundations
+
+### From Policy Gradient to Actor-Critic
+
+You understand policy gradient from policy-gradient-methods. Actor-critic extends it with **a value baseline to reduce variance**.
+
+**Pure Policy Gradient (REINFORCE)**:
+
+```
+∇J = E_τ[∇log π(a|s) * G_t]
+```
+
+**Problem**: G_t (cumulative future reward) has high variance. All rollouts pulled toward average. Noisy gradients = slow learning.
+
+**Actor-Critic Solution**:
+
+```
+∇J = E_τ[∇log π(a|s) * (G_t - V(s))]
+       = E_τ[∇log π(a|s) * A(s,a)]
+
+where:
+- Actor: π(a|s) = policy (action distribution)
+- Critic: V(s) = value function (baseline)
+- Advantage: A(s,a) = G_t - V(s) = "how much better than average"
+```
+
+**Why baseline helps**:
+
+```
+Without baseline: policy gradients = [+10, -2, +5, -3, -1] (noisy, high variance)
+With baseline (subtract mean=2): [+8, -4, +3, -5, -3] (same direction, but cleaner relative to baseline)
+
+Result: Gradient points in same direction (increase high G, decrease low G) but with MUCH lower variance.
+This reduces sample complexity significantly.
+```
+
+### Advantage Estimation
+
+The core of actor-critic is **accurate advantage estimation**:
+
+```
+A(s,a) = Q(s,a) - V(s)
+       = E[r + γV(s')] - V(s)
+       = E[r + γV(s') - V(s)]
+```
+
+**Key insight**: Advantage = "by how much does taking action a in state s beat the average for this state?"
+
+**Three ways to estimate advantage**:
+
+**1. Monte Carlo (full return)**:
+
+```python
+G_t = r_t + γr_{t+1} + γ²r_{t+2} + ... (full rollout)
+A(s,a) = G_t - V(s)
+```
+
+- Unbiased but high variance
+- Requires complete episodes or long horizons
+
+**2. TD(0) (one-step bootstrap)**:
+
+```python
+A(s,a) = r + γV(s') - V(s)
+```
+
+- Low variance but biased (depends on critic accuracy)
+- One-step lookahead only
+- If V(s') is wrong, advantage is wrong
+
+**3. GAE - Generalized Advantage Estimation** (best practice):
+
+```python
+A_t = δ_t + (γλ)δ_{t+1} + (γλ)²δ_{t+2} + ...
+δ_t = r_t + γV(s_{t+1}) - V(s_t)  [TD error]
+
+λ ∈ [0,1] trades off bias-variance:
+- λ=0: pure TD(0) (low variance, high bias)
+- λ=1: pure MC (high variance, low bias)
+- λ=0.95: sweet spot (good tradeoff)
+```
+
+**Why GAE**: Exponentially decaying trace over multiple steps. Reduces variance without full MC, reduces bias without pure TD.
+
+
+### Actor-Critic Pitfall #1: Critic Not Learning Properly
+
+**Scenario**: User trains actor-critic but critic loss doesn't decrease. Actor improves, but value function plateaus. Agent can't use accurate advantage estimates.
+
+**Problem**:
+
+```python
+# WRONG - critic loss computed incorrectly
+critic_loss = mean((V(s) - G_t)^2)  # Wrong target!
+critic_loss.backward()
+```
+
+**The bug**: Critic should learn Bellman equation:
+
+```
+V(s) = E[r + γV(s')]
+```
+
+If you compute target as G_t directly, you're using Monte Carlo returns (too noisy). If you use r + γV(s'), you're bootstrapping properly.
+
+**Correct approach**:
+
+```python
+# RIGHT - Bellman bootstrap target
+V_target = r + gamma * V(s').detach()  # Detach next state value!
+critic_loss = mean((V(s) - V_target)^2)
+```
+
+**Why detach() matters**: If you don't detach V(s'), gradient flows backward through value function, creating a moving target problem.
+
+**Red Flag**: If critic loss doesn't decrease while actor loss decreases, critic isn't learning Bellman equation. Check:
+
+1. Target computation (should be r + γV(s'), not G_t alone)
+2. Detach next state value
+3. Critic network is separate from actor
+4. Different learning rates (critic typically higher than actor)
+
+
+### Critic as Baseline vs Critic as Q-Function
+
+**Important distinction**:
+
+**A2C uses critic as baseline**:
+
+```
+V(s) = value of being in state s
+A(s,a) = r + γV(s') - V(s)  [TD advantage]
+Policy loss = -∇log π(a|s) * A(s,a)
+```
+
+**SAC/TD3 use critic as Q-function**:
+
+```
+Q(s,a) = expected return from taking action a in s
+A(s,a) = Q(s,a) - V(s)
+Policy loss = ∇log π(a|s) * Q(s,a) [deterministic policy gradient]
+```
+
+**Why the difference**: A2C updates actor and critic together (on-policy). SAC/TD3 decouple them (off-policy):
+
+- Actor never sees the replay buffer
+- Critic learns Q from replay buffer
+- Actor uses critic's Q estimate (always lagging slightly)
+
+
+## Part 2: A2C - Advantage Actor-Critic
+
+### A2C Architecture
+
+A2C = on-policy advantage actor-critic. Actor and critic train simultaneously with synchronized rollouts:
+
+```
+┌─────────────────────────────────────────┐
+│  Environment                            │
+└────────────┬────────────────────────────┘
+             │ states, rewards
+             ▼
+┌─────────────────────────────────────────┐
+│  Actor π(a|s)     Critic V(s)          │
+│  Policy network   Value network         │
+│  Outputs: action  Outputs: value        │
+└────────┬──────────────────┬─────────────┘
+         │                  │
+         └──────┬───────────┘
+                │
+         ┌──────▼───────────┐
+         │ Advantage        │
+         │ A(s,a) = r+γV(s')-V(s)
+         └────────┬─────────┘
+                  │
+         ┌────────▼────────────┐
+         │ Actor Loss:         │
+         │ -log π(a|s) * A(s,a)
+         │                     │
+         │ Critic Loss:        │
+         │ (V(s) - target)²    │
+         └─────────────────────┘
+```
+
+### A2C Training Loop
+
+```python
+for episode in range(num_episodes):
+    states, actions, rewards, values = [], [], [], []
+
+    state = env.reset()
+    for t in range(horizon):
+        # Actor samples action from policy
+        action = actor(state)
+
+        # Step environment
+        next_state, reward = env.step(action)
+
+        # Get value estimate (baseline)
+        value = critic(state)
+
+        # Store for advantage computation
+        states.append(state)
+        actions.append(action)
+        rewards.append(reward)
+        values.append(value)
+
+        state = next_state
+
+    # Advantage estimation (GAE)
+    advantages = compute_gae(rewards, values, next_value, gamma, lambda)
+
+    # Actor loss (policy gradient with baseline)
+    actor_loss = -log_prob(actions, actor(states)) * advantages
+    actor.update(actor_loss)
+
+    # Critic loss (value function learning)
+    critic_targets = rewards + gamma * values[1:] + gamma * critic(next_state)
+    critic_loss = (critic(states) - critic_targets)^2
+    critic.update(critic_loss)
+```
+
+### A2C vs A3C
+
+**A2C**: Synchronous - all parallel workers update at same time (cleaner, deterministic)
+
+```
+Worker 1  ────┐
+Worker 2  ────┼──► Global Model Update ──► All workers receive updated weights
+Worker 3  ────┤
+Worker N  ────┘
+Wait for all workers before next update
+```
+
+**A3C**: Asynchronous - workers update whenever they finish (faster wall clock time, messier)
+
+```
+Worker 1  ──► Update (1) ──► Continue
+Worker 2  ──────► Update (2) ──────► Continue
+Worker 3  ──────────► Update (3) ──────► Continue
+No synchronization barrier (race conditions possible)
+```
+
+**In practice**: A2C is preferred. A3C was important historically (enables multi-GPU training without synchronization) but A2C is cleaner.
+
+
+## Part 3: SAC - Soft Actor-Critic
+
+### SAC Overview
+
+SAC = Soft Actor-Critic. The current SOTA (state-of-the-art) for continuous control. Three key innovations:
+
+1. **Entropy regularization**: Add H(π(·|s)) to objective (maximize entropy + reward)
+2. **Auto-tuning entropy coefficient**: Learn α automatically (no manual tuning!)
+3. **Off-policy learning**: Learn from replay buffer (sample efficient)
+
+### SAC's Objective Function
+
+Standard policy gradient maximizes:
+
+```
+J(π) = E[G_t]
+```
+
+SAC maximizes:
+
+```
+J(π) = E[G_t + α H(π(·|s))]
+     = E[G_t] + α E[H(π(·|s))]
+```
+
+**Where**:
+
+- G_t = cumulative reward
+- H(π(·|s)) = policy entropy (randomness)
+- α = entropy coefficient (how much we value exploration)
+
+**Why entropy**: Exploratory policies (high entropy) discover better strategies. Adding entropy to objective = agent explores automatically.
+
+### SAC Components
+
+```
+┌─────────────────────────────────────┐
+│  Replay Buffer (off-policy data)    │
+└────────────┬────────────────────────┘
+             │ sample batch
+             ▼
+    ┌────────────────────────┐
+    │  Actor Network         │
+    │  π(a|s) = μ(s) + σ(s) │  (Gaussian policy)
+    │  Outputs: mean, std    │
+    └────────────────────────┘
+             │
+             ▼
+    ┌────────────────────────┐
+    │  Two Critic Networks   │
+    │  Q1(s,a), Q2(s,a)     │
+    │  Learn Q-values        │
+    └────────────────────────┘
+             │
+             ▼
+    ┌────────────────────────┐
+    │  Target Networks       │
+    │  Q_target1, Q_target2  │
+    │  (updated every N)     │
+    └────────────────────────┘
+             │
+             ▼
+    ┌────────────────────────┐
+    │  Entropy Coefficient   │
+    │  α (learned!)          │
+    └────────────────────────┘
+```
+
+### SAC Training Algorithm
+
+```python
+# Initialize
+actor = ActorNetwork()
+critic1, critic2 = CriticNetwork(), CriticNetwork()
+target_critic1, target_critic2 = copy(critic1), copy(critic2)
+entropy_alpha = 1.0  # Learned!
+target_entropy = -action_dim  # Target entropy (usually -action_dim)
+
+for step in range(num_steps):
+    # 1. Collect data (could be online or from buffer)
+    state = env.reset() if step % 1000 == 0 else next_state
+    action = actor.sample(state)  # π(a|s)
+    next_state, reward = env.step(action)
+    replay_buffer.add(state, action, reward, next_state, done)
+
+    # 2. Sample batch from replay buffer
+    batch = replay_buffer.sample(batch_size=256)
+    states, actions, rewards, next_states, dones = batch
+
+    # 3. Critic update (Q-function learning)
+    # Compute target Q value using entropy-regularized objective
+    next_actions = actor.sample(next_states)
+    next_log_probs = actor.log_prob(next_actions, next_states)
+
+    # Use BOTH target critics, take minimum (overestimation prevention)
+    Q_target1 = target_critic1(next_states, next_actions)
+    Q_target2 = target_critic2(next_states, next_actions)
+    Q_target = min(Q_target1, Q_target2)
+
+    # Entropy-regularized target
+    y = reward + γ(1 - done) * (Q_target - α * next_log_probs)
+
+    # Update both critics
+    critic1_loss = MSE(critic1(states, actions), y)
+    critic1.update(critic1_loss)
+
+    critic2_loss = MSE(critic2(states, actions), y)
+    critic2.update(critic2_loss)
+
+    # 4. Actor update (policy gradient with entropy)
+    # Reparameterization trick: sample actions, compute log probs
+    sampled_actions = actor.sample(states)
+    sampled_log_probs = actor.log_prob(sampled_actions, states)
+
+    # Actor maximizes Q - α*log_prob (entropy regularization)
+    Q1_sampled = critic1(states, sampled_actions)
+    Q2_sampled = critic2(states, sampled_actions)
+    Q_sampled = min(Q1_sampled, Q2_sampled)
+
+    actor_loss = -E[Q_sampled - α * sampled_log_probs]
+    actor.update(actor_loss)
+
+    # 5. Entropy coefficient auto-tuning (SAC's KEY INNOVATION)
+    # Learn α to maintain target entropy
+    entropy_loss = -α * (sampled_log_probs + target_entropy)
+    alpha.update(entropy_loss)
+
+    # 6. Soft update target networks (every N steps)
+    if step % update_frequency == 0:
+        target_critic1 = τ * critic1 + (1-τ) * target_critic1
+        target_critic2 = τ * critic2 + (1-τ) * target_critic2
+```
+
+### SAC Pitfall #1: Manual Entropy Coefficient
+
+**Scenario**: User implements SAC but manually sets α=0.2 and training diverges. Agent explores randomly and never improves.
+
+**Problem**: SAC's entire design is that α is **learned automatically**. Setting it manually defeats the purpose.
+
+```python
+# WRONG - treating α as fixed hyperparameter
+alpha = 0.2  # Fixed!
+loss = Q_target - 0.2 * log_prob  # Same penalty regardless of entropy
+
+# Result: If entropy naturally low, penalty still high → policy forced random
+#         If entropy naturally high, penalty too weak → insufficient exploration
+```
+
+**Correct approach**:
+
+```python
+# RIGHT - α is learned via entropy constraint
+target_entropy = -action_dim  # For Gaussian: typically -action_dim
+
+# Optimize α to maintain target entropy
+entropy_loss = -α * (sampled_log_probs.detach() + target_entropy)
+alpha_optimizer.zero_grad()
+entropy_loss.backward()
+alpha_optimizer.step()
+
+# α adjusts automatically:
+# - If entropy too high: α increases (more penalty) → policy becomes more deterministic
+# - If entropy too low: α decreases (less penalty) → policy explores more
+```
+
+**Red Flag**: If SAC agent explores randomly without improving, check:
+
+1. Is α being optimized? (not fixed value)
+2. Is target entropy set correctly? (usually -action_dim)
+3. Is log_prob computed with squashed action (after tanh)?
+
+
+### SAC Pitfall #2: Tanh Squashing and Log Probability
+
+**Scenario**: User implements SAC with Gaussian policy but uses policy directly. Log probabilities are computed wrong. Training is unstable.
+
+**Problem**: SAC uses tanh squashing to bound actions:
+
+```
+Raw action from network: μ(s) + σ(s)*ε, ε~N(0,1)  → unbounded
+Tanh squashed: a = tanh(raw_action)  → bounded in [-1,1]
+```
+
+But policy probability must account for this transformation:
+
+```
+π(a|s) ≠ N(μ(s), σ²(s))  [Wrong! Ignores tanh]
+π(a|s) = |det(∂a/∂raw_action)|^(-1) * N(μ(s), σ²(s))
+       = (1 - a²)^2 * N(μ(s), σ²(s))  [Right! Jacobian correction]
+
+log π(a|s) = log N(μ(s), σ²(s)) - 2*log(1 - a²)
+```
+
+**The bug**: Computing log_prob without Jacobian correction:
+
+```python
+# WRONG
+log_prob = normal.log_prob(raw_action) - log(1 + exp(-2*x))
+# (standard normal log prob, ignores squashing)
+
+# RIGHT
+log_prob = normal.log_prob(raw_action) - log(1 + exp(-2*x))
+log_prob = log_prob - 2 * (log(2) - x - softplus(-2*x))  # Add Jacobian term
+```
+
+Or simpler:
+
+```python
+# PyTorch way
+dist = Normal(mu, sigma)
+raw_action = dist.rsample()  # Reparameterized sample
+action = torch.tanh(raw_action)
+log_prob = dist.log_prob(raw_action) - torch.log(1 - action.pow(2) + 1e-6).sum(-1)
+```
+
+**Red Flag**: If SAC policy doesn't learn despite updates, check:
+
+1. Are actions being squashed (tanh)?
+2. Is log_prob computed with tanh Jacobian term?
+3. Is squashing adjustment in entropy coefficient update?
+
+
+### SAC Pitfall #3: Two Critics and Target Networks
+
+**Scenario**: User implements SAC with one critic and gets unstable learning. "I thought SAC just needed entropy regularization?"
+
+**Problem**: SAC uses TWO critics because of Q-function overestimation:
+
+```
+Single critic Q(s,a):
+- Targets computed as: y = r + γQ_target(s', a')
+- Q_target is function of Q (updated less frequently)
+- In continuous space, selecting actions via max isn't feasible
+- Next action sampled from π (deterministic max removed)
+- But Q-values can still overestimate (stochastic environment noise)
+
+Two critics (clipped double Q-learning):
+- Use both Q1 and Q2, take minimum: Q_target = min(Q1_target, Q2_target)
+- Prevents overestimation (conservative estimate)
+- Both updated simultaneously
+- Asymmetric: both learn, but target uses minimum
+```
+
+**Correct implementation**:
+
+```python
+# WRONG - one critic
+target = reward + gamma * critic_target(next_state, next_action)
+
+# RIGHT - two critics with min
+Q1_target = critic1_target(next_state, next_action)
+Q2_target = critic2_target(next_state, next_action)
+target = reward + gamma * min(Q1_target, Q2_target)
+
+# Both critics learn
+critic1_loss = MSE(critic1(state, action), target)
+critic2_loss = MSE(critic2(state, action), target)
+
+# But actor only uses critic1 (or min of both)
+Q_current = min(critic1(state, sampled_action), critic2(state, sampled_action))
+actor_loss = -(Q_current - alpha * log_prob)
+```
+
+**Red Flag**: If SAC diverges, check:
+
+1. Are there two Q-networks?
+2. Does target use min(Q1, Q2)?
+3. Are target networks updated (soft or hard)?
+
+
+## Part 4: TD3 - Twin Delayed DDPG
+
+### Why TD3 Exists
+
+TD3 = Twin Delayed DDPG. It addresses SAC's cost (two networks, more computation) with deterministic policy gradient (simpler).
+
+**DDPG** (older): Deterministic policy, single Q-network, no entropy. Fast but unstable.
+
+**TD3** (newer): Three tricks to stabilize DDPG:
+
+1. **Twin critics**: Two Q-networks (clipped double Q-learning)
+2. **Delayed actor updates**: Update actor every d steps (not every step)
+3. **Target policy smoothing**: Add noise to target action before Q evaluation
+
+### TD3 Architecture
+
+```
+┌──────────────────────────────────┐
+│  Replay Buffer                   │
+└────────────┬─────────────────────┘
+             │
+             ▼
+    ┌───────────────────────┐
+    │  Actor μ(s)           │
+    │  Deterministic policy │
+    │  Outputs: action      │
+    └───────────────────────┘
+             │
+             ▼
+    ┌─────────────────────────┐
+    │  Q1(s,a), Q2(s,a)      │
+    │  Two Q-networks        │
+    │  (Triple: original+2)  │
+    └─────────────────────────┘
+             │
+             ▼
+    ┌─────────────────────────┐
+    │  Delayed Actor Update   │
+    │  (every d steps)        │
+    └─────────────────────────┘
+```
+
+### TD3 Training Algorithm
+
+```python
+for step in range(num_steps):
+    # 1. Collect data
+    action = actor(state) + exploration_noise
+    next_state, reward = env.step(action)
+    replay_buffer.add(state, action, reward, next_state, done)
+
+    if step < min_steps_before_training:
+        continue
+
+    batch = replay_buffer.sample(batch_size)
+    states, actions, rewards, next_states, dones = batch
+
+    # 2. Critic update (BOTH Q-networks)
+    # Trick #3: Target policy smoothing
+    next_actions = actor_target(next_states)
+    noise = torch.randn_like(next_actions) * target_noise
+    noise = torch.clamp(noise, -noise_clip, noise_clip)
+    next_actions = torch.clamp(next_actions + noise, -1, 1)  # Add noise, clip
+
+    # Clipped double Q-learning: use minimum
+    Q1_target = critic1_target(next_states, next_actions)
+    Q2_target = critic2_target(next_states, next_actions)
+    Q_target = torch.min(Q1_target, Q2_target)
+
+    y = rewards + gamma * (1 - dones) * Q_target
+
+    # Update both critics
+    critic1_loss = MSE(critic1(states, actions), y)
+    critic1_optimizer.zero_grad()
+    critic1_loss.backward()
+    critic1_optimizer.step()
+
+    critic2_loss = MSE(critic2(states, actions), y)
+    critic2_optimizer.zero_grad()
+    critic2_loss.backward()
+    critic2_optimizer.step()
+
+    # 3. Delayed actor update (Trick #2)
+    if step % policy_delay == 0:
+        # Deterministic policy gradient
+        Q1_current = critic1(states, actor(states))
+        actor_loss = -Q1_current.mean()
+
+        actor_optimizer.zero_grad()
+        actor_loss.backward()
+        actor_optimizer.step()
+
+        # Soft update target networks
+        for param, target_param in zip(critic1.parameters(), critic1_target.parameters()):
+            target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
+        for param, target_param in zip(critic2.parameters(), critic2_target.parameters()):
+            target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
+        for param, target_param in zip(actor.parameters(), actor_target.parameters()):
+            target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
+```
+
+### TD3 Pitfall #1: Missing Target Policy Smoothing
+
+**Scenario**: User implements TD3 with twin critics and delayed updates but training still unstable. "I have two critics, why isn't it stable?"
+
+**Problem**: Target policy smoothing is critical. Without it:
+
+```
+Next action = deterministic μ_target(s')  [exact, no exploration noise]
+
+If Q-networks overestimate for certain actions:
+- Target policy always selects that exact action
+- Q-target biased high for that action
+- Feedback loop: overestimation → more value → policy selects it more → more overestimation
+```
+
+With smoothing:
+
+```
+Next action = μ_target(s') + ε_smoothing
+- Adds small random noise to target action
+- Prevents exploitation of Q-estimation errors
+- Breaks feedback loop by adding randomness to target action
+
+Important: Noise is added at TARGET action, not current action!
+- Current: exploration_noise (for exploration during collection)
+- Target: target_noise (for stability, noise clip small)
+```
+
+**Correct implementation**:
+
+```python
+# Trick #3: Target policy smoothing
+next_actions = actor_target(next_states)
+noise = torch.randn_like(next_actions) * target_policy_noise
+noise = torch.clamp(noise, -noise_clip, noise_clip)
+next_actions = torch.clamp(next_actions + noise, -1, 1)
+
+# Then use these noisy actions for Q-target
+Q_target = min(Q1_target(next_states, next_actions),
+               Q2_target(next_states, next_actions))
+```
+
+**Red Flag**: If TD3 diverges despite two critics, check:
+
+1. Is noise added to target action (not just actor output)?
+2. Is noise clipped (noise_clip prevents too much noise)?
+3. Are critic targets using smoothed actions?
+
+
+### TD3 Pitfall #2: Delayed Actor Updates
+
+**Scenario**: User implements TD3 with target policy smoothing and twin critics, but updates actor every step. "Do I really need delayed updates?"
+
+**Problem**: Policy updates change actor, which changes actions chosen. If you update actor every step while critics are learning:
+
+```
+Step 1: Actor outputs a1, Q(s,a1) = 5, Actor updated
+Step 2: Actor outputs a2, Q(s,a2) = 3, Actor wants to stay at a1
+Step 3: Critics haven't converged, oscillate between a1 and a2
+Result: Actor chases moving target, training unstable
+```
+
+With delayed updates:
+
+```
+Steps 1-4: Update critics only, let them converge
+Step 5: Update actor (once per policy_delay=5)
+Steps 6-9: Update critics only
+Step 10: Update actor again
+Result: Critic stabilizes before actor changes, smoother learning
+```
+
+**Typical settings**:
+
+```python
+policy_delay = 2  # Update actor every 2 critic updates
+# or
+policy_delay = 5  # More conservative, every 5 critic updates
+```
+
+**Correct implementation**:
+
+```python
+if step % policy_delay == 0:  # Only sometimes!
+    actor_loss = -critic1(state, actor(state)).mean()
+    actor_optimizer.zero_grad()
+    actor_loss.backward()
+    actor_optimizer.step()
+
+    # Update targets on same schedule
+    soft_update(critic1_target, critic1)
+    soft_update(critic2_target, critic2)
+    soft_update(actor_target, actor)
+```
+
+**Red Flag**: If TD3 training unstable, check:
+
+1. Is actor updated only every policy_delay steps?
+2. Are target networks updated on same schedule (policy_delay)?
+3. Policy_delay typically 2-5
+
+
+### SAC vs TD3 Decision Framework
+
+**Both are SOTA for continuous control. How to choose?**
+
+| Aspect | SAC | TD3 |
+|--------|-----|-----|
+| **Policy Type** | Stochastic (Gaussian) | Deterministic |
+| **Exploration** | Entropy maximization (automatic) | Target policy smoothing |
+| **Sample Efficiency** | High (two critics) | High (two critics) |
+| **Stability** | Very stable (entropy helps) | Stable (three tricks) |
+| **Computation** | Higher (entropy tuning) | Slightly lower |
+| **Manual Tuning** | Minimal (α auto-tuned) | Moderate (policy_delay, noise) |
+| **When to Use** | Default choice, off-policy | When deterministic better, simpler noise |
+
+**Decision tree**:
+
+1. **Do you prefer stochastic or deterministic policy?**
+   - Stochastic (multiple possible actions per state) → SAC
+   - Deterministic (one action per state) → TD3
+
+2. **Sample efficiency critical?**
+   - Yes, limited data → Both good, slight edge SAC
+   - No, lots of data → Either works
+
+3. **How much tuning tolerance?**
+   - Want minimal tuning → SAC (α auto-tuned)
+   - Don't mind tuning policy_delay, noise → TD3 (simpler conceptually)
+
+4. **Exploration challenges?**
+   - Complex exploration (entropy helps) → SAC
+   - Simple exploration (policy smoothing enough) → TD3
+
+**Practical recommendation**: Start with SAC. It's more robust (entropy auto-tuning). Switch to TD3 only if you:
+
+- Know you want deterministic policy
+- Have tuning expertise for policy_delay
+- Need slightly faster computation
+
+
+## Part 5: Continuous Action Handling
+
+### Gaussian Policy Representation
+
+Actor outputs **mean and standard deviation**:
+
+```python
+raw_output = actor_network(state)
+mu = raw_output[:, :action_dim]
+log_std = raw_output[:, action_dim:]
+log_std = torch.clamp(log_std, min=log_std_min, max=log_std_max)
+std = log_std.exp()
+
+dist = Normal(mu, std)
+raw_action = dist.rsample()  # Reparameterized sample
+```
+
+**Why log(std)?**: Parameterize log scale instead of scale directly.
+
+- Numerical stability (log prevents underflow)
+- Gradient flow smoother
+- Prevents std from becoming negative
+
+**Why clamp log_std?**: Prevents std from becoming too small or large.
+
+- Too small: policy becomes deterministic, no exploration
+- Too large: policy becomes random, no learning
+
+Typical ranges:
+
+```python
+log_std_min = -20  # std >= exp(-20) ≈ 2e-9 (small exploration)
+log_std_max = 2    # std <= exp(2) ≈ 7.4 (max randomness)
+```
+
+### Continuous Action Squashing (Tanh)
+
+Raw network output unbounded. Use tanh to bound to [-1,1]:
+
+```python
+# After sampling from policy
+action = torch.tanh(raw_action)
+# action now in [-1, 1]
+
+# Scale to environment action range [low, high]
+action_scaled = (high - low) / 2 * action + (high + low) / 2
+```
+
+**Pitfall**: Log probability must account for squashing (already covered in SAC section).
+
+### Exploration Noise in Continuous Control
+
+**Off-policy methods** (SAC, TD3) need exploration during data collection:
+
+**Method 1: Action space noise** (simpler):
+
+```python
+action = actor(state) + noise
+noise = torch.randn_like(action) * exploration_std
+action = torch.clamp(action, -1, 1)  # Ensure in bounds
+```
+
+**Method 2: Parameter noise** (more complex):
+
+```
+Add noise to actor network weights periodically
+Action = actor_with_noisy_weights(state)
+Results in correlated action noise across timesteps (more natural exploration)
+```
+
+**Typical settings**:
+
+```python
+# For SAC: exploration_std = 0.1 * max_action
+# For TD3: exploration_std starts high, decays over time
+```
+
+
+## Part 6: Common Bugs and Debugging
+
+### Bug #1: Critic Divergence
+
+**Symptom**: Critic loss explodes, V(s) becomes huge (1e6+), agent breaks.
+
+**Causes**:
+
+1. **Wrong target computation**: Using wrong Bellman target
+2. **No gradient clipping**: Gradients unstable
+3. **Learning rate too high**: Critic overshoots
+4. **Value targets too large**: Reward scale not normalized
+
+**Diagnosis**:
+
+```python
+# Check target computation
+print("Reward range:", rewards.min(), rewards.max())
+print("V(s) range:", v_current.min(), v_current.max())
+print("Target range:", v_target.min(), v_target.max())
+
+# Plot value function over time
+plt.plot(v_values_history)  # Should slowly increase, not explode
+
+# Check critic loss
+print("Critic loss:", critic_loss.item())  # Should decrease, not diverge
+```
+
+**Fix**:
+
+```python
+# 1. Reward normalization
+rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
+
+# 2. Gradient clipping
+torch.nn.utils.clip_grad_norm_(critic.parameters(), max_norm=1.0)
+
+# 3. Lower learning rate
+critic_lr = 1e-4  # Instead of 1e-3
+
+# 4. Value function target clipping (optional)
+v_target = torch.clamp(v_target, -100, 100)
+```
+
+
+### Bug #2: Actor Not Learning (Constant Policy)
+
+**Symptom**: Actor loss decreases but policy doesn't change. Same action sampled repeatedly. No improvement in return.
+
+**Causes**:
+
+1. **Policy output not properly parameterized**: Mean/std wrong
+2. **Critic signal dead**: Q-values all same, no gradient
+3. **Learning rate too low**: Actor updates too small
+4. **Advantage always zero**: Critic perfect (impossible) or wrong
+
+**Diagnosis**:
+
+```python
+# Check policy output distribution
+actions = [actor.sample(state) for _ in range(1000)]
+print("Action std:", np.std(actions))  # Should be >0.01
+print("Action mean:", np.mean(actions))
+
+# Check critic signal
+q_values = critic(states, random_actions)
+print("Q range:", q_values.min(), q_values.max())
+print("Q std:", q_values.std())  # Should have variation
+
+# Check advantage
+advantages = q_values - v_baseline
+print("Advantage std:", advantages.std())  # Should be >0
+```
+
+**Fix**:
+
+```python
+# 1. Ensure policy outputs have variance
+assert log_std.mean() < log_std_max - 0.5  # Not clamped to max
+assert log_std.mean() > log_std_min + 0.5  # Not clamped to min
+
+# 2. Check critic learns
+critic_loss should decrease
+
+# 3. Increase actor learning rate
+actor_lr = 3e-4  # Instead of 1e-4
+
+# 4. Debug advantage calculation
+if advantage.std() < 0.01:
+    print("WARNING: Advantages have no variation, critic might be wrong")
+```
+
+
+### Bug #3: Entropy Coefficient Divergence (SAC)
+
+**Symptom**: SAC entropy coefficient α explodes (1e6+), policy becomes completely random, agent stops learning.
+
+**Cause**: Entropy constraint optimization unstable.
+
+```python
+# WRONG - entropy loss unbounded
+entropy_loss = -alpha * (log_probs + target_entropy)
+# If log_probs >> target_entropy, loss becomes huge positive, α explodes
+```
+
+**Fix**:
+
+```python
+# RIGHT - use log(α) to avoid explosion
+log_alpha = torch.log(alpha)
+log_alpha_loss = -log_alpha * (log_probs.detach() + target_entropy)
+alpha_optimizer.zero_grad()
+log_alpha_loss.backward()
+alpha_optimizer.step()
+alpha = log_alpha.exp()
+
+# Or clip α
+alpha = torch.clamp(alpha, min=1e-4, max=10.0)
+```
+
+
+### Bug #4: Target Network Never Updated
+
+**Symptom**: Agent learns for a bit, then stops improving. Training plateaus.
+
+**Cause**: Target networks not updated (or updated too rarely).
+
+```python
+# WRONG - never update targets
+target_critic = copy(critic)  # Initialize once
+for step in range(1000000):
+    # ... training loop ...
+    # But target_critic never updated!
+```
+
+**Fix**:
+
+```python
+# RIGHT - soft update every step (or every N steps for delayed methods)
+tau = 0.005  # Soft update parameter
+for step in range(1000000):
+    # ... critic update ...
+    # Soft update targets
+    for param, target_param in zip(critic.parameters(), target_critic.parameters()):
+        target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
+
+# Or hard update (copy all weights) every N steps
+if step % update_frequency == 0:
+    target_critic = copy(critic)
+```
+
+
+### Bug #5: Gradient Flow Through Detached Tensors
+
+**Symptom**: Actor loss computation succeeds, but actor parameters don't update.
+
+**Cause**: Critic detached but actor expects gradients.
+
+```python
+# WRONG
+for step in range(1000):
+    q_value = critic(state, action).detach()  # Detached!
+    actor_loss = -q_value.mean()
+    actor.update(actor_loss)  # Gradient won't flow through q_value!
+
+# Result: actor_loss always 0 (constant from q_value.detach())
+# Actor parameters updated but toward constant target (no signal)
+```
+
+**Fix**:
+
+```python
+# RIGHT - don't detach when computing actor loss
+q_value = critic(state, action)  # No detach!
+actor_loss = -q_value.mean()
+actor.update(actor_loss)  # Gradient flows through q_value
+
+# Detach where appropriate:
+# - Value targets: v_target = (r + gamma * v_next).detach()
+# - Stop gradient in critic: q_target = (r + gamma * q_next.detach()).detach()
+# But NOT when computing actor loss
+```
+
+
+## Part 7: When to Use Actor-Critic vs Alternatives
+
+### Actor-Critic vs Policy Gradient (REINFORCE)
+
+| Factor | Actor-Critic | Policy Gradient |
+|--------|--------------|-----------------|
+| **Variance** | Low (baseline reduces) | High (full return) |
+| **Sample Efficiency** | High | Low |
+| **Convergence Speed** | Fast | Slow |
+| **Complexity** | Two networks | One network |
+| **Stability** | Better | Worse (high noise) |
+
+**Use Actor-Critic when**: Continuous actions, sample efficiency matters, training instability
+
+**Use Policy Gradient when**: Simple problem, don't need value function, prefer simpler code
+
+
+### Actor-Critic vs Q-Learning (DQN)
+
+| Factor | Actor-Critic | Q-Learning |
+|--------|--------------|-----------|
+| **Action Space** | Continuous (natural) | Discrete (requires all Q values) |
+| **Sample Efficiency** | High | Very high |
+| **Stability** | Good | Can diverge (overestimation) |
+| **Complexity** | Two networks | One network (but needs tricks) |
+
+**Use Actor-Critic for**: Continuous actions, robotics, control
+
+**Use Q-Learning for**: Discrete actions, games, navigation
+
+
+### Actor-Critic (On-Policy A2C) vs Off-Policy (SAC, TD3)
+
+| Factor | A2C (On-Policy) | SAC/TD3 (Off-Policy) |
+|--------|-----------------|---------------------|
+| **Sample Efficiency** | Moderate | High (replay buffer) |
+| **Stability** | Good | Excellent |
+| **Complexity** | Simpler | More complex |
+| **Data Reuse** | Limited (one pass) | High (replay buffer) |
+| **Parallel Training** | Excellent (A3C) | Limited (off-policy break) |
+
+**Use A2C when**: Want simplicity, have parallel workers, on-policy is okay
+
+**Use SAC/TD3 when**: Need sample efficiency, offline data possible, maximum stability
+
+
+## Part 8: Implementation Checklist
+
+### Pre-Training Checklist
+
+- [ ] Actor outputs mean and log_std separately
+- [ ] Log_std clamped: `log_std_min <= log_std <= log_std_max`
+- [ ] Action squashing with tanh (bounded to [-1,1])
+- [ ] Log probability computation includes tanh Jacobian (SAC/A2C)
+- [ ] Critic network separate from actor
+- [ ] Critic loss is value bootstrap (r + γV(s'), not G_t)
+- [ ] Two critics for SAC/TD3 (or one for A2C)
+- [ ] Target networks initialized as copies of main networks
+- [ ] Replay buffer created (for off-policy methods)
+- [ ] Advantage estimation (GAE preferred, MC acceptable)
+
+### Training Loop Checklist
+
+- [ ] Data collection uses current actor (not target)
+- [ ] Critic updated with Bellman target: `r + γV(s').detach()`
+- [ ] Actor updated with advantage signal: `-log_prob(a) * A(s,a)` or `-Q(s,a)`
+- [ ] Target networks soft updated: `τ * main + (1-τ) * target`
+- [ ] For SAC: entropy coefficient α being optimized
+- [ ] For TD3: delayed actor updates (every policy_delay)
+- [ ] For TD3: target policy smoothing (noise + clip)
+- [ ] Gradient clipping applied if losses explode
+- [ ] Learning rates appropriate (critic_lr typically >= actor_lr)
+- [ ] Reward normalization or clipping applied
+
+### Debugging Checklist
+
+- [ ] Critic loss decreasing over time?
+- [ ] V(s) and Q(s,a) values in reasonable range?
+- [ ] Policy entropy decreasing (exploration → exploitation)?
+- [ ] Actor loss decreasing?
+- [ ] Return increasing over episodes?
+- [ ] No NaN or Inf in losses?
+- [ ] Advantage estimates have variation?
+- [ ] Policy output std not stuck at min/max?
+
+
+## Part 9: Comprehensive Pitfall Reference
+
+### 1. Critic Loss Not Decreasing
+
+- Wrong Bellman target (should be r + γV(s'))
+- Critic weights not updating (zero gradients)
+- Learning rate too low
+- Target network staleness (not updated)
+
+### 2. Actor Not Improving
+
+- Critic broken (no signal)
+- Advantage estimates all zero
+- Actor learning rate too low
+- Policy parameterization wrong (no variance)
+
+### 3. Training Unstable (Divergence)
+
+- Missing target networks
+- Critic loss exploding (wrong target, high learning rate)
+- Entropy coefficient exploding (SAC: should be log(α))
+- Actor updates every step (should delay, especially TD3)
+
+### 4. Policy Stuck at Random Actions (SAC)
+
+- Manual α fixed (should be auto-tuned)
+- Target entropy wrong (should be -action_dim)
+- Entropy loss gradient wrong direction
+
+### 5. Policy Output Clamped to Min/Max Std
+
+- Log_std range too tight (check log_std_min/max)
+- Network initialization pushing to extreme values
+- No gradient clipping preventing adjustment
+
+### 6. Tanh Squashing Ignored
+
+- Log probability not adjusted for squashing
+- Missing Jacobian term in SAC/policy gradient
+- Action scaling inconsistent
+
+### 7. Target Networks Never Updated
+
+- Forgot to create target networks
+- Update function called but not applied
+- Update frequency too high (no learning)
+
+### 8. Off-Policy Break (Experience Replay)
+
+- Actor training on old data (should use current replay buffer)
+- Data distribution shift (actions from old policy)
+- Batch importance weights missing (PER)
+
+### 9. Advantage Estimates Biased
+
+- GAE parameter λ wrong (should be 0.95-0.99)
+- Bootstrap incorrect (wrong value target)
+- Critic too inaccurate (overcorrection)
+
+### 10. Entropy Coefficient Issues (SAC)
+
+- Manual tuning instead of auto-tuning
+- Entropy target not set correctly
+- Log(α) optimization not used (causes explosion)
+
+
+## Part 10: Real-World Examples
+
+### Example 1: SAC for Robotic Arm Control
+
+**Problem**: Robotic arm needs to reach target position. Continuous joint angles.
+
+**Setup**:
+
+```python
+state_dim = 18  # 6 joint angles + velocities
+action_dim = 6  # Joint torques
+action_range = [-1, 1]  # Normalized
+
+actor = ActorNetwork(state_dim, action_dim)  # Outputs μ, log_std
+critic1 = CriticNetwork(state_dim, action_dim)
+critic2 = CriticNetwork(state_dim, action_dim)
+
+target_entropy = -action_dim  # -6
+alpha = 1.0
+```
+
+**Training**:
+
+```python
+for step in range(1000000):
+    # Collect experience
+    state = env.reset() if done else next_state
+    action = actor.sample(state)
+    next_state, reward, done = env.step(action)
+    replay_buffer.add(state, action, reward, next_state, done)
+
+    if len(replay_buffer) < min_buffer_size:
+        continue
+
+    batch = replay_buffer.sample(256)
+
+    # Critic update
+    next_actions = actor.sample(batch.next_states)
+    next_log_probs = actor.log_prob(next_actions, batch.next_states)
+    q1_target = target_critic1(batch.next_states, next_actions)
+    q2_target = target_critic2(batch.next_states, next_actions)
+    target = batch.rewards + gamma * (1-batch.dones) * (
+        torch.min(q1_target, q2_target) - alpha * next_log_probs
+    )
+
+    critic1_loss = MSE(critic1(batch.states, batch.actions), target)
+    critic2_loss = MSE(critic2(batch.states, batch.actions), target)
+
+    # Actor update
+    actions = actor.sample(batch.states)
+    log_probs = actor.log_prob(actions, batch.states)
+    q_values = torch.min(
+        critic1(batch.states, actions),
+        critic2(batch.states, actions)
+    )
+    actor_loss = (alpha * log_probs - q_values).mean()
+
+    # Entropy coefficient update
+    entropy_loss = -alpha * (log_probs.detach() + target_entropy).mean()
+
+    # Optimize
+    critic1_optimizer.step(critic1_loss)
+    critic2_optimizer.step(critic2_loss)
+    actor_optimizer.step(actor_loss)
+    alpha_optimizer.step(entropy_loss)
+
+    # Soft update targets
+    soft_update(target_critic1, critic1, tau=0.005)
+    soft_update(target_critic2, critic2, tau=0.005)
+```
+
+### Example 2: TD3 for Autonomous Vehicle Control
+
+**Problem**: Vehicle continuous steering/acceleration. Needs stable, deterministic behavior.
+
+**Setup**:
+
+```python
+state_dim = 32  # Observations (lidar, speed, etc)
+action_dim = 2  # Steering angle, acceleration
+action_range = [[-0.5, -1], [0.5, 1]]  # Different ranges per action
+
+actor = ActorNetwork(state_dim, action_dim)  # Deterministic!
+critic1 = CriticNetwork(state_dim, action_dim)
+critic2 = CriticNetwork(state_dim, action_dim)
+```
+
+**Training**:
+
+```python
+for step in range(1000000):
+    # Collect with exploration noise
+    action = actor(state) + exploration_noise
+    action = torch.clamp(action, *action_range)
+    next_state, reward, done = env.step(action)
+    replay_buffer.add(state, action, reward, next_state, done)
+
+    batch = replay_buffer.sample(256)
+
+    # Critic update with target policy smoothing
+    next_actions = actor_target(batch.next_states)
+    noise = torch.randn_like(next_actions) * target_noise
+    noise = torch.clamp(noise, -noise_clip, noise_clip)
+    next_actions = torch.clamp(next_actions + noise, *action_range)
+
+    q1_target = critic1_target(batch.next_states, next_actions)
+    q2_target = critic2_target(batch.next_states, next_actions)
+    target = batch.rewards + gamma * (1-batch.dones) * torch.min(q1_target, q2_target)
+
+    critic1_loss = MSE(critic1(batch.states, batch.actions), target)
+    critic2_loss = MSE(critic2(batch.states, batch.actions), target)
+
+    # Delayed actor update
+    if step % policy_delay == 0:
+        actor_loss = -critic1(batch.states, actor(batch.states)).mean()
+        actor_optimizer.step(actor_loss)
+
+        # Update targets
+        soft_update(target_actor, actor, tau=0.005)
+        soft_update(target_critic1, critic1, tau=0.005)
+        soft_update(target_critic2, critic2, tau=0.005)
+```
+
+
+## Part 11: Advanced Topics
+
+### Distributed Training
+
+Actor-critic methods work with distributed data collection:
+
+```
+┌─────────┐  ┌─────────┐  ┌─────────┐
+│Worker 1 │  │Worker 2 │  │Worker N │
+│ env     │  │ env     │  │ env     │
+│ rollout │  │ rollout │  │ rollout │
+└────┬────┘  └────┬────┘  └────┬────┘
+     │           │           │
+     └─────┬─────┴─────┬─────┘
+           │           │
+       ┌───▼───────────▼────┐
+       │  Replay Buffer     │
+       │  (or Shared Queue) │
+       └───┬────────────────┘
+           │
+       ┌───▼──────────────┐
+       │ Parameter Server │
+       │ (Actor + Critics)│
+       └─────────────────┘
+```
+
+**Benefits**: Fast sample collection (N workers collect in parallel)
+
+
+### Multi-Task Learning
+
+Use actor-critic for multiple related tasks:
+
+```
+State: [task_id, observations]
+Actor: Outputs action conditional on task_id
+Critic: Values state+task
+
+Transfer learning: Pre-train on many tasks, fine-tune on new task
+```
+
+
+## Part 12: Rationalization Common Mistakes
+
+Users often make systematic errors in actor-critic reasoning. Here's how to prevent them:
+
+### Mistake #1: "Why use SAC when TD3 is simpler?"
+
+**Rationalization**: "TD3 has simpler math (no entropy), just two critics and delayed updates. SAC adds entropy which seems overly complex. Can't I just use TD3?"
+
+**Counter**: SAC's entropy IS the simplicity. By automatically tuning α, SAC handles exploration automatically. TD3 still requires manual tuning of:
+
+- policy_delay (2? 5? 10?)
+- target_policy_noise magnitude
+- noise_clip value
+
+SAC auto-tunes entropy. That's FEWER hyperparameters overall.
+
+**Reality**: SAC is more automated. TD3 requires more expertise.
+
+
+### Mistake #2: "My critic diverged, let me reduce learning rate"
+
+**Rationalization**: "Critic loss is exploding. Reducing learning rate should stabilize it."
+
+**Counter**: Blindly lowering learning rate treats symptom, not cause. If critic is diverging, check:
+
+1. Is the Bellman target correct? (r + γV(s').detach())
+2. Are you gradient clipping?
+3. Are target networks being updated?
+
+A wrong target will diverge at ANY learning rate (will just take longer).
+
+**Reality**: Debug the Bellman equation first. Then adjust learning rate.
+
+
+### Mistake #3: "A2C should work fine, why use off-policy?"
+
+**Rationalization**: "A2C is on-policy and simpler. Off-policy (SAC/TD3) adds complexity with replay buffers. Can't I just use A2C for everything?"
+
+**Counter**: A2C discards data after one pass. SAC/TD3 reuse data with replay buffer.
+
+For continuous control with limited data:
+
+- A2C: 1 million environment steps = 1 million gradient updates
+- SAC: 1 million environment steps = 4+ million gradient updates (from replay buffer)
+
+SAC learns 4x faster per sample.
+
+**Reality**: Off-policy scales better. Use it when data is expensive (robotics).
+
+
+### Mistake #4: "SAC won't explore, let me manually set α higher"
+
+**Rationalization**: "Agent isn't exploring. SAC entropy coefficient seems too low. Let me manually increase α to force exploration."
+
+**Counter**: Manually increasing α BREAKS SAC's design. SAC will auto-adjust α. If it's not exploring:
+
+1. Check if α is actually being optimized (log(α) loss?)
+2. Check target_entropy is correct (-action_dim?)
+3. Maybe the reward is so good, SAC found it fast (not a bug!)
+
+Manual α override means you're not using SAC, you're using plain entropy regularization. That's worse than SAC.
+
+**Reality**: Trust SAC's auto-tuning. If exploring too little, check target_entropy.
+
+
+### Mistake #5: "Two critics in TD3, but I'll use only one Q-value"
+
+**Rationalization**: "TD3 has Q1 and Q2, but I'll just use Q1 for the target. It's one critic, should work fine."
+
+**Counter**: Twin critics are critical for stability. Using only one defeats the purpose:
+
+```python
+# WRONG - only one Q, no overestimation prevention
+Q_target = critic1_target(next_state, next_action)  # Just one!
+target = r + gamma * Q_target
+
+# RIGHT - minimum of two, prevents high bias
+Q1_target = critic1_target(next_state, next_action)
+Q2_target = critic2_target(next_state, next_action)
+target = r + gamma * min(Q1_target, Q2_target)  # Conservative!
+```
+
+Single critic will overestimate and diverge.
+
+**Reality**: Both critics must be used in target. That's the point.
+
+
+### Mistake #6: "Tanh squashing is just for action bounds, doesn't affect gradients"
+
+**Rationalization**: "I'll scale actions with tanh, but it's just a function. The log probability should be the same as unsquashed normal."
+
+**Counter**: Tanh squashing changes the probability distribution:
+
+```
+π(a|s) = N(μ(s), σ(s))  [Wrong! Ignores tanh]
+π(a|s) = |det(∂a/∂x)|^(-1) * N(μ(s), σ(s))  [Right! Includes Jacobian]
+
+log π(a|s) has Jacobian term: -log(1 - a² + ε)
+```
+
+Ignoring this term makes entropy calculation wrong. SAC entropy coefficient adjusts based on WRONG entropy estimate. Policy diverges.
+
+**Reality**: Always include Jacobian. It's not optional.
+
+
+### Mistake #7: "Gradient clipping is for neural nets, not RL"
+
+**Rationalization**: "Gradient clipping is for recurrent networks. Actor-critic shouldn't need it."
+
+**Counter**: Actor-critic trains on bootstrapped targets. If critic breaks, gradients can explode:
+
+```
+Unstable critic → huge Q-values → huge actor gradients → NaN
+```
+
+Gradient clipping prevents explosion:
+
+```python
+torch.nn.utils.clip_grad_norm_(actor.parameters(), max_norm=10.0)
+torch.nn.utils.clip_grad_norm_(critic.parameters(), max_norm=10.0)
+```
+
+This is protective. It doesn't hurt convergence.
+
+**Reality**: Use gradient clipping in actor-critic. It's standard practice.
+
+
+### Mistake #8: "Soft update is just for stability, doesn't matter if I hard update"
+
+**Rationalization**: "Target networks update less frequently. Whether I soft update (τ=0.005) or hard update (every 1000 steps), shouldn't matter."
+
+**Counter**: Soft vs hard update has different stability properties:
+
+```python
+# Soft update - every step
+target = τ * main + (1-τ) * target  # Smooth, continuous change
+
+# Hard update - every N steps
+if step % N == 0:
+    target = copy(main)  # Sudden change
+
+# Soft update: target changes by 0.5% per step (smooth learning)
+# Hard update: target changes 100% every N steps (may overshoot)
+```
+
+Hard update can cause temporary divergence when copied. Soft update is smoother.
+
+**Reality**: Soft update is preferred. Use τ ≈ 0.005 for continuous stability.
+
+
+## Part 13: Rationalization Decision Table
+
+When users ask "Should I use X or Y?", use this table:
+
+| Question | A | B | Decision |
+|----------|---|---|----------|
+| Stochastic or Deterministic? | Stochastic (SAC) | Deterministic (TD3) | Both valid, SAC more robust |
+| Off-policy or On-policy? | Off-policy (SAC/TD3) | On-policy (A2C) | Off-policy for sample efficiency |
+| Sample efficiency critical? | Yes (SAC/TD3) | No (A2C) | Use off-policy if data expensive |
+| Manual tuning tolerance? | Minimal (SAC) | Moderate (TD3) | SAC: fewer hyperparameters |
+| Exploration strategy? | Entropy (SAC) | Policy smoothing (TD3) | SAC: automatic entropy |
+| Computation budget? | Higher (SAC) | Lower (TD3) | SAC: slightly more, worth it |
+| First time AC method? | SAC (recommended) | TD3 (alternative) | Start with SAC |
+
+
+## Part 14: Common Pitfall Deep Dives
+
+### Pitfall #11: Advantage Estimation Bias
+
+**What goes wrong**: Using TD(0) advantage instead of GAE. Learning slow, noisy.
+
+```python
+# Suboptimal - high bias, low variance
+A(s,a) = r + γV(s') - V(s)  # One-step, if V(s') wrong, advantage wrong
+
+# Better - balanced bias-variance
+A(s,a) = δ_0 + (γλ)δ_1 + (γλ)²δ_2 + ...  # GAE combines multiple steps
+```
+
+**How to fix**:
+
+```python
+def compute_gae(rewards, values, next_value, gamma, lam):
+    advantages = torch.zeros_like(rewards)
+    gae = 0
+    for t in reversed(range(len(rewards))):
+        if t == len(rewards) - 1:
+            next_val = next_value
+        else:
+            next_val = values[t+1]
+
+        delta = rewards[t] + gamma * next_val - values[t]
+        gae = delta + (gamma * lam) * gae
+        advantages[t] = gae
+
+    return advantages
+```
+
+
+### Pitfall #12: Network Architecture Mismatch
+
+**What goes wrong**: Actor and critic networks very different sizes. Critic learns slow, can't keep up with actor.
+
+```python
+# WRONG - massive mismatch
+actor = nn.Sequential(
+    nn.Linear(state_dim, 128),
+    nn.ReLU(),
+    nn.Linear(128, action_dim)  # Small!
+)
+
+critic = nn.Sequential(
+    nn.Linear(state_dim, 512),
+    nn.ReLU(),
+    nn.Linear(512, 256),
+    nn.ReLU(),
+    nn.Linear(256, 1)  # Huge!
+)
+```
+
+**Fix**: Use similar architectures:
+
+```python
+actor = nn.Sequential(
+    nn.Linear(state_dim, 256),
+    nn.ReLU(),
+    nn.Linear(256, 256),
+    nn.ReLU(),
+    nn.Linear(256, action_dim * 2)  # μ and log_std
+)
+
+critic = nn.Sequential(
+    nn.Linear(state_dim + action_dim, 256),
+    nn.ReLU(),
+    nn.Linear(256, 256),
+    nn.ReLU(),
+    nn.Linear(256, 1)  # Same layer sizes
+)
+```
+
+
+### Pitfall #13: Rewards Not Normalized
+
+**What goes wrong**: Rewards in range [0, 10000]. Critic outputs huge values. Gradients unstable.
+
+```python
+# WRONG - raw rewards
+reward = env.reward()  # Could be 1000+
+target = reward + gamma * v_next
+
+# RIGHT - normalize
+reward_mean = running_mean(rewards)
+reward_std = running_std(rewards)
+reward_normalized = (reward - reward_mean) / (reward_std + 1e-8)
+target = reward_normalized + gamma * v_next
+```
+
+**Running statistics**:
+
+```python
+class RunningNorm:
+    def __init__(self):
+        self.mean = 0
+        self.var = 1
+        self.count = 0
+
+    def update(self, x):
+        self.count += 1
+        delta = x - self.mean
+        self.mean += delta / self.count
+        delta2 = x - self.mean
+        self.var = (self.var * (self.count-1) + delta * delta2) / self.count
+```
+
+
+## Part 15: Red Flags Comprehensive List
+
+| Red Flag | Diagnosis | Fix |
+|----------|-----------|-----|
+| Critic loss NaN | Exploding gradients, huge rewards | Reward normalization, gradient clipping |
+| Critic loss stuck | Wrong target, or target network not updating | Check Bellman target, ensure soft update |
+| Actor loss 0 | Critic signal dead, or advantage zero | Debug critic, check gradient flow |
+| Policy std at min | Network pushing to limits | Check initialization, gradient clipping |
+| Return oscillates | Actor chasing moving target | Use delayed updates (TD3) or check critic |
+| Entropy coefficient explodes (SAC) | Loss unbounded | Use log(α) instead of α directly |
+| Target network never used | Forgot to create/copy targets | Check target network update code |
+| Action clipping needed constantly | Action range wrong, or policy diverges | Check action bounds, policy variance |
+| Same action always | log_std clamped to min | Increase log_std_max or check initialization |
+| Reward always same episode | Reward computed wrong, or agent stuck | Check reward function, environment |
+
+
+## Summary: Quick Reference
+
+### When to Choose Actor-Critic
+
+```
+Do you have continuous actions? YES → Actor-Critic
+Need sample efficiency? YES → SAC or TD3
+Prefer stochastic policy? YES → SAC
+Prefer deterministic? YES → TD3
+Want off-policy learning? YES → SAC/TD3
+Want on-policy simplicity? YES → A2C
+```
+
+### Red Flags
+
+1. Critic loss not decreasing → Check Bellman target
+2. Policy not changing → Check advantage signal
+3. Training diverging → Check target networks
+4. SAC policy random → Check entropy coefficient (must be auto-tuned)
+5. TD3 unstable → Check policy delay and target smoothing
+
+### Quick Debugging
+
+```python
+# First: Check critic
+assert critic_loss.decreasing()  # Should go down
+assert -100 < v_values.mean() < 100  # Reasonable range
+
+# Second: Check actor
+assert actor_loss.decreasing() or returning improving
+assert log_std not clamped to min/max
+assert actions have variance
+
+# Third: Check training
+assert return increasing over episodes
+assert return_std decreasing (less variance)
+assert entropy coefficient changing (SAC)
+```
+
+
+## Conclusion
+
+Actor-critic methods are the gold standard for continuous control because they:
+
+1. **Combine benefits**: Policy (actor) + value (critic) = stable, sample-efficient learning
+2. **Handle continuous actions naturally**: No need to discretize
+3. **Reduce variance**: Baseline critical for fast learning
+4. **Enable off-policy learning**: SAC/TD3 with replay buffer = very sample efficient
+5. **Scale to complexity**: Work on high-dimensional state/action spaces
+
+**Start with SAC** (most robust, auto-tuned entropy). Debug using the comprehensive checklist. Switch to TD3 if you need deterministic policy or have tuning expertise.
+
+Key insight: Actor-critic's power comes from separating action selection (actor) from value estimation (critic). Maintain this separation carefully (critics must learn accurately, actors must use accurate value signals) and training will be stable and efficient.
diff --git a/skills/using-deep-rl/exploration-strategies.md b/skills/using-deep-rl/exploration-strategies.md
new file mode 100644
index 0000000..ad97dc9
--- /dev/null
+++ b/skills/using-deep-rl/exploration-strategies.md
@@ -0,0 +1,1599 @@
+
+# Exploration Strategies in Deep RL
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Exploration-Exploitation Problem**: Agent stuck in local optimum, not finding sparse rewards
+- **ε-Greedy Tuning**: Designing or debugging epsilon decay schedules
+- **Sparse Reward Environments**: Montezuma's Revenge, goal-conditioned tasks, minimal feedback
+- **Large State Spaces**: Too many states for random exploration to be effective
+- **Curiosity-Driven Learning**: Implementing or understanding intrinsic motivation
+- **RND (Random Network Distillation)**: Novelty-based exploration for sparse rewards
+- **Count-Based Exploration**: Encouraging discovery in discrete/tabular domains
+- **Exploration Stability**: Agent explores too much/little, inconsistent performance
+- **Method Selection**: Which exploration strategy for this problem?
+- **Computational Cost**: Balancing exploration sophistication vs overhead
+- **Boltzmann Exploration**: Softmax-based action selection and temperature tuning
+
+**Core Problem:** Many RL agents get stuck exploiting a local optimum, never finding sparse rewards or exploring high-dimensional state spaces effectively. Choosing the right exploration strategy is fundamental to success.
+
+## Do NOT Use This Skill For
+
+- **Algorithm selection** (route to rl-foundations or specific algorithm skills like value-based-methods, policy-gradient-methods)
+- **Reward design issues** (route to reward-shaping-engineering)
+- **Environment bugs causing poor exploration** (route to rl-debugging first to verify environment works correctly)
+- **Basic RL concepts** (route to rl-foundations for MDPs, value functions, Bellman equations)
+- **Training instability unrelated to exploration** (route to appropriate algorithm skill or rl-debugging)
+
+
+## Core Principle: The Exploration-Exploitation Tradeoff
+
+### The Fundamental Tension
+
+In reinforcement learning, every action selection is a decision:
+
+- **Exploit**: Take the action with highest estimated value (maximize immediate reward)
+- **Explore**: Try a different action to learn about its value (find better actions)
+
+```
+Exploitation Extreme:
+- Only take the best-known action
+- High immediate reward (in training)
+- BUT: Stuck in local optimum if initial action wasn't optimal
+- Risk: Never find the actual best reward
+
+Exploration Extreme:
+- Take random actions uniformly
+- Will eventually find any reward
+- BUT: Wasting resources on clearly bad actions
+- Risk: No learning because too much randomness
+
+Optimal Balance:
+- Explore enough to find good actions
+- Exploit enough to benefit from learning
+```
+
+### Why Exploration Matters
+
+**Scenario 1: Sparse Reward Environment**
+
+Imagine an agent in Montezuma's Revenge (classic exploration benchmark):
+
+- Most states give reward = 0
+- First coin gives +1 (at step 500+)
+- Without exploring systematically, random actions won't find that coin in millions of steps
+
+Without exploration strategy:
+
+```
+Steps 0-1,000: Random actions, no reward signal
+Steps 1,000-10,000: Learned to get to the coin, finally seeing reward
+Problem: Took 1,000 steps of pure random exploration!
+
+With smart exploration (RND):
+Steps 0-100: RND detects novel states, guides toward unexplored areas
+Steps 100-500: Finds coin much faster because exploring strategically
+Result: Reward found in 10% of steps
+```
+
+**Scenario 2: Local Optimum Trap**
+
+Agent finds a small reward (+1) from a simple policy:
+
+```
+Without decay:
+- Agent learns exploit_policy achieves +1
+- ε-greedy with ε=0.3: Still 30% random (good, explores)
+- BUT: 70% exploiting suboptimal policy indefinitely
+
+With decay:
+- Step 0: ε=1.0, 100% explore
+- Step 100k: ε=0.05, 5% explore
+- Step 500k: ε=0.01, 1% explore
+- Result: Enough exploration to find +5 reward, then exploit it
+```
+
+### Core Rule
+
+**Exploration is an investment with declining returns.**
+
+- Early training: Exploration critical (don't know anything yet)
+- Mid training: Balanced (learning but not confident)
+- Late training: Exploitation dominant (confident in good actions)
+
+
+## Part 1: ε-Greedy Exploration
+
+### The Baseline Method
+
+ε-Greedy is the simplest exploration strategy: with probability ε, take a random action; otherwise, take the greedy (best-known) action.
+
+```python
+import numpy as np
+
+def epsilon_greedy_action(q_values, epsilon):
+    """
+    Select action using ε-greedy.
+
+    Args:
+        q_values: Q(s, *) - values for all actions
+        epsilon: exploration probability [0, 1]
+
+    Returns:
+        action: int (0 to num_actions-1)
+    """
+    if np.random.random() < epsilon:
+        # Explore: random action
+        return np.random.randint(len(q_values))
+    else:
+        # Exploit: best action
+        return np.argmax(q_values)
+```
+
+### Why ε-Greedy Works
+
+1. **Simple**: Easy to implement and understand
+2. **Guaranteed Convergence**: Will eventually visit all states (if ε > 0)
+3. **Effective Baseline**: Works surprisingly well for many tasks
+4. **Interpretable**: ε has clear meaning (probability of random action)
+
+### When ε-Greedy Fails
+
+```
+Problem Space → Exploration Effectiveness:
+
+Small discrete spaces (< 100 actions):
+- ε-greedy: Excellent ✓
+- Reason: Random exploration covers space quickly
+
+Large discrete spaces (100-10,000 actions):
+- ε-greedy: Poor ✗
+- Reason: Random action is almost always bad
+- Example: Game with 500 actions, random 1/500 chance is right action
+
+Continuous action spaces:
+- ε-greedy: Terrible ✗
+- Reason: Random action in [-∞, ∞] is meaningless noise
+- Alternative: Gaussian noise on action (not true ε-greedy)
+
+Sparse rewards, large state spaces:
+- ε-greedy: Hopeless ✗
+- Reason: Random exploration won't find rare reward before heat death
+- Alternative: Curiosity, RND, intrinsic motivation
+```
+
+### ε-Decay Schedules
+
+The key insight: ε should decay over time. Explore early, exploit late.
+
+#### Linear Decay
+
+```python
+def epsilon_linear(step, total_steps, epsilon_start=1.0, epsilon_end=0.1):
+    """
+    Linear decay from epsilon_start to epsilon_end.
+
+    ε(t) = ε_start - (ε_start - ε_end) * t / T
+    """
+    t = min(step, total_steps)
+    return epsilon_start - (epsilon_start - epsilon_end) * t / total_steps
+```
+
+**Properties:**
+
+- Simple, predictable, easy to tune
+- Equal exploration reduction per step
+- Good for most tasks
+
+**Guidance:**
+
+- Use if no special knowledge about task
+- `epsilon_start = 1.0` (explore fully initially)
+- `epsilon_end = 0.01` to `0.1` (small residual exploration)
+- `total_steps = 1,000,000` (typical deep RL)
+
+#### Exponential Decay
+
+```python
+def epsilon_exponential(step, decay_rate=0.9995):
+    """
+    Exponential decay with constant rate.
+
+    ε(t) = ε_0 * decay_rate^t
+    """
+    return 1.0 * (decay_rate ** step)
+```
+
+**Properties:**
+
+- Fast initial decay, slow tail
+- Aggressive early exploration cutoff
+- Exploration drops exponentially
+
+**Guidance:**
+
+- Use if task rewards are found quickly
+- `decay_rate = 0.9995` is gentle (1% per 100 steps)
+- `decay_rate = 0.999` is aggressive (1% per step)
+- Watch for premature convergence to local optimum
+
+#### Polynomial Decay
+
+```python
+def epsilon_polynomial(step, total_steps, epsilon_start=1.0,
+                       epsilon_end=0.01, power=2.0):
+    """
+    Polynomial decay: ε(t) = ε_start * (1 - t/T)^p
+
+    power=1: Linear
+    power=2: Quadratic (faster early decay)
+    power=0.5: Slower decay
+    """
+    t = min(step, total_steps)
+    fraction = t / total_steps
+    return epsilon_start * (1 - fraction) ** power
+```
+
+**Properties:**
+
+- Smooth, tunable decay curve
+- Power > 1: Fast early decay, slow tail
+- Power < 1: Slow early decay, fast tail
+
+**Guidance:**
+
+- `power = 2.0`: Quadratic (balanced, common)
+- `power = 3.0`: Cubic (aggressive early decay)
+- `power = 0.5`: Slower (gentle early decay)
+
+### Practical Guidance: Choosing Epsilon Parameters
+
+```
+Rule of Thumb:
+- epsilon_start = 1.0 (explore uniformly initially)
+- epsilon_end = 0.01 to 0.1 (maintain minimal exploration)
+  - 0.01: For large action spaces (need some exploration)
+  - 0.05: Default choice
+  - 0.1: For small action spaces (can afford random actions)
+- total_steps: Based on training duration
+  - Usually 500k to 1M steps
+  - Longer if rewards are sparse or delayed
+
+Task-Specific Adjustments:
+- Sparse rewards: Longer decay (explore for more steps)
+- Dense rewards: Shorter decay (can exploit earlier)
+- Large action space: Higher epsilon_end (maintain exploration)
+- Small action space: Lower epsilon_end (exploitation is cheap)
+```
+
+### ε-Greedy Pitfall 1: Decay Too Fast
+
+```python
+# WRONG: Decays to 0 in just 10k steps
+epsilon_final = 0.01
+decay_steps = 10_000
+epsilon = epsilon_final ** (step / decay_steps)  # ← BUG
+
+# CORRECT: Decays gently over training
+total_steps = 1_000_000
+epsilon_linear(step, total_steps, epsilon_start=1.0, epsilon_end=0.01)
+```
+
+**Symptom:** Agent plateaus early, never improves past initial local optimum
+
+**Fix:** Use longer decay schedule, ensure epsilon_end > 0
+
+### ε-Greedy Pitfall 2: Never Decays (Constant ε)
+
+```python
+# WRONG: Fixed epsilon forever
+epsilon = 0.3  # Constant
+
+# CORRECT: Decay epsilon over time
+epsilon = epsilon_linear(step, total_steps=1_000_000)
+```
+
+**Symptom:** Agent learns but performance noisy, can't fully exploit learned policy
+
+**Fix:** Add epsilon decay schedule
+
+### ε-Greedy Pitfall 3: Epsilon on Continuous Actions
+
+```python
+# WRONG: Discrete epsilon-greedy on continuous actions
+action = np.random.uniform(-1, 1) if random() < epsilon else greedy_action
+
+# CORRECT: Gaussian noise on continuous actions
+def continuous_exploration(action, exploration_std=0.1):
+    return action + np.random.normal(0, exploration_std, action.shape)
+```
+
+**Symptom:** Continuous action spaces don't benefit from ε-greedy (random action is meaningless)
+
+**Fix:** Use Gaussian noise or other continuous exploration methods
+
+
+## Part 2: Boltzmann Exploration
+
+### Temperature-Based Action Selection
+
+Instead of deterministic greedy action, select actions proportional to their Q-values using softmax with temperature T.
+
+```python
+def boltzmann_exploration(q_values, temperature=1.0):
+    """
+    Select action using Boltzmann distribution.
+
+    P(a) = exp(Q(s,a) / T) / Σ exp(Q(s,a') / T)
+
+    Args:
+        q_values: Q(s, *) - values for all actions
+        temperature: Exploration parameter
+          T → 0: Becomes deterministic (greedy)
+          T → ∞: Becomes uniform random
+
+    Returns:
+        action: int (sampled from distribution)
+    """
+    # Subtract max for numerical stability
+    q_shifted = q_values - np.max(q_values)
+
+    # Compute probabilities
+    probabilities = np.exp(q_shifted / temperature)
+    probabilities = probabilities / np.sum(probabilities)
+
+    # Sample action
+    return np.random.choice(len(q_values), p=probabilities)
+```
+
+### Properties vs ε-Greedy
+
+| Feature | ε-Greedy | Boltzmann |
+|---------|----------|-----------|
+| Good actions | Probability: 1-ε | Probability: higher (proportional to Q) |
+| Bad actions | Probability: ε/(n-1) | Probability: lower (proportional to Q) |
+| Action selection | Deterministic or random | Stochastic distribution |
+| Exploration | Uniform random | Biased toward better actions |
+| Tuning | ε (1 parameter) | T (1 parameter) |
+
+**Key Advantage:** Boltzmann balances better—good actions are preferred but still get chances.
+
+```
+Example: Three actions with Q=[10, 0, -10]
+
+ε-Greedy (ε=0.2):
+- Action 0: P=0.8 (exploit best)
+- Action 1: P=0.1 (random)
+- Action 2: P=0.1 (random)
+- Problem: Good actions (Q=0, -10) barely sampled
+
+Boltzmann (T=2):
+- Action 0: P=0.88 (exp(10/2)=e^5 ≈ 148)
+- Action 1: P=0.11 (exp(0/2)=1)
+- Action 2: P=0.01 (exp(-10/2)≈0.007)
+- Better: Action 1 still gets 11% (not negligible)
+```
+
+### Temperature Decay Schedule
+
+Like epsilon, temperature should decay: start high (explore), end low (exploit).
+
+```python
+def temperature_decay(step, total_steps, temp_start=1.0, temp_end=0.1):
+    """
+    Linear temperature decay.
+
+    T(t) = T_start - (T_start - T_end) * t / T_total
+    """
+    t = min(step, total_steps)
+    return temp_start - (temp_start - temp_end) * t / total_steps
+
+# Usage in training loop
+for step in range(total_steps):
+    T = temperature_decay(step, total_steps)
+    action = boltzmann_exploration(q_values, temperature=T)
+    # ...
+```
+
+### When to Use Boltzmann vs ε-Greedy
+
+```
+Choose ε-Greedy if:
+- Simple implementation preferred
+- Discrete action space
+- Task has clear good/bad actions (wide Q-value spread)
+
+Choose Boltzmann if:
+- Actions have similar Q-values (nuanced exploration)
+- Want to bias exploration toward promising actions
+- Fine-grained control over exploration desired
+```
+
+
+## Part 3: UCB (Upper Confidence Bound)
+
+### Theoretical Optimality
+
+UCB is provably optimal for the multi-armed bandit problem:
+
+```python
+def ucb_action(q_values, action_counts, total_visits, c=1.0):
+    """
+    Select action using Upper Confidence Bound.
+
+    UCB(a) = Q(a) + c * sqrt(ln(N) / N(a))
+
+    Args:
+        q_values: Current Q-value estimates
+        action_counts: N(a) - times each action visited
+        total_visits: N - total visits to state
+        c: Exploration constant (usually 1.0 or sqrt(2))
+
+    Returns:
+        action: int (maximizing UCB)
+    """
+    # Avoid division by zero
+    action_counts = np.maximum(action_counts, 1)
+
+    # Compute exploration bonus
+    exploration_bonus = c * np.sqrt(np.log(total_visits) / action_counts)
+
+    # Upper confidence bound
+    ucb = q_values + exploration_bonus
+
+    return np.argmax(ucb)
+```
+
+### Why UCB Works
+
+UCB balances exploitation and exploration via **optimism under uncertainty**:
+
+- If Q(a) is high → exploit it
+- If Q(a) is uncertain (rarely visited) → exploration bonus makes UCB high
+
+```
+Example: Bandit with 2 arms
+- Arm A: Visited 100 times, estimated Q=2.0
+- Arm B: Visited 10 times, estimated Q=1.5
+
+UCB(A) = 2.0 + 1.0 * sqrt(ln(110) / 100) ≈ 2.0 + 0.26 = 2.26
+UCB(B) = 1.5 + 1.0 * sqrt(ln(110) / 10) ≈ 1.5 + 0.82 = 2.32
+
+Result: Try Arm B despite lower Q estimate (less certain)
+```
+
+### Critical Limitation: Doesn't Scale to Deep RL
+
+UCB assumes **tabular setting** (small, discrete state space where you can count visits):
+
+```python
+# WORKS: Tabular Q-learning
+state_action_counts = defaultdict(int)  # N(s, a)
+state_counts = defaultdict(int)  # N(s)
+
+# BREAKS in deep RL:
+# With function approximation, states don't repeat exactly
+# Can't count "how many times visited state X" in continuous/image observations
+```
+
+**Practical Issue:**
+
+In image-based RL (Atari, vision), never see the same pixel image twice. State counting is impossible.
+
+### When UCB Applies
+
+```
+Use UCB if:
+✓ Discrete action space (< 100 actions)
+✓ Discrete state space (< 10,000 states)
+✓ Tabular Q-learning (no function approximation)
+✓ Rewards come quickly (don't need long-term planning)
+
+Examples: Simple bandits, small Gridworlds, discrete card games
+
+DO NOT use UCB if:
+✗ Using neural networks (state approximation)
+✗ Continuous actions or large state space
+✗ Image observations (pixel space too large)
+✗ Sparse rewards (need different methods)
+```
+
+### Connection to Deep RL
+
+For deep RL, need to estimate **uncertainty** without explicit counts:
+
+```python
+def deep_ucb_approximation(mean_q, uncertainty, c=1.0):
+    """
+    Approximate UCB using learned uncertainty (not action counts).
+
+    Used in methods like:
+    - Deep Ensembles: Use ensemble variance as uncertainty
+    - Dropout: Use MC-dropout variance
+    - Bootstrap DQN: Ensemble of Q-networks
+
+    UCB ≈ Q(s,a) + c * uncertainty(s,a)
+    """
+    return mean_q + c * uncertainty
+```
+
+**Modern Approach:** Instead of counting visits, learn uncertainty through:
+
+- **Ensemble Methods**: Train multiple Q-networks, use disagreement
+- **Bayesian Methods**: Learn posterior over Q-values
+- **Bootstrap DQN**: Separate Q-networks give uncertainty estimates
+
+These adapt UCB principles to deep RL.
+
+
+## Part 4: Curiosity-Driven Exploration (ICM)
+
+### The Core Insight
+
+**Prediction Error as Exploration Signal**
+
+Agent is "curious" about states where it can't predict the next state well:
+
+```
+Intuition: If I can't predict what will happen, I probably
+haven't learned about this state yet. Let me explore here!
+
+Intrinsic Reward = ||next_state - predicted_next_state||^2
+```
+
+### Intrinsic Curiosity Module (ICM)
+
+```python
+import torch
+import torch.nn as nn
+
+class IntrinsicCuriosityModule(nn.Module):
+    """
+    ICM = Forward Model + Inverse Model
+
+    Forward Model: Predicts next state from (state, action)
+    - Input: current state + action taken
+    - Output: predicted next state
+    - Error: prediction error = surprise
+
+    Inverse Model: Predicts action from (state, next_state)
+    - Input: current state and next state
+    - Output: predicted action taken
+    - Purpose: Learn representation that distinguishes states
+    """
+
+    def __init__(self, state_dim, action_dim, hidden_dim=128):
+        super().__init__()
+
+        # Inverse model: (s, s') → a
+        self.inverse = nn.Sequential(
+            nn.Linear(2 * state_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim)
+        )
+
+        # Forward model: (s, a) → s'
+        self.forward = nn.Sequential(
+            nn.Linear(state_dim + action_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, state_dim)
+        )
+
+    def compute_intrinsic_reward(self, state, action, next_state):
+        """
+        Curiosity reward = prediction error of forward model.
+
+        high_error → Unseen state → Reward exploration
+        low_error → Seen state → Ignore (already learned)
+        """
+        # Predict next state
+        predicted_next = self.forward(torch.cat([state, action], dim=-1))
+
+        # Compute prediction error
+        prediction_error = torch.norm(next_state - predicted_next, dim=-1)
+
+        # Intrinsic reward is prediction error (exploration bonus)
+        return prediction_error
+
+    def loss(self, state, action, next_state, action_pred_logits):
+        """
+        Combine forward and inverse losses.
+
+        Forward loss: Forward model prediction error
+        Inverse loss: Inverse model action prediction error
+        """
+        # Forward loss
+        predicted_next = self.forward(torch.cat([state, action], dim=-1))
+        forward_loss = torch.mean((next_state - predicted_next) ** 2)
+
+        # Inverse loss
+        predicted_action = action_pred_logits
+        inverse_loss = torch.mean((action - predicted_action) ** 2)
+
+        return forward_loss + inverse_loss
+```
+
+### Why Both Forward and Inverse Models?
+
+```
+Forward model alone:
+- Can predict next state without learning features
+- Might just memorize (Q: Do pixels change when I do action X?)
+- Doesn't necessarily learn task-relevant state representation
+
+Inverse model:
+- Forces feature learning that distinguishes states
+- Can only predict action if states are well-represented
+- Improves forward model's learned representation
+
+Together: Forward + Inverse
+- Better feature learning (inverse helps)
+- Better prediction (forward is primary)
+```
+
+### Critical Pitfall: Random Environment Trap
+
+```python
+# WRONG: Using curiosity in stochastic environment
+# Environment: Atari with pixel randomness/motion artifacts
+
+# Agent gets reward for predicting pixel noise
+# Prediction error = pixels changed randomly
+# Intrinsic reward goes to the noisiest state!
+# Result: Agent learns nothing about task, just explores random pixels
+
+# CORRECT: Use RND instead (next section)
+# RND uses FROZEN random network, doesn't get reward for actual noise
+```
+
+**Key Distinction:**
+
+- ICM: Learns to predict environment (breaks if environment has noise/randomness)
+- RND: Uses frozen random network (robust to environment randomness)
+
+### Computational Cost
+
+```python
+# ICM adds significant overhead:
+# - Forward model network (encoder + layers + output)
+# - Inverse model network (encoder + layers + output)
+# - Training both networks every step
+
+# Overhead estimate:
+# Base agent: 1 network (policy/value)
+# With ICM: 3+ networks (policy + forward + inverse)
+# Training time: ~2-3× longer
+# Memory: ~3× larger
+
+# When justified:
+# - Sparse rewards (ICM critical)
+# - Large state spaces (ICM helps)
+#
+# When NOT justified:
+# - Dense rewards (environment signal sufficient)
+# - Continuous control with simple rewards (ε-greedy enough)
+```
+
+
+## Part 5: RND (Random Network Distillation)
+
+### The Elegant Solution
+
+RND is simpler and more robust than ICM:
+
+```python
+class RandomNetworkDistillation(nn.Module):
+    """
+    RND: Intrinsic reward = prediction error of target network
+
+    Key innovation: Target network is RANDOM and FROZEN
+    (never updated)
+
+    Two networks:
+    1. Target (random, frozen): f_target(s) - fixed throughout training
+    2. Predictor (trained): f_predict(s) - learns to predict target
+
+    Intrinsic reward = ||f_target(s) - f_predict(s)||^2
+
+    New state (s not seen) → high prediction error → reward exploration
+    Seen state (s familiar) → low prediction error → ignore
+    """
+
+    def __init__(self, state_dim, embedding_dim=128):
+        super().__init__()
+
+        # Target network: random, never updates
+        self.target = nn.Sequential(
+            nn.Linear(state_dim, embedding_dim),
+            nn.ReLU(),
+            nn.Linear(embedding_dim, embedding_dim)
+        )
+
+        # Predictor network: learns to mimic target
+        self.predictor = nn.Sequential(
+            nn.Linear(state_dim, embedding_dim),
+            nn.ReLU(),
+            nn.Linear(embedding_dim, embedding_dim)
+        )
+
+        # Freeze target network
+        for param in self.target.parameters():
+            param.requires_grad = False
+
+    def compute_intrinsic_reward(self, state, scale=1.0):
+        """
+        Intrinsic reward = prediction error of target network.
+
+        Args:
+            state: Current observation
+            scale: Scale factor for reward (usually 0.1-1.0)
+
+        Returns:
+            Intrinsic reward (novelty signal)
+        """
+        with torch.no_grad():
+            target_features = self.target(state)
+
+        predicted_features = self.predictor(state)
+
+        # L2 prediction error
+        prediction_error = torch.norm(
+            target_features - predicted_features,
+            dim=-1,
+            p=2
+        )
+
+        return scale * prediction_error
+
+    def predictor_loss(self, state):
+        """
+        Loss for predictor: minimize prediction error.
+
+        Only update predictor (target stays frozen).
+        """
+        with torch.no_grad():
+            target_features = self.target(state)
+
+        predicted_features = self.predictor(state)
+
+        # MSE loss
+        return torch.mean((target_features - predicted_features) ** 2)
+```
+
+### Why RND is Elegant
+
+1. **No Environment Model**: Doesn't need to model dynamics (unlike ICM)
+2. **Robust to Randomness**: Random network isn't trying to predict anything real, so environment noise doesn't fool it
+3. **Simple**: Just predict random features
+4. **Fast**: Train only predictor (target frozen)
+
+### RND vs ICM Comparison
+
+| Aspect | ICM | RND |
+|--------|-----|-----|
+| Networks | Forward + Inverse | Target (frozen) + Predictor |
+| Learns | Environment dynamics | Random feature prediction |
+| Robust to noise | No (breaks with stochastic envs) | Yes (random target immune) |
+| Complexity | High (3+ networks, 2 losses) | Medium (2 networks, 1 loss) |
+| Computation | 2-3× base agent | 1.5-2× base agent |
+| When to use | Dense features, clean env | Sparse rewards, noisy env |
+
+### RND Pitfall: Training Instability
+
+```python
+# WRONG: High learning rate, large reward scale
+rnd_loss = rnd.predictor_loss(state)
+optimizer.zero_grad()
+rnd_loss.backward()
+optimizer.step()  # ← high learning rate causes divergence
+
+# CORRECT: Careful hyperparameter tuning
+rnd_lr = 1e-4  # Much smaller than main agent
+rnd_optimizer = Adam(rnd.predictor.parameters(), lr=rnd_lr)
+
+# Scale intrinsic reward appropriately
+intrinsic_reward = rnd.compute_intrinsic_reward(state, scale=0.01)
+```
+
+**Symptom:** RND rewards explode, agent overfits to novelty
+
+**Fix:** Lower learning rate for RND, scale intrinsic rewards carefully
+
+
+## Part 6: Count-Based Exploration
+
+### State Visitation Counts
+
+For **discrete/tabular** environments, track how many times each state visited:
+
+```python
+from collections import defaultdict
+
+class CountBasedExploration:
+    """
+    Count-based exploration: encourage visiting rarely-seen states.
+
+    Works for:
+    ✓ Tabular (small discrete state space)
+    ✓ Gridworlds, simple games
+
+    Doesn't work for:
+    ✗ Continuous spaces
+    ✗ Image observations (never see same image twice)
+    ✗ Large state spaces
+    """
+
+    def __init__(self):
+        self.state_counts = defaultdict(int)
+
+    def compute_intrinsic_reward(self, state, reward_scale=1.0):
+        """
+        Intrinsic reward inversely proportional to state visitation.
+
+        intrinsic_reward = reward_scale / sqrt(N(s))
+
+        Rarely visited states (small N) → high intrinsic reward
+        Frequently visited states (large N) → low intrinsic reward
+        """
+        count = max(self.state_counts[state], 1)  # Avoid division by zero
+        return reward_scale / np.sqrt(count)
+
+    def update_counts(self, state):
+        """Increment visitation count for state."""
+        self.state_counts[state] += 1
+```
+
+### Example: Gridworld with Sparse Reward
+
+```python
+# Gridworld: 10×10 grid, reward at (9, 9), start at (0, 0)
+# Without exploration: Random walking takes exponential time
+# With count-based: Directed toward unexplored cells
+
+# Pseudocode:
+for episode in range(episodes):
+    state = env.reset()
+    for step in range(max_steps):
+        # Compute exploration bonus
+        intrinsic_reward = count_explorer.compute_intrinsic_reward(state)
+
+        # Combine with task reward
+        combined_reward = env_reward + lambda * intrinsic_reward
+
+        # Q-learning with combined reward
+        action = epsilon_greedy(q_values[state], epsilon)
+        next_state, env_reward = env.step(action)
+
+        q_values[state][action] += alpha * (
+            combined_reward + gamma * max(q_values[next_state]) - q_values[state][action]
+        )
+
+        # Update counts
+        count_explorer.update_counts(next_state)
+        state = next_state
+```
+
+### Critical Limitation: Doesn't Scale
+
+```python
+# Works: Small state space
+state_space_size = 100  # 10×10 grid
+# Can track counts for all states
+
+# Fails: Large/continuous state space
+state_space_size = 10^18  # Image observations
+# Can't track visitation counts for 10^18 unique states!
+```
+
+
+## Part 7: When Exploration is Critical
+
+### Decision Framework
+
+**Exploration matters when:**
+
+1. **Sparse Rewards** (rewards rare, hard to find)
+   - Examples: Montezuma's Revenge, goal-conditioned tasks, real robotics
+   - No dense reward signal to guide learning
+   - Agent must explore to find any reward
+   - Solution: Intrinsic motivation (curiosity, RND)
+
+2. **Large State Spaces** (too many possible states)
+   - Examples: Image-based RL, continuous control
+   - Random exploration covers infinitesimal fraction
+   - Systematic exploration essential
+   - Solution: Curiosity-driven or RND
+
+3. **Long Horizons** (many steps before reward)
+   - Examples: Multi-goal tasks, planning problems
+   - Temporal credit assignment hard
+   - Need to explore systematically to connect actions to delayed rewards
+   - Solution: Sophisticated exploration strategy
+
+4. **Deceptive Reward Landscape** (local optima common)
+   - Examples: Multiple solutions, trade-offs
+   - Easy to get stuck in suboptimal policy
+   - Exploration helps escape local optima
+   - Solution: Slow decay schedule, maintain exploration
+
+### Decision Framework (Quick Check)
+
+```
+Do you have SPARSE rewards?
+  YES → Use intrinsic motivation (curiosity, RND)
+  NO → Continue
+
+Is state space large (images, continuous)?
+  YES → Use curiosity-driven or RND
+  NO → Continue
+
+Is exploration reasonably efficient with ε-greedy?
+  YES → Use ε-greedy + appropriate decay schedule
+  NO → Use curiosity-driven or RND
+```
+
+### Example: Reward Structure Analysis
+
+```python
+def analyze_reward_structure(rewards):
+    """Determine if exploration strategy needed."""
+
+    # Check sparsity
+    nonzero_rewards = np.count_nonzero(rewards)
+    sparsity = 1 - (nonzero_rewards / len(rewards))
+
+    if sparsity > 0.95:
+        print("SPARSE REWARDS detected")
+        print("  → Use: Intrinsic motivation (RND or curiosity)")
+        print("  → Why: Reward signal too rare to guide learning")
+
+    # Check reward magnitude
+    reward_std = np.std(rewards)
+    reward_mean = np.mean(rewards)
+
+    if reward_std < 0.1:
+        print("WEAK/NOISY REWARDS detected")
+        print("  → Use: Intrinsic motivation")
+        print("  → Why: Reward signal insufficient to learn from")
+
+    # Check reward coverage
+    episode_length = len(rewards)
+    if episode_length > 1000:
+        print("LONG HORIZONS detected")
+        print("  → Use: Strong exploration decay or intrinsic motivation")
+        print("  → Why: Temporal credit assignment difficult")
+```
+
+
+## Part 8: Combining Exploration with Task Rewards
+
+### Combining Intrinsic and Extrinsic Rewards
+
+When using intrinsic motivation, balance with task reward:
+
+```python
+def combine_rewards(extrinsic_reward, intrinsic_reward,
+                    intrinsic_scale=0.01):
+    """
+    Combine extrinsic (task) and intrinsic (curiosity) rewards.
+
+    r_total = r_extrinsic + λ * r_intrinsic
+
+    λ controls tradeoff:
+    - λ = 0: Ignore intrinsic reward (no exploration)
+    - λ = 0.01: Curiosity helps, task reward primary (typical)
+    - λ = 0.1: Curiosity significant
+    - λ = 1.0: Curiosity dominates (might ignore task)
+    """
+    return extrinsic_reward + intrinsic_scale * intrinsic_reward
+```
+
+### Challenges: Reward Hacking
+
+```python
+# PROBLEM: Intrinsic reward encourages anything novel
+# Even if novel thing is useless for task
+
+# Example: Atari with RND
+# If game has pixel randomness, RND rewards exploring random pixels
+# Instead of exploring to find coins/power-ups
+
+# SOLUTION: Scale intrinsic reward carefully
+# Make it significant but not dominant
+
+# SOLUTION 2: Curriculum learning
+# Start with high intrinsic reward (discover environment)
+# Gradually reduce as agent finds reward signals
+```
+
+### Intrinsic Reward Scale Tuning
+
+```python
+# Quick tuning procedure:
+for intrinsic_scale in [0.001, 0.01, 0.1, 1.0]:
+    agent = RL_Agent(intrinsic_reward_scale=intrinsic_scale)
+    for episode in episodes:
+        performance = train_episode(agent)
+
+    print(f"Scale={intrinsic_scale}: Performance={performance}")
+
+# Find scale where agent learns task well AND explores
+# Usually 0.01-0.1 is sweet spot
+```
+
+
+## Part 9: Common Pitfalls and Debugging
+
+### Pitfall 1: Epsilon Decay Too Fast
+
+**Symptom:** Agent plateaus at poor performance early in training
+
+**Root Cause:** Epsilon decays to near-zero before agent finds good actions
+
+```python
+# WRONG: Decays in 10k steps
+epsilon_final = 0.0
+epsilon_decay = 0.9999  # Per-step decay
+# After 10k steps: ε ≈ 0, almost no exploration left
+
+# CORRECT: Decay over full training
+total_training_steps = 1_000_000
+epsilon_linear(step, total_training_steps,
+               epsilon_start=1.0, epsilon_end=0.01)
+```
+
+**Diagnosis:**
+
+- Plot epsilon over training: does it reach 0 too early?
+- Check if performance improves after epsilon reaches low values
+
+**Fix:**
+
+- Use longer decay (more steps)
+- Use higher epsilon_end (never go to pure exploitation)
+
+### Pitfall 2: Intrinsic Reward Too Strong
+
+**Symptom:** Agent explores forever, ignores task reward
+
+**Root Cause:** Intrinsic reward scale too high
+
+```python
+# WRONG: Intrinsic reward dominates
+r_total = r_task + 1.0 * r_intrinsic
+# Agent optimizes novelty, ignores task
+
+# CORRECT: Intrinsic reward is small bonus
+r_total = r_task + 0.01 * r_intrinsic
+# Task reward primary, intrinsic helps exploration
+```
+
+**Diagnosis:**
+
+- Agent explores everywhere but doesn't collect task rewards
+- Intrinsic reward signal going to seemingly useless states
+
+**Fix:**
+
+- Reduce intrinsic_reward_scale (try 0.01, 0.001)
+- Verify agent eventually starts collecting task rewards
+
+### Pitfall 3: ε-Greedy on Continuous Actions
+
+**Symptom:** Exploration ineffective, agent doesn't learn
+
+**Root Cause:** Random action in continuous space is meaningless
+
+```python
+# WRONG: ε-greedy on continuous actions
+if random() < epsilon:
+    action = np.random.uniform(-1, 1)  # Random in action space
+else:
+    action = network(state)  # Neural network action
+
+# Random action is far from learned policy, completely unhelpful
+
+# CORRECT: Gaussian noise on action
+action = network(state)
+noisy_action = action + np.random.normal(0, exploration_std)
+noisy_action = np.clip(noisy_action, -1, 1)
+```
+
+**Diagnosis:**
+
+- Continuous action space and using ε-greedy
+- Agent not learning effectively
+
+**Fix:**
+
+- Use Gaussian noise: action + N(0, σ)
+- Decay exploration_std over time (like epsilon decay)
+
+### Pitfall 4: Forgetting to Decay Exploration
+
+**Symptom:** Training loss decreases but policy doesn't improve, noisy behavior
+
+**Root Cause:** Agent keeps exploring randomly instead of exploiting learned policy
+
+```python
+# WRONG: Constant exploration forever
+epsilon = 0.3
+
+# CORRECT: Decaying exploration
+epsilon = epsilon_linear(step, total_steps)
+```
+
+**Diagnosis:**
+
+- No epsilon decay schedule mentioned in code
+- Agent behaves randomly even after many training steps
+
+**Fix:**
+
+- Add decay schedule (linear, exponential, polynomial)
+
+### Pitfall 5: Using Exploration at Test Time
+
+**Symptom:** Test performance worse than training, highly variable
+
+**Root Cause:** Applying exploration strategy (ε > 0) at test time
+
+```python
+# WRONG: Test with exploration
+for test_episode in test_episodes:
+    action = epsilon_greedy(q_values, epsilon=0.05)  # Wrong!
+    # Agent still explores at test time
+
+# CORRECT: Test with greedy policy
+for test_episode in test_episodes:
+    action = np.argmax(q_values)  # Deterministic, no exploration
+```
+
+**Diagnosis:**
+
+- Test performance has high variance
+- Test performance < training performance (exploration hurts)
+
+**Fix:**
+
+- At test time, use greedy/deterministic policy
+- No ε-greedy, no Boltzmann, no exploration noise
+
+### Pitfall 6: RND Predictor Overfitting
+
+**Symptom:** RND loss decreases but intrinsic rewards still large everywhere
+
+**Root Cause:** Predictor overfits to training data, doesn't generalize to new states
+
+```python
+# WRONG: High learning rate, no regularization
+rnd_optimizer = Adam(rnd.predictor.parameters(), lr=0.001)
+rnd_loss.backward()
+rnd_optimizer.step()
+
+# Predictor fits perfectly to seen states but doesn't generalize
+
+# CORRECT: Lower learning rate, regularization
+rnd_optimizer = Adam(rnd.predictor.parameters(), lr=0.0001)
+# Add weight decay for regularization
+```
+
+**Diagnosis:**
+
+- RND training loss is low (close to 0)
+- But intrinsic rewards still high for most states
+- Suggests predictor fitted to training states but not generalizing
+
+**Fix:**
+
+- Reduce RND learning rate
+- Add weight decay (L2 regularization)
+- Use batch normalization in predictor
+
+### Pitfall 7: Count-Based on Non-Tabular Problems
+
+**Symptom:** Exploration ineffective, agent keeps revisiting similar states
+
+**Root Cause:** State counting doesn't work for continuous/image spaces
+
+```python
+# WRONG: Counting state IDs in image-based RL
+state = env.render(mode='rgb_array')  # 84x84 image
+state_id = hash(state.tobytes())  # Different hash every time!
+count_based_explorer.update_counts(state_id)
+
+# Every frame is "new" because of slight pixel differences
+# State counting broken
+
+# CORRECT: Use RND or curiosity instead
+rnd = RandomNetworkDistillation(state_dim)
+# RND handles high-dimensional states
+```
+
+**Diagnosis:**
+
+- Using count-based exploration with images/continuous observations
+- Exploration not working effectively
+
+**Fix:**
+
+- Switch to RND or curiosity-driven methods
+- Count-based only for small discrete state spaces
+
+
+## Part 10: Red Flags and Pressure Tests
+
+### Red Flags Checklist
+
+- [ ] **Constant epsilon (no decay)**: Agent explores forever, can't exploit
+- [ ] **Epsilon decays to zero (pure exploitation)**: Agent stuck in local optimum
+- [ ] **ε-greedy on continuous actions**: Exploration unguided, learning slow
+- [ ] **Intrinsic reward >> task reward**: Agent ignores task
+- [ ] **No exploration specified**: Might be using default (often insufficient)
+- [ ] **Using RND in random/stochastic environment**: Breaks because predicting noise
+- [ ] **Count-based on images**: Can't count unique images, worthless
+- [ ] **Exploration at test time**: Test performance degrades from exploration noise
+- [ ] **ICM on pixel-randomness environment**: Forward model tries to predict noise
+- [ ] **Ignoring sparse rewards**: Using ε-greedy only, no intrinsic motivation
+
+### Pressure Test Scenario 1: "My Agent Stuck in Local Optimum"
+
+**User Query:** "I trained my agent and it found a solution giving +10 reward. But I know the optimal solution is +100. Agent is stuck at +10. How do I fix it?"
+
+**What Skill Must Provide:**
+
+1. Diagnosis: Likely exploration problem (local optimum)
+2. Solution: Increase exploration (slower decay, or intrinsic motivation)
+3. Prevention: Design exploration strategy before training
+
+**Skill Response:**
+
+"Your agent found a local optimum (+10) and exploitation is too strong. This is a classic exploration-exploitation problem.
+
+**Quick Fixes:**
+
+1. Check epsilon decay: Is epsilon decaying too fast? Use slower decay schedule.
+2. Check epsilon_end: Is epsilon_end = 0? Change to 0.01-0.05.
+3. Add intrinsic motivation: Your environment might need curiosity-driven exploration.
+
+**Systematic Approach:**
+
+- If rewards are dense (frequent signals): ε-greedy with slow decay
+- If rewards are sparse (rare signals): Add RND or curiosity
+- If state space huge: Definitely use RND
+
+**Example Fix:**
+
+```
+# Instead of epsilon decaying to 0.01 in 100k steps
+# Decay over 1M steps with epsilon_end=0.05
+epsilon = epsilon_linear(step, total_steps=1_000_000,
+                         epsilon_start=1.0, epsilon_end=0.05)
+```"
+
+### Pressure Test Scenario 2: "Which Exploration for Montezuma's Revenge?"
+
+**User Query:** "I'm training on Montezuma's Revenge (classic sparse reward benchmark). ε-Greedy isn't working (agent never finds first coin). What exploration strategy should I use?"
+
+**What Skill Must Provide:**
+1. Recognition: This is THE benchmark for intrinsic motivation
+2. Solution: RND is standard, ICM is alternative
+3. Implementation: Code/guidance for using RND
+
+**Skill Response:**
+
+"Montezuma's Revenge is the canonical example where ε-greedy fails. The first coin is at step 500+ with no reward signal. Random exploration won't find it.
+
+**Recommended:** RND (Random Network Distillation)
+
+Why RND:
+- Robust to stochasticity (doesn't try to predict environment)
+- Simpler than ICM (no environment model needed)
+- Proven to work on Montezuma's Revenge
+
+**Implementation:**
+```python
+rnd = RandomNetworkDistillation(state_dim=...)
+lambda_intrinsic = 0.01  # Scale intrinsic reward
+
+for step in training_steps:
+    state = env.observation
+
+    # Get intrinsic reward
+    r_intrinsic = rnd.compute_intrinsic_reward(state)
+
+    # Combine with task reward
+    r_total = r_task + lambda_intrinsic * r_intrinsic
+
+    # Train agent with combined reward
+    action = agent.select_action(state)
+    next_state, r_task = env.step(action)
+
+    # Update RND predictor
+    rnd_loss = rnd.predictor_loss(state)
+    rnd_optimizer.zero_grad()
+    rnd_loss.backward()
+    rnd_optimizer.step()
+```
+
+**Alternative:** Curiosity-driven (ICM) also works if environment deterministic.
+
+**What to tune:**
+
+- lambda_intrinsic: Start 0.01, increase if agent doesn't explore enough
+- RND learning rate: Usually 1e-4 (much smaller than main agent)"
+
+### Pressure Test Scenario 3: "ε-Greedy Decay Schedule"
+
+**User Query:** "I'm using ε-greedy but not sure about the decay schedule. I have 100k training steps. Should I decay epsilon linearly? Over how many steps? To what final value?"
+
+**What Skill Must Provide:**
+
+1. Formula: Linear, exponential, polynomial decay
+2. Guidance: Typical values for different problem types
+3. Tuning: How to adjust based on task
+
+**Skill Response:**
+
+"**Quick Answer:**
+
+```python
+total_steps = 100_000
+epsilon = epsilon_linear(step, total_steps,
+                         epsilon_start=1.0, epsilon_end=0.05)
+```
+
+**Reasoning:**
+
+- `epsilon_start=1.0`: Explore fully initially (know nothing)
+- `epsilon_end=0.05`: Keep 5% exploration always (helps with stochasticity)
+- Linear decay: Simple, works well for most tasks
+
+**Decision Tree:**
+
+1. **How dense are rewards?**
+   - Sparse: Use slower decay (maybe 0.1 epsilon_end)
+   - Dense: Use faster decay (0.01 epsilon_end)
+
+2. **How many steps?**
+   - 100k steps: Decay over 50k-100k (half to full training)
+   - 1M steps: Decay over 500k-1M
+   - Rule: Decay over 50-100% of training time
+
+3. **Which decay curve?**
+   - Linear (ε_linear): Safe default
+   - Exponential (ε_exponential): Faster early decay
+   - Polynomial with power=2: Balanced
+
+**Example Tuning:**
+
+```python
+# Sparse rewards: More exploration needed
+epsilon = epsilon_linear(step, total_steps=100_000,
+                        epsilon_start=1.0, epsilon_end=0.1)
+
+# Very sparse rewards: Even more
+epsilon = epsilon_linear(step, total_steps=100_000,
+                        epsilon_start=1.0, epsilon_end=0.2)
+
+# Dense rewards: Faster decay to exploitation
+epsilon = epsilon_linear(step, total_steps=50_000,
+                        epsilon_start=1.0, epsilon_end=0.01)
+```"
+
+### Pressure Test Scenario 4: "ICM vs RND vs ε-Greedy"
+
+**User Query:** "I have sparse rewards and a deterministic environment. Should I use ε-greedy, ICM curiosity, or RND? How do I decide?"
+
+**What Skill Must Provide:**
+1. Comparison table: Pros/cons of each
+2. Decision framework: Which for which situation
+3. Implementation difficulty: Is it worth the complexity?
+
+**Skill Response:**
+
+"**Comparison:**
+
+| Method | Complexity | Robustness | When to Use |
+|--------|-----------|-----------|------------|
+| ε-Greedy | Very Simple | Good on dense rewards | Dense rewards, simple env |
+| ICM | Complex (2 models) | Poor on stochastic | Sparse, deterministic, learning important |
+| RND | Medium (1 model) | Excellent | Sparse, any environment |
+
+**Decision Framework:**
+
+1. **Sparse rewards + Deterministic + Lots of learning:**
+   → Try ICM first (forward model learns useful features)
+
+2. **Sparse rewards + Any environment:**
+   → Use RND (robust, simpler)
+
+3. **Sparse rewards + Large pixel space:**
+   → RND (better for high-dimensional observations)
+
+4. **Dense rewards:**
+   → Just use ε-greedy (intrinsic motivation not needed)
+
+5. **Not sure, want safe choice:**
+   → Use RND (works almost everywhere)
+
+**Implementation Difficulty:**
+- ε-Greedy: 5 lines of code
+- ICM: 100+ lines (forward + inverse models, 2 losses)
+- RND: 50 lines (target + predictor)
+
+**My Recommendation:**
+Start ε-greedy. If not working after tuning decay schedules, switch to RND. Only use ICM if environment is fully deterministic and you care about learning dynamics."
+
+### Pressure Test Scenario 5: "Computational Cost of Curiosity"
+
+**User Query:** "I want to add curiosity-driven exploration (ICM) to my agent. What's the computational overhead? Will it slow down training significantly?"
+
+**What Skill Must Provide:**
+1. Overhead estimate: Exact computation cost
+2. Tradeoff: When overhead is justified
+3. Alternatives: Lighter-weight options
+
+**Skill Response:**
+
+"**Computational Overhead Estimate:**
+
+Base Agent:
+- 1 policy/value network
+- Training time: 1.0×
+
+With ICM:
+- Policy/value network + Forward model + Inverse model
+- Forward model: Encoder + hidden layers + state-dim output
+- Inverse model: Concatenate states + hidden + action-dim output
+- Training time: ~2.5-3.0×
+
+**Memory:**
+- Base: X MB
+- With ICM: ~3-4× X MB
+- GPU memory often bottleneck
+
+**When Overhead is Justified:**
+- Sparse rewards (ICM critical to success)
+- Large state space (intrinsic motivation helps)
+- Willing to wait longer for better exploration
+
+**When Overhead is NOT Justified:**
+- Dense rewards (ε-greedy sufficient)
+- Real-time training constraints
+- Limited GPU memory
+
+**Lighter Alternative:**
+Use RND instead of ICM:
+- ~1.5-2.0× overhead (vs 2.5-3.0× for ICM)
+- Same exploration benefits
+- Simpler to implement
+
+**Scaling to Large Models:**
+```python
+# ICM with huge state encoders can be prohibitive
+# Example: Vision transformer encoder → ICM
+# That's very expensive
+
+# RND scales better: predictor can be small
+# Don't need sophisticated encoder
+```
+
+**Bottom Line:**
+ICM costs 2-3× training time. If you can afford it and rewards are very sparse, worth it. Otherwise try RND or even ε-greedy with slower decay first."
+
+
+## Part 11: Rationalization Resistance Table
+
+| Rationalization | Reality | Counter-Guidance | Red Flag |
+|-----------------|---------|------------------|----------|
+| "ε-Greedy works everywhere" | Fails on sparse rewards, large spaces | Use ε-greedy for dense/small, intrinsic motivation for sparse/large | Applying ε-greedy to Montezuma's Revenge |
+| "Higher epsilon is better" | High ε → too random, doesn't exploit | Use decay schedule (ε high early, low late) | Using constant ε=0.5 throughout training |
+| "Decay epsilon to zero" | Agent needs residual exploration | Keep ε_end=0.01-0.1 always | Setting ε_final=0 (pure exploitation) |
+| "Curiosity always helps" | Can break with stochasticity (model tries to predict noise) | Use RND for stochastic, ICM for deterministic | Agent learns to explore random noise instead of task |
+| "RND is just ICM simplified" | RND is fundamentally different (frozen random vs learned model) | Understand frozen network prevents overfitting/noise | Not grasping why RND frozen network matters |
+| "More intrinsic reward = faster exploration" | Too much intrinsic reward drowns out task signal | Balance with λ=0.01-0.1, tune on task performance | Agent explores forever, ignores task |
+| "Count-based works anywhere" | Only works tabular (can't count unique images) | Use RND for continuous/high-dimensional spaces | Trying count-based on Atari images |
+| "Boltzmann is always better than ε-greedy" | Boltzmann smoother but harder to tune | Use ε-greedy for simplicity (it works well) | Switching to Boltzmann without clear benefit |
+| "Test with ε>0 for exploration" | Test should use learned policy, not explore | ε=0 or greedy policy at test time | Variable test performance from exploration |
+| "Longer decay is always better" | Very slow decay wastes time in early training | Match decay to task difficulty (faster for easy, slower for hard) | Decaying over 10M steps when training only 1M |
+| "Skip exploration, increase learning rate" | Learning rate is for optimization, exploration for coverage | Use both: exploration strategy + learning rate | Agent oscillates without exploration |
+| "ICM is the SOTA exploration" | RND simpler and more robust | Use RND unless you need environment model | Implementing ICM when RND would suffice |
+
+
+## Part 12: Summary and Decision Framework
+
+### Quick Decision Tree
+
+```
+START: Need exploration strategy?
+
+├─ Are rewards sparse? (rare reward signal)
+│  ├─ YES → Need intrinsic motivation
+│  │  ├─ Environment stochastic?
+│  │  │  ├─ YES → RND
+│  │  │  └─ NO → ICM (or RND for simplicity)
+│  │  └─ Choose RND for safety
+│  │
+│  └─ NO → Dense rewards
+│     └─ Use ε-greedy + decay schedule
+
+├─ Is state space large? (images, continuous)
+│  ├─ YES → Intrinsic motivation (RND/curiosity)
+│  └─ NO → ε-greedy usually sufficient
+
+└─ Choosing decay schedule:
+   ├─ Sparse rewards → slower decay (ε_end=0.05-0.1)
+   ├─ Dense rewards → faster decay (ε_end=0.01)
+   └─ Default: Linear decay over 50% of training
+```
+
+### Implementation Checklist
+
+- [ ] Define reward structure (dense vs sparse)
+- [ ] Estimate state space size (discrete vs continuous)
+- [ ] Choose exploration method (ε-greedy, curiosity, RND, UCB, count-based)
+- [ ] Set epsilon/temperature parameters (start, end)
+- [ ] Choose decay schedule (linear, exponential, polynomial)
+- [ ] If using intrinsic motivation: set λ (usually 0.01)
+- [ ] Use greedy policy at test time (ε=0)
+- [ ] Monitor exploration vs exploitation (plot epsilon decay)
+- [ ] Tune hyperparameters (decay schedule, λ) based on task performance
+
+### Typical Configurations
+
+**Dense Rewards, Small Action Space (e.g., simple game)**
+
+```python
+epsilon = epsilon_linear(step, total_steps=100_000,
+                        epsilon_start=1.0, epsilon_end=0.01)
+# Fast exploitation, low exploration needed
+```
+
+**Sparse Rewards, Discrete Actions (e.g., Atari)**
+
+```python
+rnd = RandomNetworkDistillation(...)
+epsilon = epsilon_linear(step, total_steps=1_000_000,
+                        epsilon_start=1.0, epsilon_end=0.05)
+r_total = r_task + 0.01 * r_intrinsic
+# Intrinsic motivation + slow decay
+```
+
+**Continuous Control, Sparse (e.g., Robotics)**
+
+```python
+rnd = RandomNetworkDistillation(...)
+action = policy(state) + gaussian_noise(std=exploration_std)
+exploration_std = exploration_std_linear(..., std_end=0.01)
+r_total = r_task + 0.01 * r_intrinsic
+# Gaussian noise + RND
+```
+
+
+## Key Takeaways
+
+1. **Exploration is fundamental**: Don't ignore it. Design exploration strategy before training.
+
+2. **Match method to problem**:
+   - Dense rewards → ε-greedy
+   - Sparse rewards → Intrinsic motivation (RND preferred)
+   - Large state space → Intrinsic motivation
+
+3. **Decay exploration over time**: Explore early, exploit late.
+
+4. **Avoid common pitfalls**:
+   - Don't decay to zero (ε_end > 0)
+   - Don't use ε-greedy on continuous actions
+   - Don't forget decay schedule
+   - Don't use exploration at test time
+
+5. **Balance intrinsic and extrinsic**: If using intrinsic rewards, don't let them dominate.
+
+6. **RND is the safe choice**: Works for most exploration problems, simpler than ICM.
+
+7. **Test exploration hypothesis**: Plot epsilon or intrinsic rewards, verify exploration strategy is active.
+
+This skill is about **systematic exploration design**, not just tuning one hyperparameter.
diff --git a/skills/using-deep-rl/model-based-rl.md b/skills/using-deep-rl/model-based-rl.md
new file mode 100644
index 0000000..94adff7
--- /dev/null
+++ b/skills/using-deep-rl/model-based-rl.md
@@ -0,0 +1,1726 @@
+
+# Model-Based Reinforcement Learning
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Learning World Models**: User wants to predict future states from current state + action
+- **Planning with Models**: How to use learned models for planning (MPC, shooting)
+- **Dyna-Q Questions**: How to combine model-free (Q-learning) with model-based (planning)
+- **MBPO Implementation**: Short rollouts, model ensemble, policy optimization
+- **Dreamer Architecture**: Latent world models, imagination in latent space
+- **Model Error Handling**: Why long rollouts diverge, how to keep rollouts short
+- **Sim-to-Real**: Using simulators, domain randomization, reality gap
+- **Sample Efficiency Claims**: When model-based actually saves samples vs compute cost
+- **Distribution Shift**: Policy improves → states leave training distribution → model fails
+
+**This skill bridges model learning and policy improvement.**
+
+Do NOT use this skill for:
+
+- Pure dynamics learning (use supervised learning, not RL)
+- Perfect simulators (those are Dreamers, not world models)
+- Model-free policy optimization (use policy-gradient-methods, actor-critic-methods)
+- Debugging specific algorithm (use rl-debugging)
+
+## Core Principle
+
+**Model-based RL trades sample complexity for model error.**
+
+The fundamental tradeoff:
+
+- **Sample Complexity**: Learning from real world requires few samples (model helps)
+- **Model Error**: Learned models diverge from reality, planning on wrong models hurts
+- **Solution**: Keep rollouts short (k=5-10), bootstrap with value function, handle distribution shift
+
+**Without understanding error mechanics, you'll implement algorithms that learn model errors instead of policies.**
+
+
+## Part 1: World Models (Dynamics Learning)
+
+### What is a World Model?
+
+A world model (dynamics model) learns to predict the next state from current state and action:
+
+```
+Deterministic: s_{t+1} = f(s_t, a_t)
+Stochastic:    p(s_{t+1} | s_t, a_t)  = N(μ_θ(s_t, a_t), σ_θ(s_t, a_t))
+```
+
+**Key Components**:
+
+1. **State Representation**: What info captures current situation? (pixels, features, latent)
+2. **Dynamics Function**: Neural network mapping (s, a) → s'
+3. **Loss Function**: How to train? (MSE, cross-entropy, contrastive)
+4. **Uncertainty**: Estimate model confidence (ensemble, aleatoric, epistemic)
+
+### Example 1: Pixel-Based Dynamics
+
+**Environment**: Cart-pole
+
+```
+Input: Current image (84×84×4 pixels)
+Output: Next image (84×84×4 pixels)
+Model: CNN that predicts image differences
+
+Loss = MSE(predicted_frame, true_frame) + regularization
+```
+
+**Architecture**:
+
+```python
+class PixelDynamicsModel(nn.Module):
+    def __init__(self):
+        self.encoder = CNN(input_channels=4, output_dim=256)
+        self.dynamics_net = MLP(256 + action_dim, 256)
+        self.decoder = TransposeCNN(256, output_channels=4)
+
+    def forward(self, s, a):
+        # Encode image
+        z = self.encoder(s)
+
+        # Predict latent next state
+        z_next = self.dynamics_net(torch.cat([z, a], dim=1))
+
+        # Decode to image
+        s_next = self.decoder(z_next)
+        return s_next
+```
+
+**Training**:
+
+```
+For each real transition (s, a, s_next):
+    pred_s_next = model(s, a)
+    loss = MSE(pred_s_next, s_next)
+    loss.backward()
+```
+
+**Problem**: Pixel-space errors compound (blurry 50-step predictions).
+
+
+### Example 2: Latent-Space Dynamics
+
+**Better for high-dim observations** (learn representation + dynamics separately).
+
+**Architecture**:
+
+```
+1. Encoder: s → z (256-dim latent)
+2. Dynamics: z_t, a_t → z_{t+1}
+3. Decoder: z → s (reconstruction)
+4. Reward Predictor: z, a → r
+```
+
+**Training**:
+
+```
+Reconstruction loss: ||s - decode(encode(s))||²
+Dynamics loss: ||z_{t+1} - f(z_t, a_t)||²
+Reward loss: ||r - reward_net(z_t, a_t)||²
+```
+
+**Advantage**: Learns compact representation, faster rollouts, better generalization.
+
+
+### Example 3: Stochastic Dynamics
+
+**Handle environment stochasticity** (multiple outcomes from (s, a)):
+
+```python
+class StochasticDynamicsModel(nn.Module):
+    def forward(self, s, a):
+        # Predict mean and std of next state distribution
+        z = self.encoder(s)
+        mu, log_sigma = self.dynamics_net(torch.cat([z, a], dim=1))
+
+        # Sample next state
+        z_next = mu + torch.exp(log_sigma) * torch.randn_like(mu)
+        return z_next, mu, log_sigma
+```
+
+**Training**:
+
+```
+NLL loss = -log p(s_{t+1} | s_t, a_t)
+         = ||s_{t+1} - μ||² / (2σ²) + log σ
+```
+
+**Key**: Captures uncertainty (aleatoric: environment noise, epistemic: model uncertainty).
+
+
+### World Model Pitfall #1: Compounding Errors
+
+**Bad Understanding**: "If model is 95% accurate, 50-step rollout is (0.95)^50 = 5% accurate."
+
+**Reality**: Error compounds worse.
+
+**Mechanics**:
+
+```
+Step 1: s1_pred = s1_true + ε1
+Step 2: s2_pred = f(s1_pred, a1) = f(s1_true + ε1, a1) = f(s1_true, a1) + ∇f ε1 + ε2
+       Error grows: ε_cumulative ≈ ||∇f|| * ε_prev + ε2
+Step 3: Error keeps magnifying (if ||∇f|| > 1)
+```
+
+**Example**: Cart-pole position error 0.1 pixel
+
+```
+After 1 step: 0.10
+After 5 steps: ~0.15 (small growth)
+After 10 steps: ~0.25 (noticeable)
+After 50 steps: ~2.0 (completely wrong)
+```
+
+**Solution**: Use short rollouts (k=5-10), trust value function beyond.
+
+
+### World Model Pitfall #2: Distribution Shift
+
+**Scenario**: Train model on policy π_0 data, policy improves to π_1.
+
+**What Happens**:
+
+```
+π_0 data distribution: {s1, s2, s3, ...}
+Model trained on: P_0(s)
+
+π_1 visits new states: {s4, s5, s6, ...}
+Model has no training data for {s4, s5, s6}
+Model predictions on new states: WRONG (distribution shift)
+
+Planning uses wrong model → Policy learns model errors
+```
+
+**Example**: Cartpole
+
+- Initial: pole barely moving
+- After learning: pole swinging wildly
+- Model trained on small-angle dynamics
+- New states (large angle) outside training distribution
+- Model breaks
+
+**Solution**:
+
+1. Retrain model frequently (as policy improves)
+2. Use ensemble (detect epistemic uncertainty in new states)
+3. Keep policy close to training distribution (regularization)
+
+
+## Part 2: Planning with Learned Models
+
+### What is Planning?
+
+Planning = using model to simulate trajectories and find good actions.
+
+**General Form**:
+
+```
+Given:
+- Current state s_t
+- Dynamics model f(·)
+- Reward function r(·) (known or learned)
+- Value function V(·) (for horizon beyond imagination)
+
+Find action a_t that maximizes:
+  Q(s_t, a_t) = E[Σ_{τ=0}^{k} γ^τ r(s_τ, a_τ) + γ^k V(s_{t+k})]
+```
+
+**Two Approaches**:
+
+1. **Model Predictive Control (MPC)**: Solve optimization at each step
+2. **Shooting Methods**: Sample trajectories, pick best
+
+
+### Model Predictive Control (MPC)
+
+**Algorithm**:
+
+```
+1. At each step:
+   - Initialize candidate actions a₀, a₁, ..., a_{k-1}
+
+2. Compute k-step imagined rollout:
+   s₁ = f(s_t, a₀)
+   s₂ = f(s₁, a₁)
+   ...
+   s_k = f(s_{k-1}, a_{k-1})
+
+3. Evaluate trajectory:
+   Q = Σ τ=0 to k-1 [γ^τ r(s_τ, a_τ)] + γ^k V(s_k)
+
+4. Optimize actions to maximize Q
+5. Execute first action a₀, discard rest
+6. Replan at next step
+```
+
+**Optimization Methods**:
+
+- **Cross-Entropy Method (CEM)**: Sample actions, keep best, resample
+- **Shooting**: Random shooting, iLQR, etc.
+
+**Example**: Cart-pole with learned model
+
+```python
+def mpc_planning(s_current, model, reward_fn, value_fn, k=5, horizon=100):
+    best_action = None
+    best_return = -float('inf')
+
+    # Sample candidate action sequences
+    for _ in range(100):  # CEM: sample trajectories
+        actions = np.random.randn(k, action_dim)
+
+        # Simulate trajectory
+        s = s_current
+        trajectory_return = 0
+
+        for t in range(k):
+            s_next = model(s, actions[t])
+            r = reward_fn(s, actions[t])
+            trajectory_return += gamma**t * r
+            s = s_next
+
+        # Bootstrap with value
+        trajectory_return += gamma**k * value_fn(s)
+
+        # Track best
+        if trajectory_return > best_return:
+            best_return = trajectory_return
+            best_action = actions[0]
+
+    return best_action
+```
+
+**Key Points**:
+
+- Replan at every step (expensive, but avoids compounding errors)
+- Use short horizons (k=5-10)
+- Bootstrap with value function
+
+
+### Shooting Methods
+
+**Random Shooting** (simplest):
+
+```python
+def random_shooting(s, model, reward_fn, value_fn, k=5, num_samples=1000):
+    best_action = None
+    best_return = -float('inf')
+
+    # Sample random action sequences
+    for _ in range(num_samples):
+        actions = np.random.uniform(action_min, action_max, size=(k, action_dim))
+
+        # Rollout
+        s_current = s
+        returns = 0
+        for t in range(k):
+            s_next = model(s_current, actions[t])
+            r = reward_fn(s_current, actions[t])
+            returns += gamma**t * r
+            s_current = s_next
+
+        # Bootstrap
+        returns += gamma**k * value_fn(s_current)
+
+        if returns > best_return:
+            best_return = returns
+            best_action = actions[0]
+
+    return best_action
+```
+
+**Trade-offs**:
+
+- Pros: Simple, parallelizable, no gradient computation
+- Cons: Slow (needs many samples), doesn't refine actions
+
+**iLQR/LQR**: Assumes quadratic reward, can optimize actions.
+
+
+### Planning Pitfall #1: Long Horizons
+
+**User Belief**: "k=50 is better than k=5 (more planning)."
+
+**Reality**:
+
+```
+k=5: Q = r₀ + γr₁ + ... + γ⁴r₄ + γ⁵V(s₅)
+     Errors from 5 steps of model error
+     But V(s₅) more reliable (only 5 steps out)
+
+k=50: Q = r₀ + γr₁ + ... + γ⁴⁹r₄₉ + γ⁵⁰V(s₅₀)
+      Errors from 50 steps compound!
+      s₅₀ prediction probably wrong
+      V(s₅₀) estimated on out-of-distribution state
+```
+
+**Result**: k=50 rollouts learn model errors, policy worse than k=5.
+
+
+## Part 3: Dyna-Q (Model + Model-Free Hybrid)
+
+### The Idea
+
+**Dyna = Dynamics + Q-Learning**
+
+Combine:
+
+1. **Real Transitions**: Learn Q from real environment data (model-free)
+2. **Imagined Transitions**: Learn Q from model-generated data (model-based)
+
+**Why?** Leverage both:
+
+- Real data: Updates are correct, but expensive
+- Imagined data: Updates are cheap, but noisy
+
+
+### Dyna-Q Algorithm
+
+```
+Initialize:
+  Q(s, a) = 0 for all (s, a)
+  M = {} (dynamics model, initially empty)
+
+Repeat:
+  1. Sample real transition: (s, a) → (r, s_next)
+
+  2. Update Q from real transition (Q-learning):
+     Q[s, a] += α(r + γ max_a' Q[s_next, a'] - Q[s, a])
+
+  3. Update model M with real transition:
+     M[s, a] = (r, s_next)  [deterministic, or learn distribution]
+
+  4. Imagine k steps:
+     For n = 1 to k:
+       s_r = random state from visited states
+       a_r = random action
+       (r, s_next) = M[s_r, a_r]
+
+       # Update Q from imagined transition
+       Q[s_r, a_r] += α(r + γ max_a' Q[s_next, a'] - Q[s_r, a_r])
+```
+
+**Key Insight**: Use model to generate additional training data (imagined transitions).
+
+
+### Example: Dyna-Q on Cartpole
+
+```python
+class DynaQ:
+    def __init__(self, alpha=0.1, gamma=0.9, k_planning=10):
+        self.Q = defaultdict(lambda: defaultdict(float))
+        self.M = {}  # state, action → (reward, next_state)
+        self.alpha = alpha
+        self.gamma = gamma
+        self.k = k_planning
+        self.visited_states = set()
+        self.visited_actions = {}
+
+    def learn_real_transition(self, s, a, r, s_next):
+        """Learn from real transition (step 1-3)"""
+        # Q-learning update
+        max_q_next = max(self.Q[s_next].values()) if s_next in self.Q else 0
+        self.Q[s][a] += self.alpha * (r + self.gamma * max_q_next - self.Q[s][a])
+
+        # Model update
+        self.M[(s, a)] = (r, s_next)
+
+        # Track visited states/actions
+        self.visited_states.add(s)
+        if s not in self.visited_actions:
+            self.visited_actions[s] = set()
+        self.visited_actions[s].add(a)
+
+    def planning_steps(self):
+        """Imagine k steps (step 4)"""
+        for _ in range(self.k):
+            # Random state-action from memory
+            s_r = random.choice(list(self.visited_states))
+            a_r = random.choice(list(self.visited_actions[s_r]))
+
+            # Imagine transition
+            if (s_r, a_r) in self.M:
+                r, s_next = self.M[(s_r, a_r)]
+
+                # Q-learning update on imagined transition
+                max_q_next = max(self.Q[s_next].values()) if s_next in self.Q else 0
+                self.Q[s_r][a_r] += self.alpha * (
+                    r + self.gamma * max_q_next - self.Q[s_r][a_r]
+                )
+
+    def choose_action(self, s, epsilon=0.1):
+        """ε-greedy policy"""
+        if random.random() < epsilon:
+            return random.choice(actions)
+        return max(self.Q[s].items(), key=lambda x: x[1])[0]
+
+    def train_episode(self, env):
+        s = env.reset()
+        done = False
+
+        while not done:
+            a = self.choose_action(s)
+            s_next, r, done, _ = env.step(a)
+
+            # Learn from real transition
+            self.learn_real_transition(s, a, r, s_next)
+
+            # Planning steps
+            self.planning_steps()
+
+            s = s_next
+```
+
+**Benefits**:
+
+- Real transitions: Accurate but expensive
+- Imagined transitions: Cheap, accelerates learning
+
+**Sample Efficiency**: Dyna-Q learns faster than Q-learning alone (imagined transitions provide extra updates).
+
+
+### Dyna-Q Pitfall #1: Model Overfitting
+
+**Problem**: Model learned on limited data, doesn't generalize.
+
+**Example**: Model memorizes transitions, imagined transitions all identical.
+
+**Solution**:
+
+1. Use ensemble (multiple models, average predictions)
+2. Track model uncertainty
+3. Weight imagined updates by confidence
+4. Limit planning in uncertain regions
+
+
+## Part 4: MBPO (Model-Based Policy Optimization)
+
+### The Idea
+
+**MBPO = Short rollouts + Policy optimization (SAC)**
+
+Key Insight: Don't use model for full-episode rollouts. Use model for short rollouts (k=5), bootstrap with learned value function.
+
+**Architecture**:
+
+```
+1. Train ensemble of dynamics models (4-7 models)
+2. For each real transition (s, a) → (r, s_next):
+   - Roll out k=5 steps with model
+   - Collect imagined transitions (s, a, r, s', s'', ...)
+3. Combine real + imagined data
+4. Update Q-function and policy (SAC)
+5. Repeat
+```
+
+
+### MBPO Algorithm
+
+```
+Initialize:
+  Models = [M1, M2, ..., M_n]  (ensemble)
+  Q-function, policy, target network
+
+Repeat for N environment steps:
+  1. Collect real transition: (s, a) → (r, s_next)
+
+  2. Roll out k steps using ensemble:
+     s = s_current
+     For t = 1 to k:
+       # Use ensemble mean (or sample one model)
+       s_next = mean([M_i(s, a) for M_i in Models])
+       r = reward_fn(s, a)  [learned reward model]
+
+       Store imagined transition: (s, a, r, s_next)
+       s = s_next
+
+  3. Mix real + imagined:
+     - Real buffer: 10% real transitions
+     - Imagined buffer: 90% imagined transitions (from rollouts)
+
+  4. Update Q-function (n_gradient_steps):
+     Sample batch from mixed buffer
+     Compute TD error: (r + γ V(s_next) - Q(s, a))²
+     Optimize Q
+
+  5. Update policy (n_policy_steps):
+     Use SAC: maximize E[Q(s, a) - α log π(a|s)]
+
+  6. Decay rollout ratio:
+     As model improves, increase imagined % (k stays fixed)
+```
+
+
+### Key MBPO Design Choices
+
+**1. Rollout Length k**:
+
+```
+k=5-10 recommended (not k=50)
+
+Why short?
+- Error compounding (k=5 gives manageable error)
+- Value bootstrapping works (V is learned from real data)
+- MPC-style replanning (discard imagined trajectory)
+```
+
+**2. Ensemble Disagreement**:
+
+```
+High disagreement = model uncertainty in new state region
+
+Use disagreement as:
+- Early stopping (stop imagining if uncertainty high)
+- Weighting (less trust in uncertain predictions)
+- Exploration bonus (similar to curiosity)
+
+disagreement = max_i ||M_i(s, a) - M_j(s, a)||
+```
+
+**3. Model Retraining Schedule**:
+
+```
+Too frequent: Overfitting to latest data
+Too infrequent: Model becomes stale
+
+MBPO: Retrain every N environment steps
+     Typical: N = every 1000 real transitions
+```
+
+**4. Real vs Imagined Ratio**:
+
+```
+High real ratio: Few imagined transitions, limited speedup
+High imagined ratio: Many imagined transitions, faster, higher model error
+
+MBPO: Start high real % (100%), gradually increase imagined % to 90%
+
+Why gradually?
+- Early: Model untrained, use real data
+- Later: Model accurate, benefit from imagined data
+```
+
+
+### MBPO Example (Pseudocode)
+
+```python
+class MBPO:
+    def __init__(self, env, k=5, num_models=7):
+        self.models = [DynamicsModel() for _ in range(num_models)]
+        self.q_net = QNetwork()
+        self.policy = SACPolicy()
+        self.target_q_net = deepcopy(self.q_net)
+
+        self.k = k  # Rollout length
+        self.real_ratio = 0.05
+        self.real_buffer = ReplayBuffer()
+        self.imagined_buffer = ReplayBuffer()
+
+    def collect_real_transitions(self, num_steps=1000):
+        """Collect from real environment"""
+        for _ in range(num_steps):
+            s = self.env.state
+            a = self.policy(s)
+            r, s_next = self.env.step(a)
+
+            self.real_buffer.add((s, a, r, s_next))
+
+            # Retrain models
+            if len(self.real_buffer) % 1000 == 0:
+                self.train_models()
+                self.generate_imagined_transitions()
+
+    def train_models(self):
+        """Train ensemble on real data"""
+        for model in self.models:
+            dataset = self.real_buffer.sample_batch(batch_size=256)
+            for _ in range(model_epochs):
+                loss = model.train_on_batch(dataset)
+
+    def generate_imagined_transitions(self):
+        """Roll out k steps with each real transition"""
+        for (s, a, r_real, s_next_real) in self.real_buffer.sample_batch(256):
+            # Discard, use to seed rollouts
+
+            # Rollout k steps
+            s = s_next_real  # Start from real next state
+            for t in range(self.k):
+                # Ensemble prediction (mean)
+                s_pred = torch.stack([m(s, None) for m in self.models]).mean(dim=0)
+                r_pred = self.reward_model(s, None)  # Learned reward
+
+                # Check ensemble disagreement
+                disagreement = torch.std(
+                    torch.stack([m(s, None) for m in self.models]), dim=0
+                ).mean()
+
+                # Early stopping if uncertain
+                if disagreement > uncertainty_threshold:
+                    break
+
+                # Store imagined transition
+                self.imagined_buffer.add((s, a_random, r_pred, s_pred))
+
+                s = s_pred
+
+    def train_policy(self, num_steps=10000):
+        """Train Q-function and policy with mixed data"""
+        for step in range(num_steps):
+            # Sample from mixed buffer (5% real, 95% imagined)
+            if random.random() < self.real_ratio:
+                batch = self.real_buffer.sample_batch(128)
+            else:
+                batch = self.imagined_buffer.sample_batch(128)
+
+            # Q-learning update (SAC)
+            td_target = batch['r'] + gamma * self.target_q_net(batch['s_next'])
+            q_loss = MSE(self.q_net(batch['s'], batch['a']), td_target)
+            q_loss.backward()
+
+            # Policy update (SAC)
+            a_new = self.policy(batch['s'])
+            policy_loss = -self.q_net(batch['s'], a_new) + alpha * entropy(a_new)
+            policy_loss.backward()
+```
+
+
+### MBPO Pitfalls
+
+**Pitfall 1: k too large**
+
+```
+k=50 → Model errors compound, policy learns errors
+k=5 → Manageable error, good bootstrap
+```
+
+**Pitfall 2: No ensemble**
+
+```
+Single model → Overconfident, plans in wrong regions
+Ensemble → Uncertainty estimated, early stopping works
+```
+
+**Pitfall 3: Model never retrained**
+
+```
+Policy improves → States change → Model becomes stale
+Solution: Retrain every N steps (or when performance plateaus)
+```
+
+**Pitfall 4: High imagined ratio early**
+
+```
+Model untrained, 90% imagined data → Learning garbage
+Solution: Start low (5% imagined), gradually increase
+```
+
+
+## Part 5: Dreamer (Latent World Models)
+
+### The Idea
+
+**Dreamer = Imagination in latent space**
+
+Problem: Pixel-space world models hard to train (blurry reconstructions, high-dim).
+Solution: Learn latent representation, do imagination there.
+
+**Architecture**:
+
+```
+1. Encoder: Image → Latent (z)
+2. VAE: Latent space with KL regularization
+3. Dynamics in latent: z_t, a_t → z_{t+1}
+4. Policy: z_t → a_t (learns to dream)
+5. Value: z_t → V(z_t)
+6. Decoder: z_t → Image (reconstruction)
+7. Reward: z_t, a_t → r (predict reward in latent space)
+```
+
+**Key Difference from MBPO**:
+
+- MBPO: Short rollouts in state space, then Q-learning
+- Dreamer: Imagine trajectories in latent space, then train policy + value in imagination
+
+
+### Dreamer Algorithm
+
+```
+Phase 1: World Model Learning (offline)
+  Given: Real replay buffer with (image, action, reward)
+
+  1. Encode: z_t = encoder(image_t)
+  2. Learn VAE loss: KL(z || N(0, I)) + ||decode(z) - image||²
+  3. Learn dynamics: ||z_{t+1} - dynamics(z_t, a_t)||²
+  4. Learn reward: ||r_t - reward_net(z_t, a_t)||²
+  5. Learn value: ||V(z_t) - discounted_return_t||²
+
+Phase 2: Imagination (online, during learning)
+  Given: Trained world model
+
+  1. Sample state from replay buffer: z₀ = encoder(image₀)
+  2. Imagine trajectory (15-50 steps):
+     a_t ~ π(a_t | z_t)  [policy samples actions]
+     r_t = reward_net(z_t, a_t)  [predict reward]
+     z_{t+1} ~ dynamics(z_t, a_t)  [sample next latent]
+  3. Compute imagined returns:
+     G_t = r_t + γ r_{t+1} + ... + γ^{k-1} r_{t+k} + γ^k V(z_{t+k})
+  4. Train policy to maximize: E[G_t]
+  5. Train value to match: E[(V(z_t) - G_t)²]
+```
+
+
+### Dreamer Details
+
+**1. Latent Dynamics Learning**:
+
+```
+In pixel space: Errors accumulate visibly (blurry)
+In latent space: Errors more abstract, easier to learn dynamics
+
+Model: z_{t+1} = μ_θ(z_t, a_t) + σ_θ(z_t, a_t) * ε
+       ε ~ N(0, I)
+
+Loss: NLL(z_{t+1} | z_t, a_t)
+```
+
+**2. Policy Learning via Imagination**:
+
+```
+Standard RL in imagined trajectories (not real)
+
+π(a_t | z_t) learns to select actions that:
+- Maximize predicted reward
+- Maximize value (long-term)
+- Be uncertain in model predictions (curious)
+```
+
+**3. Value Learning via Imagination**:
+
+```
+V(z_t) learns to estimate imagined returns
+
+Using stop-gradient (or separate network):
+V(z_t) ≈ E[G_t]  over imagined trajectories
+
+This enables bootstrapping in imagination
+```
+
+
+### Dreamer Example (Pseudocode)
+
+```python
+class Dreamer:
+    def __init__(self):
+        self.encoder = Encoder()  # image → z
+        self.decoder = Decoder()  # z → image
+        self.dynamics = Dynamics()  # (z, a) → z
+        self.reward_net = RewardNet()  # (z, a) → r
+        self.policy = Policy()  # z → a
+        self.value_net = ValueNet()  # z → V(z)
+
+    def world_model_loss(self, batch_images, batch_actions, batch_rewards):
+        """Phase 1: Learn world model (supervised)"""
+        # Encode
+        z = self.encoder(batch_images)
+        z_next = self.encoder(batch_images_next)
+
+        # VAE loss (regularize latent)
+        kl_loss = kl_divergence(z, N(0, I))
+        recon_loss = MSE(self.decoder(z), batch_images)
+
+        # Dynamics loss
+        z_next_pred = self.dynamics(z, batch_actions)
+        dynamics_loss = MSE(z_next_pred, z_next)
+
+        # Reward loss
+        r_pred = self.reward_net(z, batch_actions)
+        reward_loss = MSE(r_pred, batch_rewards)
+
+        total_loss = kl_loss + recon_loss + dynamics_loss + reward_loss
+        return total_loss
+
+    def imagine_trajectory(self, z_start, horizon=50):
+        """Phase 2: Imagine trajectory"""
+        z = z_start
+        trajectory = []
+
+        for t in range(horizon):
+            # Sample action
+            a = self.policy(z)
+
+            # Predict reward
+            r = self.reward_net(z, a)
+
+            # Imagine next state
+            mu, sigma = self.dynamics(z, a)
+            z_next = mu + sigma * torch.randn_like(mu)
+
+            trajectory.append((z, a, r, z_next))
+            z = z_next
+
+        return trajectory
+
+    def compute_imagined_returns(self, trajectory):
+        """Compute G_t = r_t + γ r_{t+1} + ... + γ^k V(z_k)"""
+        returns = []
+        G = 0
+
+        # Backward pass
+        for z, a, r, z_next in reversed(trajectory):
+            G = r + gamma * G
+
+        # Add value bootstrap
+        z_final = trajectory[-1][3]
+        G += gamma ** len(trajectory) * self.value_net(z_final)
+
+        return G
+
+    def train_policy_and_value(self, z_start_batch, horizon=15):
+        """Phase 2: Train policy and value in imagination"""
+        z = z_start_batch
+        returns_list = []
+
+        # Rollout imagination
+        for t in range(horizon):
+            a = self.policy(z)
+            r = self.reward_net(z, a)
+
+            mu, sigma = self.dynamics(z, a)
+            z_next = mu + sigma * torch.randn_like(mu)
+
+            # Compute return-to-go
+            G = r + gamma * self.value_net(z_next)
+            returns_list.append(G)
+
+            z = z_next
+
+        # Train value
+        value_loss = MSE(self.value_net(z_start_batch), returns_list[0])
+        value_loss.backward()
+
+        # Train policy (maximize imagined return)
+        policy_loss = -returns_list[0].mean()  # Maximize return
+        policy_loss.backward()
+```
+
+
+### Dreamer Pitfalls
+
+**Pitfall 1: Too-long imagination**
+
+```
+h=50: Latent dynamics errors compound
+h=15: Better (manageable error)
+```
+
+**Pitfall 2: No KL regularization**
+
+```
+VAE collapses → z same for all states → dynamics useless
+Solution: KL term forces diverse latent space
+```
+
+**Pitfall 3: Policy overfits to value estimates**
+
+```
+Early imagination: V(z_t) estimates wrong
+Policy follows wrong value
+
+Solution:
+- Uncertainty estimation in imagination
+- Separate value network
+- Stop-gradient on value target
+```
+
+
+## Part 6: When Model-Based Helps
+
+### Sample Efficiency
+
+**Claim**: "Model-based RL is 10-100x more sample efficient."
+
+**Reality**: Depends on compute budget.
+
+**Example**: Cartpole
+
+```
+Model-free (DQN): 100k samples, instant policy
+Model-based (MBPO):
+  - 10k samples to train model: 2 minutes
+  - 1 million imagined rollouts: 30 minutes
+  - Total: 32 minutes for 10k real samples
+
+Model-free wins on compute
+```
+
+**When Model-Based Helps**:
+
+1. **Real samples expensive**: Robotics (100s per hour)
+2. **Sim available**: Use for pre-training, transfer to real
+3. **Multi-task**: Reuse model for multiple tasks
+4. **Offline RL**: No online interaction, must plan from fixed data
+
+
+### Sim-to-Real Transfer
+
+**Setup**:
+
+1. Train model + policy in simulator (cheap samples)
+2. Test on real robot (expensive, dangerous)
+3. Reality gap: Simulator ≠ Real world
+
+**Approaches**:
+
+1. **Domain Randomization**: Vary simulator dynamics, color, physics
+2. **System Identification**: Fit simulator to real robot
+3. **Robust Policy**: Train policy robust to model errors
+
+**MBPO in Sim-to-Real**:
+
+```
+1. Train in simulator (unlimited samples)
+2. Collect real data (expensive)
+3. Finetune model + policy on real data
+4. Continue imagining with real-trained model
+```
+
+
+### Multi-Task Learning
+
+**Setup**: Train model once, use for multiple tasks.
+
+**Example**:
+
+```
+Model learns: p(s_{t+1} | s_t, a_t)  [task-independent]
+Task 1 reward: r₁(s, a)
+Task 2 reward: r₂(s, a)
+
+Plan with model + reward₁
+Plan with model + reward₂
+```
+
+**Advantage**: Model amortizes over tasks.
+
+
+## Part 7: Model Error Handling
+
+### Error Sources
+
+**1. Aleatoric (Environment Noise)**:
+
+```
+Same (s, a) can lead to multiple s'
+Example: Pushing object, slight randomness in friction
+
+Solution: Stochastic model p(s' | s, a)
+```
+
+**2. Epistemic (Model Uncertainty)**:
+
+```
+Limited training data, model hasn't seen this state
+Example: Policy explores new region, model untrained
+
+Solution: Ensemble, Bayesian network, uncertainty quantification
+```
+
+**3. Distribution Shift**:
+
+```
+Policy improves, visits new states
+Model trained on old policy data
+New states: Out of training distribution
+
+Solution: Retraining, regularization, uncertainty detection
+```
+
+
+### Handling Uncertainty
+
+**Approach 1: Ensemble**:
+
+```python
+# Train multiple models on same data
+models = [DynamicsModel() for _ in range(7)]
+for model in models:
+    train_model(model, data)
+
+# Uncertainty = disagreement
+predictions = [m(s, a) for m in models]
+mean_pred = torch.stack(predictions).mean(dim=0)
+std_pred = torch.stack(predictions).std(dim=0)
+
+# Use for early stopping
+if std_pred.mean() > threshold:
+    stop_rollout()
+```
+
+**Approach 2: Uncertainty Weighting**:
+
+```
+High uncertainty → Less trust → Lower imagined data weight
+
+Weight for imagined transition = 1 / (1 + ensemble_disagreement)
+```
+
+**Approach 3: Conservative Planning**:
+
+```
+Roll out only when ensemble agrees
+
+disagreement = max_disagreement between models
+if disagreement < threshold:
+    roll_out()
+else:
+    use_only_real_data()
+```
+
+
+## Part 8: Implementation Patterns
+
+### Pseudocode: Learning Dynamics Model
+
+```python
+class DynamicsModel:
+    def __init__(self, state_dim, action_dim):
+        self.net = MLP(state_dim + action_dim, state_dim)
+        self.optimizer = Adam(self.net.parameters())
+
+    def predict(self, s, a):
+        """Predict next state"""
+        sa = torch.cat([s, a], dim=-1)
+        s_next = self.net(sa)
+        return s_next
+
+    def train(self, dataset):
+        """Supervised learning on real transitions"""
+        s, a, s_next = dataset
+
+        # Forward pass
+        s_next_pred = self.predict(s, a)
+
+        # Loss
+        loss = MSE(s_next_pred, s_next)
+
+        # Backward pass
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        return loss.item()
+```
+
+### Pseudocode: MPC Planning
+
+```python
+def mpc_plan(s_current, model, reward_fn, value_fn, k=5, num_samples=100):
+    """Model Predictive Control"""
+    best_action = None
+    best_return = -float('inf')
+
+    for _ in range(num_samples):
+        # Sample action sequence
+        actions = np.random.uniform(-1, 1, size=(k, action_dim))
+
+        # Rollout k steps
+        s = s_current
+        trajectory_return = 0
+
+        for t in range(k):
+            s_next = model.predict(s, actions[t])
+            r = reward_fn(s, actions[t])
+            trajectory_return += (gamma ** t) * r
+            s = s_next
+
+        # Bootstrap with value
+        trajectory_return += (gamma ** k) * value_fn(s)
+
+        # Track best
+        if trajectory_return > best_return:
+            best_return = trajectory_return
+            best_action = actions[0]
+
+    return best_action
+```
+
+
+## Part 9: Common Pitfalls Summary
+
+### Pitfall 1: Long Rollouts
+
+```
+k=50 → Model errors compound
+k=5 → Manageable error, good bootstrap
+FIX: Keep k small, use value function
+```
+
+### Pitfall 2: Distribution Shift
+
+```
+Policy changes → New states outside training distribution → Model wrong
+FIX: Retrain model frequently, use ensemble for uncertainty
+```
+
+### Pitfall 3: Model Overfitting
+
+```
+Few transitions → Model memorizes
+FIX: Ensemble, regularization, hold-out validation set
+```
+
+### Pitfall 4: No Value Bootstrapping
+
+```
+Pure imagined returns → All error in rollout
+FIX: Bootstrap with learned value at horizon k
+```
+
+### Pitfall 5: Using Model-Based When Model-Free Better
+
+```
+Simple task, perfect simulator → Model-based wastes compute
+FIX: Use model-free (DQN, PPO) unless samples expensive
+```
+
+### Pitfall 6: Model Never Updated
+
+```
+Policy improves, model stays frozen → Model stale
+FIX: Retrain every N steps or monitor validation performance
+```
+
+### Pitfall 7: High Imagined Data Ratio Early
+
+```
+Untrained model, 90% imagined → Learning garbage
+FIX: Start with low imagined ratio, gradually increase
+```
+
+### Pitfall 8: No Ensemble
+
+```
+Single model → Overconfident in uncertain regions
+FIX: Use 4-7 models, aggregate predictions
+```
+
+### Pitfall 9: Ignoring Reward Function
+
+```
+Use true reward with imperfect state model
+FIX: Also learn reward model (or use true rewards if available)
+```
+
+### Pitfall 10: Planning Too Long
+
+```
+Expensive planning, model errors → Not worth compute
+FIX: Short horizons (k=5), real-time constraints
+```
+
+
+## Part 10: Red Flags in Model-Based RL
+
+- [ ] **Long rollouts (k > 20)**: Model errors compound, use short rollouts
+- [ ] **No value function**: Pure imagined returns, no bootstrap
+- [ ] **Single model**: Overconfident, use ensemble
+- [ ] **Model never retrained**: Policy changes, model becomes stale
+- [ ] **High imagined ratio early**: Learning from bad model, start with 100% real
+- [ ] **No distribution shift handling**: New states outside training distribution
+- [ ] **Comparing to wrong baseline**: MBPO vs model-free, not MBPO vs DQN with same compute
+- [ ] **Believing sample efficiency claims**: Model helps sample complexity, not compute time
+- [ ] **Treating dynamics as perfect**: Model is learned, has errors
+- [ ] **No uncertainty estimates**: Can't detect when to stop rolling out
+
+
+## Part 11: Rationalization Resistance
+
+| Rationalization | Reality | Counter | Red Flag |
+|---|---|---|---|
+| "k=50 is better planning" | Errors compound, k=5 better | Use short rollouts, bootstrap value | Long horizons |
+| "I trained a model, done" | Missing planning algorithm | Use model for MPC/shooting/Dyna | No planning step |
+| "100% imagined data" | Model untrained, garbage quality | Start 100% real, gradually increase | No real data ratio |
+| "Single model fine" | Overconfident, plans in wrong regions | Ensemble provides uncertainty | Single model |
+| "Model-based always better" | Model errors + compute vs sample efficiency | Only help when real samples expensive | Unconditional belief |
+| "One model for life" | Policy improves, model becomes stale | Retrain every N steps | Static model |
+| "Dreamer works on pixels" | Needs good latent learning, complex tuning | MBPO simpler on state space | Wrong problem |
+| "Value function optional" | Pure rollout return = all model error | Bootstrap with learned value | No bootstrapping |
+
+
+## Summary
+
+**You now understand**:
+
+1. **World Models**: Learning p(s_{t+1} | s_t, a_t), error mechanics
+2. **Planning**: MPC, shooting, Dyna-Q, short horizons, value bootstrapping
+3. **Dyna-Q**: Combining real + imagined transitions
+4. **MBPO**: Short rollouts (k=5), ensemble, value bootstrapping
+5. **Dreamer**: Latent imagination, imagination in latent space
+6. **Model Error**: Compounding, distribution shift, uncertainty estimation
+7. **When to Use**: Real samples expensive, sim-to-real, multi-task
+8. **Pitfalls**: Long rollouts, no bootstrapping, overconfidence, staleness
+
+**Key Insights**:
+
+- **Error compounding**: Keep k small (5-10), trust value function beyond
+- **Distribution shift**: Retrain model as policy improves, use ensemble
+- **Value bootstrapping**: Horizon k, then V(s_k), not pure imagined return
+- **Sample vs Compute**: Model helps sample complexity, not compute time
+- **When it helps**: Real samples expensive (robotics), sim-to-real, multi-task
+
+**Route to implementation**: Use MBPO for continuous control, Dyna-Q for discrete, Dreamer for visual tasks.
+
+**This foundation enables debugging model-based algorithms and knowing when they're appropriate.**
+
+
+## Part 12: Advanced Model Learning Techniques
+
+### Latent Ensemble Models
+
+**Why Latent?** State/pixel space models struggle with high-dimensional data.
+
+**Architecture**:
+
+```
+Encoder: s (pixels) → z (latent, 256-dim)
+Ensemble models: z_t, a_t → z_{t+1}
+Decoder: z → s (reconstruction)
+
+7 ensemble models in latent space (not pixel space)
+```
+
+**Benefits**:
+
+1. **Smaller models**: Latent 256-dim vs pixel 84×84×3
+2. **Better dynamics**: Learned in abstract space
+3. **Faster training**: 10x faster than pixel models
+4. **Better planning**: Latent trajectories more stable
+
+**Implementation Pattern**:
+
+```python
+class LatentEnsembleDynamics:
+    def __init__(self):
+        self.encoder = PixelEncoder()  # image → z
+        self.decoder = PixelDecoder()  # z → image
+        self.models = [LatentDynamics() for _ in range(7)]
+
+    def encode_batch(self, images):
+        return self.encoder(images)
+
+    def predict_latent_ensemble(self, z, a):
+        """Predict next latent, with uncertainty"""
+        predictions = [m(z, a) for m in self.models]
+        z_next_mean = torch.stack(predictions).mean(dim=0)
+        z_next_std = torch.stack(predictions).std(dim=0)
+        return z_next_mean, z_next_std
+
+    def decode_batch(self, z):
+        return self.decoder(z)
+```
+
+
+### Reward Model Learning
+
+**When needed**: Visual RL (don't have privileged reward)
+
+**Structure**:
+
+```
+Reward predictor: (s or z, a) → r
+Trained via supervised learning on real transitions
+```
+
+**Training**:
+
+```python
+class RewardModel(nn.Module):
+    def __init__(self, latent_dim, action_dim):
+        self.net = MLP(latent_dim + action_dim, 1)
+
+    def forward(self, z, a):
+        za = torch.cat([z, a], dim=-1)
+        r = self.net(za)
+        return r
+
+    def train_step(self, batch):
+        z, a, r_true = batch
+        r_pred = self.forward(z, a)
+        loss = MSE(r_pred, r_true)
+        loss.backward()
+        return loss.item()
+```
+
+**Key**: Train on ground truth rewards from environment.
+
+**Integration with MBPO**:
+
+- Use learned reward when true reward unavailable
+- Use true reward when available (more accurate)
+
+
+### Model Selection and Scheduling
+
+**Problem**: Which model to use for which task?
+
+**Solution: Modular Approach**
+
+```python
+class ModelScheduler:
+    def __init__(self):
+        self.deterministic = DeterministicModel()  # For planning
+        self.stochastic = StochasticModel()  # For uncertainty
+        self.ensemble = [DynamicsModel() for _ in range(7)]
+
+    def select_for_planning(self, num_rollouts):
+        """Choose model based on phase"""
+        if num_rollouts < 100:
+            return self.stochastic  # Learn uncertainty
+        else:
+            return self.ensemble  # Use for planning
+
+    def select_for_training(self):
+        return self.deterministic  # Simple, stable
+```
+
+**Use Cases**:
+
+- Deterministic: Fast training, baseline
+- Stochastic: Uncertainty quantification
+- Ensemble: Planning with disagreement detection
+
+
+## Part 13: Multi-Step Planning Algorithms
+
+### Cross-Entropy Method (CEM) for Planning
+
+**Idea**: Iteratively refine action sequence.
+
+```
+1. Sample N random action sequences
+2. Evaluate all (rollout with model)
+3. Keep top 10% (elite)
+4. Fit Gaussian to elite
+5. Sample from Gaussian
+6. Repeat 5 times
+```
+
+**Implementation**:
+
+```python
+def cem_plan(s, model, reward_fn, value_fn, k=5, num_samples=100, num_iters=5):
+    """Cross-Entropy Method for planning"""
+    action_dim = 2  # Example: 2D action
+    a_min, a_max = -1.0, 1.0
+
+    # Initialize distribution
+    mu = torch.zeros(k, action_dim)
+    sigma = torch.ones(k, action_dim)
+
+    for iteration in range(num_iters):
+        # Sample candidates
+        samples = []
+        for _ in range(num_samples):
+            actions = (mu + sigma * torch.randn_like(mu)).clamp(a_min, a_max)
+            samples.append(actions)
+
+        # Evaluate (rollout)
+        returns = []
+        for actions in samples:
+            s_temp = s
+            ret = 0
+            for t, a in enumerate(actions):
+                s_temp = model(s_temp, a)
+                r = reward_fn(s_temp, a)
+                ret += (0.99 ** t) * r
+            ret += (0.99 ** k) * value_fn(s_temp)
+            returns.append(ret)
+
+        # Keep elite (top 10%)
+        returns = torch.tensor(returns)
+        elite_idx = torch.topk(returns, int(num_samples * 0.1))[1]
+        elite_actions = [samples[i] for i in elite_idx]
+
+        # Update distribution
+        elite = torch.stack(elite_actions)  # (elite_size, k, action_dim)
+        mu = elite.mean(dim=0)
+        sigma = elite.std(dim=0) + 0.01  # Add small constant for stability
+
+    return mu[0]  # Return first action of best sequence
+```
+
+**Comparison to Random Shooting**:
+
+- Random: Simple, parallelizable, needs many samples
+- CEM: Iterative refinement, fewer samples, more compute per sample
+
+
+### Shooting Methods: iLQR-Like Planning
+
+**Idea**: Linearize dynamics, solve quadratic problem.
+
+```
+For simple quadratic cost, can find optimal action analytically
+Uses: Dynamics Jacobian, Reward Hessian
+```
+
+**Simplified Version** (iterative refinement):
+
+```python
+def ilqr_like_plan(s, model, reward_fn, value_fn, k=5):
+    """Iterative refinement of action sequence"""
+    actions = torch.randn(k, action_dim)  # Initialize
+
+    for iteration in range(10):
+        # Forward pass: evaluate trajectory
+        s_traj = [s]
+        for t, a in enumerate(actions):
+            s_next = model(s_traj[-1], a)
+            s_traj.append(s_next)
+
+        # Backward pass: compute gradients
+        returns = 0
+        for t in range(k - 1, -1, -1):
+            r = reward_fn(s_traj[t], actions[t])
+            returns = r + 0.99 * returns
+
+            # Gradient w.r.t. action
+            grad = torch.autograd.grad(returns, actions[t], retain_graph=True)[0]
+
+            # Update action (gradient ascent)
+            actions[t] += 0.01 * grad
+
+        # Clip actions
+        actions = actions.clamp(a_min, a_max)
+
+    return actions[0]
+```
+
+**When to Use**:
+
+- Continuous action space (not discrete)
+- Differentiable model (neural network)
+- Need fast planning (compute-constrained)
+
+
+## Part 14: When NOT to Use Model-Based RL
+
+### Red Flags for Model-Based (Use Model-Free Instead)
+
+**Flag 1: Perfect Simulator Available**
+
+```
+Example: Mujoco, Unity, Atari emulator
+Benefit: Unlimited free samples
+Model-based cost: Training model + planning
+Model-free benefit: Just train policy (simpler)
+```
+
+**Flag 2: Task Very Simple**
+
+```
+Cartpole, MountainCar (horizon < 50)
+Benefit of planning: Minimal (too short)
+Cost: Model training
+Model-free wins
+```
+
+**Flag 3: Compute Limited, Samples Abundant**
+
+```
+Example: Atari (free samples from emulator)
+Model-based: 30 hours train + plan
+Model-free: 5 hours train
+Model-free wins on compute
+```
+
+**Flag 4: Stochastic Environment (High Noise)**
+
+```
+Example: Dice rolling, random collisions
+Model must predict distribution (hard)
+Model-free: Just stores Q-values (simpler)
+```
+
+**Flag 5: Evaluation Metric is Compute Time**
+
+```
+Model-based sample efficient but compute-expensive
+Model-free faster on wall-clock time
+Choose based on metric
+```
+
+
+## Part 15: Model-Based + Model-Free Hybrid Approaches
+
+### When Both Complement Each Other
+
+**Idea**: Use model-based for data augmentation, model-free for policy.
+
+**Architecture**:
+
+```
+Phase 1: Collect real data (model-free exploration)
+Phase 2: Train model
+Phase 3: Augment data (model-based imagined rollouts)
+Phase 4: Train policy on mixed data (model-free algorithm)
+```
+
+**MBPO Example**:
+
+- Model-free: SAC (learns Q and policy)
+- Model-based: Short rollouts for data augmentation
+- Hybrid: Best of both
+
+**Other Hybrids**:
+
+1. **Model for Initialization**:
+
+   ```
+   Train model-based policy → Initialize model-free policy
+   Fine-tune with model-free (if needed)
+   ```
+
+2. **Model for Curriculum**:
+
+   ```
+   Model predicts difficulty → Curriculum learning
+   Easy → Hard task progression
+   ```
+
+3. **Model for Exploration Bonus**:
+
+   ```
+   Model uncertainty → Exploration bonus
+   Curious about uncertain states
+   Combines model-based discovery + policy learning
+   ```
+
+
+## Part 16: Common Questions and Answers
+
+### Q1: Should I train one model or ensemble?
+
+**A**: Ensemble (4-7 models) provides uncertainty estimates.
+
+- Single model: Fast training, overconfident
+- Ensemble: Disagreement detects out-of-distribution states
+
+For production: Ensemble recommended.
+
+
+### Q2: How long should rollouts be?
+
+**A**: k=5-10 for most tasks.
+
+- Shorter (k=1-3): Very safe, but minimal planning
+- Medium (k=5-10): MBPO default, good tradeoff
+- Longer (k=20+): Error compounds, avoid
+
+Rule of thumb: k = task_horizon / 10
+
+
+### Q3: When should I retrain the model?
+
+**A**: Every N environment steps or when validation loss increases.
+
+- MBPO: Every 1000 steps
+- Dreamer: Every episode
+- Dyna-Q: Every 10-100 steps
+
+Monitor validation performance.
+
+
+### Q4: Model-based or model-free for my problem?
+
+**A**: Decision tree:
+
+1. Are real samples expensive? → Model-based
+2. Do I have perfect simulator? → Model-free
+3. Is task very complex (high-dim)? → Model-based (Dreamer)
+4. Is compute limited? → Model-free
+5. Default → Model-free (simpler, proven)
+
+
+### Q5: How do I know if model is good?
+
+**A**: Metrics:
+
+1. **Validation MSE**: Low on hold-out test set
+2. **Rollout Accuracy**: Predict 10-step trajectory, compare to real
+3. **Policy Performance**: Does planning with model improve policy?
+4. **Ensemble Disagreement**: Should be low in training dist, high outside
+
+
+## Part 17: Conclusion and Recommendations
+
+### Summary of Key Concepts
+
+**1. World Models**:
+
+- Learn p(s_{t+1} | s_t, a_t) from data
+- Pixel vs latent space (latent better for high-dim)
+- Deterministic vs stochastic
+
+**2. Planning**:
+
+- MPC: Optimize actions at each step
+- Shooting: Sample trajectories
+- CEM: Iterative refinement
+- Short rollouts (k=5-10) + value bootstrap
+
+**3. Algorithms**:
+
+- **Dyna-Q**: Real + imagined transitions
+- **MBPO**: Short rollouts + policy optimization
+- **Dreamer**: Latent imagination + policy learning
+
+**4. Error Handling**:
+
+- Ensemble for uncertainty
+- Early stopping on disagreement
+- Distribution shift via retraining
+- Value bootstrapping for tail uncertainty
+
+**5. When to Use**:
+
+- Real samples expensive → Model-based
+- Compute cheap → Model-free
+- Multi-task → Model-based (reuse)
+- Offline RL → Model-based (planning from fixed data)
+
+### Best Practices
+
+1. **Start simple**: Model-free first, model-based only if justified
+2. **Use ensemble**: 4-7 models, not single
+3. **Keep rollouts short**: k=5-10, not 50
+4. **Retrain frequently**: Monitor performance
+5. **Validate carefully**: Hold-out test set, policy performance
+6. **Understand your domain**: Real samples expensive? Complex? Sparse reward?
+
+### Next Steps
+
+After this skill:
+
+- **Implementation**: value-based-methods, policy-gradient-methods, actor-critic-methods
+- **Advanced**: offline-RL (planning from fixed data), curiosity-driven (exploration via model), sim-to-real (domain randomization)
+- **Evaluation**: rl-evaluation (proper benchmarking, statistics)
+
+
+**Congratulations!** You now understand model-based RL from foundations through implementation.
+
+You can:
+
+- Implement Dyna-Q for discrete control
+- Implement MBPO for continuous control
+- Handle model errors appropriately
+- Choose the right algorithm for your problem
+- Debug model-based learning issues
+- Design robust world models
+
+**Key insight**: Model-based RL trades sample complexity for model error. Success requires short rollouts, value bootstrapping, proper error handling, and appropriate algorithm selection.
+
+Go build something amazing!
diff --git a/skills/using-deep-rl/multi-agent-rl.md b/skills/using-deep-rl/multi-agent-rl.md
new file mode 100644
index 0000000..ab1efc1
--- /dev/null
+++ b/skills/using-deep-rl/multi-agent-rl.md
@@ -0,0 +1,1752 @@
+
+# Multi-Agent Reinforcement Learning
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Multiple Learners**: 2+ agents learning simultaneously in shared environment
+- **Coordination Problem**: Agents must coordinate to achieve goals
+- **Non-Stationarity**: Other agents changing policies during training
+- **CTDE Implementation**: Separating centralized training from decentralized execution
+- **Value Factorization**: Credit assignment in cooperative multi-agent settings
+- **QMIX Algorithm**: Learning cooperative Q-values with value factorization
+- **MADDPG**: Multi-agent actor-critic with centralized critics
+- **Communication**: Agents learning to communicate to improve coordination
+- **Team Reward Ambiguity**: How to split team reward fairly among agents
+- **Cooperative vs Competitive**: Designing reward structure for multi-agent problem
+- **Non-Stationarity Handling**: Dealing with other agents' policy changes
+- **When Multi-Agent RL Needed**: Deciding if problem requires MARL vs single-agent
+
+**This skill teaches learning from multiple simultaneous agents with coordination challenges.**
+
+Do NOT use this skill for:
+
+- Single-agent RL (use rl-foundations, value-based-methods, policy-gradient-methods)
+- Supervised multi-task learning (that's supervised learning)
+- Simple parallel independent tasks (use single-agent RL in parallel)
+- Pure game theory without learning (use game theory frameworks)
+
+## Core Principle
+
+**Multi-agent RL learns coordinated policies for multiple agents in shared environment, solving the fundamental problem that environment non-stationarity from other agents' learning breaks standard RL convergence guarantees.**
+
+The core insight: When other agents improve their policies, the environment changes. Your value estimates computed assuming other agents play old policy become wrong when they play new policy.
+
+```
+Single-Agent RL:
+  1. Agent learns policy π
+  2. Environment is fixed
+  3. Agent value estimates Q(s,a) stable
+  4. Algorithm converges to optimal policy
+
+Multi-Agent RL:
+  1. Agent 1 learns policy π_1
+  2. Agent 2 also learning, changing π_2
+  3. Environment from Agent 1 perspective is non-stationary
+  4. Agent 1's value estimates invalid when Agent 2 improves
+  5. Standard convergence guarantees broken
+  6. Need special algorithms: QMIX, MADDPG, communication
+
+Without addressing non-stationarity, multi-agent learning is unstable.
+```
+
+**Without understanding multi-agent problem structure and non-stationarity, you'll implement algorithms that fail to coordinate, suffer credit assignment disasters, or waste effort on agent conflicts instead of collaboration.**
+
+
+## Part 1: Multi-Agent RL Fundamentals
+
+### Why Multi-Agent RL Differs From Single-Agent
+
+**Standard RL Assumption (Single-Agent)**:
+
+- You have one agent
+- Environment dynamics and reward function are fixed
+- Agent's actions don't change environment structure
+- Goal: Learn policy that maximizes expected return
+
+**Multi-Agent RL Reality**:
+
+- Multiple agents act in shared environment
+- Each agent learns simultaneously
+- When Agent 1 improves, Agent 2 sees changed environment
+- Reward depends on all agents' actions: R = R(a_1, a_2, ..., a_n)
+- Non-stationarity: other agents' policies change constantly
+- Convergence undefined (what is "optimal" when others adapt?)
+
+### Problem Types: Cooperative, Competitive, Mixed
+
+**Cooperative Multi-Agent Problem**:
+
+```
+Definition: All agents share same objective
+Reward: R_team(a_1, a_2, ..., a_n) = same for all agents
+
+Example - Robot Team Assembly:
+  - All robots get same team reward
+  - +100 if assembly succeeds
+  - 0 if assembly fails
+  - All robots benefit from success equally
+
+Characteristic:
+  - Agents don't conflict on goals
+  - Challenge: Credit assignment (who deserves credit?)
+  - Solution: Value factorization (QMIX, QPLEX)
+
+Key Insight:
+  Cooperative doesn't mean agents see each other!
+  - Agents might have partial/no observation of others
+  - Still must coordinate for team success
+  - Factorization enables coordination without observation
+```
+
+**Competitive Multi-Agent Problem**:
+
+```
+Definition: Agents have opposite objectives (zero-sum)
+Reward: R_i(a_1, ..., a_n) = -R_j(a_1, ..., a_n) for i≠j
+
+Example - Chess, Poker, Soccer:
+  - Agent 1 tries to win
+  - Agent 2 tries to win
+  - One's gain is other's loss
+  - R_1 + R_2 = 0 (zero-sum)
+
+Characteristic:
+  - Agents are adversarial
+  - Challenge: Computing best response to opponent
+  - Solution: Nash equilibrium (MADDPG, self-play)
+
+Key Insight:
+  In competitive games, agents must predict opponent strategies.
+  - Agent 1 assumes Agent 2 plays best response
+  - Agent 2 assumes Agent 1 plays best response
+  - Nash equilibrium = mutual best response
+  - No agent can improve unilaterally
+```
+
+**Mixed Multi-Agent Problem**:
+
+```
+Definition: Some cooperation, some competition
+Reward: R_i(a_1, ..., a_n) contains both shared and individual terms
+
+Example - Team Soccer (3v3):
+  - Blue team agents cooperate for same goal
+  - But blue vs red is competitive
+  - Blue agent reward:
+    R_i = +10 if blue scores, -10 if red scores (team-based)
+        + 1 if blue_i scores goal (individual bonus)
+
+Characteristic:
+  - Agents cooperate with teammates
+  - Agents compete with opponents
+  - Challenge: Balancing cooperation and competition
+  - Solution: Hybrid approaches using both cooperative and competitive algorithms
+
+Key Insight:
+  Mixed scenarios are most common in practice.
+  - Robot teams: cooperate internally, compete for resources
+  - Trading: multiple firms (cooperate via regulations, compete for profit)
+  - Multiplayer games: team-based (cooperate with allies, compete with enemies)
+```
+
+### Non-Stationarity: The Core Challenge
+
+**What is Non-Stationarity?**
+
+```
+Stationarity: Environment dynamics P(s'|s,a) and rewards R(s,a) are fixed
+Non-Stationarity: Dynamics/rewards change over time
+
+In multi-agent RL:
+  Environment from Agent 1's perspective:
+    P(s'_1 | s_1, a_1, a_2(t), a_3(t), ...)
+
+  If other agents' policies change:
+    π_2(t) ≠ π_2(t+1)
+
+  Then transition dynamics change:
+    P(s'_1 | s_1, a_1, a_2(t)) ≠ P(s'_1 | s_1, a_1, a_2(t+1))
+
+  Environment is non-stationary!
+```
+
+**Why Non-Stationarity Breaks Standard RL**:
+
+```python
+# Single-agent Q-learning assumes:
+# Environment is fixed during learning
+# Q-values converge because bellman expectation is fixed point
+
+Q[s,a] ← Q[s,a] + α(r + γ max_a' Q[s',a'] - Q[s,a])
+
+# In multi-agent with non-stationarity:
+# Other agents improve their policies
+# Max action a' depends on other agents' policies
+# When other agents improve, max action changes
+# Q-values chase moving target
+# No convergence guarantee
+```
+
+**Impact on Learning**:
+
+```
+Scenario: Two agents learning to navigate
+Agent 1 learns: "If Agent 2 goes left, I go right"
+Agent 1 builds value estimates based on this assumption
+
+Agent 2 improves: "Actually, going right is better"
+Now Agent 2 goes right (not left)
+Agent 1's assumptions invalid!
+Agent 1's value estimates become wrong
+Agent 1 must relearn
+
+Agent 1 tries new path based on new estimates
+Agent 2 sees Agent 1's change and adapts
+Agent 2's estimates become wrong
+
+Result: Chaotic learning, no convergence
+```
+
+
+## Part 2: Centralized Training, Decentralized Execution (CTDE)
+
+### CTDE Paradigm
+
+**Key Idea**: Use centralized information during training, decentralized information during execution.
+
+```
+Training Phase (Centralized):
+  - Trainer observes: o_1, o_2, ..., o_n (all agents' observations)
+  - Trainer observes: a_1, a_2, ..., a_n (all agents' actions)
+  - Trainer observes: R_team or R_1, R_2, ... (reward signals)
+  - Trainer can assign credit fairly
+  - Trainer can compute global value functions
+
+Execution Phase (Decentralized):
+  - Agent 1 observes: o_1 only
+  - Agent 1 executes: π_1(a_1 | o_1)
+  - Agent 1 doesn't need to see other agents
+  - Each agent is independent during rollout
+  - Enables scalability and robustness
+```
+
+**Why CTDE Solves Non-Stationarity**:
+
+```
+During training:
+  - Centralized trainer sees all information
+  - Can compute value Q_1(s_1, s_2, ..., s_n | a_1, a_2, ..., a_n)
+  - Can factor: Q_team = f(Q_1, Q_2, ..., Q_n) (QMIX)
+  - Can compute importance weights: who contributed most?
+
+During execution:
+  - Decentralized agents only use own observations
+  - Policies learned during centralized training work well
+  - No need for other agents' observations at runtime
+  - Robust to other agents' changes (policy doesn't depend on their states)
+
+Result:
+  - Training leverages global information for stability
+  - Execution is independent and scalable
+  - Solves non-stationarity via centralized credit assignment
+```
+
+### CTDE in Practice
+
+**Centralized Information Used in Training**:
+
+```python
+# During training, compute global value function
+# Inputs: observations and actions of ALL agents
+def compute_value_ctde(obs_1, obs_2, obs_3, act_1, act_2, act_3):
+    # See everyone's observations
+    global_state = combine(obs_1, obs_2, obs_3)
+
+    # See everyone's actions
+    joint_action = (act_1, act_2, act_3)
+
+    # Compute shared value with all information
+    Q_shared = centralized_q_network(global_state, joint_action)
+
+    # Factor into individual Q-values (QMIX)
+    Q_1 = q_network_1(obs_1, act_1)
+    Q_2 = q_network_2(obs_2, act_2)
+    Q_3 = q_network_3(obs_3, act_3)
+
+    # Factorization: Q_team ≈ mixing_network(Q_1, Q_2, Q_3)
+    # Each agent learns its contribution via QMIX loss
+    return Q_shared, (Q_1, Q_2, Q_3)
+```
+
+**Decentralized Execution**:
+
+```python
+# During execution, use only own observation
+def execute_policy(agent_id, own_observation):
+    # Agent only sees and uses own obs
+    action = policy_network(own_observation)
+
+    # No access to other agents' observations
+    # Doesn't need other agents' actions
+    # Purely decentralized execution
+    return action
+
+# All agents execute in parallel:
+# Agent 1: o_1 → a_1 (decentralized)
+# Agent 2: o_2 → a_2 (decentralized)
+# Agent 3: o_3 → a_3 (decentralized)
+# Execution is independent!
+```
+
+
+## Part 3: QMIX - Value Factorization for Cooperative Teams
+
+### QMIX: The Core Insight
+
+**Problem**: In cooperative teams, how do you assign credit fairly?
+
+```
+Naive approach: Joint Q-value
+  Q_team(s, a_1, a_2, ..., a_n) = expected return from joint action
+
+Problem: Still doesn't assign individual credit
+  If Q_team = 100, how much did Agent 1 contribute?
+  Agent 1 might think: "I deserve 50%" (overconfident)
+  But Agent 1 might deserve only 10% (others did more)
+
+Result: Agents learn wrong priorities
+```
+
+**Solution: Value Factorization (QMIX)**
+
+```
+Key Assumption: Monotonicity in actions
+  If improving Agent i's action improves team outcome,
+  improving Agent i's individual Q-value should help
+
+Mathematical Form:
+  Q_team(a) ≥ Q_team(a') if Agent i plays better action a_i instead of a'_i
+  and Agent i's Q_i(a_i) > Q_i(a'_i)
+
+Concrete Implementation:
+  Q_team(s, a_1, ..., a_n) = f(Q_1(s_1, a_1), Q_2(s_2, a_2), ..., Q_n(s_n, a_n))
+
+  Where:
+  - Q_i: Individual Q-network for agent i
+  - f: Monotonic mixing network (ensures monotonicity)
+
+  Monotonicity guarantee:
+    If Q_1 increases, Q_team increases (if f is monotonic)
+```
+
+### QMIX Algorithm
+
+**Architecture**:
+
+```
+Individual Q-Networks:           Mixing Network (Monotonic):
+┌─────────┐                      ┌──────────────────┐
+│ Agent 1 │─o_1─────────────────→│                  │
+│  LSTM   │                      │   MLP (weights)  │─→ Q_team
+│ Q_1     │                      │  are monotonic   │
+└─────────┘                      │   (ReLU blocks)  │
+                                 └──────────────────┘
+┌─────────┐                             ↑
+│ Agent 2 │─o_2──────────────────────────┤
+│  LSTM   │                              │
+│ Q_2     │                              │
+└─────────┘                              │
+                                    Hypernet:
+┌─────────┐                        generates weights
+│ Agent 3 │─o_3────────────────────→    as function
+│  LSTM   │                        of state
+│ Q_3     │
+└─────────┘
+
+Value outputs: Q_1(o_1, a_1), Q_2(o_2, a_2), Q_3(o_3, a_3)
+Mixing: Q_team = mixing_network(Q_1, Q_2, Q_3, state)
+```
+
+**QMIX Training**:
+
+```python
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+
+class QMIXAgent:
+    def __init__(self, n_agents, state_dim, obs_dim, action_dim, hidden_dim=64):
+        self.n_agents = n_agents
+
+        # Individual Q-networks (one per agent)
+        self.q_networks = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(obs_dim + action_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 1)  # Q-value for this action
+            )
+            for _ in range(n_agents)
+        ])
+
+        # Mixing network: takes individual Q-values and produces joint Q
+        self.mixing_network = nn.Sequential(
+            nn.Linear(n_agents + state_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)
+        )
+
+        # Hypernet: generates mixing network weights (ensuring monotonicity)
+        self.hypernet = nn.Sequential(
+            nn.Linear(state_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim * (n_agents + state_dim))
+        )
+
+        self.optimizer = Adam(
+            list(self.q_networks.parameters()) +
+            list(self.mixing_network.parameters()) +
+            list(self.hypernet.parameters()),
+            lr=5e-4
+        )
+
+        self.discount = 0.99
+        self.target_update_rate = 0.001
+        self.epsilon = 0.05
+
+        # Target networks (soft update)
+        self._init_target_networks()
+
+    def _init_target_networks(self):
+        """Create target networks for stable learning."""
+        self.target_q_networks = nn.ModuleList([
+            nn.Sequential(*[nn.Linear(*p.shape[::-1]) for p in q.parameters()])
+            for q in self.q_networks
+        ])
+        self.target_mixing_network = nn.Sequential(
+            *[nn.Linear(*p.shape[::-1]) for p in self.mixing_network.parameters()]
+        )
+
+    def compute_individual_q_values(self, observations, actions):
+        """
+        Compute Q-values for each agent given their observation and action.
+
+        Args:
+            observations: list of n_agents observations (each [batch_size, obs_dim])
+            actions: list of n_agents actions (each [batch_size, action_dim])
+
+        Returns:
+            q_values: tensor [batch_size, n_agents]
+        """
+        q_values = []
+        for i, (obs, act) in enumerate(zip(observations, actions)):
+            # Concatenate observation and action
+            q_input = torch.cat([obs, act], dim=-1)
+            q_i = self.q_networks[i](q_input)
+            q_values.append(q_i)
+
+        return torch.cat(q_values, dim=-1)  # [batch_size, n_agents]
+
+    def compute_joint_q_value(self, q_values, state):
+        """
+        Mix individual Q-values into joint Q-value using monotonic mixing network.
+
+        Args:
+            q_values: individual Q-values [batch_size, n_agents]
+            state: global state [batch_size, state_dim]
+
+        Returns:
+            q_joint: joint Q-value [batch_size, 1]
+        """
+        # Ensure monotonicity by using weight constraints
+        # Mixing network learns to combine Q-values
+        q_joint = self.mixing_network(torch.cat([q_values, state], dim=-1))
+        return q_joint
+
+    def train_step(self, batch, state_batch):
+        """
+        One QMIX training step.
+
+        Batch contains:
+          observations: list[n_agents] of [batch_size, obs_dim]
+          actions: list[n_agents] of [batch_size, action_dim]
+          rewards: [batch_size] (shared team reward)
+          next_observations: list[n_agents] of [batch_size, obs_dim]
+          dones: [batch_size]
+        """
+        observations, actions, rewards, next_observations, dones = batch
+        batch_size = observations[0].shape[0]
+
+        # Compute current Q-values
+        q_values = self.compute_individual_q_values(observations, actions)
+        q_joint = self.compute_joint_q_value(q_values, state_batch)
+
+        # Compute target Q-values
+        with torch.no_grad():
+            # Get next Q-values for all possible joint actions (in practice, greedy)
+            next_q_values = self.compute_individual_q_values(
+                next_observations,
+                [torch.zeros_like(a) for a in actions]  # Best actions (simplified)
+            )
+
+            # Mix next Q-values
+            next_q_joint = self.compute_joint_q_value(next_q_values, state_batch)
+
+            # TD target: team gets shared reward
+            td_target = rewards.unsqueeze(-1) + (
+                1 - dones.unsqueeze(-1)
+            ) * self.discount * next_q_joint
+
+        # QMIX loss
+        qmix_loss = ((q_joint - td_target) ** 2).mean()
+
+        self.optimizer.zero_grad()
+        qmix_loss.backward()
+        self.optimizer.step()
+
+        # Soft update target networks
+        self._soft_update_targets()
+
+        return {'qmix_loss': qmix_loss.item()}
+
+    def _soft_update_targets(self):
+        """Soft update target networks."""
+        for target, main in zip(self.target_q_networks, self.q_networks):
+            for target_param, main_param in zip(target.parameters(), main.parameters()):
+                target_param.data.copy_(
+                    self.target_update_rate * main_param.data +
+                    (1 - self.target_update_rate) * target_param.data
+                )
+
+    def select_actions(self, observations):
+        """
+        Greedy action selection (decentralized execution).
+        Each agent selects action independently.
+        """
+        actions = []
+        for i, obs in enumerate(observations):
+            with torch.no_grad():
+                # Agent i evaluates all possible actions
+                best_action = None
+                best_q = -float('inf')
+
+                for action in range(self.action_dim):
+                    q_input = torch.cat([obs, one_hot(action, self.action_dim)])
+                    q_val = self.q_networks[i](q_input).item()
+
+                    if q_val > best_q:
+                        best_q = q_val
+                        best_action = action
+
+                # Epsilon-greedy
+                if torch.rand(1).item() < self.epsilon:
+                    best_action = torch.randint(0, self.action_dim, (1,)).item()
+
+                actions.append(best_action)
+
+        return actions
+```
+
+**QMIX Key Concepts**:
+
+1. **Monotonicity**: If agent improves action, team value improves
+2. **Value Factorization**: Q_team = f(Q_1, Q_2, ..., Q_n)
+3. **Decentralized Execution**: Each agent uses only own observation
+4. **Centralized Training**: Trainer sees all Q-values and state
+
+**When QMIX Works Well**:
+
+- Fully observable or partially observable cooperative teams
+- Sparse communication needs
+- Fixed team membership
+- Shared reward structure
+
+**QMIX Limitations**:
+
+- Assumes monotonicity (not all cooperative games satisfy this)
+- Doesn't handle explicit communication
+- Doesn't learn agent roles dynamically
+
+
+## Part 4: MADDPG - Multi-Agent Actor-Critic
+
+### MADDPG: For Competitive and Mixed Scenarios
+
+**Core Idea**: Actor-critic but with centralized critic during training.
+
+```
+DDPG (single-agent):
+  - Actor π(a|s) learns policy
+  - Critic Q(s,a) estimates value
+  - Critic trains actor via policy gradient
+
+MADDPG (multi-agent):
+  - Each agent has actor π_i(a_i|o_i)
+  - Centralized critic Q(s, a_1, ..., a_n) sees all agents
+  - During training: use centralized critic for learning
+  - During execution: each agent uses only own actor
+```
+
+**MADDPG Algorithm**:
+
+```python
+class MADDPGAgent:
+    def __init__(self, agent_id, n_agents, obs_dim, action_dim, state_dim, hidden_dim=256):
+        self.agent_id = agent_id
+        self.n_agents = n_agents
+        self.action_dim = action_dim
+
+        # Actor: learns decentralized policy π_i(a_i|o_i)
+        self.actor = nn.Sequential(
+            nn.Linear(obs_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim),
+            nn.Tanh()  # Continuous actions in [-1, 1]
+        )
+
+        # Critic: centralized value Q(s, a_1, ..., a_n)
+        # Input: global state + all agents' actions
+        self.critic = nn.Sequential(
+            nn.Linear(state_dim + n_agents * action_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)  # Single value output
+        )
+
+        # Target networks for stability
+        self.target_actor = copy.deepcopy(self.actor)
+        self.target_critic = copy.deepcopy(self.critic)
+
+        self.actor_optimizer = Adam(self.actor.parameters(), lr=1e-4)
+        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-3)
+
+        self.discount = 0.99
+        self.tau = 0.01  # Soft update rate
+
+    def train_step(self, batch):
+        """
+        MADDPG training step.
+
+        Batch contains:
+          observations: list[n_agents] of [batch_size, obs_dim]
+          actions: list[n_agents] of [batch_size, action_dim]
+          rewards: [batch_size] (agent-specific reward!)
+          next_observations: list[n_agents] of [batch_size, obs_dim]
+          global_state: [batch_size, state_dim]
+          next_global_state: [batch_size, state_dim]
+          dones: [batch_size]
+        """
+        observations, actions, rewards, next_observations, \
+            global_state, next_global_state, dones = batch
+
+        batch_size = observations[0].shape[0]
+        agent_obs = observations[self.agent_id]
+        agent_action = actions[self.agent_id]
+        agent_reward = rewards  # Agent-specific reward
+
+        # Step 1: Critic Update (centralized)
+        with torch.no_grad():
+            # Compute next actions using target actors
+            next_actions = []
+            for i, next_obs in enumerate(next_observations):
+                if i == self.agent_id:
+                    next_a = self.target_actor(next_obs)
+                else:
+                    # Use stored target actors from other agents
+                    next_a = other_agents_target_actors[i](next_obs)
+                next_actions.append(next_a)
+
+            # Concatenate all next actions
+            next_actions_cat = torch.cat(next_actions, dim=-1)
+
+            # Compute next value (centralized critic)
+            next_q = self.target_critic(
+                torch.cat([next_global_state, next_actions_cat], dim=-1)
+            )
+
+            # TD target
+            td_target = agent_reward.unsqueeze(-1) + (
+                1 - dones.unsqueeze(-1)
+            ) * self.discount * next_q
+
+        # Compute current Q-value
+        current_actions_cat = torch.cat(actions, dim=-1)
+        current_q = self.critic(
+            torch.cat([global_state, current_actions_cat], dim=-1)
+        )
+
+        # Critic loss
+        critic_loss = ((current_q - td_target) ** 2).mean()
+
+        self.critic_optimizer.zero_grad()
+        critic_loss.backward()
+        self.critic_optimizer.step()
+
+        # Step 2: Actor Update (decentralized policy improvement)
+        # Actor only uses own observation
+        policy_actions = []
+        for i, obs in enumerate(observations):
+            if i == self.agent_id:
+                # Use current actor for this agent
+                action_i = self.actor(obs)
+            else:
+                # Use current actors of other agents
+                action_i = other_agents_actors[i](obs)
+            policy_actions.append(action_i)
+
+        # Compute Q-value under current policy
+        policy_actions_cat = torch.cat(policy_actions, dim=-1)
+        policy_q = self.critic(
+            torch.cat([global_state, policy_actions_cat], dim=-1)
+        )
+
+        # Policy gradient: maximize Q-value
+        actor_loss = -policy_q.mean()
+
+        self.actor_optimizer.zero_grad()
+        actor_loss.backward()
+        self.actor_optimizer.step()
+
+        # Soft update target networks
+        self._soft_update_targets()
+
+        return {
+            'critic_loss': critic_loss.item(),
+            'actor_loss': actor_loss.item(),
+            'avg_q_value': current_q.mean().item()
+        }
+
+    def _soft_update_targets(self):
+        """Soft update target networks toward main networks."""
+        for target_param, main_param in zip(
+            self.target_actor.parameters(),
+            self.actor.parameters()
+        ):
+            target_param.data.copy_(
+                self.tau * main_param.data + (1 - self.tau) * target_param.data
+            )
+
+        for target_param, main_param in zip(
+            self.target_critic.parameters(),
+            self.critic.parameters()
+        ):
+            target_param.data.copy_(
+                self.tau * main_param.data + (1 - self.tau) * target_param.data
+            )
+
+    def select_action(self, observation):
+        """Decentralized action selection."""
+        with torch.no_grad():
+            action = self.actor(observation)
+            # Add exploration noise
+            action = action + torch.normal(0, 0.1, action.shape)
+            action = torch.clamp(action, -1, 1)
+        return action.cpu().numpy()
+```
+
+**MADDPG Key Properties**:
+
+1. **Centralized Critic**: Sees all agents' observations and actions
+2. **Decentralized Actors**: Each agent uses only own observation
+3. **Agent-Specific Rewards**: Each agent maximizes own reward
+4. **Handles Competitive/Mixed**: Doesn't assume cooperation
+5. **Continuous Actions**: Works well with continuous action spaces
+
+**When MADDPG Works Well**:
+
+- Competitive and mixed-motive scenarios
+- Continuous action spaces
+- Partial observability (agents don't see each other)
+- Need for independent agent rewards
+
+
+## Part 5: Communication in Multi-Agent Systems
+
+### When and Why Communication Helps
+
+**Problem Without Communication**:
+
+```
+Agents with partial observability:
+Agent 1: sees position p_1, but NOT p_2
+Agent 2: sees position p_2, but NOT p_1
+
+Goal: Avoid collision while moving to targets
+
+Without communication:
+  Agent 1: "I don't know where Agent 2 is"
+  Agent 2: "I don't know where Agent 1 is"
+
+  Both might move toward same corridor
+  Collision, but agents couldn't coordinate!
+
+With communication:
+  Agent 1: broadcasts "I'm moving left"
+  Agent 2: receives message, moves right
+  No collision!
+```
+
+**Communication Trade-offs**:
+
+```
+Advantages:
+- Enables coordination with partial observability
+- Can solve some problems impossible without communication
+- Explicit intention sharing
+
+Disadvantages:
+- Adds complexity: agents must learn what to communicate
+- High variance: messages might mislead
+- Computational overhead: processing all messages
+- Communication bandwidth limited in real systems
+
+When to use communication:
+- Partial observability prevents coordination
+- Explicit roles (e.g., one agent is "scout")
+- Limited field of view, agents are out of sight
+- Agents benefit from sharing intentions
+
+When NOT to use communication:
+- Full observability (agents see everything)
+- Simple coordination (value factorization sufficient)
+- Communication is unreliable
+```
+
+### CommNet: Learning Communication
+
+**Idea**: Agents learn to send and receive messages to improve coordination.
+
+```
+Architecture:
+1. Each agent processes own observation: f_i(o_i) → hidden state h_i
+2. Agent broadcasts hidden state as "message"
+3. Agent receives messages from neighbors
+4. Agent aggregates messages: Σ_j M(h_j) (attention mechanism)
+5. Agent processes aggregated information: policy π(a_i | h_i + aggregated)
+
+Key: Agents learn what information to broadcast in h_i
+     Receiving agents learn what messages are useful
+```
+
+**Simple Communication Example**:
+
+```python
+class CommNetAgent:
+    def __init__(self, obs_dim, action_dim, hidden_dim=64):
+        # Encoding network: observation → hidden message
+        self.encoder = nn.Sequential(
+            nn.Linear(obs_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim)  # Message to broadcast
+        )
+
+        # Communication aggregation (simplified attention)
+        self.comm_processor = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),  # Own + received
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+
+        # Policy network
+        self.policy = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim)
+        )
+
+    def compute_message(self, observation):
+        """Generate message to broadcast to other agents."""
+        return self.encoder(observation)
+
+    def forward(self, observation, received_messages):
+        """
+        Process observation + received messages, output action.
+
+        Args:
+            observation: [obs_dim]
+            received_messages: list of messages from neighbors
+
+        Returns:
+            action: [action_dim]
+        """
+        # Generate own message
+        my_message = self.encoder(observation)
+
+        # Aggregate received messages (mean pooling)
+        if received_messages:
+            others_messages = torch.stack(received_messages).mean(dim=0)
+        else:
+            others_messages = torch.zeros_like(my_message)
+
+        # Process aggregated communication
+        combined = torch.cat([my_message, others_messages], dim=-1)
+        hidden = self.comm_processor(combined)
+
+        # Select action
+        action = self.policy(hidden)
+        return action, my_message
+```
+
+**Communication Pitfall**: Agents learn to send misleading messages!
+
+```python
+# Without careful design, agents learn deceptive communication:
+Agent 1 learns: "If I broadcast 'I'm going right', Agent 2 will go left"
+Agent 1 broadcasts: "Going right" (but actually goes left)
+Agent 2 goes right as expected (collision!)
+Agent 1 gets higher reward (its deception worked)
+
+Solution: Design communication carefully
+- Verify agents to be truthful (implicit in cooperative setting)
+- Use communication only when beneficial
+- Monitor emergent communication protocols
+```
+
+
+## Part 6: Credit Assignment in Cooperative Teams
+
+### Individual Reward vs Team Reward
+
+**Problem**:
+
+```
+Scenario: 3-robot assembly team
+Team reward: +100 if assembly succeeds, 0 if fails
+
+Individual Reward Design:
+Option 1 - Split equally: each robot gets +33.33
+  Problem: Robot 3 (insignificant) gets same credit as Robot 1 (crucial)
+
+Option 2 - Use agent contribution:
+  Robot 1 (held piece): +60
+  Robot 2 (guided insertion): +25
+  Robot 3 (steadied base): +15
+  Problem: How to compute contributions? (requires complex analysis)
+
+Option 3 - Use value factorization (QMIX):
+  Team value = mixing_network(Q_1, Q_2, Q_3)
+  Each robot learns its Q-value
+  QMIX learns to weight Q-values by importance
+  Result: Fair credit assignment via factorization
+```
+
+**QMIX Credit Assignment Mechanism**:
+
+```
+Training:
+  Observe: robot_1 does action a_1, gets q_1
+           robot_2 does action a_2, gets q_2
+           robot_3 does action a_3, gets q_3
+           Team gets reward r_team
+
+  Factorize: r_team ≈ mixing_network(q_1, q_2, q_3)
+             = w_1 * q_1 + w_2 * q_2 + w_3 * q_3 + bias
+
+  Learn weights w_i via mixing network
+
+  If Robot 1 is crucial:
+    mixing network learns w_1 > w_2, w_3
+    Robot 1 gets larger credit (w_1 * q_1 > others)
+
+  If Robot 3 is redundant:
+    mixing network learns w_3 ≈ 0
+    Robot 3 gets small credit
+
+Result: Each robot learns fair contribution
+```
+
+**Value Decomposition Pitfall**: Agents can game the factorization!
+
+```
+Example: Learned mixing network w = [0.9, 0.05, 0.05]
+
+Agent 1 learns: "I must maximize q_1 (it has weight 0.9)"
+Agent 1 tries: action that maximizes own q_1
+Problem: q_1 computed from own reward signal (myopic)
+         might not actually help team!
+
+Solution: Use proper credit assignment metrics
+- Shapley values: game theory approach to credit
+- Counterfactual reasoning: what if agent didn't act?
+- Implicit credit (QMIX): let factorization emergently learn
+```
+
+
+## Part 7: Common Multi-Agent RL Failure Modes
+
+### Failure Mode 1: Non-Stationarity Instability
+
+**Symptom**: Learning curves erratic, no convergence.
+
+```python
+# Problem scenario:
+for episode in range(1000):
+    # Agent 1 learns
+    episode_reward_1 = []
+    for t in range(steps):
+        a_1 = agent_1.select_action(o_1)
+        a_2 = agent_2.select_action(o_2)  # Using old policy!
+        r, o'_1, o'_2 = env.step(a_1, a_2)
+        agent_1.update(a_1, r, o'_1)
+
+    # Agent 2 improves (environment changes for Agent 1!)
+    episode_reward_2 = []
+    for t in range(steps):
+        a_1 = agent_1.select_action(o_1)  # OLD VALUE ESTIMATES
+        a_2 = agent_2.select_action(o_2)  # NEW POLICY (Agent 2 improved)
+        r, o'_1, o'_2 = env.step(a_1, a_2)
+        agent_2.update(a_2, r, o'_2)
+
+Result: Agent 1's Q-values become invalid when Agent 2 improves
+        Learning is unstable, doesn't converge
+```
+
+**Solution**: Use CTDE or opponent modeling
+
+```python
+# CTDE Approach:
+# During training, use global information to stabilize
+trainer.observe(o_1, a_1, o_2, a_2, r)
+# Trainer sees both agents' actions, can compute stable target
+
+# During execution:
+agent_1.execute(o_1 only)  # Decentralized
+agent_2.execute(o_2 only)  # Decentralized
+```
+
+### Failure Mode 2: Reward Ambiguity
+
+**Symptom**: Agents don't improve, stuck at local optima.
+
+```python
+# Problem: Multi-agent team, shared reward
+total_reward = 50
+
+# Distribution: who gets what?
+# Agent 1 thinks: "I deserve 50" (overconfident)
+# Agent 2 thinks: "I deserve 50" (overconfident)
+# Agent 3 thinks: "I deserve 50" (overconfident)
+
+# Each agent overestimates importance
+# Each agent learns selfishly (internal conflict)
+# Team coordination breaks
+
+Result: Team performance worse than if agents cooperated
+```
+
+**Solution**: Use value factorization
+
+```python
+# QMIX learns fair decomposition
+q_1, q_2, q_3 = compute_individual_values(a_1, a_2, a_3)
+team_reward = mixing_network(q_1, q_2, q_3)
+
+# Mixing network learns importance
+# If Agent 2 crucial: weight_2 > weight_1, weight_3
+# Training adjusts weights based on who actually helped
+
+Result: Fair credit, agents coordinate
+```
+
+### Failure Mode 3: Algorithm-Reward Mismatch
+
+**Symptom**: Learning fails in specific problem types (cooperative/competitive).
+
+```python
+# Problem: Using QMIX (cooperative) in competitive setting
+# Competitive game (agents have opposite rewards)
+
+# QMIX assumes: shared reward (monotonicity works)
+# But in competitive:
+#   Q_1 high means Agent 1 winning
+#   Q_2 high means Agent 2 winning (opposite!)
+# QMIX mixing doesn't make sense
+# Convergence fails
+
+# Solution: Use MADDPG (handles competitive)
+# MADDPG doesn't assume monotonicity
+# Works with individual rewards
+# Handles competition naturally
+```
+
+
+## Part 8: When to Use Multi-Agent RL
+
+### Problem Characteristics for MARL
+
+**Use MARL when**:
+
+```
+1. Multiple simultaneous learners
+   - Problem has 2+ agents learning
+   - NOT just parallel tasks (that's single-agent x N)
+
+2. Shared/interdependent environment
+   - Agents' actions affect each other
+   - One agent's action impacts other agent's rewards
+   - True interaction (not independent MDPs)
+
+3. Coordination is beneficial
+   - Agents can improve by coordinating
+   - Alternative: agents could act independently (inefficient)
+
+4. Non-trivial communication/credit
+   - Agents need to coordinate or assign credit
+   - NOT trivial to decompose into independent subproblems
+```
+
+**Use Single-Agent RL when**:
+
+```
+1. Single learning agent (others are environment)
+   - Example: one RL agent vs static rules-based opponents
+   - Environment includes other agents, but they're not learning
+
+2. Independent parallel tasks
+   - Example: 10 robots, each with own goal, no interaction
+   - Use single-agent RL x 10 (faster, simpler)
+
+3. Fully decomposable problems
+   - Example: multi-robot path planning (can use single-agent per robot)
+   - Problem decomposes into independent subproblems
+
+4. Scalability critical
+   - Single-agent RL scales to huge teams
+   - MARL harder to scale (centralized training bottleneck)
+```
+
+### Decision Tree
+
+```
+Problem: Multiple agents learning together?
+  NO → Use single-agent RL
+  YES ↓
+
+Problem: Agents' rewards interdependent?
+  NO → Use single-agent RL x N (parallel)
+  YES ↓
+
+Problem: Agents must coordinate?
+  NO → Use independent learning (but expect instability)
+  YES ↓
+
+Problem structure:
+  COOPERATIVE → Use QMIX, MAPPO, QPLEX
+  COMPETITIVE → Use MADDPG, self-play
+  MIXED → Use hybrid (cooperative + competitive algorithms)
+```
+
+
+## Part 9: Opponent Modeling in Competitive Settings
+
+### Why Model Opponents?
+
+**Problem Without Opponent Modeling**:
+
+```
+Agent 1 (using MADDPG) learns:
+  "Move right gives Q=50"
+
+But assumption: Agent 2 plays policy π_2
+
+When Agent 2 improves to π'_2:
+  "Move right gives Q=20" (because Agent 2 blocks that path)
+
+Agent 1's Q-value estimates become stale!
+Environment has changed (opponent improved)
+```
+
+**Solution: Opponent Modeling**
+
+```python
+class OpponentModelingAgent:
+    def __init__(self, agent_id, n_agents, obs_dim, action_dim):
+        self.agent_id = agent_id
+
+        # Own actor and critic
+        self.actor = self._build_actor(obs_dim, action_dim)
+        self.critic = self._build_critic()
+
+        # Model opponent policies (for agents we compete against)
+        self.opponent_models = {
+            i: self._build_opponent_model() for i in range(n_agents) if i != agent_id
+        }
+
+    def _build_opponent_model(self):
+        """Model what opponent will do given state."""
+        return nn.Sequential(
+            nn.Linear(64, 128),
+            nn.ReLU(),
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Linear(128, self.action_dim)
+        )
+
+    def train_step_with_opponent_modeling(self, batch):
+        """
+        Update own policy AND opponent models.
+
+        Key insight: predict what opponent will do,
+        then plan against those predictions
+        """
+        observations, actions, rewards, next_observations = batch
+
+        # Step 1: Update opponent models (supervised)
+        # Predict opponent action from observation
+        for opponent_id, model in self.opponent_models.items():
+            predicted_action = model(next_observations[opponent_id])
+            actual_action = actions[opponent_id]
+            opponent_loss = ((predicted_action - actual_action) ** 2).mean()
+            # Update opponent model
+            optimizer.zero_grad()
+            opponent_loss.backward()
+            optimizer.step()
+
+        # Step 2: Plan against opponent predictions
+        predicted_opponent_actions = {
+            i: model(observations[i])
+            for i, model in self.opponent_models.items()
+        }
+
+        # Use predictions in MADDPG update
+        # Critic sees: own obs + predicted opponent actions
+        # Actor learns: given opponent predictions, best response
+
+        return {'opponent_loss': opponent_loss.item()}
+```
+
+**Opponent Modeling Trade-offs**:
+
+```
+Advantages:
+  - Accounts for opponent improvements (non-stationarity)
+  - Enables planning ahead
+  - Reduces brittleness to opponent policy changes
+
+Disadvantages:
+  - Requires learning opponent models (additional supervision)
+  - If opponent model is wrong, agent learns wrong policy
+  - Computational overhead
+  - Assumes opponent is predictable
+
+When to use:
+  - Competitive settings with clear opponents
+  - Limited number of distinct opponents
+  - Opponents have consistent strategies
+
+When NOT to use:
+  - Too many potential opponents
+  - Opponents are unpredictable
+  - Cooperative setting (waste of resources)
+```
+
+
+## Part 10: Advanced: Independent Q-Learning (IQL) for Multi-Agent
+
+### IQL in Multi-Agent Settings
+
+**Idea**: Each agent learns Q-value using only own rewards and observations.
+
+```python
+class IQLMultiAgent:
+    def __init__(self, agent_id, obs_dim, action_dim):
+        self.agent_id = agent_id
+
+        # Q-network for this agent only
+        self.q_network = nn.Sequential(
+            nn.Linear(obs_dim + action_dim, 128),
+            nn.ReLU(),
+            nn.Linear(128, 1)
+        )
+
+        self.optimizer = Adam(self.q_network.parameters(), lr=1e-3)
+
+    def train_step(self, batch):
+        """
+        Independent Q-learning: each agent learns from own reward only.
+
+        Problem: Non-stationarity
+        - Other agents improve policies
+        - Environment from this agent's perspective changes
+        - Q-values become invalid
+
+        Benefit: Decentralized
+        - No centralized training needed
+        - Scalable to many agents
+        """
+        observations, actions, rewards, next_observations = batch
+
+        # Q-value update (standard Q-learning)
+        with torch.no_grad():
+            # Greedy next action (assume agent acts greedily)
+            next_q_values = []
+            for action in range(self.action_dim):
+                q_input = torch.cat([next_observations, one_hot(action)])
+                q_val = self.q_network(q_input)
+                next_q_values.append(q_val)
+
+            max_next_q = torch.max(torch.stack(next_q_values), dim=0)[0]
+            td_target = rewards + 0.99 * max_next_q
+
+        # Current Q-value
+        q_pred = self.q_network(torch.cat([observations, actions], dim=-1))
+
+        # TD loss
+        loss = ((q_pred - td_target) ** 2).mean()
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        return {'loss': loss.item()}
+```
+
+**IQL in Multi-Agent: Pros and Cons**:
+
+```
+Advantages:
+  - Fully decentralized (scalable)
+  - No communication needed
+  - Simple implementation
+  - Works with partial observability
+
+Disadvantages:
+  - Non-stationarity breaks convergence
+  - Agents chase moving targets (other agents improving)
+  - No explicit coordination
+  - Performance often poor without CTDE
+
+Result:
+  - IQL works but is unstable in true multi-agent settings
+  - Better to use CTDE (QMIX, MADDPG) for stability
+  - IQL useful if centralized training impossible
+```
+
+
+## Part 11: Multi-Agent Experience Replay and Batch Sampling
+
+### Challenges of Experience Replay in Multi-Agent
+
+**Problem**:
+
+```
+In single-agent RL:
+  Experience replay stores (s, a, r, s', d)
+  Sample uniformly from buffer
+  Works well (iid samples)
+
+In multi-agent RL:
+  Experience replay stores (s, a_1, a_2, ..., a_n, r, s')
+  But agents are non-stationary!
+
+  Transition (s, a_1, a_2, r, s') valid only if:
+    - Assumptions about other agents' policies still hold
+    - If other agents improved, assumptions invalid
+
+Solution: Prioritized experience replay for multi-agent
+  - Prioritize transitions where agent's assumptions are likely correct
+  - Down-weight transitions from old policies (outdated assumptions)
+  - Focus on recent transitions (more relevant)
+```
+
+**Batch Sampling Strategy**:
+
+```python
+class MultiAgentReplayBuffer:
+    def __init__(self, capacity=100000, n_agents=3):
+        self.buffer = deque(maxlen=capacity)
+        self.n_agents = n_agents
+        self.priority_weights = deque(maxlen=capacity)
+
+    def add(self, transition):
+        """Store experience with priority."""
+        # transition: (observations, actions, rewards, next_observations, dones)
+        self.buffer.append(transition)
+
+        # Priority: how relevant is this to current policy?
+        # Recent transitions: high priority (policies haven't changed much)
+        # Old transitions: low priority (agents have improved, assumptions stale)
+        priority = self._compute_priority(transition)
+        self.priority_weights.append(priority)
+
+    def _compute_priority(self, transition):
+        """Compute priority for multi-agent setting."""
+        # Heuristic: prioritize recent transitions
+        # Could use TD-error (how surprised are we by this transition?)
+        age = len(self.buffer)  # How long ago was this added?
+        decay = 0.99 ** age  # Exponential decay
+        return decay
+
+    def sample(self, batch_size):
+        """Sample prioritized batch."""
+        # Weighted sampling: high priority more likely
+        indices = np.random.choice(
+            len(self.buffer),
+            batch_size,
+            p=self.priority_weights / self.priority_weights.sum()
+        )
+
+        batch = [self.buffer[i] for i in indices]
+        return batch
+```
+
+
+## Part 12: 10+ Critical Pitfalls
+
+1. **Treating as independent agents**: Non-stationarity breaks convergence
+2. **Giving equal reward to unequal contributors**: Credit assignment fails
+3. **Forgetting decentralized execution**: Agents need independent policies
+4. **Communicating too much**: High variance, bandwidth waste
+5. **Using cooperative algorithm in competitive game**: Convergence fails
+6. **Using competitive algorithm in cooperative game**: Agents conflict
+7. **Not using CTDE**: Weak coordination, brittle policies
+8. **Assuming other agents will converge**: Non-stationarity = moving targets
+9. **Value overestimation in team settings**: Similar to offline RL issues
+10. **Forgetting opponent modeling**: In competitive settings, must predict others
+11. **Communication deception**: Agents learn to mislead for short-term gain
+12. **Scalability (too many agents)**: MARL doesn't scale to 100+ agents
+13. **Experience replay staleness**: Old transitions assume old opponent policies
+14. **Ignoring observability constraints**: Partial obs needs communication or factorization
+15. **Reward structure not matching algorithm**: Cooperative/competitive mismatch
+
+
+## Part 13: 10+ Rationalization Patterns
+
+Users often rationalize MARL mistakes:
+
+1. **"Independent agents should work"**: Doesn't understand non-stationarity
+2. **"My algorithm converged to something"**: Might be local optima due to credit ambiguity
+3. **"Communication improved rewards"**: Might be learned deception, not coordination
+4. **"QMIX should work everywhere"**: Doesn't check problem for monotonicity
+5. **"More agents = more parallelism"**: Ignores centralized training bottleneck
+6. **"Rewards are subjective anyway"**: Credit assignment is objective (factorization)
+7. **"I'll just add more training"**: Non-stationarity can't be fixed by more epochs
+8. **"Other agents are fixed"**: But they're learning too (environment is non-stationary)
+9. **"Communication bandwidth doesn't matter"**: In real systems, it does
+10. **"Nash equilibrium is always stable"**: No, it's just best-response equilibrium
+
+
+## Part 14: MAPPO - Multi-Agent Proximal Policy Optimization
+
+### When to Use MAPPO
+
+**Cooperative teams with policy gradients**:
+
+```python
+class MAPPOAgent:
+    def __init__(self, agent_id, obs_dim, action_dim, hidden_dim=256):
+        self.agent_id = agent_id
+
+        # Actor: policy for decentralized execution
+        self.actor = nn.Sequential(
+            nn.Linear(obs_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim)
+        )
+
+        # Critic: centralized value function (uses global state during training)
+        self.critic = nn.Sequential(
+            nn.Linear(obs_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)
+        )
+
+        self.actor_optimizer = Adam(self.actor.parameters(), lr=3e-4)
+        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-3)
+
+    def train_step_on_batch(self, observations, actions, returns, advantages):
+        """
+        MAPPO training: advantage actor-critic with clipped policy gradient.
+
+        Key difference from DDPG:
+        - Policy gradient (not off-policy value)
+        - Centralized training (uses global returns/advantages)
+        - Decentralized execution (policy uses only own observation)
+        """
+        # Actor loss (clipped PPO)
+        action_probs = torch.softmax(self.actor(observations), dim=-1)
+        action_log_probs = torch.log(action_probs.gather(-1, actions))
+
+        # Importance weight (in on-policy setting, = 1)
+        # In practice, small advantage clipping for stability
+        policy_loss = -(action_log_probs * advantages).mean()
+
+        # Entropy regularization (exploration)
+        entropy = -(action_probs * torch.log(action_probs + 1e-8)).sum(dim=-1).mean()
+        actor_loss = policy_loss - 0.01 * entropy
+
+        self.actor_optimizer.zero_grad()
+        actor_loss.backward()
+        self.actor_optimizer.step()
+
+        # Critic loss (value estimation)
+        values = self.critic(observations)
+        critic_loss = ((values - returns) ** 2).mean()
+
+        self.critic_optimizer.zero_grad()
+        critic_loss.backward()
+        self.critic_optimizer.step()
+
+        return {
+            'actor_loss': actor_loss.item(),
+            'critic_loss': critic_loss.item(),
+            'entropy': entropy.item()
+        }
+```
+
+**MAPPO vs QMIX**:
+
+```
+QMIX:
+  - Value-based (discrete actions)
+  - Value factorization (credit assignment)
+  - Works with partial observability
+
+MAPPO:
+  - Policy gradient-based
+  - Centralized critic (advantage estimation)
+  - On-policy (requires recent trajectories)
+
+Use MAPPO when:
+  - Continuous or large discrete action spaces
+  - On-policy learning acceptable
+  - Value factorization not needed (reward structure simple)
+
+Use QMIX when:
+  - Discrete actions
+  - Need explicit credit assignment
+  - Off-policy learning preferred
+```
+
+
+## Part 15: Self-Play for Competitive Learning
+
+### Self-Play Mechanism
+
+**Problem**: Training competitive agents requires opponents.
+
+```
+Naive approach:
+  - Agent 1 trains vs fixed opponent
+  - Problem: fixed opponent doesn't adapt
+  - Agent 1 learns exploitation (brittle to new opponents)
+
+Self-play:
+  - Agent 1 trains vs historical versions of itself
+  - Agent 1 improves → creates stronger opponent
+  - New Agent 1 trains vs stronger Agent 1
+  - Cycle: both improve together
+  - Result: robust agent that beats all versions of itself
+```
+
+**Self-Play Implementation**:
+
+```python
+class SelfPlayTrainer:
+    def __init__(self, agent_class, n_checkpoint_opponents=5):
+        self.current_agent = agent_class()
+        self.opponent_pool = []  # Keep historical versions
+        self.n_checkpoints = n_checkpoint_opponents
+
+    def train(self, num_episodes):
+        """Train with self-play against previous versions."""
+        for episode in range(num_episodes):
+            # Select opponent: current agent or historical version
+            if not self.opponent_pool or np.random.rand() < 0.5:
+                opponent = copy.deepcopy(self.current_agent)
+            else:
+                opponent = np.random.choice(self.opponent_pool)
+
+            # Play episode: current_agent vs opponent
+            trajectory = self._play_episode(self.current_agent, opponent)
+
+            # Train current agent on trajectory
+            self.current_agent.train_on_trajectory(trajectory)
+
+            # Periodically add current agent to opponent pool
+            if episode % (num_episodes // self.n_checkpoints) == 0:
+                self.opponent_pool.append(copy.deepcopy(self.current_agent))
+
+        return self.current_agent
+
+    def _play_episode(self, agent1, agent2):
+        """Play episode: agent1 vs agent2, collect experience."""
+        trajectory = []
+        state = self.env.reset()
+        done = False
+
+        while not done:
+            # Agent 1 action
+            action1 = agent1.select_action(state['agent1_obs'])
+
+            # Agent 2 action (opponent)
+            action2 = agent2.select_action(state['agent2_obs'])
+
+            # Step environment
+            state, reward, done = self.env.step(action1, action2)
+
+            trajectory.append({
+                'obs1': state['agent1_obs'],
+                'obs2': state['agent2_obs'],
+                'action1': action1,
+                'action2': action2,
+                'reward1': reward['agent1'],
+                'reward2': reward['agent2']
+            })
+
+        return trajectory
+```
+
+**Self-Play Benefits and Pitfalls**:
+
+```
+Benefits:
+  - Agents automatically improve together
+  - Robust to different opponent styles
+  - Emergent complexity (rock-paper-scissors dynamics)
+
+Pitfalls:
+  - Agents might exploit specific weaknesses (not generalizable)
+  - Training unstable if pool too small
+  - Forgetting how to beat weaker opponents (catastrophic forgetting)
+  - Computational cost (need to evaluate multiple opponents)
+
+Solution: Diverse opponent pool
+  - Keep varied historical versions
+  - Mix self-play with evaluation vs fixed benchmark
+  - Monitor for forgetting (test vs all opponents periodically)
+```
+
+
+## Part 16: Practical Implementation Considerations
+
+### Observation Space Design
+
+**Key consideration**: Partial vs full observability
+
+```python
+# Full Observability (not realistic but simplest)
+observation = {
+    'own_position': agent_pos,
+    'all_agent_positions': [pos1, pos2, pos3],  # See everyone!
+    'all_agent_velocities': [vel1, vel2, vel3],
+    'targets': [target1, target2, target3]
+}
+
+# Partial Observability (more realistic, harder)
+observation = {
+    'own_position': agent_pos,
+    'own_velocity': agent_vel,
+    'target': own_target,
+    'nearby_agents': agents_within_5m,  # Limited field of view
+    # Note: don't see agents far away
+}
+
+# Consequence: With partial obs, agents must communicate or learn implicitly
+# Via environmental interaction (e.g., bumping into others)
+```
+
+### Reward Structure Design
+
+**Critical for multi-agent learning**:
+
+```python
+# Cooperative game: shared reward
+team_reward = +100 if goal_reached else 0
+# Problem: ambiguous who contributed
+
+# Cooperative game: mixed rewards (shared + individual)
+team_reward = +100 if goal_reached
+individual_bonus = +5 if agent_i_did_critical_action
+total_reward_i = team_reward + individual_bonus  # incentivizes both
+
+# Competitive game: zero-sum
+reward_1 = goals_1 - goals_2
+reward_2 = goals_2 - goals_1  # Opposite
+
+# Competitive game: individual scores
+reward_1 = goals_1
+reward_2 = goals_2
+# Problem: agents don't care about each other (no implicit competition)
+
+# Mixed: cooperation + competition (team sports)
+reward_i = +10 if team_wins
+        + 1 if agent_i_scores
+        + 0.1 * team_score  # Shared team success bonus
+```
+
+**Reward Design Pitfall**: Too much individual reward breaks cooperation
+
+```
+Example: 3v3 soccer
+reward_i = +100 if agent_i_scores  (individual goal)
+        + +5 if agent_i_assists     (passes to scorer)
+        + 0 if teammate scores      (not rewarded!)
+
+Result:
+  Agent learns: "Only my goals matter, don't pass to teammates"
+  Agent hoards ball, tries solo shots
+  Team coordination breaks
+  Lose to coordinated opponent team
+
+Solution: Include team reward
+reward_i = +100 if team_wins
+        + +10 if agent_i_scores goal
+        + +2 if agent_i_assists
+```
+
+
+## Summary: When to Use Multi-Agent RL
+
+**Multi-agent RL is needed when**:
+
+1. Multiple agents learning simultaneously in shared environment
+2. Agent interactions cause non-stationarity
+3. Coordination or credit assignment is non-trivial
+4. Problem structure matches available algorithm (cooperative/competitive)
+
+**Multi-agent RL is NOT needed when**:
+
+1. Single learning agent (others are static)
+2. Agents act independently (no true interaction)
+3. Problem easily decomposes (use single-agent RL per agent)
+4. Scalability to 100+ agents critical (MARL hard to scale)
+
+**Key Algorithms**:
+
+1. **QMIX**: Cooperative, value factorization, decentralized execution
+2. **MADDPG**: Competitive/mixed, continuous actions, centralized critic
+3. **MAPPO**: Cooperative, policy gradients, centralized training
+4. **Self-Play**: Competitive, agents train vs historical versions
+5. **Communication**: For partial observability, explicit coordination
+6. **CTDE**: Paradigm enabling stable multi-agent learning
+
+**Algorithm Selection Matrix**:
+
+```
+                 Cooperative    Competitive    Mixed
+Discrete Action    QMIX          Nash-Q         Hybrid
+Continuous Action  MAPPO/MADDPG  MADDPG         MADDPG
+Partial Obs        +Comm         +Opponent Mod  +Both
+Scalable           IQL (unstable) IQL           IQL (unstable)
+```
+
+**Critical Success Factors**:
+
+1. Match algorithm to problem structure (cooperative vs competitive)
+2. Design reward to align with desired coordination
+3. Use CTDE for stable training
+4. Monitor for non-stationarity issues
+5. Validate agents work independently during execution
+
+Use this skill to understand multi-agent problem structure and select appropriate algorithms for coordination challenges.
diff --git a/skills/using-deep-rl/offline-rl.md b/skills/using-deep-rl/offline-rl.md
new file mode 100644
index 0000000..b3380c0
--- /dev/null
+++ b/skills/using-deep-rl/offline-rl.md
@@ -0,0 +1,1571 @@
+
+# Offline Reinforcement Learning
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Offline Dataset**: User has fixed dataset D, no environment access
+- **Distribution Shift**: Policy improvements diverge from data collection distribution
+- **Value Overestimation**: Learning from offline data causes Q-values to diverge
+- **CQL/IQL Implementation**: Implementing conservative value estimation
+- **Behavior Cloning Constraints**: How to keep policy close to data distribution
+- **BCQ Algorithm**: Batch-Constrained Q-learning for offline control
+- **Offline Evaluation**: Evaluating policy performance without environment interaction
+- **When to Apply Offline RL**: Deciding if offline RL is needed vs online alternatives
+- **Extrapolation Error**: Understanding why agents fail on OOD actions
+- **Data Quality Impact**: How dataset composition affects algorithm choice
+
+**This skill teaches learning from fixed datasets without environment interaction.**
+
+Do NOT use this skill for:
+
+- Online RL with environment interaction (use policy-gradient-methods, actor-critic-methods)
+- Pure supervised learning on (s,a) pairs (that's behavior cloning, use supervised learning)
+- Online model-free learning (use value-based-methods)
+- Algorithm-agnostic debugging (use rl-debugging-methodology)
+
+## Core Principle
+
+**Offline RL learns from fixed datasets without environment interaction, solving the fundamental problem of value overestimation without online correction.**
+
+The core insight: Standard RL algorithms (Q-learning, policy gradient) assume you can interact with environment to correct mistakes. Offline RL has no such luxury.
+
+```
+Online RL Problem:
+  1. Agent explores environment
+  2. Collects (s,a,r,s',d) transitions
+  3. Updates value estimate: Q[s,a] ← r + γ max_a' Q[s',a']
+  4. If Q overestimates, agent tries bad action
+  5. Environment gives low reward
+  6. Q-value corrects downward in next update
+
+Offline RL Problem:
+  1. Agent receives fixed dataset D
+  2. Estimates Q from D only
+  3. Updates value estimate: Q[s,a] ← r + γ max_a' Q[s',a']
+  4. If Q overestimates (no data for some actions), agent tries bad action
+  5. No feedback! Can't try action in environment
+  6. Q-value never corrects. Error compounds.
+  7. Policy diverges, performance collapses
+```
+
+**Without understanding extrapolation error and conservative value estimation, you'll implement algorithms that hallucinate value for unseen state-action pairs.**
+
+
+## Part 1: The Offline RL Problem
+
+### Offline RL Fundamentals
+
+**Offline RL Setting**:
+
+- You have fixed dataset D = {(s_i, a_i, r_i, s'_i, d_i)} collected by unknown behavior policy
+- No access to environment (can't interact)
+- Goal: Learn policy π that maximizes expected return
+- Constraint: Policy must work on real environment, not just in data
+
+**Key Difference from Supervised Learning**:
+
+```
+Supervised Learning (behavior cloning):
+  π = argmin_π E_{(s,a)~D}[||π(a|s) - β(a|s)||²]
+  Problem: Learns data collection policy, can't improve
+
+Offline RL:
+  π = argmin_π E_{(s,a,r,s')~D}[Q(s,a) - μ(a|s)]
+  Benefit: Uses reward signal to improve beyond data
+  Challenge: Q-values unreliable outside data distribution
+```
+
+### Why Standard Q-Learning Fails on Offline Data
+
+**The Extrapolation Problem**:
+
+Imagine discrete MDP with 3 actions: left, right, wait.
+
+```python
+# Data collection: random policy samples uniformly
+# States: {s1, s2}
+# Dataset D:
+# (s1, left, r=5, s1')
+# (s1, left, r=4, s1')
+# (s1, right, r=3, s1')
+# (s2, wait, r=10, s2')
+# (s2, wait, r=9, s2')
+# (s1, right, r=2, s1')
+
+# Training Q-Learning on D:
+Q(s1, left) ≈ 4.5   # Average of data
+Q(s1, right) ≈ 2.5  # Average of data
+Q(s1, wait) ≈ ???   # No data! Network must extrapolate
+
+# What does Q-network guess for Q(s1, wait)?
+# Network sees:
+#   - action=left → reward high
+#   - action=right → reward low
+#   - action=wait → no signal
+# Worst case: network predicts Q(s1, wait) = 100 (hallucination!)
+
+# Policy improvement:
+π(a|s1) = argmax_a Q(s1, a)
+         = argmax{4.5, 2.5, 100}
+         = wait  # WRONG CHOICE!
+
+# In real environment: s1 + wait = crash (reward = -100)
+# Why? Network extrapolated wildly beyond training distribution.
+```
+
+**The Root Cause**:
+
+1. Training signal only for actions in D
+2. Network trained to minimize MSE on seen (s,a) pairs
+3. For unseen (s,a), network interpolates/extrapolates
+4. Extrapolation is unreliable in high dimensions
+5. Policy picks extrapolated high values
+6. In reality: catastrophic failure
+
+### Distribution Shift and Policy Divergence
+
+**The Core Challenge**:
+
+```
+Initial behavior policy β:
+  Collects D by visiting state distribution d_β
+  D covers: {actions β takes, rewards β gets}
+
+Your learned policy π:
+  Improves on β
+  Visits different states: d_π
+  Tries different actions: π(a|s)
+
+Mismatch:
+  - States in d_π but not d_β: Q has no data
+  - Actions π takes but β doesn't: Q extrapolates
+  - Q-estimates unreliable
+  - Policy gets stuck in hallucinated high-value regions
+```
+
+**Example: Robot Manipulation**:
+
+```
+Data: Collected by [move_forward, move_left, grasp] policy
+- Good at pushing objects forward
+- Poor at pulling backward
+
+Your offline-trained π:
+- Learns forward motion works (in data)
+- Learns backward motion has high Q (extrapolated!)
+- Because backward unexplored, network guesses Q=50
+
+Reality:
+- Forward: actually good
+- Backward: crashes into wall, r=-100
+- Policy fails catastrophically
+
+Why?
+- Distribution shift: π tries backward, d_π ≠ d_β
+- No data for backward actions
+- Network extrapolates incorrectly
+```
+
+### Value Overestimation: The Central Problem
+
+Standard Q-learning uses:
+
+```
+Q(s,a) ← Q(s,a) + α(r + γ max_{a'} Q(s',a') - Q(s,a))
+                            └─ Greedy max
+```
+
+**The Problem with max_a'**:
+
+```
+With limited data:
+- max_{a'} Q(s',a') picks highest Q-value
+- But if Q overestimates for most actions
+- max picks an overestimated value
+- TD target is too high
+- Q-values drift upward indefinitely
+```
+
+**Why Does This Happen Offline?**
+
+Online RL:
+
+```
+Iteration 1:
+  Q(s,a) = 10 (overestimate)
+Iteration 2:
+  Real transition: (s,a,r=-5,s')
+  Q(s,a) ← -5 + γ max Q(s',a')
+  If Q(s',a') corrected to 0, Q(s,a) ← -5
+  Overestimation corrected!
+
+Offline RL:
+  Same transition in replay buffer D
+  Q(s,a) ← -5 + γ max Q(s',a')
+  But max Q(s',a') still overestimated (no correction)
+  Q stays high, continues overestimating
+  Error never corrects
+```
+
+
+## Part 2: Conservative Q-Learning (CQL)
+
+### CQL: The Idea
+
+**Conservative Q-Learning** directly addresses value overestimation by adding a **pessimistic lower bound**:
+
+```
+Standard Bellman (optimistic):
+  Q(s,a) ← r + γ max_{a'} Q(s',a')
+
+CQL (conservative):
+  Q(s,a) ← r + γ max_{a'} (Q(s',a') - α * C(a'))
+
+Where C(a') is a penalty for actions outside data distribution.
+```
+
+**Key Idea**: Penalize high Q-values for actions not well-represented in data.
+
+### CQL in Practice: The Implementation
+
+**Full CQL Update**:
+
+```python
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+
+class CQLAgent:
+    def __init__(self, state_dim, action_dim, hidden_dim=256):
+        # Q-network (standard DQN-style)
+        self.Q = nn.Sequential(
+            nn.Linear(state_dim + action_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)  # Q-value output
+        )
+
+        # Behavior cloning network (estimate β(a|s))
+        self.pi_b = nn.Sequential(
+            nn.Linear(state_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, action_dim)
+        )
+
+        self.Q_optimizer = Adam(self.Q.parameters(), lr=3e-4)
+        self.pi_b_optimizer = Adam(self.pi_b.parameters(), lr=3e-4)
+
+        # CQL hyperparameters
+        self.cql_weight = 1.0  # How much to penalize OOD actions
+        self.discount = 0.99
+        self.target_update_rate = 0.005
+
+        # Target network
+        self.Q_target = nn.Sequential(
+            nn.Linear(state_dim + action_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)
+        )
+        self._soft_update_target()
+
+    def train_step(self, batch):
+        """
+        One CQL training update on batch D.
+        Batch contains: states, actions, rewards, next_states, dones
+        """
+        states, actions, rewards, next_states, dones = batch
+        batch_size = states.shape[0]
+
+        # 1. Compute TD target with CQL penalty
+        with torch.no_grad():
+            # Next action values
+            q_next = self.Q_target(torch.cat([next_states, actions], dim=1))
+
+            # CQL: penalize high Q-values for OOD actions
+            # Sample random actions and batch actions
+            random_actions = torch.rand((batch_size, 10, self.action_dim))
+            batch_actions = actions.unsqueeze(1).expand(-1, 10, -1)
+
+            # Q-values for random and batch actions
+            q_random = self.Q_target(torch.cat([
+                next_states.unsqueeze(1).expand(-1, 10, -1),
+                random_actions
+            ], dim=2))  # [batch, 10]
+
+            q_batch = self.Q_target(torch.cat([
+                next_states.unsqueeze(1).expand(-1, 10, -1),
+                batch_actions
+            ], dim=2))  # [batch, 10]
+
+            # CQL penalty: log(sum exp(Q_random) + sum exp(Q_batch))
+            # This penalizes taking OOD actions
+            cql_penalty = (
+                torch.logsumexp(q_random, dim=1) +
+                torch.logsumexp(q_batch, dim=1)
+            )
+
+            # TD target: conservative value estimate
+            td_target = rewards + (1 - dones) * self.discount * (
+                q_next - self.cql_weight * cql_penalty / 2
+            )
+
+        # 2. Update Q-network
+        q_pred = self.Q(torch.cat([states, actions], dim=1))
+        q_loss = ((q_pred - td_target) ** 2).mean()
+
+        self.Q_optimizer.zero_grad()
+        q_loss.backward()
+        self.Q_optimizer.step()
+
+        # 3. Update behavior cloning network (optional)
+        # Helps estimate which actions are in-distribution
+        log_probs = torch.log_softmax(self.pi_b(states), dim=1)
+        bc_loss = -log_probs.gather(1, actions.long()).mean()
+
+        self.pi_b_optimizer.zero_grad()
+        bc_loss.backward()
+        self.pi_b_optimizer.step()
+
+        # 4. Soft update target network
+        self._soft_update_target()
+
+        return {
+            'q_loss': q_loss.item(),
+            'bc_loss': bc_loss.item(),
+            'cql_penalty': cql_penalty.mean().item()
+        }
+
+    def _soft_update_target(self):
+        """Soft update target network toward main network."""
+        for target_param, main_param in zip(
+            self.Q_target.parameters(),
+            self.Q.parameters()
+        ):
+            target_param.data.copy_(
+                self.target_update_rate * main_param.data +
+                (1 - self.target_update_rate) * target_param.data
+            )
+
+    def select_action(self, state, temperature=0.1):
+        """
+        Select action using CQL-trained Q-values.
+        Temperature controls exploration.
+        """
+        with torch.no_grad():
+            # Evaluate all actions (in discrete case)
+            q_values = []
+            for a in range(self.action_dim):
+                action_tensor = torch.tensor([a], dtype=torch.float32)
+                q_val = self.Q(torch.cat([state, action_tensor]))
+                q_values.append(q_val.item())
+
+            q_values = torch.tensor(q_values)
+
+            # Softmax policy (temperature for uncertainty)
+            logits = q_values / temperature
+            action_probs = torch.softmax(logits, dim=0)
+
+            # Sample or take greedy
+            action = torch.multinomial(action_probs, 1).item()
+            return action
+```
+
+**Key CQL Components**:
+
+1. **CQL Penalty**: `logsumexp(Q_random) + logsumexp(Q_batch)`
+   - Penalizes high Q-values for both random and batch actions
+   - Forces Q-network to be pessimistic
+   - Prevents extrapolation to unseen actions
+
+2. **Conservative Target**: `r + γ(Q(s',a') - α*penalty)`
+   - Lowers TD target by CQL penalty amount
+   - Makes Q-estimates more conservative
+   - Safer for policy improvement
+
+3. **Behavior Cloning Network**: Estimates β(a|s)
+   - Helps identify in-distribution actions
+   - Can weight CQL penalty by action probability
+   - Tighter constraint on constrained actions
+
+### CQL Intuition
+
+**What CQL Prevents**:
+
+```
+Without CQL:
+Q(s1, wait) = 100  (hallucinated, no data)
+π picks wait → disaster
+
+With CQL:
+Q_target for s1, wait includes penalty
+Q_target = r + γ(Q(s',a') - α * log(sum exp(Q)))
+         = r + γ(50 - 100)
+         = r - 50 * γ
+CQL pessimism forces Q(s1, wait) low
+π picks safer action
+
+Result: Policy stays in data distribution, avoids hallucinated values
+```
+
+### When CQL Works Well
+
+- **Short horizons**: Errors don't compound as much
+- **Diverse data**: Multiple actions represented
+- **Known behavior policy**: Can weight penalty appropriately
+- **Discrete actions**: Easier to evaluate all actions
+
+### CQL Failure Modes
+
+- **Too conservative on good data**: May not improve over β
+- **High variance penalties**: log-sum-exp can be unstable
+- **Computational cost**: Requires sampling many actions
+
+
+## Part 3: Implicit Q-Learning (IQL)
+
+### IQL: A Different Approach to Pessimism
+
+While CQL explicitly penalizes OOD actions, **IQL** achieves pessimism through a different mechanism: **expectile regression**.
+
+```
+Standard L2 Regression (mean):
+  Expected value minimizes E[(y - ŷ)²]
+
+Expectile Regression (quantile-like):
+  Expects value minimizes E[|2τ - 1| * |y - ŷ|] for τ in (0,1)
+  - τ < 0.5: underestimates (pessimistic)
+  - τ = 0.5: median (neutral)
+  - τ > 0.5: overestimates (optimistic)
+```
+
+### IQL Implementation
+
+**Key Insight**: Use expectile loss to make Q-estimates naturally pessimistic without explicit penalties.
+
+```python
+class IQLAgent:
+    def __init__(self, state_dim, action_dim, hidden_dim=256, expectile=0.7):
+        self.Q = self._build_q_network(state_dim, action_dim, hidden_dim)
+        self.V = self._build_v_network(state_dim, hidden_dim)  # Value function
+
+        self.Q_optimizer = Adam(self.Q.parameters(), lr=3e-4)
+        self.V_optimizer = Adam(self.V.parameters(), lr=3e-4)
+
+        self.expectile = expectile  # τ = 0.7 for slight pessimism
+        self.discount = 0.99
+        self.temperature = 1.0  # For policy softness
+
+    def expectile_loss(self, diff, expectile):
+        """
+        Asymmetric expectile loss.
+        Penalizes overestimation more than underestimation (pessimism).
+        """
+        weight = torch.where(
+            diff > 0,
+            expectile * torch.ones_like(diff),
+            (1 - expectile) * torch.ones_like(diff)
+        )
+        return weight * (diff ** 2)
+
+    def train_v_function(self, batch):
+        """
+        Step 1: Train value function V(s)
+        V(s) estimates expected Q-value under behavior policy
+        """
+        states, actions, rewards, next_states, dones = batch
+
+        # Q-values from current policy
+        q_values = self.Q(states, actions)
+
+        # V-network predicts these Q-values
+        v_pred = self.V(states)
+
+        # Expectile loss: V should underestimate Q slightly
+        # (stay pessimistic)
+        q_diff = q_values - v_pred
+        v_loss = self.expectile_loss(q_diff, self.expectile).mean()
+
+        self.V_optimizer.zero_grad()
+        v_loss.backward()
+        self.V_optimizer.step()
+
+        return {'v_loss': v_loss.item()}
+
+    def train_q_function(self, batch):
+        """
+        Step 2: Train Q-function using pessimistic V-target
+        Q(s,a) ← r + γ V(s')  (instead of γ max_a' Q(s',a'))
+        """
+        states, actions, rewards, next_states, dones = batch
+
+        # IQL target: use V-function instead of max Q
+        with torch.no_grad():
+            v_next = self.V(next_states)
+            td_target = rewards + (1 - dones) * self.discount * v_next
+
+        q_pred = self.Q(states, actions)
+        q_loss = ((q_pred - td_target) ** 2).mean()
+
+        self.Q_optimizer.zero_grad()
+        q_loss.backward()
+        self.Q_optimizer.step()
+
+        return {'q_loss': q_loss.item()}
+
+    def train_step(self, batch):
+        """IQL training: V-function first, then Q-function."""
+        v_info = self.train_v_function(batch)
+        q_info = self.train_q_function(batch)
+        return {**v_info, **q_info}
+
+    def select_action(self, state):
+        """
+        Policy improvement: use exponential weighted Q-values.
+        Only improve actions with high estimated value.
+        """
+        with torch.no_grad():
+            # Evaluate actions
+            q_values = self.Q(state, actions=None)  # All actions
+
+            # Exponential weighting: exp(Q/τ)
+            # Concentrates on high-Q actions
+            weights = torch.exp(q_values / self.temperature)
+            weights = weights / weights.sum()
+
+            action = torch.multinomial(weights, 1).item()
+            return action
+```
+
+### IQL Key Insight
+
+**Why V-function Makes Q Pessimistic**:
+
+```
+Standard Q-Learning:
+  Q(s,a) = r + γ max_a' Q(s',a')  ← Optimistic!
+
+IQL:
+  1. Train V(s) to estimate Q under behavior policy
+     V(s) ≈ E_a~β[Q(s,a)]
+  2. Use V as target: Q(s,a) = r + γ V(s')
+     Why pessimistic?
+
+     If policy is suboptimal:
+       - Good actions: Q > V(s) (above average)
+       - Bad actions: Q < V(s) (below average)
+       - max_a Q(s',a') picks good action (might extrapolate)
+       - V(s') is average (conservative)
+
+    Result: Using V instead of max prevents picking overestimated actions!
+```
+
+### Expectile Loss Intuition
+
+```
+Standard MSE: E[(Q - V)²]
+  - Symmetric penalty: overestimation = underestimation
+
+Expectile Loss (τ=0.7): E[|2*0.7 - 1| * |Q - V|²]
+  - When Q > V: weight = 0.7 (moderate penalty)
+  - When Q < V: weight = 0.3 (light penalty)
+  - Result: V underestimates Q slightly
+
+Effect: Q values are naturally pessimistic without explicit penalties!
+```
+
+### When IQL Excels
+
+- **High-dimensional observations**: V-function is simpler than Q
+- **Continuous actions**: No need to discretize
+- **Mixed quality data**: Expectile naturally handles varying data quality
+- **Implicit distribution shift handling**: V-function implicitly constrains to data distribution
+
+
+## Part 4: Batch-Constrained Q-Learning (BCQ)
+
+### BCQ: Constraining Policy to Behavior Support
+
+**Core Idea**: Only improve actions that have **high probability under behavior policy β**.
+
+```
+Standard offline improvement:
+  π(a|s) ← exp(Q(s,a) / τ)  ← Can pick any action!
+
+BCQ improvement:
+  π(a|s) ← exp(Q(s,a) / τ) * I(β(a|s) > threshold)
+           └─ Only if action has nonzero β probability
+```
+
+### BCQ Implementation
+
+```python
+class BCQAgent:
+    def __init__(self, state_dim, action_dim, hidden_dim=256):
+        # Q-network
+        self.Q = self._build_q_network(state_dim, action_dim, hidden_dim)
+
+        # Behavior cloning network: learns β(a|s)
+        self.pi_b = self._build_policy_network(state_dim, action_dim, hidden_dim)
+
+        # Perturbation network: learn action perturbations near β
+        # π(a|s) = arg max_a Q(s, β(s) + ξ(s, a))
+        self.perturbation = self._build_perturbation_network(
+            state_dim, action_dim, hidden_dim
+        )
+
+        self.Q_optimizer = Adam(self.Q.parameters(), lr=3e-4)
+        self.pi_b_optimizer = Adam(self.pi_b.parameters(), lr=3e-4)
+        self.perturbation_optimizer = Adam(self.perturbation.parameters(), lr=3e-4)
+
+        self.discount = 0.99
+        self.constraint_weight = 0.5  # How strict is batch constraint?
+
+    def train_step(self, batch):
+        """BCQ training loop."""
+        states, actions, rewards, next_states, dones = batch
+
+        # 1. Train behavior cloning network β
+        log_probs = torch.log_softmax(self.pi_b(states), dim=1)
+        pi_b_loss = -log_probs.gather(1, actions.long()).mean()
+
+        self.pi_b_optimizer.zero_grad()
+        pi_b_loss.backward()
+        self.pi_b_optimizer.step()
+
+        # 2. Train Q-network with BCQ constraint
+        with torch.no_grad():
+            # Behavior actions in next state
+            pi_b_next = self.pi_b(next_states)  # [batch, action_dim]
+
+            # Perturbation network learns small deviations from β
+            perturbation = self.perturbation(next_states, pi_b_next)
+
+            # Constrained action: π(a|s') = β(s') + ξ(s')
+            # But in action space [-1, 1] (clipped)
+            constrained_actions = torch.clamp(
+                pi_b_next + perturbation, min=-1, max=1
+            )
+
+            # Q-value with constrained action
+            q_next = self.Q(next_states, constrained_actions)
+            td_target = rewards + (1 - dones) * self.discount * q_next
+
+        # Q-loss
+        q_pred = self.Q(states, actions)
+        q_loss = ((q_pred - td_target) ** 2).mean()
+
+        self.Q_optimizer.zero_grad()
+        q_loss.backward()
+        self.Q_optimizer.step()
+
+        # 3. Train perturbation network
+        # Learn to find best perturbation to β that increases Q
+        pi_b_curr = self.pi_b(states)
+        perturbation_curr = self.perturbation(states, pi_b_curr)
+        perturbed_actions = torch.clamp(
+            pi_b_curr + perturbation_curr, min=-1, max=1
+        )
+
+        q_perturbed = self.Q(states, perturbed_actions)
+        perturbation_loss = -q_perturbed.mean()  # Maximize Q
+
+        self.perturbation_optimizer.zero_grad()
+        perturbation_loss.backward()
+        self.perturbation_optimizer.step()
+
+        return {
+            'q_loss': q_loss.item(),
+            'pi_b_loss': pi_b_loss.item(),
+            'perturbation_loss': perturbation_loss.item()
+        }
+
+    def select_action(self, state, num_samples=100):
+        """
+        BCQ action selection:
+        1. Sample from β(a|s)
+        2. Perturb by small amount
+        3. Pick action with highest Q
+        """
+        with torch.no_grad():
+            # Behavior policy action
+            pi_b = self.pi_b(state)
+
+            # Sample small perturbations
+            perturbations = []
+            for _ in range(num_samples):
+                pert = self.perturbation(state, pi_b)
+                perturbed = torch.clamp(pi_b + pert, min=-1, max=1)
+                perturbations.append(perturbed)
+
+            perturbations = torch.stack(perturbations)  # [num_samples, action_dim]
+
+            # Evaluate Q for each perturbation
+            q_values = []
+            for pert_action in perturbations:
+                q_val = self.Q(state, pert_action)
+                q_values.append(q_val.item())
+
+            # Pick action with highest Q
+            best_idx = np.argmax(q_values)
+            action = perturbations[best_idx]
+
+            return action
+```
+
+### BCQ Core Mechanism
+
+```
+Batch Constraint Principle:
+
+Without BCQ:
+  Policy π can pick ANY action
+  Problem: Picks actions not in dataset
+  Result: Q overestimates for unseen actions
+
+With BCQ:
+  Policy π = β(s) + small_perturbation(s)
+  Constraint: π must stay close to β
+  Result: Only slightly improves on data actions
+
+Why it works:
+  - Q is accurate for β-like actions (in data)
+  - Perturbations are small (confidence is high)
+  - Policy can't escape to hallucinated region
+  - Safe policy improvement
+```
+
+### When BCQ is Appropriate
+
+- **Very limited data**: Strict constraint needed
+- **Expert data with mistakes**: Learn expert actions ± small perturbations
+- **Safety-critical**: Can't risk exploring OOD actions
+- **Discrete action space**: Easier to enumerate nearby actions
+
+### BCQ Pitfalls
+
+- **Too restrictive**: Can't leverage good data effectively
+- **Assumes β is reasonable**: If β is terrible, BCQ learns terrible policy
+- **Perturbation network complexity**: Another network to train
+
+
+## Part 5: Distribution Shift and Offline Evaluation
+
+### Understanding Distribution Shift in Offline RL
+
+**The Core Challenge**:
+
+```
+Training distribution:
+  States visited by β (behavior policy)
+  Actions β typically takes
+  Transitions β experiences
+
+Evaluation distribution:
+  States visited by π (learned policy)
+  Actions π learns to take
+  Transitions π will experience
+
+Problem:
+  Training and evaluation distributions diverge
+  π will visit states not in training data
+  Q-estimates unreliable for those states
+```
+
+### Offline Evaluation Without Environment Interaction
+
+**The Problem**:
+
+```
+Online RL:
+  π ← train(D)
+  eval_return = run_policy(π, env, episodes=100)
+  Simple and accurate
+
+Offline RL:
+  π ← train(D)
+  Can't run in environment (offline constraint!)
+  Must estimate return without interaction
+  How to evaluate without cheating?
+```
+
+### Offline Evaluation Methods
+
+**Method 1: Importance Sampling**
+
+```python
+def importance_sampling_eval(policy_pi, dataset_D):
+    """
+    Estimate E[G_π] using importance sampling.
+
+    Key idea:
+    E[G_π] = E_{(s,a,...)~π}[G]
+           = E_{(s,a,...)~β}[G * (π(a|s)/β(a|s))]
+
+    Use dataset transitions weighted by policy ratio.
+    """
+    total_return = 0
+    total_weight = 0
+
+    for trajectory in dataset_D:
+        # Trajectory importance ratio
+        traj_ratio = 1.0
+        traj_return = 0
+
+        for t, (s, a, r) in enumerate(trajectory):
+            # Importance weight for this action
+            pi_prob = policy_pi.log_prob(a, s).exp()
+            beta_prob = behavior_policy.log_prob(a, s).exp()
+
+            weight = pi_prob / (beta_prob + 1e-8)
+
+            # Importance ratio for trajectory
+            traj_ratio *= weight
+
+            # Accumulate return
+            traj_return += (0.99 ** t) * r
+
+        # Weight trajectory by importance ratio
+        total_return += traj_ratio * traj_return
+        total_weight += traj_ratio
+
+    # Average weighted return
+    estimated_return = total_return / total_weight
+    return estimated_return
+```
+
+**Problem**: High variance with small β probability. Weight explodes.
+
+**Method 2: Regression Importance Sampling (RIS)**
+
+```python
+def ris_eval(policy_pi, dataset_D, value_fn):
+    """
+    Regression Importance Sampling: combines IS with value function
+    to reduce variance.
+
+    Idea: Use learned V-function for long horizons, IS for short.
+    """
+    total_return = 0
+    total_weight = 0
+
+    for trajectory in dataset_D:
+        for t, (s, a, r, s_next) in enumerate(trajectory):
+            # Importance weight for this step
+            pi_prob = policy_pi.log_prob(a, s).exp()
+            beta_prob = behavior_policy.log_prob(a, s).exp()
+
+            is_weight = pi_prob / (beta_prob + 1e-8)
+
+            # Value of next state
+            v_next = value_fn(s_next)
+
+            # Hybrid estimate: IS-weighted reward + V-estimate of rest
+            return_est = r + 0.99 * v_next
+            weighted_return = is_weight * return_est
+
+            total_return += weighted_return
+            total_weight += is_weight
+
+    return total_return / total_weight
+```
+
+**Method 3: Model-based Estimation**
+
+```python
+def model_based_eval(policy_pi, dataset_D, dynamics_model):
+    """
+    Use learned dynamics model to estimate policy performance.
+
+    Idea: π(s) → a, dynamics_model(s,a) → s', Q(s,a) → r
+    """
+    initial_states = dataset_D.sample_initial_states(batch_size=100)
+
+    total_return = 0
+
+    for s in initial_states:
+        traj_return = 0
+        done = False
+
+        for t in range(horizon):
+            # Policy action
+            a = policy_pi.select_action(s)
+
+            # Model prediction
+            s_next = dynamics_model(s, a)
+            r = reward_fn(s, a)  # or Q(s,a)
+
+            # Accumulate return
+            traj_return += (0.99 ** t) * r
+
+            s = s_next
+            if done:
+                break
+
+        total_return += traj_return
+
+    estimated_return = total_return / len(initial_states)
+    return estimated_return
+```
+
+### Offline Evaluation Challenges
+
+```
+Challenge 1: Importance Weight Explosion
+  - If β rarely takes action π prefers
+  - IS weight = π(a|s)/β(a|s) becomes huge
+  - Estimate has infinite variance
+
+Challenge 2: V-Function Errors
+  - If V-function incorrect
+  - RIS estimates still wrong
+  - Can't correct without environment feedback
+
+Challenge 3: Model Errors
+  - If dynamics model wrong
+  - Model-based estimates diverge from reality
+  - Especially bad for long horizons
+
+Solution: Use multiple methods, cross-validate
+  - If all methods agree: estimate is reliable
+  - If methods disagree: be suspicious, try online validation
+```
+
+
+## Part 6: When Offline RL is Needed
+
+### Decision Framework: Offline vs Online RL
+
+**Question 1: Can you collect more data?**
+
+```
+YES → Consider online RL
+  - Use environment interaction
+  - Standard algorithms work well
+  - Lower algorithmic complexity
+
+NO → Offline RL necessary
+  - Fixed dataset only
+  - Learn without interaction
+  - Handle overestimation explicitly
+```
+
+**Question 2: Is data collection expensive?**
+
+```
+YES (expensive) → Offline RL pays off
+  - Robot experiments: $1000+ per hour
+  - Medical trials: ethical constraints
+  - Real-world deployment: safety concerns
+
+NO (cheap) → Online RL usually better
+  - Simulation available
+  - Data generation easy
+  - Self-play systems
+```
+
+**Question 3: Is data collection dangerous?**
+
+```
+YES → Offline RL + careful validation
+  - Autonomous driving
+  - Nuclear plants
+  - Medical systems
+  - Learn conservatively from past experience
+
+NO → Online RL fine
+  - Game environments
+  - Safe simulators
+  - Can afford exploration failures
+```
+
+### When Offline RL is Essential
+
+**1. Real-World Robotics**
+
+```
+Problem: Collect trajectory = robot experiment
+Cost: $$$, time, expertise
+Solution: Offline RL from logged demonstrations
+
+Example: Learning from human tele-operation logs
+- Data collection: humans control robot
+- Training: offline RL from logs
+- Deployment: robot improves on human behavior
+
+Why offline RL helps:
+- Can't try random actions (breaks hardware)
+- Can't explore unsafely (danger)
+- Limited budget for experiments
+```
+
+**2. Medical Treatment Policies**
+
+```
+Problem: Can't experiment on patients
+Data: Historical patient records
+Solution: Offline RL to find better treatments
+
+Example: Learning antibiotic treatment policies
+- Data: patient → treatment → outcome logs
+- Training: offline RL from historical data
+- Deployment: recommend treatments
+
+Why offline RL helps:
+- Can't do random exploration (unethical)
+- Patient outcomes matter immediately
+- Limited patient population
+```
+
+**3. Recommendation Systems**
+
+```
+Problem: Users leave if recommendations bad
+Data: Historical user interactions
+Solution: Offline RL to improve recommendations
+
+Example: Movie recommendations
+- Data: user watches movie → rates it
+- Training: offline RL from interaction logs
+- Deployment: recommend movies offline users will like
+
+Why offline RL helps:
+- Online experiment = worse user experience
+- Can't A/B test extensively (business impact)
+- Massive data available (can be offline)
+```
+
+### When Online RL is Better
+
+**1. Simulation Available**
+
+```
+Example: Atari games
+- Infinite free samples
+- Can explore safely
+- Rewards deterministic
+- Online RL solves it easily
+
+Why offline RL unnecessary:
+- Data collection cost ≈ 0
+- Exploration safe (it's a game)
+- Online algorithms highly optimized
+```
+
+**2. Self-Play Systems**
+
+```
+Example: Chess, Go
+- Generate own data
+- Unlimited exploration budget
+- Learn strong policies easily
+- Online RL natural
+
+Why offline RL adds complexity:
+- Data generation is free (self-play)
+- Can afford to explore
+- Online algorithms work better
+```
+
+**3. Simulator Fidelity is High**
+
+```
+Example: Training in simulation, deploy in reality
+- Simulator accurate enough
+- Can collect unlimited data
+- Distribution shift minimal (sim matches reality)
+- Online RL sufficient
+
+Why offline RL unnecessary:
+- Can collect all needed data in simulation
+- Don't have distribution shift problem
+```
+
+
+## Part 7: Common Pitfalls and Red Flags
+
+### Pitfall 1: Assuming Online Algorithm Will Work
+
+**Red Flag**: "I'll just use DQN/PPO on my offline data."
+
+**Reality**: Will overestimate values, learn poor policy.
+
+**Example**:
+
+```
+Dataset: suboptimal human demonstrations
+DQN trained on D: max Q estimated for unseen actions
+Result: Policy picks actions never in dataset
+Reality: actions fail in deployment
+
+Correct approach:
+- Use CQL/IQL to address overestimation
+- Constrain policy to behavior support (BCQ)
+- Evaluate carefully offline before deployment
+```
+
+### Pitfall 2: Ignoring Distribution Shift
+
+**Red Flag**: "Policy divergence shouldn't be a problem if data is diverse."
+
+**Reality**: Even diverse data has gaps. Policy will find them.
+
+**Example**:
+
+```
+Dataset: collected over 6 months, diverse actions
+Your policy: learns to combine actions in novel ways
+Result: visits unseen state combinations
+Q-estimates fail for combinations not in data
+
+Correct approach:
+- Monitor policy divergence from data
+- Use uncertainty estimates (ensemble Q-networks)
+- Gradually deploy, validate offline metrics
+```
+
+### Pitfall 3: Evaluating Offline with Online Metrics
+
+**Red Flag**: "I trained the policy, let me just run it in the environment to evaluate."
+
+**Reality**: That's not offline RL anymore! Defeats the purpose.
+
+**Example**:
+
+```
+Offline RL goal: learn without environment interaction
+Wrong evaluation: run π in environment 1000 times
+Result: uses 1000s of samples for evaluation
+
+Correct approach:
+- Use offline evaluation methods (IS, RIS, model-based)
+- Validate offline estimates before deployment
+- Use conservative estimates (pessimistic evaluation)
+```
+
+### Pitfall 4: Not Considering Data Quality
+
+**Red Flag**: "My algorithm is robust, it handles any data."
+
+**Reality**: Offline RL performance depends critically on data quality.
+
+**Example**:
+
+```
+Good data: expert demonstrations, well-explored actions
+CQL: performs well, conservative works fine
+
+Bad data: random exploration, sparse rewards
+CQL: learns very slowly (too pessimistic)
+BCQ: learns random behavior (constrained to β)
+
+Solution: Analyze data quality first
+- Percent expert vs random actions
+- Return distribution
+- Action coverage
+- Choose algorithm for data type
+```
+
+### Pitfall 5: Overestimating Conservatism
+
+**Red Flag**: "I'll use maximum pessimism to be safe."
+
+**Reality**: Too much pessimism prevents learning anything.
+
+**Example**:
+
+```
+Hyperparameter: CQL weight α = 1000 (extreme)
+Result: Q(s,a) always negative (super pessimistic)
+Policy: learns random actions (all have low Q)
+Performance: no improvement over random
+
+Correct approach:
+- Tune conservatism to data quality
+- Diverse data: less pessimism (CQL weight = 0.1)
+- Limited data: more pessimism (CQL weight = 1.0)
+- Validate offline evaluation metrics
+```
+
+### Pitfall 6: Forgetting Batch Constraints
+
+**Red Flag**: "CQL handles distribution shift, I don't need behavior cloning."
+
+**Reality**: CQL addresses overestimation, not policy divergence.
+
+**Example**:
+
+```
+CQL alone:
+Q-values conservative (good)
+Policy gradient: π(a|s) = exp(Q(s,a) / τ)
+Policy can still diverge far from β (bad)
+New states visited, Q unreliable
+
+CQL + behavior cloning:
+Q-values conservative
+Policy constrained: π ≈ β + improvement
+Policy stays near data
+Safe exploration
+
+Solution: Combine approaches
+- CQL for value estimation
+- KL constraint for policy divergence
+- Behavior cloning for explicit constraint
+```
+
+### Pitfall 7: Using Wrong Evaluation Metric
+
+**Red Flag**: "I'll just report average Q-value as evaluation metric."
+
+**Reality**: Q-values can be hallucinated!
+
+**Example**:
+
+```
+High Q-values don't mean high returns
+- If Q overestimates: high Q, low actual return
+- If data is poor: high Q, bad actions
+
+Correct metrics:
+- Importance sampling estimate (IS)
+- Regression IS estimate (RIS, lower variance)
+- Model-based estimate (if model good)
+- Conservative: min of multiple estimates
+```
+
+### Pitfall 8: Not Handling Batch Imbalance
+
+**Red Flag**: "I have 1M samples, that's enough data."
+
+**Reality**: Sample diversity matters more than quantity.
+
+**Example**:
+
+```
+Dataset composition:
+- 900K samples: random actions
+- 100K samples: expert demonstrations
+
+Training:
+- Q-network sees mostly random behavior
+- Expert actions are rare
+- Q overestimates expert actions (underfitting them)
+
+Solution:
+- Stratified sampling (balance expert/random)
+- Importance weighting (weight rare samples higher)
+- Separate Q-networks for different behavior types
+```
+
+### Pitfall 9: Assuming Stationary Environment
+
+**Red Flag**: "Environment hasn't changed, my offline policy should work."
+
+**Reality**: Without online validation, you can't detect environment shifts.
+
+**Example**:
+
+```
+Training period: robot arm dynamics stable
+Deployment period: arm friction increased
+Offline estimate: predicted 50 reward
+Actual return: 20 reward
+
+Why? Q-function trained on different dynamics.
+
+Solution:
+- Monitor deployment performance vs offline estimate
+- If large gap: retrain on new data
+- Use online validation after deployment
+```
+
+### Pitfall 10: Not Accounting for Reward Uncertainty
+
+**Red Flag**: "Rewards are fixed, no uncertainty."
+
+**Reality**: Sparse/noisy rewards create uncertainty in Q-estimates.
+
+**Example**:
+
+```
+Sparse reward environment:
+- Most transitions have r = 0
+- Few transitions have r = 1 (sparse signal)
+- Q-function must extrapolate from few examples
+
+CQL without reward uncertainty:
+- Pessimistic on actions, not rewards
+- Misses that some Q-high states might have low actual reward
+
+Solution:
+- Ensemble Q-networks (estimate uncertainty)
+- Use epistemic uncertainty in policy improvement
+- Conservative value averaging (use min over ensemble)
+```
+
+
+## Part 8: Real-World Case Studies
+
+### Case Study 1: Robot Manipulation
+
+**Problem**: Train robot to pick and place objects from demonstrations.
+
+```
+Data:
+- 50 human teleoperation episodes (~500 transitions each)
+- Expert actions: picking, placing, moving
+- Cost: each episode = 1 hour robot time
+
+Why offline RL:
+- Data collection expensive (human time)
+- Can't explore randomly (break hardware)
+- Want to improve on expert (BC alone isn't enough)
+
+Solution: IQL
+- Learns from expert demonstrations
+- V-function constraints policy to expert-like state distribution
+- Iteratively improves on expert with confidence
+
+Results:
+- Offline training: 2 hours GPU
+- Online validation: 1 hour robot time
+- Policy: 50% success rate improvement over expert
+```
+
+### Case Study 2: Recommendation Systems
+
+**Problem**: Improve movie recommendations from historical user interactions.
+
+```
+Data:
+- 100M user-movie interactions (1 year history)
+- Reward: user watches movie (r=1) or skips (r=0)
+- Distribution: mix of random recommendations and human editorial
+
+Why offline RL:
+- Online A/B test = worse user experience
+- Can't explore randomly (hurts engagement metrics)
+- But can learn better recommendations from behavior
+
+Solution: CQL
+- Conservatively estimates Q(user, movie)
+- Avoids recommending movies only hypothetically good
+- Safely recommends movies similar to successful past
+
+Results:
+- Offline metrics: 20% improvement over heuristics
+- A/B test: 8% improvement (offline was too conservative)
+- Trade-off: offline pessimism vs online optimality
+```
+
+### Case Study 3: Medical Treatment Policy
+
+**Problem**: Learn treatment policy from patient records to minimize mortality.
+
+```
+Data:
+- 50K patient records (diverse treatments, outcomes)
+- Actions: drug choices, dosages, procedures
+- Reward: patient survives (r=1) or dies (r=-100)
+
+Why offline RL:
+- Can't experiment on patients (unethical)
+- Sparse reward (most patients live)
+- Distribution shift (changing patient population)
+
+Solution: BCQ + IQL hybrid
+- BCQ: only improve on treatments in data (avoid new untested drugs)
+- IQL: expectile regression handles sparse rewards naturally
+- Conservative deployment: start with small population, validate
+
+Challenges:
+- Data collection bias (sicker patients get more aggressive treatment)
+- Measurement error (outcomes uncertain)
+- Non-stationarity (medical practices evolve)
+
+Results:
+- Offline validation: policy seems better
+- Pilot deployment: 2% improvement
+- Requires continuous retraining as new data arrives
+```
+
+
+## Part 9: Advanced Topics
+
+### Topic 1: Conservative Q-Learning Variants
+
+**CQL with Importance Weighting**:
+
+```
+Instead of treating all actions equally in CQL penalty,
+weight by behavioral policy probability:
+
+CQL loss = -α * E[(1-β(a|s))/(1+β(a|s)) * Q(s,a)]
+           + E[Q(s,a) - target]
+
+Intuition: Heavy penalty on unlikely actions, light penalty on likely ones
+Result: More efficient use of data, can improve more than standard CQL
+```
+
+**Weighted CQL for Reward Maximization**:
+
+```
+Modify target to emphasize high-reward trajectories:
+
+CQL loss = -α * E[weight(r) * Q(s,a)] + E[(Q - target)²]
+
+where weight(r) = high if r is high, low if r is low
+
+Result: Faster learning from expert demonstrations
+Trade-off: Less conservative, more risk of overestimation
+```
+
+### Topic 2: Offline RL with Function Approximation Errors
+
+When using neural networks, approximation errors can compound:
+
+```
+Error compound formula:
+  Total error ≈ sum of:
+  1. Bellman error: |r + γ V(s') - Q(s,a)|
+  2. Approximation error: |Q_approx(s,a) - Q_true(s,a)|
+  3. Extrapolation error: error magnitude grows with |s - data_states|
+
+Solutions:
+- Use ensemble networks (estimate uncertainty)
+- Conservative value updates (take min over ensemble)
+- Explicit uncertainty penalties
+```
+
+### Topic 3: Offline-to-Online Fine-Tuning
+
+```
+Real-world often requires offline pre-training + online fine-tuning:
+
+Phase 1: Offline training
+  - Learn from fixed dataset
+  - Use CQL/IQL for value stability
+
+Phase 2: Online fine-tuning
+  - Collect new data with learned policy
+  - Fine-tune with reduced exploration (avoid undoing offline learning)
+  - Use importance weighting to not forget offline data
+
+Example: Robotics
+  - Offline: learn from 1 month of demonstrations
+  - Online: 1 week fine-tuning with real environment
+  - Combined: policy leverages both demonstrations and interaction
+```
+
+
+## Part 10: Debugging and Diagnosis
+
+### Diagnostic Question 1: Are Q-Values Reasonable?
+
+**How to Check**:
+
+```python
+# Sample random trajectory from dataset
+state = dataset.sample_state()
+for action in range(num_actions):
+    q_val = Q(state, action)
+    print(f"Q({state}, {action}) = {q_val}")
+
+# Reasonable bounds:
+# - Q-values should match observed returns (~10-100 for most tasks)
+# - Not astronomical (1000+) without justification
+# - Negative only if task is hard
+```
+
+**Red Flags**:
+
+- Q-values > 1000: overestimation (increase CQL weight)
+- Q-values all negative: pessimism too high (decrease CQL weight)
+- Q-values constant: network not learning (check training loss)
+
+### Diagnostic Question 2: Is Policy Diverging from Behavior?
+
+**How to Check**:
+
+```python
+# Compute KL divergence between learned π and behavior β
+kl_div = 0
+for state in test_states:
+    pi_logprobs = π.log_prob(state)
+    beta_logprobs = β.log_prob(state)
+    kl_div += (pi_logprobs.exp() * (pi_logprobs - beta_logprobs)).mean()
+
+# Reasonable KL divergence:
+# - < 1.0 bit: small divergence (safe)
+# - 1.0-5.0 bits: moderate divergence (watch for OOD)
+# - > 5.0 bits: severe divergence (use more regularization)
+```
+
+**Fixes**:
+
+- Too much divergence: increase KL constraint weight
+- Too little divergence: decrease constraint (not learning)
+
+### Diagnostic Question 3: Offline vs Online Performance Gap
+
+```python
+# Estimate offline performance (no environment interaction)
+offline_return_estimate = importance_sampling_eval(policy_π, dataset)
+
+# Estimate online performance (if possible to test)
+online_return_actual = run_policy(policy_π, environment, episodes=10)
+
+# Gap analysis:
+gap = online_return_actual - offline_return_estimate
+
+# Interpretation:
+# gap ≈ 0: offline estimates reliable
+# gap > 10%: offline estimate optimistic (increase pessimism)
+# gap < -10%: offline estimate too pessimistic (decrease pessimism)
+```
+
+
+## Conclusion: When You Know Offline RL
+
+**You understand offline RL when you can**:
+
+1. Explain value overestimation without environment feedback
+2. Choose between CQL, IQL, BCQ for data type
+3. Recognize distribution shift in your learned policy
+4. Design offline evaluation without environment rollouts
+5. Know when offline RL worth the complexity
+6. Diagnose training failures (overestimation, divergence, etc.)
+
+**Next Steps**:
+
+- Implement CQL from scratch on GridWorld
+- Compare CQL vs IQL vs BCQ on offline benchmark
+- Design offline evaluation strategy for your domain
+- Deploy cautiously: offline → online fine-tuning
+
+
+## References & Further Learning
+
+**Key Papers**:
+
+- CQL: "Conservative Q-Learning for Offline Reinforcement Learning" (Kumar et al., 2020)
+- IQL: "Offline Reinforcement Learning with Implicit Q-Learning" (Kostrikov et al., 2021)
+- BCQ: "Batch-Constrained Deep Q-Learning" (Fujimoto et al., 2018)
+
+**Offline RL Benchmarks**:
+
+- D4RL: Offline RL benchmark suite
+- Atari 5M: Limited sample offline Atari
+- Locomotion: MuJoCo continuous control
+
+**Related Skills**:
+
+- rl-foundations: TD learning, Bellman equations
+- value-based-methods: Q-learning fundamentals
+- policy-gradient-methods: Policy improvement
+- rl-evaluation-benchmarking: How to measure RL progress
diff --git a/skills/using-deep-rl/policy-gradient-methods.md b/skills/using-deep-rl/policy-gradient-methods.md
new file mode 100644
index 0000000..904435c
--- /dev/null
+++ b/skills/using-deep-rl/policy-gradient-methods.md
@@ -0,0 +1,1574 @@
+
+# Policy Gradient Methods
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Algorithm Selection**: "Should I use policy gradients or DQN for my problem?"
+- **REINFORCE Issues**: User implementing REINFORCE with high variance or slow training
+- **PPO Implementation**: User building PPO, debugging training, hyperparameter tuning
+- **TRPO Questions**: "What's TRPO?", "Should I use TRPO vs PPO?", "How does trust region work?"
+- **Continuous Actions**: User with continuous action space (robot, autonomous vehicles, trading)
+- **Variance Reduction**: "How do baselines help?", "How to compute advantages?", "Advantage normalization?"
+- **Policy vs Value Confusion**: User unsure whether to use policy gradients or value-based methods
+- **Actor-Critic Methods**: User implementing A3C, A2C, understanding actor-critic structure
+- **Discrete vs Continuous**: User confused about implementing policy gradients for discrete vs continuous actions
+- **Training Instability**: "PPO learning suddenly stops", "Rewards collapse", "Loss spikes"
+- **Trust Region Concepts**: Understanding how PPO's clipping enforces trust regions
+
+**This skill provides practical implementation guidance for direct policy optimization.**
+
+Do NOT use this skill for:
+
+- Value-based methods like DQN (route to value-based-methods)
+- Model-based RL (route to model-based-rl)
+- Offline RL (route to offline-rl-methods)
+- Theory foundations (route to rl-foundations)
+- Advanced variants (route to advanced-rl-topics)
+
+
+## Core Principle
+
+**Policy gradient methods directly optimize the policy by following gradients of expected return. They're essential for continuous action spaces and excel when the policy space is simpler than the value function landscape. The fundamental tradeoff: high variance (need baselines) but can handle continuous actions naturally.**
+
+Key insight: Unlike value-based methods that learn Q(s,a) then act greedily, policy gradients parameterize the policy directly: π(a|s,θ) and improve it via gradient ascent on expected return. This fundamental difference makes them:
+
+- Natural for continuous actions (infinite action space, can't enumerate all actions)
+- Capable of stochastic policies (useful for exploration and multi-modal solutions)
+- Directly optimizing the objective you care about (expected return)
+- But suffering from high variance (need variance reduction techniques)
+
+**Use policy gradients for**:
+
+- Continuous control (robot arms, autonomous vehicles, physics simulation)
+- Stochastic policies required (exploration strategies, risk-aware policies)
+- Large/continuous action spaces
+- When value function is harder to learn than policy
+
+**Do not use for** (use value-based methods instead):
+
+- Discrete action spaces where you can enumerate all actions (use DQN)
+- When off-policy efficiency is critical and state space huge (use DQN)
+- Tabular/small discrete problems (Q-learning faster to converge)
+
+
+## Part 1: Policy Gradient Theorem Foundation
+
+### The Policy Gradient Theorem
+
+This is the mathematical foundation. The theorem states:
+
+```
+∇_θ J(θ) = E_τ[∇_θ log π(a|s,θ) Q^π(s,a)]
+```
+
+Where:
+
+- J(θ) = expected return (objective to maximize)
+- π(a|s,θ) = policy parameterized by θ
+- Q^π(s,a) = action value function under policy π
+- ∇_θ log π(a|s,θ) = gradient of log-probability (score function)
+
+**What this means**: The expected return gradient is the expectation of (policy gradient × action value). You move the policy in direction of good actions (high Q) and away from bad actions (low Q).
+
+### Why Log-Probability (Score Function)?
+
+The gradient ∇_θ log π(a|s,θ) is crucial. Using log-probability instead of π directly:
+
+```
+∇_θ log π(a|s,θ) = ∇_θ π(a|s,θ) / π(a|s,θ)
+
+Key insight: This naturally rescales gradient by 1/π(a|s,θ)
+- Actions with low probability get higher gradient signals (explore unusual actions)
+- Actions with high probability get lower gradient signals (exploit good actions)
+```
+
+### Practical Interpretation
+
+```python
+# Pseudocode interpretation
+for step in training:
+    # 1. Sample trajectory under current policy
+    trajectory = sample_trajectory(policy)
+
+    # 2. For each state-action pair in trajectory
+    for s, a in trajectory:
+        # Compute action value (return from this step)
+        q_value = compute_return(trajectory, step)
+
+        # 3. Update policy to increase log-prob of high-value actions
+        gradient = q_value * ∇ log π(a|s,θ)
+
+        # 4. Gradient ascent on expected return
+        θ ← θ + α * gradient
+```
+
+**Key insight**: If q_value > 0 (good action), increase probability. If q_value < 0 (bad action), decrease probability.
+
+### The Baseline Problem
+
+Raw policy gradient has huge variance because Q(s,a) values vary widely. For example:
+
+```
+Same trajectory, two baseline approaches:
+WITHOUT baseline: Q values range from -1000 to +1000
+  - Small differences in action quality get lost in noise
+  - Gradient updates noisy and unstable
+
+WITH baseline: Advantages A(s,a) = Q(s,a) - V(s) range from -50 to +50
+  - Relative quality captured, absolute scale reduced
+  - Gradient updates stable and efficient
+```
+
+
+## Part 2: REINFORCE - Vanilla Policy Gradient
+
+### Algorithm: REINFORCE
+
+REINFORCE is the simplest policy gradient algorithm:
+
+```
+Algorithm: REINFORCE
+Input: policy π(a|s,θ), learning rate α
+1. Initialize policy parameters θ
+
+for episode in 1 to num_episodes:
+    # Collect trajectory
+    τ = [(s_0, a_0, r_0), (s_1, a_1, r_1), ..., (s_T, a_T, r_T)]
+
+    # Compute returns (cumulative rewards from each step)
+    for t = 0 to T:
+        G_t = r_t + γ*r_{t+1} + γ²*r_{t+2} + ... + γ^{T-t}*r_T
+
+    # Update policy (gradient ascent)
+    for t = 0 to T:
+        θ ← θ + α * G_t * ∇_θ log π(a_t|s_t,θ)
+```
+
+### REINFORCE Implementation Details
+
+**Discrete Actions** (softmax policy):
+
+```python
+import torch
+import torch.nn as nn
+from torch.distributions import Categorical
+
+class REINFORCEAgent:
+    def __init__(self, state_dim, action_dim, hidden=128, lr=0.01):
+        self.network = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=lr)
+
+    def compute_returns(self, rewards, gamma=0.99):
+        """Compute cumulative returns"""
+        returns = []
+        G = 0
+        for r in reversed(rewards):
+            G = r + gamma * G
+            returns.insert(0, G)
+        return torch.tensor(returns, dtype=torch.float32)
+
+    def train_step(self, states, actions, rewards):
+        """Single training step"""
+        states = torch.tensor(states, dtype=torch.float32)
+        actions = torch.tensor(actions, dtype=torch.long)
+
+        # Get policy logits
+        logits = self.network(states)
+
+        # Get log probabilities (softmax + log)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        log_prob_actions = log_probs.gather(1, actions.unsqueeze(1)).squeeze()
+
+        # Compute returns
+        returns = self.compute_returns(rewards)
+
+        # Policy gradient loss (negative because optimizer minimizes)
+        loss = -(log_prob_actions * returns).mean()
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        return loss.item()
+```
+
+**Continuous Actions** (Gaussian policy):
+
+```python
+class REINFORCEContinuous:
+    def __init__(self, state_dim, action_dim, hidden=128, lr=0.01):
+        self.mean_network = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+        self.log_std = nn.Parameter(torch.zeros(action_dim))
+        self.optimizer = torch.optim.Adam(
+            list(self.mean_network.parameters()) + [self.log_std],
+            lr=lr
+        )
+
+    def train_step(self, states, actions, rewards):
+        states = torch.tensor(states, dtype=torch.float32)
+        actions = torch.tensor(actions, dtype=torch.float32)
+
+        # Get policy mean
+        mean = self.mean_network(states)
+        std = torch.exp(self.log_std)
+
+        # Gaussian log probability
+        var = std.pow(2)
+        log_prob = -0.5 * ((actions - mean) ** 2 / var).sum(dim=-1)
+        log_prob -= 0.5 * torch.log(var).sum(dim=-1)
+
+        returns = self.compute_returns(rewards)
+        loss = -(log_prob * returns).mean()
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+```
+
+### REINFORCE Pitfall #1: Ignoring High Variance
+
+**Scenario**: User implements REINFORCE and sees training curve: extremely noisy, takes millions of samples to learn simple task.
+
+**Problem**:
+
+```
+REINFORCE uses raw returns G_t = R_t + γR_{t+1} + ...
+These have huge variance because:
+- Stochastic environment: same action has different outcomes
+- Credit assignment: which action caused reward 100 steps later?
+- Result: gradient updates are noisy, learning inefficient
+```
+
+**Red Flag**: If you see extreme noise in training and slow convergence with REINFORCE, you're missing variance reduction.
+
+**Solution**: Add baseline (value function estimate).
+
+
+## Part 3: Baseline and Advantage Estimation
+
+### Why Baselines Reduce Variance
+
+Baseline b(s) is any function of state that doesn't change policy (0-gradient), but reduces variance:
+
+```
+Advantage: A(s,a) = Q(s,a) - b(s)
+
+Mathematical property:
+E[b(s) * ∇ log π(a|s)] = 0  (baseline cancels out in expectation)
+
+But variance reduces:
+Var[Q(s,a) * ∇ log π] >> Var[A(s,a) * ∇ log π]
+```
+
+### Value Function as Baseline
+
+Standard baseline: learn V(s) to estimate expected return from state s.
+
+```
+Advantage estimation:
+A(s,a) = r + γV(s') - V(s)  [1-step temporal difference]
+
+or
+
+A(s,a) = r + γV(s') - V(s) + γ² V(s'') - γV(s') + ...  [n-step]
+
+or
+
+A(s,a) = G_t - V(s)  [Monte Carlo, use full return]
+```
+
+### Advantage Normalization
+
+Critical for training stability:
+
+```python
+# Compute advantages
+advantages = returns - baseline_values
+
+# Normalize (zero mean, unit variance)
+advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+```
+
+**Why normalize?**
+
+```
+Without: advantages might be [-1000, 1000] → huge gradient updates
+With: advantages might be [-2, 2] → stable gradient updates
+```
+
+### Baseline Network Implementation
+
+```python
+class PolicyGradientWithBaseline:
+    def __init__(self, state_dim, action_dim, hidden=128, lr=0.001):
+        # Policy network (discrete actions)
+        self.policy = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+
+        # Value network (baseline)
+        self.value = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+
+        self.optimizer = torch.optim.Adam(
+            list(self.policy.parameters()) + list(self.value.parameters()),
+            lr=lr
+        )
+
+    def train_step(self, states, actions, rewards):
+        states = torch.tensor(states, dtype=torch.float32)
+        actions = torch.tensor(actions, dtype=torch.long)
+
+        # Get policy and value estimates
+        logits = self.policy(states)
+        values = self.value(states).squeeze()
+
+        # Compute returns
+        returns = self.compute_returns(rewards)
+
+        # Advantages (with normalization)
+        advantages = returns - values.detach()
+        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Policy loss: maximize expected return
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        log_prob_actions = log_probs.gather(1, actions.unsqueeze(1)).squeeze()
+        policy_loss = -(log_prob_actions * advantages).mean()
+
+        # Value loss: minimize squared error
+        value_loss = ((returns - values) ** 2).mean()
+
+        # Combined loss
+        loss = policy_loss + 0.5 * value_loss
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        return loss.item()
+```
+
+### Pitfall #2: Unnormalized Advantages
+
+**Scenario**: User computes advantages but doesn't normalize, gets training instability.
+
+**Problem**:
+
+```
+Without normalization:
+- Advantages might be [−500, 0, 500] (varies widely)
+- Policy gradients huge: 500 * ∇log π is massive
+- Learning rate must be tiny to avoid divergence
+- Training unstable and slow
+
+With normalization:
+- Advantages become [−1, 0, 1] (standardized)
+- Gradients stable and proportional
+- Can use higher learning rate
+- Training smooth and efficient
+```
+
+**Red Flag**: If training is unstable with policy gradients, first check advantage normalization.
+
+
+## Part 4: PPO - Proximal Policy Optimization
+
+### PPO: The Practical Standard
+
+PPO is the most popular policy gradient method because it's simple, stable, and effective.
+
+**Key idea**: Prevent policy from changing too much per update (trust region).
+
+### PPO Clipped Surrogate Loss
+
+PPO uses clipping to enforce trust region:
+
+```
+L^CLIP(θ) = E_t[min(r_t(θ) A_t, clip(r_t(θ), 1-ε, 1+ε) A_t)]
+
+Where:
+r_t(θ) = π(a_t|s_t,θ) / π(a_t|s_t,θ_old)  [probability ratio]
+ε = clip parameter (typically 0.2)
+A_t = advantage at time t
+```
+
+**What clipping does**:
+
+```
+Advantage A > 0 (good action):
+  - Without clipping: r can be arbitrarily large → huge gradient
+  - With clipping: r is bounded by (1+ε) → gradient capped
+
+Advantage A < 0 (bad action):
+  - Without clipping: r can shrink to 0 → small gradient
+  - With clipping: r is bounded by (1-ε) → prevents overuse
+
+Result: Policy changes bounded per update (trust region)
+```
+
+### PPO Implementation (Discrete Actions)
+
+```python
+class PPOAgent:
+    def __init__(self, state_dim, action_dim, hidden=128, lr=0.0003, clip_ratio=0.2):
+        self.policy = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+
+        self.value = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+
+        self.optimizer = torch.optim.Adam(
+            list(self.policy.parameters()) + list(self.value.parameters()),
+            lr=lr
+        )
+        self.clip_ratio = clip_ratio
+
+    def train_step(self, states, actions, rewards, old_log_probs):
+        """
+        Train on collected batch using clipped surrogate loss
+
+        Args:
+            states: batch of states
+            actions: batch of actions
+            rewards: batch of returns
+            old_log_probs: log probabilities from old policy
+        """
+        states = torch.tensor(states, dtype=torch.float32)
+        actions = torch.tensor(actions, dtype=torch.long)
+        returns = torch.tensor(rewards, dtype=torch.float32)
+        old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
+
+        # Get current policy and value
+        logits = self.policy(states)
+        values = self.value(states).squeeze()
+
+        # New log probabilities under current policy
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        log_prob_actions = log_probs.gather(1, actions.unsqueeze(1)).squeeze()
+
+        # Probability ratio (new policy / old policy)
+        ratio = torch.exp(log_prob_actions - old_log_probs)
+
+        # Compute advantages with normalization
+        advantages = returns - values.detach()
+        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Clipped surrogate loss (PPO's key contribution)
+        unclipped = ratio * advantages
+        clipped = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * advantages
+        policy_loss = -torch.min(unclipped, clipped).mean()
+
+        # Value loss
+        value_loss = ((returns - values) ** 2).mean()
+
+        # Entropy bonus (exploration)
+        entropy = -(log_probs * torch.exp(log_probs)).sum(dim=-1).mean()
+
+        # Total loss
+        loss = policy_loss + 0.5 * value_loss - 0.01 * entropy
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
+        self.optimizer.step()
+
+        return loss.item()
+```
+
+### PPO for Continuous Actions
+
+```python
+class PPOContinuous:
+    def __init__(self, state_dim, action_dim, hidden=128, lr=0.0003):
+        self.mean_network = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dim)
+        )
+        self.log_std = nn.Parameter(torch.zeros(action_dim))
+
+        self.value = nn.Sequential(
+            nn.Linear(state_dim, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1)
+        )
+
+        self.optimizer = torch.optim.Adam(
+            list(self.mean_network.parameters()) +
+            list(self.value.parameters()) +
+            [self.log_std],
+            lr=lr
+        )
+        self.clip_ratio = 0.2
+
+    def compute_log_prob(self, states, actions):
+        """Compute log probability of actions under Gaussian policy"""
+        mean = self.mean_network(states)
+        std = torch.exp(self.log_std)
+
+        var = std.pow(2)
+        log_prob = -0.5 * ((actions - mean) ** 2 / var).sum(dim=-1)
+        log_prob -= 0.5 * torch.log(var).sum(dim=-1)
+
+        return log_prob
+
+    def train_step(self, states, actions, rewards, old_log_probs):
+        states = torch.tensor(states, dtype=torch.float32)
+        actions = torch.tensor(actions, dtype=torch.float32)
+        returns = torch.tensor(rewards, dtype=torch.float32)
+        old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
+
+        # Current policy
+        log_probs = self.compute_log_prob(states, actions)
+        values = self.value(states).squeeze()
+
+        # Probability ratio
+        ratio = torch.exp(log_probs - old_log_probs)
+
+        # Advantages
+        advantages = returns - values.detach()
+        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # PPO loss
+        unclipped = ratio * advantages
+        clipped = torch.clamp(ratio, 0.8, 1.2) * advantages
+        policy_loss = -torch.min(unclipped, clipped).mean()
+
+        value_loss = ((returns - values) ** 2).mean()
+
+        loss = policy_loss + 0.5 * value_loss
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+```
+
+### PPO Pitfall #3: Wrong Clip Ratio Selection
+
+**Scenario**: User sets clip_ratio=0.5 (too large) and agent learns very slowly.
+
+**Problem**:
+
+```
+Clip ratio controls trust region size:
+- Too small (0.05): policy restricted too much, underfits trajectories
+- Too large (0.5): policy can change drastically, causes instability
+
+Standard 0.2 works for most problems.
+For sensitive environments: use 0.1
+For robust environments: can use 0.3
+```
+
+**Red Flag**: If PPO isn't learning, before changing architecture check clip_ratio.
+
+
+## Part 5: TRPO vs PPO - Trust Regions
+
+### TRPO: Trust Region Policy Optimization
+
+TRPO is the predecessor to PPO, using natural gradient with KL divergence constraint:
+
+```
+Maximize: E_t[r_t(θ) A_t]
+
+Subject to: E[KL(π_old || π_new)] ≤ δ  (trust region constraint)
+
+Implementation: natural gradient + conjugate gradient optimization
+```
+
+### TRPO vs PPO Comparison
+
+| Aspect | TRPO | PPO |
+|--------|------|-----|
+| **Optimization** | Second-order (Fisher) + conjugate gradient | First-order (Adam) |
+| **Constraint** | KL divergence with Lagrange multiplier | Clipping |
+| **Complexity** | High (Fisher matrix computation) | Low (simple clipping) |
+| **Sample Efficiency** | Slightly better | Comparable, simpler |
+| **Implementation** | Complex (Fisher-vector products) | Simple (few lines) |
+| **When to use** | Research/specialized | Production/practical |
+
+**Rule of thumb**: Use PPO in 99% of cases. TRPO useful when:
+
+- Researching trust regions
+- Very high-dimensional problems where KL constraint matters
+- Academic/theoretical work
+
+### Why PPO Won
+
+PPO's clipping approximates TRPO's KL constraint much more simply:
+
+```
+TRPO: Explicit KL divergence in optimization
+PPO: Clipping implicitly prevents large policy divergence
+
+Result: PPO achieves ~95% of TRPO's benefits with ~10% of complexity
+```
+
+
+## Part 6: When to Use Policy Gradients vs Value Methods
+
+### Decision Framework
+
+**Use Policy Gradients if**:
+
+1. **Continuous action space** (position, velocity, torque)
+   - Value methods need discretization (curse of dimensionality)
+   - Policy gradients handle continuous naturally
+
+2. **Stochastic policy required** (exploration, risk)
+   - Policy gradients can naturally be stochastic
+   - Value methods produce deterministic policies
+
+3. **Policy space simpler than value space**
+   - Sometimes policy easier to learn than Q-function
+   - Especially in high-dimensional state/action spaces
+
+**Use Value Methods (DQN) if**:
+
+1. **Discrete action space** where you enumerate all actions
+   - Sample-efficient with offline capability
+   - No need for baseline variance reduction
+
+2. **Off-policy efficiency critical** (limited data)
+   - DQN naturally off-policy (experience replay)
+   - Policy gradients typically on-policy
+
+3. **Small discrete state/action spaces**
+   - Simpler to implement and tune
+   - Faster convergence
+
+### Example Decision Process
+
+```
+Problem: Robot arm control (continuous 7D joint angles)
+- Continuous action space → Use policy gradients
+- Can't discretize (7^10 combinations way too many)
+- PPO or TRPO appropriate
+
+Problem: Video game (discrete button presses)
+- Discrete actions → Value methods good option
+- Can enumerate all actions (4-18 actions typical)
+- DQN + variants (Double, Dueling, Rainbow) work well
+
+Problem: Atari with continuous control modification
+- Continuous output → Must use policy gradients
+- DQN not applicable
+- A3C, PPO, SAC appropriate
+```
+
+
+## Part 7: Common Pitfalls and Debugging
+
+### Pitfall #4: Reward Scale Issues
+
+**Scenario**: User trains policy gradient on custom environment, rewards in range [0, 1000], training doesn't work.
+
+**Problem**:
+
+```
+Large reward scale affects learning:
+- Reward 500 with baseline 400 → advantage 100 → huge gradient
+- Same problem with different magnitude masks learning signal
+- Solution: Reward clipping or normalization
+
+Options:
+1. Clip: rewards = clip(rewards, -1, 1)  [if known range]
+2. Running normalization: track reward mean/std
+3. Scaling: rewards = rewards / max_reward
+```
+
+**Solution**:
+
+```python
+# Running normalization
+class RewardNormalizer:
+    def __init__(self):
+        self.mean = 0
+        self.var = 1
+        self.count = 0
+
+    def normalize(self, reward):
+        self.count += 1
+        delta = reward - self.mean
+        self.mean += delta / self.count
+        self.var = self.var * (self.count - 1) / self.count + \
+                   delta ** 2 / self.count
+        return (reward - self.mean) / (np.sqrt(self.var) + 1e-8)
+```
+
+### Pitfall #5: Vanishing Gradients with Small Variance
+
+**Scenario**: Policy converges to deterministic (low variance), gradients become near-zero, learning plateaus.
+
+**Problem**:
+
+```
+In Gaussian policy: std = exp(log_std)
+If log_std → -∞, then std → 0, policy deterministic
+Deterministic policy: ∇log π ≈ 0 (flat log-prob landscape)
+Gradient vanishes, learning stops
+```
+
+**Solution**: Entropy bonus or minimum std:
+
+```python
+# Option 1: Entropy bonus (favors exploration)
+entropy = 0.5 * torch.sum(torch.log(2 * torch.pi * torch.e * std))
+loss = policy_loss - 0.01 * entropy  # Encourage exploration
+
+# Option 2: Minimum std (hard constraint)
+log_std = torch.clamp(log_std, min=-20)  # Prevent std→0
+```
+
+### Pitfall #6: Credit Assignment Over Long Horizons
+
+**Scenario**: Multi-step MDP with long trajectories, policy learns late-step actions but not early steps.
+
+**Problem**:
+
+```
+Return G_t = r_t + γr_{t+1} + γ²r_{t+2} + ... + γ^T r_T
+
+For early steps (t=0) with T=1000:
+- Return depends on 1000 future steps
+- Huge variance from far-future rewards
+- Gradient signal diluted: what caused reward at t=800?
+
+Solution: n-step advantages + GAE (Generalized Advantage Estimation)
+```
+
+**GAE Solution**:
+
+```python
+def gae(rewards, values, gamma=0.99, lambda_=0.95):
+    """Generalized Advantage Estimation"""
+    advantages = []
+    gae_value = 0
+
+    for t in reversed(range(len(rewards))):
+        # TD residual
+        if t < len(rewards) - 1:
+            td_residual = rewards[t] + gamma * values[t+1] - values[t]
+        else:
+            td_residual = rewards[t] - values[t]
+
+        # GAE accumulation
+        gae_value = td_residual + gamma * lambda_ * gae_value
+        advantages.insert(0, gae_value)
+
+    return torch.tensor(advantages, dtype=torch.float32)
+```
+
+### Pitfall #7: Batch Size and Training Stability
+
+**Scenario**: User trains on tiny batches (batch_size=4), gets highly unstable gradient estimates.
+
+**Problem**:
+
+```
+Advantages have huge variance on small batches:
+- Batch of 4 experiences: advantages might be [-500, -300, 200, 1500]
+- Mean/std computation unstable (high variance)
+- Normalized advantages: might be [-0.2, -0.1, 0.3, 0.8]
+- This varies wildly per batch → training unstable
+
+Solution: Larger batches (256-4096 depending on problem)
+```
+
+**Rule of thumb**:
+
+```
+- Simple problems: batch_size=256
+- Complex continuous control: batch_size=2048-4096
+- Make sure: max_episode_length << batch_size (decorrelation)
+```
+
+### Pitfall #8: Learning Rate Too High
+
+**Scenario**: Policy gradient loss oscillates wildly, returns sometimes improve then collapse.
+
+**Problem**:
+
+```
+Policy gradient updates are on probability distribution:
+- Large learning rate → policy changes drastically per step
+- KL divergence between successive policies huge
+- Training unstable, collapse to local minima
+
+PPO's clipping helps but doesn't eliminate problem
+```
+
+**Solution**: Conservative learning rates:
+
+```
+- Discrete (softmax): lr=0.001-0.0003
+- Continuous (Gaussian): lr=0.0003-0.0001
+
+Can use learning rate schedule to decay:
+lr = base_lr * (1 - progress)
+```
+
+### Red Flags Summary
+
+| Red Flag | Likely Cause | Fix |
+|----------|-------------|-----|
+| Training extremely noisy | Missing baseline or unnormalized advantages | Add value network + advantage normalization |
+| Loss spikes, returns collapse | Learning rate too high or clip_ratio wrong | Reduce lr, check clip_ratio (0.2 standard) |
+| Policy converges to deterministic | Low entropy bonus | Add entropy term: `loss - 0.01 * entropy` |
+| Slow learning, high sample inefficiency | REINFORCE without advantages | Use PPO or add baselines |
+| Early steps not learned | Long horizon credit assignment | Use GAE for advantage estimation |
+| Gradient NaN or divergence | Reward scale or gradient explosion | Clip rewards or use gradient norm clipping |
+
+
+## Part 8: Discrete vs Continuous Actions
+
+### Discrete Actions (Softmax Policy)
+
+**Network output**: logits for each action
+
+```python
+# Policy network output
+logits = network(state)  # shape: [batch, num_actions]
+
+# Convert to probabilities (softmax)
+probs = torch.softmax(logits, dim=-1)  # [batch, num_actions]
+
+# Sample action
+dist = torch.distributions.Categorical(probs)
+action = dist.sample()
+
+# Log probability of specific action
+log_prob = dist.log_prob(action)
+```
+
+**Key points**:
+
+- Output dimensionality = number of discrete actions
+- Softmax ensures valid probability distribution
+- Log-probability: `log(π(a|s))`
+
+### Continuous Actions (Gaussian Policy)
+
+**Network output**: mean and std for Gaussian distribution
+
+```python
+# Policy network outputs
+mean = mean_network(state)  # shape: [batch, action_dim]
+log_std = log_std_param     # learnable parameter
+
+std = torch.exp(log_std)    # ensure std > 0
+
+# Create Gaussian distribution
+dist = torch.distributions.Normal(mean, std)
+
+# Sample action
+action = dist.sample()
+
+# Log probability
+log_prob = dist.log_prob(action).sum(dim=-1)  # sum across dimensions
+```
+
+**Key points**:
+
+- Output dimensionality = action dimensionality
+- Parameterize as log_std (ensures std > 0)
+- Log-probability sums across action dimensions
+
+### Implementation Comparison
+
+```python
+class DiscretePolicy(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super().__init__()
+        self.net = nn.Linear(state_dim, action_dim)
+
+    def forward(self, state):
+        logits = self.net(state)
+        return torch.softmax(logits, dim=-1)
+
+    def log_prob(self, state, action):
+        probs = self.forward(state)
+        return torch.log(probs[torch.arange(len(probs)), action])
+
+
+class ContinuousPolicy(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super().__init__()
+        self.mean_net = nn.Linear(state_dim, action_dim)
+        self.log_std = nn.Parameter(torch.zeros(action_dim))
+
+    def forward(self, state):
+        mean = self.mean_net(state)
+        std = torch.exp(self.log_std)
+        return mean, std
+
+    def log_prob(self, state, action):
+        mean, std = self.forward(state)
+        var = std ** 2
+        log_prob = -0.5 * ((action - mean) ** 2 / var).sum(dim=-1)
+        log_prob -= 0.5 * torch.log(var).sum(dim=-1)
+        return log_prob
+```
+
+
+## Part 9: Implementation Pitfalls Table
+
+| Pitfall | Symptom | Root Cause | Fix |
+|---------|---------|-----------|-----|
+| High variance learning | Noisy loss, slow convergence | REINFORCE without baseline | Add value network baseline |
+| Training instability | Loss spikes, returns collapse | Unnormalized advantages | Standardize advantages: `(A - μ) / (σ + ε)` |
+| Premature convergence | Policy converges to deterministic, learning stops | Low entropy | Add entropy bonus: `-β * entropy` |
+| Slow learning | Excellent training behavior but very slow | Too conservative clip_ratio | Try clip_ratio=0.3 or 0.2 |
+| Gradient explosion | NaN loss, divergence | Reward scale or bad gradients | Clip rewards or add grad norm clipping |
+| Early steps not learned | Late steps work, early steps ignored | Long horizon variance | Use GAE for advantage estimation |
+| Value function divergence | Value loss increasing | Value learning rate too high | Reduce value_loss coefficient or lr |
+| Mode collapse | Policy too deterministic despite entropy | std too small or initialization | Increase entropy coefficient or initialize higher |
+
+
+## Part 10: Testing Scenarios (13+)
+
+1. **Basic REINFORCE** on CartPole: implement vanilla algorithm
+2. **REINFORCE with Baseline** on CartPole: compare variance reduction
+3. **PPO Discrete** on CartPole: verify clipping mechanism
+4. **PPO Continuous** on MuJoCo: test Gaussian policy
+5. **Advantage Normalization** effect: show stability improvement
+6. **Batch Size Impact** on training: variance vs stability tradeoff
+7. **Reward Scaling** in custom environment: demonstrate necessity
+8. **Clip Ratio Sensitivity** in PPO: test different epsilon values
+9. **Entropy Bonus** effect: exploration vs exploitation
+10. **GAE vs Monte Carlo Returns** on long horizon task: credit assignment
+11. **Learning Rate Sensitivity** across discrete and continuous
+12. **Value Network Architecture** impact on baseline quality
+13. **Policy vs Value Method Selection** framework validation
+
+
+## Part 11: Rationalization Table - When Users Get It Wrong
+
+| User Claim | Rationalization | Correct Approach |
+|-----------|-----------------|------------------|
+| "DQN should work for continuous actions, I'll discretize" | Can discretize but: curse of dimensionality (7D joint→7^n combos), loses continuous structure, very inefficient | Use policy gradients (PPO, SAC) naturally designed for continuous |
+| "REINFORCE is too slow, must be something wrong" | REINFORCE has high variance by design. Problem: not using baseline | Add value network baseline (variance reduction) or switch to PPO |
+| "PPO clip ratio 0.5 is more aggressive, should converge faster" | Larger clip ratio = larger trust region = less stability. Faster ≠ better | Use 0.2 (standard) or 0.15-0.3 range. Larger can diverge |
+| "Policy gradients have huge variance, value-based better for all problems" | Confusion: policy gradients handle continuous actions, DQN doesn't | Choose based on action space: discrete→consider DQN, continuous→policy gradients |
+| "I should use very small learning rate like 0.0001 to be safe" | Too conservative: policy learns very slowly, gets stuck in local minima | Use 0.001-0.0003 for discrete, 0.0003-0.0001 for continuous. Test decay. |
+| "Unnormalized advantages are fine, I'll just use small learning rate" | Small LR doesn't fix variance explosion in gradients, just masks problem | Normalize advantages: `(A - mean) / (std + ε)` properly |
+| "I'll use huge batch size (100k) for stability" | Diminishing returns: beyond 2048 doesn't improve stability, wastes computation | Use 256-4096 depending on problem complexity |
+| "Policy should converge to deterministic (low std) for best performance" | Common misconception: deterministic policies get stuck, can't explore | Keep some exploration: entropy bonus prevents premature convergence |
+| "TRPO is better than PPO because it's more sophisticated" | Confusing complexity with effectiveness: PPO achieves ~95% TRPO performance with 10% complexity | Use PPO for production unless researching KL constraints |
+| "My value network loss oscillates, means gradients bad" | Value oscillation normal during learning. Only problematic if diverging | Add value loss decay, reduce coeff: `loss = policy + 0.1 * value_loss` |
+
+
+## Summary: What You Need to Know
+
+**Policy Gradient Foundation**:
+
+- Policy gradients directly optimize π(a|s,θ) via ∇_θ J(θ)
+- Score function ∇_θ log π (gradient of log-probability) crucial for differentiability
+- High variance is the key challenge (solved by baselines)
+
+**REINFORCE**:
+
+- Simplest policy gradient algorithm
+- Without baseline: high variance, slow learning
+- Useful for understanding, not for production
+
+**Baselines & Advantages**:
+
+- Baseline b(s) reduces variance without changing policy
+- Advantage A(s,a) = Q(s,a) - V(s) captures relative action quality
+- Advantage normalization critical for training stability
+
+**PPO**:
+
+- Most practical policy gradient method (simple + effective)
+- Clipping enforces trust region (prevents destructive updates)
+- Works for discrete and continuous actions
+- Use: clip_ratio=0.2, entropy_coeff=0.01, value_loss_coeff=0.5
+
+**TRPO**:
+
+- Natural gradient + KL constraint (trust region)
+- More sophisticated but rarely necessary
+- PPO achieves ~95% effectiveness with 10% complexity
+
+**Algorithm Selection**:
+
+- Continuous actions → Policy gradients (PPO, SAC)
+- Discrete actions → Value methods (DQN) or policy gradients
+- Stochastic policy needed → Policy gradients
+- Maximum sample efficiency → DQN (if discrete)
+
+**Key Implementation Details**:
+
+- Advantage normalization: `(A - mean) / (std + ε)`
+- Learning rates: 0.001-0.0003 (discrete), 0.0003-0.0001 (continuous)
+- Batch size: 256-4096 (larger = more stable but slower)
+- Entropy bonus: `-0.01 * entropy` (prevents mode collapse)
+- Reward scaling: normalize or clip for stability
+- Gradient clipping: `clip_grad_norm_(params, 0.5)` prevents explosion
+
+**Red Flags**:
+
+- Training noisy/slow → Missing baseline
+- Loss spikes/instability → Unnormalized advantages, high LR, or clip_ratio wrong
+- Deterministic policy → Insufficient entropy
+- Gradient NaN → Reward scale or gradient explosion
+- Early steps not learning → Need GAE for long horizon
+
+
+## Part 12: Advanced Architecture Considerations
+
+### Network Capacity and Policy Learning
+
+Policy network capacity affects convergence speed and final performance:
+
+**Under-parameterized** (network too small):
+
+```
+Problem: Network can't represent optimal policy
+- Limited expressivity → stuck with poor solutions
+- Example: 2-hidden unit network for complex navigation
+- Result: High bias, underfitting
+
+Solution: Increase hidden units (128, 256, 512)
+Tradeoff: Slower training but better capacity
+```
+
+**Over-parameterized** (network too large):
+
+```
+Problem: Network overfits to finite trajectory samples
+- Example: 4096-hidden network for simple CartPole
+- Result: Fits noise in returns, poor generalization
+- But: Modern networks use dropout/regularization
+
+Solution: Standard sizes (128-512 hidden units)
+Rule: Match capacity to task complexity
+```
+
+**Shared vs Separate Networks**:
+
+```python
+# Option 1: Separate policy and value networks
+class SeparateNetworks:
+    policy = nn.Sequential(...)  # outputs action logits
+    value = nn.Sequential(...)   # outputs single value
+
+# Option 2: Shared trunk, separate heads
+class SharedTrunk:
+    trunk = nn.Sequential(...)    # shared hidden layers
+    policy_head = nn.Linear(...)  # policy logits
+    value_head = nn.Linear(...)   # value estimate
+
+# Option 3: Fully shared (rare, not recommended)
+# Single network outputs both logits and value
+```
+
+**Recommendation**: Separate networks are cleaner, shared trunk is common in practice (efficiency), fully shared not recommended.
+
+### Activation Functions and Training
+
+Choice of activation function affects gradient flow:
+
+**ReLU** (most common):
+
+```
+Advantages:
+- Fast computation
+- Prevents vanishing gradients
+- Works well with batch normalization
+
+Disadvantages:
+- Dead ReLU problem (some units permanently inactive)
+- Not smooth at zero
+```
+
+**Tanh/Sigmoid** (older, less common):
+
+```
+Advantages:
+- Smooth gradient everywhere
+- Bounded output [-1, 1]
+
+Disadvantages:
+- Can suffer vanishing gradients in deep networks
+- Slower computation
+```
+
+**LeakyReLU** (middle ground):
+
+```
+Advantages:
+- Fixes dead ReLU (small gradient when inactive)
+- Still fast
+
+Disadvantages:
+- Extra hyperparameter (leak rate)
+- Rarely needed for policy networks
+```
+
+**Recommendation**: Use ReLU for standard problems, LeakyReLU if debugging dead unit issues, avoid Tanh for policy networks.
+
+### Gradient Flow and Initialization
+
+Proper initialization prevents gradient explosion/vanishing:
+
+```python
+# Good initialization (Xavier/He)
+nn.Linear(in_features, out_features)  # PyTorch default is good
+
+# Manual Xavier initialization
+nn.init.xavier_uniform_(layer.weight)
+
+# Manual He initialization (better for ReLU)
+nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
+
+# For output layers (policy logits)
+# Often initialized smaller for stability
+nn.init.uniform_(policy_output.weight, -3e-3, 3e-3)
+nn.init.uniform_(policy_output.bias, -3e-3, 3e-3)
+```
+
+**Why it matters**: Poor initialization → vanishing/exploding gradients → training doesn't work.
+
+
+## Part 13: Practical Hyperparameter Tuning
+
+### Systematic Hyperparameter Search
+
+Instead of random guessing, systematic approach:
+
+**Step 1: Start with defaults**
+
+```python
+config = {
+    'learning_rate': 0.0003,      # Standard
+    'clip_ratio': 0.2,             # Standard
+    'entropy_coeff': 0.01,         # Standard
+    'value_loss_coeff': 0.5,       # Standard
+    'batch_size': 64,              # Small, test first
+    'num_epochs': 3,               # Updates per batch
+}
+```
+
+**Step 2: Test on simple environment**
+
+```
+- Run on CartPole or simple continuous task
+- If works: move to harder task
+- If fails: diagnose why before tuning
+```
+
+**Step 3: Tune based on observed problem**
+
+```
+If training is noisy:
+  ↓ increase batch_size (256, 512)
+  ↓ increase num_epochs (5-10)
+  ↓ decrease learning_rate (0.0001)
+
+If training is slow:
+  ↑ increase learning_rate (0.001)
+  ↓ decrease batch_size (32)
+  ↓ decrease num_epochs (1)
+
+If policy deterministic:
+  ↑ entropy_coeff (0.05, 0.1)
+  ↑ minimum std (increase from 1e-6)
+
+If value loss not decreasing:
+  ↑ value_loss_coeff (1.0, 2.0)
+  ↑ learning_rate for value
+```
+
+**Step 4: Grid or random search on subset**
+
+```python
+# Don't search all combinations, search subset
+hyperparams = {
+    'learning_rate': [0.0001, 0.0003, 0.001],
+    'batch_size': [64, 256, 1024],
+    'entropy_coeff': [0.0, 0.01, 0.05],
+}
+
+# Random sample 10 combinations
+import random
+samples = 10
+configs = [
+    {k: random.choice(v) for k, v in hyperparams.items()}
+    for _ in range(samples)
+]
+
+# Evaluate each
+for config in configs:
+    train_and_evaluate(config)
+```
+
+### Environment-Specific Tuning
+
+Different environments benefit from different settings:
+
+**Simple Discrete** (CartPole, MountainCar):
+
+```
+batch_size: 64-256
+learning_rate: 0.001
+entropy_coeff: 0.01
+clip_ratio: 0.2
+```
+
+**Complex Discrete** (Atari):
+
+```
+batch_size: 2048-4096
+learning_rate: 0.0001-0.0003
+entropy_coeff: 0.001-0.01
+clip_ratio: 0.1-0.2
+```
+
+**Continuous Control** (MuJoCo):
+
+```
+batch_size: 2048-4096
+learning_rate: 0.0003
+entropy_coeff: 0.01
+clip_ratio: 0.2
+num_epochs: 10-20
+```
+
+**Custom Environments**:
+
+```
+1. Start with continuous defaults
+2. Monitor advantage statistics
+3. Check entropy over training
+4. Adjust based on observations
+```
+
+
+## Part 14: Monitoring and Debugging Tools
+
+### Key Metrics to Monitor
+
+**Policy Loss Metrics**:
+
+```python
+# 1. Expected return (should increase)
+expected_return = sum(episode_rewards) / num_episodes
+# Should show clear upward trend
+
+# 2. Advantage statistics (should be normalized)
+mean_advantage = advantages.mean()  # Should be ~0
+std_advantage = advantages.std()    # Should be ~1
+# Red flag: if std too small (<0.1) or huge (>10)
+
+# 3. Policy entropy (should not approach zero)
+entropy = -(probs * log(probs)).sum()
+# Red flag: if entropy → 0 (policy deterministic)
+
+# 4. Policy ratio statistics
+ratio = new_policy / old_policy
+# Should be ~1.0 with small std
+# Red flag: if mean >> 1.2 or << 0.8
+```
+
+**Value Function Metrics**:
+
+```python
+# 1. Value function loss (should decrease)
+value_loss = (returns - value_estimates).pow(2).mean()
+# Should decrease over training
+
+# 2. Explained variance (higher better)
+# How much variance in returns does value explain?
+residuals = returns - value_estimates
+explained_var = 1 - (residuals.var() / returns.var())
+# Good: > 0.8, Bad: < 0.5
+
+# 3. Value estimate magnitude
+# Should be reasonable scale
+value_mean = value_estimates.mean()
+value_std = value_estimates.std()
+# Sanity check with return_mean, return_std
+```
+
+**Gradient Metrics**:
+
+```python
+# 1. Gradient magnitude (should be stable)
+for name, param in model.named_parameters():
+    if param.grad is not None:
+        grad_norm = param.grad.norm()
+        # Monitor: should be in [1e-5, 1e1] typically
+
+# 2. Gradient explosion warning
+if grad_norm > 1.0:
+    print("Warning: large gradients, consider clipping or smaller lr")
+
+# 3. Gradient vanishing warning
+if grad_norm < 1e-6:
+    print("Warning: tiny gradients, check entropy and baseline")
+```
+
+### Debugging Checklist
+
+When training fails:
+
+```python
+# 1. Verify data collection
+# Sample random trajectory, inspect:
+assert len(trajectory) > 0
+assert all(not isnan(r) for r in rewards)
+assert states.shape[0] == actions.shape[0] == len(rewards)
+
+# 2. Check advantage computation
+assert advantages.mean() < 0.1  # Should normalize to ~0
+assert advantages.std() > 0.1   # Should have variance
+
+# 3. Verify policy gradient
+assert policy_loss > 0  # Should be positive
+assert policy_loss < 1e6  # Not explosion
+
+# 4. Check value loss
+assert value_loss > 0
+assert value_loss < 1e6
+
+# 5. Monitor entropy
+assert entropy > 0  # Non-zero
+assert entropy < -log(1/num_actions)  # Not above maximum
+
+# 6. Inspect learning
+assert returns[-100:].mean() > returns[0:100].mean()  # Improving
+```
+
+
+## Part 15: Common Implementation Mistakes
+
+### Mistake #1: Probability Ratio Bugs in PPO
+
+```python
+# WRONG: using raw probabilities
+ratio = new_probs[action] / old_probs[action]
+
+# CORRECT: using log-probabilities (numerically stable)
+ratio = torch.exp(log_new_probs[action] - log_old_probs[action])
+
+# Why: log is numerically stable, avoids underflow for small probs
+```
+
+### Mistake #2: Advantage Sign Errors
+
+```python
+# WRONG: computing advantage incorrectly
+advantage = baseline - return  # Negative of what it should be!
+
+# CORRECT: advantage is how much better than baseline
+advantage = return - baseline
+
+# Consequence: policy updated opposite direction
+```
+
+### Mistake #3: Log-Probability Dimension Issues
+
+```python
+# For continuous actions:
+
+# WRONG: forgetting to sum across dimensions
+log_prob = dist.log_prob(action)  # shape: [batch, action_dim]
+loss = (log_prob * advantage).mean()  # broadcasting error or wrong reduction
+
+# CORRECT: sum log-probs across action dimensions
+log_prob = dist.log_prob(action).sum(dim=-1)  # shape: [batch]
+loss = (log_prob * advantage).mean()
+
+# Why: each action dimension contributes to overall probability
+```
+
+### Mistake #4: Detaching Value Estimates
+
+```python
+# WRONG: advantages affect value network gradients
+advantages = returns - values
+loss = -log_prob * advantages  # values included in gradient!
+
+# CORRECT: advantages should not backprop to value during policy update
+advantages = returns - values.detach()
+policy_loss = -log_prob * advantages  # doesn't affect value
+value_loss = (returns - values).pow(2).mean()  # separate value update
+
+# Why: policy gradient and value loss are separate objectives
+```
+
+### Mistake #5: Entropy Computation for Discrete
+
+```python
+# WRONG: using formula for continuous
+entropy = 0.5 * log(2 * pi * e * std)  # Only for Gaussian!
+
+# CORRECT: for categorical
+entropy = -(probs * log(probs + 1e-8)).sum()
+
+# Or using distribution:
+dist = Categorical(probs)
+entropy = dist.entropy()
+```
+
+### Mistake #6: Old Policy Mismatch in PPO
+
+```python
+# WRONG: updating policy, then computing ratio with same policy
+for epoch in range(num_epochs):
+    logits = policy(states)
+    log_probs_new = log_softmax(logits)
+    ratio = exp(log_probs_new - old_log_probs)
+    loss = clip_loss(ratio, advantages)
+    update(loss)  # Modifies policy!
+
+# CORRECT: keep old policy fixed during epochs
+old_policy_state = policy.state_dict()  # Save
+for epoch in range(num_epochs):
+    logits = policy(states)
+    log_probs_new = log_softmax(logits)
+    # old_log_probs based on old policy, fixed!
+    ratio = exp(log_probs_new - old_log_probs)
+    loss = clip_loss(ratio, advantages)
+    update(loss)
+```
+
+
+## Part 16: Performance Tips
+
+### Computation Optimization
+
+**Batch Processing**:
+
+```python
+# SLOW: processing one example at a time
+for state in states:
+    action = sample_action(state)
+
+# FAST: batch all at once
+actions = sample_actions(states)  # Vectorized
+
+# Speedup: 10-100x on GPU
+```
+
+**In-place Operations**:
+
+```python
+# Standard (creates new tensor)
+advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+# In-place (reuses memory)
+advantages.sub_(advantages.mean()).div_(advantages.std() + 1e-8)
+
+# Speedup: slight, more important for memory
+```
+
+**Mixed Precision Training**:
+
+```python
+# Use float32 for stability, computation optional
+# PyTorch automatic mixed precision:
+from torch.cuda.amp import autocast
+
+with autocast():
+    loss = compute_loss(...)
+    loss.backward()
+```
+
+### Memory Optimization
+
+**Trajectory Batching**:
+
+```python
+# Collect multiple trajectories before update
+trajectories = []
+for episode in range(num_episodes):
+    traj = collect_episode()
+    trajectories.append(traj)
+
+# Stack into batch
+states = torch.cat([t['states'] for t in trajectories])  # All at once
+# vs creating states during collection (huge memory spike)
+```
+
+**Value Function Caching**:
+
+```python
+# For long trajectories, compute value once
+values = value_network(states)  # Compute all at once
+# vs computing in loop during advantage computation
+```
+
+
+## Part 17: When to Abandon Policy Gradients
+
+### Problems Not Suited for Policy Gradients
+
+**Sample Inefficiency Critical**:
+
+```
+Example: Robot learning from limited real-world rollouts
+- Policy gradients: on-policy, need lots of data
+- Better: Offline RL, DQN with replay buffer
+- Switch to: offline-rl-methods skill
+```
+
+**Discrete Action Space + Limited Data**:
+
+```
+Example: 4-action game with 1M sample budget
+- Policy gradients: require many trajectories
+- Better: DQN (off-policy, experience replay)
+- Switch to: value-based-methods skill
+```
+
+**Exploration Bonus Needed**:
+
+```
+Example: Sparse reward environment
+- Policy gradients: entropy bonus minimal help
+- Better: Curiosity-driven, intrinsic motivation
+- Switch to: exploration-methods (if exists)
+```
+
+**Simulation Constraints**:
+
+```
+Example: Model-based planning where you have world model
+- Policy gradients: ignore model information
+- Better: Model-based planning (CEM, MPPI)
+- Switch to: model-based-rl skill
+```
+
+### Red Flags for Switching Algorithms
+
+| Signal | Consider Switching To |
+|--------|----------------------|
+| Discrete actions + huge sample budget → Can't improve | DQN or Rainbow (value-based-methods) |
+| Sparse rewards, no progress → Can't reward learning | Curiosity or hindsight (exploration methods) |
+| Have access to world model → Not using it | Model-based planning (model-based-rl) |
+| Need off-policy data efficiency → On-policy slow | Offline RL or DQN (offline-rl-methods or value-based) |
+| Multimodal solution space → Single mode explored | Evolutionary algorithms or diverse policies |
diff --git a/skills/using-deep-rl/reward-shaping-engineering.md b/skills/using-deep-rl/reward-shaping-engineering.md
new file mode 100644
index 0000000..f00b8c9
--- /dev/null
+++ b/skills/using-deep-rl/reward-shaping-engineering.md
@@ -0,0 +1,1050 @@
+
+# Reward-Shaping Engineering
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Reward Design**: "How do I design the reward function for my task?"
+- **Slow Learning**: "Training is extremely slow with sparse rewards" or "Dense rewards cause weird behavior"
+- **Reward Hacking**: "My agent learned a trick that works in training but fails on test", "Agent oscillating instead of balancing"
+- **Potential-Based Shaping**: "How to add shaping without breaking the optimal policy?"
+- **Distance-Based Rewards**: "How to reward progress toward goal without hacking?"
+- **Inverse RL**: "I have expert demonstrations, can I learn reward from them?"
+- **Auxiliary Rewards**: "Should I add helper rewards like action smoothness?"
+- **Reward Scale Issues**: "Training feels different when rewards change magnitude"
+- **Sparse vs Dense**: "When should I use sparse vs dense rewards?"
+- **Reward Validation**: "How do I verify my reward function is correct?"
+- **Credit Assignment**: "How to help agent understand which actions led to success?"
+- **Normalization**: "Should I clip or normalize rewards?"
+
+**This skill provides systematic frameworks and concrete patterns for reward engineering.**
+
+Do NOT use for:
+
+- Algorithm selection (route to rl-foundations or specific algorithm skill)
+- General RL debugging (route to rl-debugging-methodology)
+- Exploration strategies (route to exploration-strategies)
+- Environment design (route to environment-design-patterns)
+
+
+## Core Principle
+
+**Reward design is often the hardest part of RL.** The reward function defines the entire objective the agent optimizes. A poorly designed reward either:
+
+1. Learns something unintended (reward hacking)
+2. Learns slowly due to sparse/noisy signal (credit assignment crisis)
+3. Learns correctly but unstably due to scale/normalization issues
+
+The key insight: **You're solving an inverse problem.** You want an agent that achieves behavior X. You need to specify function R(s,a,s') such that optimal policy under R produces behavior X. This is much harder than it sounds because:
+
+- Agents optimize expected return, not intentions (find loopholes)
+- Credit assignment requires clear reward signal (sparse rewards fail)
+- Scale/normalization matters (reward magnitude affects gradients)
+- Shaping can help or hurt (need to preserve optimal policy)
+
+
+## Part 1: Reward Design Principles
+
+### Principle 1: Reward Must Align With Task
+
+**The Problem**: You want agent to do X, but reward incentivizes Y.
+
+**Example (CartPole)**:
+
+- Task: Balance pole in center for as long as possible
+- Bad reward: +1 per step (true) → Agent learns to oscillate side-to-side (unintended but gets +1 every step)
+- Better reward: +1 per step centered + penalty for deviation
+
+**Example (Robotics)**:
+
+- Task: Grasp object efficiently
+- Bad reward: Just +1 when grasped → Agent grasps sloppily, jerky movements
+- Better reward: +1 for grasp + small penalty per action (reward efficiency)
+
+**Pattern**: Specify WHAT success looks like, not HOW to achieve it. Let agent find the HOW.
+
+```python
+# Anti-pattern: Specify HOW
+bad_reward = -0.1 * np.sum(np.abs(action))  # Penalize movement
+
+# Pattern: Specify WHAT
+good_reward = (1.0 if grasp_success else 0.0) + (-0.01 * np.sum(action**2))
+# Says: Success is good, movements have small cost
+# Agent figures out efficient movements to minimize action cost
+```
+
+### Principle 2: Reward Should Enable Credit Assignment
+
+**The Problem**: Sparse rewards mean agent can't learn which actions led to success.
+
+**Example (Goal Navigation)**:
+
+- Sparse: Only +1 when reaching goal (1 in 1000 episodes maybe)
+- Agent can't tell: Did action 10 steps ago help or action 5 steps ago?
+- Solution: Add shaping reward based on progress
+
+**Credit Assignment Window**:
+
+```
+Short window (< 10 steps):    Need dense rewards every 1-2 steps
+Medium window (10-100 steps): Reward every 5-10 steps OK
+Long window (> 100 steps):    Sparse rewards very hard, need shaping
+```
+
+**When to Add Shaping**:
+
+- Episode length > 50 steps AND sparse rewards
+- Agent can't achieve >10% success after exploring
+
+### Principle 3: Reward Should Prevent Hacking
+
+**The Problem**: Agent finds unintended loopholes.
+
+**Classic Hacking Patterns**:
+
+1. **Shortcut Exploitation**: Taking unintended path to goal
+   - Example: Quadruped learns to flip instead of walk
+   - Solution: Specify movement requirements in reward
+
+2. **Side-Effect Exploitation**: Achieving side-effect that gives reward
+   - Example: Robotic arm oscillating (gets +1 per step for oscillation)
+   - Solution: Add penalty for suspicious behavior
+
+3. **Scale Exploitation**: Abusing unbounded reward dimension
+   - Example: Agent learns to get reward signal to spike → oscillates
+   - Solution: Use clipped/normalized rewards
+
+**Prevention Framework**:
+
+```python
+def design_robust_reward(s, a, s_next):
+    # Core task reward
+    task_reward = compute_task_reward(s_next)
+
+    # Anti-hacking penalties
+    action_penalty = -0.01 * np.sum(a**2)  # Penalize unnecessary action
+    suspension_penalty = check_suspension(s_next)  # Penalize weird postures
+
+    return task_reward + action_penalty + suspension_penalty
+```
+
+### Principle 4: Reward Scale and Normalization Matter
+
+**The Problem**: Reward magnitude affects gradient flow.
+
+**Example**:
+
+```
+Task A rewards:  0 to 1000
+Task B rewards:  0 to 1
+Same optimizer with fixed learning rate:
+  Task A: Step sizes huge, diverges
+  Task B: Step sizes tiny, barely learns
+
+Solution: Normalize both to [-1, 1]
+```
+
+**Standard Normalization Pipeline**:
+
+```python
+def normalize_reward(r):
+    # 1. Clip to reasonable range (prevents scale explosions)
+    r_clipped = np.clip(r, -1000, 1000)
+
+    # 2. Normalize using running statistics
+    reward_mean = running_mean(r_clipped)
+    reward_std = running_std(r_clipped)
+    r_normalized = (r_clipped - reward_mean) / (reward_std + 1e-8)
+
+    # 3. Clip again to [-1, 1] for stability
+    return np.clip(r_normalized, -1.0, 1.0)
+```
+
+
+## Part 2: Potential-Based Shaping (The Theorem)
+
+### The Fundamental Problem
+
+You want to:
+
+- Help agent learn faster (add shaping rewards)
+- Preserve the optimal policy (so shaping doesn't change what's best)
+
+**The Solution: Potential-Based Shaping**
+
+The theorem states: If you add shaping reward of form
+
+```
+F(s, a, s') = γ * Φ(s') - Φ(s)
+```
+
+where Φ(s) is ANY function of state, then:
+
+1. Optimal policy remains unchanged
+2. Optimal value function shifts by Φ
+3. Learning accelerates due to better signal
+
+**Why This Matters**: You can safely add rewards like distance-to-goal without worrying you're changing what the agent should do.
+
+### Mathematical Foundation
+
+Original MDP has Q-function: `Q^π(s,a) = E[R(s,a,s') + γV^π(s')]`
+
+With potential-based shaping:
+
+```
+Q'^π(s,a) = Q^π(s,a) + [γΦ(s') - Φ(s)]
+          = E[R(s,a,s') + γΦ(s') - Φ(s) + γV^π(s')]
+          = E[R(s,a,s') + γ(Φ(s') + V^π(s')) - Φ(s)]
+```
+
+The key insight: When computing optimal policy, Φ(s) acts like state-value function offset. Different actions get different Φ values, but relative ordering (which action is best) unchanged.
+
+**Proof Sketch**:
+
+- Policy compares Q(s,a₁) vs Q(s,a₂) to pick action
+- Both differ by same [γΦ(s') - Φ(s)] at state s
+- Relative ordering preserved → same optimal action
+
+### Practical Implementation
+
+```python
+def potential_based_shaping(s, a, s_next, gamma=0.99):
+    """
+    Compute shaping reward that preserves optimal policy.
+
+    Args:
+        s: current state
+        a: action taken
+        s_next: next state (result of action)
+        gamma: discount factor
+
+    Returns:
+        Shaping reward to ADD to environment reward
+    """
+    # Define potential function (e.g., negative distance to goal)
+    phi = compute_potential(s)
+    phi_next = compute_potential(s_next)
+
+    # Potential-based shaping formula
+    shaping_reward = gamma * phi_next - phi
+
+    return shaping_reward
+
+def compute_potential(s):
+    """
+    Potential function: Usually distance to goal.
+
+    Negative of distance works well:
+    - States farther from goal have lower potential
+    - Moving closer increases potential (positive shaping reward)
+    - Reaching goal gives highest potential
+    """
+    if goal_reached(s):
+        return 0.0  # Peak potential
+    else:
+        distance = euclidean_distance(s['position'], s['goal'])
+        return -distance  # Negative distance
+```
+
+### Critical Error: NOT Using Potential-Based Shaping
+
+**Common Mistake**:
+
+```python
+# WRONG: This changes the optimal policy!
+shaping_reward = -0.1 * distance_to_goal
+
+# WHY WRONG: This isn't potential-based. Moving from d=1 to d=0.5 gives:
+#   Reward = -0.1 * 0.5 - (-0.1 * 1.0) = +0.05
+# But moving from d=3 to d=2.5 gives:
+#   Reward = -0.1 * 2.5 - (-0.1 * 3.0) = +0.05
+# Same reward for same distance change regardless of state!
+# This distorts value function and can change which action is optimal.
+```
+
+**Right Way**:
+
+```python
+# CORRECT: Potential-based shaping
+def shaping(s, a, s_next):
+    phi_s = -distance(s, goal)  # Potential = negative distance
+    phi_s_next = -distance(s_next, goal)
+
+    return gamma * phi_s_next - phi_s
+
+# Moving from d=1 to d=0.5:
+#   shaping = 0.99 * (-0.5) - (-1.0) = +0.495
+# Moving from d=3 to d=2.5:
+#   shaping = 0.99 * (-2.5) - (-3.0) = +0.475
+# Slightly different, depends on state! Preserves policy.
+```
+
+### Using Potential-Based Shaping
+
+```python
+def compute_total_reward(s, a, s_next, env_reward, gamma=0.99):
+    """
+    Combine environment reward with potential-based shaping.
+
+    Pattern: R_total = R_env + R_shaping
+    """
+    # 1. Get reward from environment
+    task_reward = env_reward
+
+    # 2. Compute potential-based shaping (safe to add)
+    potential = -distance_to_goal(s_next)
+    potential_prev = -distance_to_goal(s)
+    shaping_reward = gamma * potential - potential_prev
+
+    # 3. Combine
+    total_reward = task_reward + shaping_reward
+
+    return total_reward
+```
+
+
+## Part 3: Sparse vs Dense Rewards
+
+### The Fundamental Tradeoff
+
+| Aspect | Sparse Rewards | Dense Rewards |
+|--------|---|---|
+| **Credit Assignment** | Hard (credit window huge) | Easy (immediate feedback) |
+| **Learning Speed** | Slow (few positive examples) | Fast (constant signal) |
+| **Reward Hacking** | Less likely (fewer targets) | More likely (many targets to exploit) |
+| **Convergence** | Can converge to suboptimal | May not converge if hacking |
+| **Real-World** | Matches reality (goals sparse) | Artificial but helps learning |
+
+### Decision Framework
+
+**Use SPARSE when**:
+
+- Task naturally has sparse rewards (goal-reaching, game win/loss)
+- Episode short (< 20 steps)
+- You want solution robust to reward hacking
+- Final performance matters more than learning speed
+
+**Use DENSE when**:
+
+- Episode long (> 50 steps) and no natural sub-goals
+- Learning speed critical (limited training budget)
+- You can design safe auxiliary rewards
+- You'll validate extensively against hacking
+
+**Use HYBRID when**:
+
+- Combine sparse task reward with dense shaping
+- Example: +1 for reaching goal (sparse) + negative distance shaping (dense)
+- This is the most practical approach for long-horizon tasks
+
+### Design Pattern: Sparse Task + Dense Shaping
+
+```python
+def reward_function(s, a, s_next, done):
+    """
+    Standard pattern: sparse task reward + potential-based shaping.
+
+    This gets the best of both worlds:
+    - Sparse task reward prevents hacking on main objective
+    - Dense shaping prevents credit assignment crisis
+    """
+    # 1. Sparse task reward (what we truly care about)
+    if goal_reached(s_next):
+        task_reward = 1.0
+    else:
+        task_reward = 0.0
+
+    # 2. Dense potential-based shaping (helps learning)
+    gamma = 0.99
+    phi_s = -np.linalg.norm(s['position'] - s['goal'])
+    phi_s_next = -np.linalg.norm(s_next['position'] - s_next['goal'])
+    shaping_reward = gamma * phi_s_next - phi_s
+
+    # 3. Combine: Sparse main objective + dense guidance
+    total = task_reward + 0.1 * shaping_reward
+    # Scale shaping (0.1) relative to task (1.0) so task dominates
+
+    return total
+```
+
+### Validation: Confirming Sparse/Dense Choice
+
+```python
+def validate_reward_choice(sparse_reward_fn, dense_reward_fn, env, n_trials=10):
+    """
+    Compare sparse vs dense by checking:
+    1. Learning speed (how fast does agent improve?)
+    2. Final performance (does dense cause hacking?)
+    3. Stability (does one diverge?)
+    """
+    results = {
+        'sparse': train_agent(sparse_reward_fn, env, n_trials),
+        'dense': train_agent(dense_reward_fn, env, n_trials)
+    }
+
+    # Check learning curves
+    print("Sparse learning speed:", results['sparse']['steps_to_50pct'])
+    print("Dense learning speed:", results['dense']['steps_to_50pct'])
+
+    # Check if dense causes hacking
+    print("Sparse final score:", results['sparse']['final_score'])
+    print("Dense final score:", results['dense']['final_score'])
+
+    # If dense learned faster AND achieved same/higher score: use dense + validation
+    # If sparse achieved higher: reward hacking detected in dense
+```
+
+
+## Part 4: Reward Hacking - Patterns and Detection
+
+### Common Hacking Patterns
+
+#### Pattern 1: Shortcut Exploitation
+
+Agent finds unintended path to success.
+
+**Example (Quadruped)**:
+
+- Task: Walk forward 10 meters
+- Intended: Gait pattern that moves forward
+- Hack: Agent learns to flip upside down (center of mass moves forward during flip!)
+
+**Detection**:
+
+```python
+# Test on distribution shift
+if test_on_different_terrain(agent) << train_performance:
+    print("ALERT: Shortcut exploitation detected")
+    print("Agent doesn't generalize → learned specific trick")
+```
+
+**Prevention**:
+
+```python
+def robust_reward(s, a, s_next):
+    # Forward progress
+    progress = s_next['x'] - s['x']
+
+    # Requirement: Stay upright (prevents flipping hack)
+    upright_penalty = -1.0 if not is_upright(s_next) else 0.0
+
+    # Requirement: Reasonable movement (prevents wiggling)
+    movement_penalty = -0.1 * np.sum(a**2)
+
+    return progress + upright_penalty + movement_penalty
+```
+
+#### Pattern 2: Reward Signal Exploitation
+
+Agent exploits direct reward signal rather than task.
+
+**Example (Oscillation)**:
+
+- Task: Balance pole in center
+- Intended: Keep pole balanced
+- Hack: Agent oscillates rapidly (each oscillation = +1 reward per step)
+
+**Detection**:
+
+```python
+def detect_oscillation(trajectory):
+    positions = [s['pole_angle'] for s in trajectory]
+    # Count zero crossings
+    crossings = sum(1 for i in range(len(positions)-1)
+                    if positions[i] * positions[i+1] < 0)
+
+    if crossings > len(trajectory) / 3:
+        print("ALERT: Oscillation detected")
+```
+
+**Prevention**:
+
+```python
+def non_hackable_reward(s, a, s_next):
+    # Task: Balanced pole
+    balance_penalty = -(s_next['pole_angle']**2)  # Reward being centered
+
+    # Prevent oscillation
+    angle_velocity = s_next['pole_angle'] - s['pole_angle']
+    oscillation_penalty = -0.1 * abs(angle_velocity)
+
+    return balance_penalty + oscillation_penalty
+```
+
+#### Pattern 3: Unbounded Reward Exploitation
+
+Agent maximizes dimension without bound.
+
+**Example (Camera Hack)**:
+
+- Task: Detect object (reward for correct detection)
+- Hack: Agent learns to point camera lens at bright light source (always triggers detection)
+
+**Detection**:
+
+```python
+def detect_unbounded_exploitation(training_history):
+    rewards = training_history['episode_returns']
+
+    # Check if rewards growing without bound
+    if rewards[-100:].mean() >> rewards[100:200].mean():
+        print("ALERT: Rewards diverging")
+        print("Possible unbounded exploitation")
+```
+
+**Prevention**:
+
+```python
+# Use reward clipping
+def clipped_reward(r):
+    return np.clip(r, -1.0, 1.0)
+
+# Or normalize
+def normalized_reward(r, running_mean, running_std):
+    r_norm = (r - running_mean) / (running_std + 1e-8)
+    return np.clip(r_norm, -1.0, 1.0)
+```
+
+### Systematic Hacking Detection Framework
+
+```python
+def check_for_hacking(agent, train_env, test_envs, holdout_env):
+    """
+    Comprehensive hacking detection.
+    """
+    # 1. Distribution shift test
+    train_perf = evaluate(agent, train_env)
+    test_perf = evaluate(agent, test_envs)  # Variations of train
+
+    if train_perf >> test_perf:
+        print("HACKING: Agent doesn't generalize to distribution shift")
+        return "shortcut_exploitation"
+
+    # 2. Behavioral inspection
+    trajectory = run_episode(agent, holdout_env)
+    if has_suspicious_pattern(trajectory):
+        print("HACKING: Suspicious behavior detected")
+        return "pattern_exploitation"
+
+    # 3. Reward curve analysis
+    if rewards_diverging(agent.training_history):
+        print("HACKING: Unbounded reward exploitation")
+        return "reward_signal_exploitation"
+
+    return "no_obvious_hacking"
+```
+
+
+## Part 5: Auxiliary Rewards and Shaping Examples
+
+### Example 1: Distance-Based Shaping
+
+**Most common shaping pattern. Safe when done with potential-based formula.**
+
+```python
+def distance_shaping(s, a, s_next, gamma=0.99):
+    """
+    Reward agent for getting closer to goal.
+
+    CRITICAL: Use potential-based formula to preserve optimal policy.
+    """
+    goal_position = s['goal']
+    curr_pos = s['position']
+    next_pos = s_next['position']
+
+    # Potential function: negative distance
+    phi = -np.linalg.norm(curr_pos - goal_position)
+    phi_next = -np.linalg.norm(next_pos - goal_position)
+
+    # Potential-based shaping (preserves optimal policy)
+    shaping_reward = gamma * phi_next - phi
+
+    return shaping_reward
+```
+
+### Example 2: Auxiliary Smoothness Reward
+
+**Help agent learn smooth actions without changing optimal behavior.**
+
+```python
+def smoothness_shaping(a, a_prev):
+    """
+    Penalize jittery/jerky actions.
+    Helps with efficiency and generalization.
+    """
+    # Difference between consecutive actions
+    action_jerk = np.linalg.norm(a - a_prev)
+
+    # Penalty (small, doesn't dominate task reward)
+    smoothness_penalty = -0.01 * action_jerk
+
+    return smoothness_penalty
+```
+
+### Example 3: Energy/Control Efficiency
+
+**Encourage efficient control.**
+
+```python
+def efficiency_reward(a):
+    """
+    Penalize excessive control effort.
+    Makes solutions more robust.
+    """
+    # L2 norm of action (total control magnitude)
+    effort = np.sum(a**2)
+
+    # Small penalty
+    return -0.001 * effort
+```
+
+### Example 4: Staying Safe Reward
+
+**Prevent dangerous states (without hard constraints).**
+
+```python
+def safety_reward(s):
+    """
+    Soft penalty for dangerous states.
+    Better than hard constraints (more learnable).
+    """
+    danger_score = 0.0
+
+    # Example: Prevent collision
+    min_clearance = np.min(s['collision_distances'])
+    if min_clearance < 0.1:
+        danger_score += 10.0 * (0.1 - min_clearance)
+
+    # Example: Prevent extreme states
+    if np.abs(s['position']).max() > 5.0:
+        danger_score += 1.0
+
+    return -danger_score
+```
+
+### When to Add Auxiliary Rewards
+
+**Add auxiliary reward if**:
+
+- It's potential-based (safe)
+- Task reward already roughly works (agent > 10% success)
+- Auxiliary targets clear sub-goals
+- You validate with/without
+
+**Don't add if**:
+
+- Task reward doesn't work at all (fix that first)
+- Creates new exploitation opportunities
+- Makes reward engineering too complex
+
+
+## Part 6: Inverse RL - Learning Rewards from Demonstrations
+
+### The Problem
+
+You have expert demonstrations but no explicit reward function. How to learn?
+
+**Options**:
+
+1. Behavioral cloning: Copy actions directly (doesn't learn why)
+2. Reward learning (inverse RL): Infer reward structure from demonstrations
+3. Imitation learning: Match expert behavior distribution (GAIL style)
+
+### Inverse RL Concept
+
+**Idea**: Expert is optimal under some reward function. Infer what reward structure makes expert optimal.
+
+```
+Expert demonstrations → Infer reward function → Train agent on learned reward
+```
+
+**Key insight**: If expert is optimal under reward R, then R(expert_actions) >> R(other_actions)
+
+### Practical Inverse RL (Maximum Entropy IRL)
+
+```python
+class InverseRLLearner:
+    """
+    Learn reward function from expert demonstrations.
+
+    Assumes expert is performing near-optimal policy under true reward.
+    """
+
+    def __init__(self, state_dim, action_dim):
+        # Reward function (small neural network)
+        self.reward_net = nn.Sequential(
+            nn.Linear(state_dim + action_dim, 64),
+            nn.ReLU(),
+            nn.Linear(64, 1)
+        )
+        self.optimizer = torch.optim.Adam(self.reward_net.parameters())
+
+    def compute_reward(self, s, a):
+        """Learned reward function."""
+        sa = torch.cat([torch.tensor(s), torch.tensor(a)])
+        return self.reward_net(sa).item()
+
+    def train_step(self, expert_trajectories, agent_trajectories):
+        """
+        Update reward to make expert better than agent.
+
+        Principle: Maximize expert returns, minimize agent returns under current reward.
+        """
+        # Expert reward sum
+        expert_returns = sum(
+            sum(self.compute_reward(s, a) for s, a in traj)
+            for traj in expert_trajectories
+        )
+
+        # Agent reward sum
+        agent_returns = sum(
+            sum(self.compute_reward(s, a) for s, a in traj)
+            for traj in agent_trajectories
+        )
+
+        # Loss: Want expert >> agent
+        loss = agent_returns - expert_returns
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        return loss.item()
+```
+
+### When to Use Inverse RL
+
+**Use when**:
+
+- Reward is hard to specify but easy to demonstrate
+- You have expert demonstrations (human, reference controller)
+- Task complex enough that behavior != objective
+- Training budget allows for two-stage process
+
+**Don't use when**:
+
+- Reward is easy to specify (just specify it!)
+- No expert demonstrations available
+- Demonstration quality varies
+- Need fast learning (inverse RL is slow)
+
+
+## Part 7: Reward Normalization and Clipping
+
+### Why Normalize?
+
+Reward scale directly affects gradient magnitude and training stability.
+
+```python
+# Without normalization
+reward_taskA = 1000 * task_metric  # Large magnitude
+loss = -policy_gradient * reward_taskA  # Huge gradients
+
+# With normalization
+reward_normalized = reward_taskA / reward_std  # Unit magnitude
+loss = -policy_gradient * reward_normalized  # Reasonable gradients
+```
+
+### Standard Normalization Pipeline
+
+```python
+class RewardNormalizer:
+    def __init__(self, epsilon=1e-8):
+        self.mean = 0.0
+        self.var = 1.0
+        self.epsilon = epsilon
+
+    def update_statistics(self, rewards):
+        """Update running mean and variance."""
+        rewards = np.array(rewards)
+        # Exponential moving average (online update)
+        alpha = 0.01
+        self.mean = (1 - alpha) * self.mean + alpha * rewards.mean()
+        self.var = (1 - alpha) * self.var + alpha * rewards.var()
+
+    def normalize(self, reward):
+        """Apply standardization then clipping."""
+        # 1. Standardize (zero mean, unit variance)
+        normalized = (reward - self.mean) / np.sqrt(self.var + self.epsilon)
+
+        # 2. Clip to [-1, 1] for stability
+        clipped = np.clip(normalized, -1.0, 1.0)
+
+        return clipped
+```
+
+### Clipping Strategy
+
+```python
+def clip_reward(r, clip_range=(-1.0, 1.0)):
+    """
+    Clip reward to fixed range.
+
+    Prevents large reward spikes from destabilizing training.
+    """
+    return np.clip(r, clip_range[0], clip_range[1])
+
+# Usage
+def total_reward(task_r, shaping_r):
+    # Combine rewards
+    combined = task_r + shaping_r
+
+    # Clip combined
+    clipped = clip_reward(combined)
+
+    return clipped
+```
+
+
+## Part 8: Validating Reward Functions
+
+### Validation Checklist
+
+```python
+def validate_reward_function(reward_fn, env, agent_class, n_trials=5):
+    """
+    Systematic validation of reward design.
+    """
+    results = {}
+
+    # 1. Learning speed test
+    agent = train_agent(agent_class, env, reward_fn, steps=100000)
+    success_rate = evaluate(agent, env, n_episodes=100)
+    results['learning_speed'] = success_rate
+
+    if success_rate < 0.3:
+        print("WARNING: Agent can't learn → reward signal too sparse")
+        return False
+
+    # 2. Generalization test
+    test_variants = [modify_env(env) for _ in range(5)]
+    test_rates = [evaluate(agent, test_env, 20) for test_env in test_variants]
+
+    if np.mean(test_rates) < 0.7 * success_rate:
+        print("WARNING: Hacking detected → Agent doesn't generalize")
+        return False
+
+    # 3. Stability test
+    agents = [train_agent(...) for _ in range(n_trials)]
+    variance = np.var([evaluate(a, env, 20) for a in agents])
+
+    if variance > 0.3:
+        print("WARNING: Training unstable → Reward scale issue?")
+        return False
+
+    # 4. Behavioral inspection
+    trajectory = run_episode(agent, env)
+    if suspicious_behavior(trajectory):
+        print("WARNING: Agent exhibiting strange behavior")
+        return False
+
+    print("PASSED: Reward function validated")
+    return True
+```
+
+### Red Flags During Validation
+
+| Red Flag | Likely Cause | Fix |
+|----------|---|---|
+| Success rate < 10% after 50k steps | Reward too sparse | Add shaping |
+| High variance across seeds | Reward scale/noise | Normalize/clip |
+| Passes train but fails test | Reward hacking | Add anti-hacking penalties |
+| Rewards diverging to infinity | Unbounded reward | Use clipping |
+| Agent oscillates/twitches | Per-step reward exploitation | Penalize action change |
+| Learning suddenly stops | Reward scale issue | Check normalization |
+
+
+## Part 9: Common Pitfalls and Rationalizations
+
+### Pitfall 1: "Let me just add distance reward"
+
+**Rationalization**: "I'll add reward for getting closer to goal, it can't hurt"
+**Problem**: Without potential-based formula, changes optimal policy
+**Reality Check**: Measure policy difference with/without shaping
+
+### Pitfall 2: "Sparse rewards are always better"
+
+**Rationalization**: "Sparse rewards prevent hacking"
+**Problem**: Agent can't learn in long-horizon tasks (credit assignment crisis)
+**Reality Check**: 10+ steps without reward → need shaping or fail training
+
+### Pitfall 3: "Normalize everything"
+
+**Rationalization**: "I'll normalize all rewards to [-1, 1]"
+**Problem**: Over-normalization loses task structure (goal vs near-goal now equal)
+**Reality Check**: Validate that normalized reward still trains well
+
+### Pitfall 4: "Inverse RL is the answer"
+
+**Rationalization**: "I don't know how to specify rewards, I'll learn from demos"
+**Problem**: Inverse RL is slow and requires good demonstrations
+**Reality Check**: If you can specify reward clearly, just do it
+
+### Pitfall 5: "More auxiliary rewards = faster learning"
+
+**Rationalization**: "I'll add smoothness, energy, safety rewards"
+**Problem**: Each auxiliary reward is another hacking target
+**Reality Check**: Validate each auxiliary independently
+
+### Pitfall 6: "This should work, why doesn't it?"
+
+**Rationalization**: "The reward looks right, must be algorithm issue"
+**Problem**: Reward design is usually the bottleneck, not algorithm
+**Reality Check**: Systematically validate reward using test framework
+
+### Pitfall 7: "Agent learned the task, my reward was right"
+
+**Rationalization**: "Agent succeeded, so reward design was good"
+**Problem**: Agent might succeed on hacked solution, not true task
+**Reality Check**: Test on distribution shift / different environment variants
+
+### Pitfall 8: "Dense rewards cause overfitting"
+
+**Rationalization**: "Sparse rewards generalize better"
+**Problem**: Sparse rewards just fail to learn in long episodes
+**Reality Check**: Compare learning curves and final policy generalization
+
+### Pitfall 9: "Clipping breaks the signal"
+
+**Rationalization**: "If I clip rewards, I lose information"
+**Problem**: Unbounded rewards cause training instability
+**Reality Check**: Relative ordering preserved after clipping, information retained
+
+### Pitfall 10: "Potential-based shaping doesn't matter"
+
+**Rationalization**: "A reward penalty is a reward penalty"
+**Problem**: Non-potential-based shaping CAN change optimal policy
+**Reality Check**: Prove mathematically that Φ(s') - Φ(s) structure used
+
+
+## Part 10: Reward Engineering Patterns for Common Tasks
+
+### Pattern 1: Goal-Reaching Tasks
+
+```python
+def reaching_reward(s, a, s_next, gamma=0.99):
+    """
+    Task: Reach target location.
+    """
+    goal = s['goal']
+
+    # Sparse task reward
+    if np.linalg.norm(s_next['position'] - goal) < 0.1:
+        task_reward = 1.0
+    else:
+        task_reward = 0.0
+
+    # Dense potential-based shaping
+    distance = np.linalg.norm(s_next['position'] - goal)
+    distance_prev = np.linalg.norm(s['position'] - goal)
+
+    phi = -distance
+    phi_prev = -distance_prev
+    shaping = gamma * phi - phi_prev
+
+    # Efficiency penalty (optional)
+    efficiency = -0.001 * np.sum(a**2)
+
+    return task_reward + 0.1 * shaping + efficiency
+```
+
+### Pattern 2: Locomotion Tasks
+
+```python
+def locomotion_reward(s, a, s_next):
+    """
+    Task: Move forward efficiently.
+    """
+    # Forward progress (sparse)
+    forward_reward = s_next['x_pos'] - s['x_pos']
+
+    # Staying alive (don't fall)
+    alive_bonus = 1.0 if is_alive(s_next) else 0.0
+
+    # Energy efficiency
+    action_penalty = -0.0001 * np.sum(a**2)
+
+    return forward_reward + alive_bonus + action_penalty
+```
+
+### Pattern 3: Multi-Objective Tasks
+
+```python
+def multi_objective_reward(s, a, s_next):
+    """
+    Task: Multiple objectives (e.g., reach goal AND minimize energy).
+    """
+    goal_reward = 10.0 * (goal_progress(s, s_next))
+    energy_reward = -0.01 * np.sum(a**2)
+    safety_reward = -1.0 * collision_risk(s_next)
+
+    # Weight objectives
+    return 1.0 * goal_reward + 0.1 * energy_reward + 0.5 * safety_reward
+```
+
+
+## Summary: Reward Engineering Workflow
+
+1. **Specify what success looks like** (task reward)
+2. **Choose sparse or dense** based on episode length
+3. **If dense, use potential-based shaping** (preserves policy)
+4. **Add anti-hacking penalties** if needed
+5. **Normalize and clip** for stability
+6. **Validate** systematically (generalization, hacking, stability)
+7. **Iterate** based on validation results
+
+
+## Key Equations Reference
+
+```
+Potential-Based Shaping:
+F(s,a,s') = γΦ(s') - Φ(s)
+
+Value Function Shift (with shaping):
+V'(s) = V(s) + Φ(s)
+
+Optimal Policy Preservation:
+argmax_a Q'(s,a) = argmax_a Q(s,a)  (same action, different Q-values)
+
+Reward Normalization:
+r_norm = (r - μ) / (σ + ε)
+
+Clipping:
+r_clipped = clip(r_norm, -1, 1)
+```
+
+
+## Testing Scenarios (13+)
+
+The skill addresses these scenarios:
+
+1. Detecting reward hacking from test set failure
+2. Implementing potential-based shaping correctly
+3. Choosing sparse vs dense based on episode length
+4. Designing distance-based rewards without changing policy
+5. Adding auxiliary rewards without hacking
+6. Normalizing rewards across task variants
+7. Validating that shaping preserves optimal policy
+8. Applying inverse RL to expert demonstrations
+9. Debugging when reward signal causes oscillation
+10. Engineering rewards for specific task families
+11. Recognizing when reward is bottleneck vs algorithm
+12. Explaining reward hacking in principal-agent terms
+13. Implementing end-to-end reward validation pipeline
+
+
+## Practical Checklist
+
+- [ ] Task reward clearly specifies success
+- [ ] Reward function can't be exploited by shortcuts
+- [ ] Episode length < 20 steps → sparse OK
+- [ ] Episode length > 50 steps → need shaping
+- [ ] Using potential-based formula F = γΦ(s') - Φ(s)
+- [ ] Clipping/normalizing rewards to [-1, 1]
+- [ ] Tested on distribution shift (different env variant)
+- [ ] Behavioral inspection (is agent doing what you expect?)
+- [ ] Training stability across seeds (variance < 0.3)
+- [ ] Learning curves look reasonable (no sudden divergence)
+- [ ] Final policy generalizes to test distribution
diff --git a/skills/using-deep-rl/rl-debugging.md b/skills/using-deep-rl/rl-debugging.md
new file mode 100644
index 0000000..22232a7
--- /dev/null
+++ b/skills/using-deep-rl/rl-debugging.md
@@ -0,0 +1,1394 @@
+
+# RL Debugging Methodology
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Agent Won't Learn**: Reward stuck at baseline, not improving
+- **Training Unstable**: Loss bouncing, reward highly variable
+- **Suboptimal Policy**: Agent learned something but worse than expected
+- **Reward Hacking**: Agent gaming the reward function
+- **Exploration Issues**: Agent stuck in local optimum or exploring poorly
+- **Hyperparameter Sensitivity**: Small changes break training
+- **Learning Rate Tuning**: Not sure what value is right
+- **Convergence Problems**: Loss doesn't decrease or decreases then stops
+- **Environment vs Algorithm**: Unsure if problem is environment or RL algorithm
+- **Logging Confusion**: Not sure what metrics to monitor
+- **Gradual Performance Degradation**: Early training good, later poor
+- **Sparse Reward Challenge**: Agent never finds reward signal
+
+**Core Problem**: RL debugging often becomes random hyperparameter tweaking. Agents are complex systems with many failure modes. Systematic diagnosis finds root causes; random tweaking wastes time and leads to contradictory findings.
+
+## Do NOT Use This Skill For
+
+- **Learning RL theory** (route to rl-foundations for MDPs, Bellman equations, policy gradients)
+- **Implementing new algorithms** (route to algorithm-specific skills like value-based-methods, policy-gradient-methods, actor-critic-methods)
+- **Environment API questions** (route to rl-environments for Gym/Gymnasium API, custom environments, wrappers)
+- **Evaluation methodology** (route to rl-evaluation for rigorous statistical testing, generalization assessment)
+- **Initial algorithm selection** (route to using-deep-rl router or rl-foundations for choosing the right algorithm family)
+
+
+## Core Principle: The 80/20 Rule
+
+**The most important insight in RL debugging:**
+
+```
+80% of RL failures are in:
+  1. Environment design (agent can't see true state)
+  2. Reward function (misaligned or wrong scale)
+  3. Observation/action representation (missing information)
+
+15% are in:
+  4. Hyperparameters (learning rate, batch size, etc.)
+  5. Exploration strategy (too much or too little)
+
+5% are in:
+  6. Algorithm selection (wrong algorithm for problem)
+```
+
+**Consequence**: If training fails, check environment and reward FIRST. Changing the algorithm last.
+
+### Why This Order?
+
+**Scenario 1: Broken Environment**
+
+```python
+# BROKEN ENVIRONMENT: Agent can't win no matter what algorithm
+class BrokenEnv:
+    def reset(self):
+        self.state = random_state()  # Agent can't control this
+        return self.state
+
+    def step(self, action):
+        # Reward independent of action!
+        reward = random.random()
+        return self.state, reward
+
+    # No amount of PPO, DQN, SAC can learn from random reward
+
+# CORRECT ENVIRONMENT: Agent can win with right policy
+class CorrectEnv:
+    def reset(self):
+        self.state = initial_state
+        return self.state
+
+    def step(self, action):
+        # Reward depends on action
+        reward = compute_reward(self.state, action)
+        self.state = compute_next_state(self.state, action)
+        return self.state, reward
+```
+
+**If environment is broken, no algorithm will learn.**
+
+**Scenario 2: Reward Scale Issue**
+
+```python
+# WRONG SCALE: Reward in [0, 1000000]
+# Algorithm gradient updates: param = param - lr * grad
+# If gradient huge (due to reward scale), single step breaks everything
+
+# CORRECT SCALE: Reward in [-1, 1]
+# Gradients are reasonable, learning stable
+
+# Fix is simple: divide reward by scale factor
+# But if you don't know to check reward scale, you'll try 10 learning rates instead
+```
+
+**Consequence: Always check reward scale before tuning learning rate.**
+
+
+## Part 1: Systematic Debugging Framework
+
+### The Debugging Process (Not Random Tweaking)
+
+```
+START: Agent not learning (or training unstable, or suboptimal)
+
+Step 1: ENVIRONMENT CHECK (Does agent have what it needs?)
+  ├─ Can agent see the state? (Is observation sufficient?)
+  ├─ Is environment deterministic or stochastic? (Affects algorithm choice)
+  ├─ Can agent actually win? (Does optimal policy exist?)
+  └─ Is environment reset working? (Fresh episode each reset?)
+
+Step 2: REWARD SCALE CHECK (Is reward in reasonable range?)
+  ├─ What's the range of rewards? (Min, max, typical)
+  ├─ Are rewards normalized? (Should be ≈ [-1, 1])
+  ├─ Is reward aligned with desired behavior? (No reward hacking)
+  └─ Are rewards sparse or dense? (Affects exploration strategy)
+
+Step 3: OBSERVATION REPRESENTATION (Is information preserved?)
+  ├─ Are observations normalized? (Images: [0, 255] or [0, 1]?)
+  ├─ Is temporal information included? (Frame stacking for Atari?)
+  ├─ Are observations consistent? (Same format each episode?)
+  └─ Is observation sufficient to solve problem? (Can human win from this info?)
+
+Step 4: BASIC ALGORITHM CHECK (Is the RL algorithm working at all?)
+  ├─ Run on simple environment (CartPole, simple task)
+  ├─ Can algorithm learn on simple env? (If not: algorithm issue)
+  ├─ Can algorithm beat random baseline? (If not: something is broken)
+  └─ Does loss decrease? (If not: learning not happening)
+
+Step 5: HYPERPARAMETER TUNING (Only after above passed)
+  ├─ Is learning rate in reasonable range? (1e-5 to 1e-3 typical)
+  ├─ Is batch size appropriate? (Power of 2: 32, 64, 128, 256)
+  ├─ Is exploration sufficient? (Epsilon decaying? Entropy positive?)
+  └─ Are network layers reasonable? (3 hidden layers typical)
+
+Step 6: LOGGING ANALYSIS (What do the metrics say?)
+  ├─ Policy loss: decreasing? exploding? zero?
+  ├─ Value loss: decreasing? stable?
+  ├─ Reward curve: trending up? flat? oscillating?
+  ├─ Entropy: decreasing over time? (Exploration → exploitation)
+  └─ Gradient norms: reasonable? exploding? vanishing?
+
+Step 7: IDENTIFY ROOT CAUSE (Synthesize findings)
+  └─ Where is the actual problem? (Environment, reward, algorithm, hyperparameters)
+```
+
+### Why This Order Matters
+
+**Common mistake: Jump to Step 5 (hyperparameter tuning)**
+
+```python
+# Agent not learning. Frustration sets in.
+# "I'll try learning rate 1e-4" (Step 5, skipped 1-4)
+# Doesn't work.
+# "I'll try batch size 64" (more Step 5 tweaking)
+# Doesn't work.
+# "I'll try a bigger network" (still Step 5)
+# Doesn't work.
+# Hours wasted.
+
+# Correct approach: Follow Steps 1-4 first.
+# Step 1: Oh! Environment reset is broken, always same initial state
+# Fix environment.
+# Now agent learns immediately with default hyperparameters.
+```
+
+**The order reflects probability**: It's more likely the environment is broken than the algorithm; more likely the reward scale is wrong than learning rate is wrong.
+
+
+## Part 2: Diagnosis Trees by Symptom
+
+### Diagnosis Tree 1: "Agent Won't Learn"
+
+**Symptom**: Reward stuck near random baseline. Loss doesn't decrease meaningfully.
+
+```
+START: Agent Won't Learn
+
+├─ STEP 1: Can agent beat random baseline?
+│  ├─ YES → Skip to STEP 4
+│  └─ NO → Environment issue likely
+│     ├─ Check 1A: Is environment output sane?
+│     │  ├─ Print first 5 episodes: state, action, reward, next_state
+│     │  ├─ Verify types match (shapes, ranges, dtypes)
+│     │  └─ Is reward always same? Always zero? (Red flag: no signal)
+│     ├─ Check 1B: Can you beat it manually?
+│     │  ├─ Play environment by hand (hardcode a policy)
+│     │  ├─ Can you get >0 reward? (If not: environment is broken)
+│     │  └─ If yes: Agent is missing something
+│     └─ Check 1C: Is reset working?
+│        ├─ Call reset() twice, check states differ
+│        └─ If states same: reset is broken, fix it
+
+├─ STEP 2: Is reward scale reasonable?
+│  ├─ Compute: min, max, mean, std of rewards from random policy
+│  ├─ If range >> 1 (e.g., [0, 10000]):
+│  │  ├─ Action: Normalize rewards to [-1, 1]
+│  │  ├─ Code: reward = reward / max_possible_reward
+│  │  └─ Retest: Usually fixes "won't learn"
+│  ├─ If range << 1 (e.g., [0, 0.001]):
+│  │  ├─ Action: Scale up rewards
+│  │  ├─ Code: reward = reward * 1000
+│  │  └─ Or increase network capacity (more signal needed)
+│  └─ If reward is [0, 1] (looks fine):
+│     └─ Continue to STEP 3
+
+├─ STEP 3: Is observation sufficient?
+│  ├─ Check 3A: Are observations normalized?
+│  │  ├─ If images [0, 255]: normalize to [0, 1] or [-1, 1]
+│  │  ├─ Code: observation = observation / 255.0
+│  │  └─ Retest
+│  ├─ Check 3B: Is temporal info included? (For vision: frame stacking)
+│  │  ├─ If using images: last 4 frames stacked?
+│  │  ├─ If using states: includes velocity/derivatives?
+│  │  └─ Missing temporal info → agent can't infer velocity
+│  └─ Check 3C: Is observation Markovian?
+│     ├─ Can optimal policy be derived from this observation?
+│     ├─ If not: observation insufficient (red flag)
+│     └─ Example: Only position, not velocity → agent can't control
+
+├─ STEP 4: Run sanity check on simple environment
+│  ├─ Switch to CartPole or equivalent simple env
+│  ├─ Train with default hyperparameters
+│  ├─ Does simple env learn? (Should learn in 1000-5000 steps)
+│  ├─ YES → Your algorithm works, issue is your env/hyperparameters
+│  └─ NO → Algorithm itself broken (rare, check algorithm implementation)
+
+├─ STEP 5: Check exploration
+│  ├─ Is agent exploring or stuck?
+│  ├─ Log entropy (for stochastic policies)
+│  ├─ If entropy → 0 early: agent exploiting before exploring
+│  │  └─ Solution: Increase entropy regularization or ε
+│  ├─ If entropy always high: too much exploration
+│  │  └─ Solution: Decay entropy or ε more aggressively
+│  └─ Visualize: Plot policy actions over time, should see diversity early
+
+├─ STEP 6: Check learning rate
+│  ├─ Is learning rate in [1e-5, 1e-3]? (typical range)
+│  ├─ If > 1e-3: Try reducing (might be too aggressive)
+│  ├─ If < 1e-5: Try increasing (might be too conservative)
+│  ├─ Watch loss first step: If loss increases → LR too high
+│  └─ Safe default: 3e-4
+
+└─ STEP 7: Check network architecture
+   ├─ For continuous control: small networks ok (1-2 hidden layers, 64-256 units)
+   ├─ For vision: use CNN (don't use FC on pixels)
+   ├─ Check if network has enough capacity
+   └─ Tip: Start with simple, add complexity if needed
+```
+
+**ROOT CAUSES in order of likelihood:**
+
+1. **Reward scale wrong** (40% of cases)
+2. **Environment broken** (25% of cases)
+3. **Observation insufficient** (15% of cases)
+4. **Learning rate too high/low** (12% of cases)
+5. **Algorithm issue** (8% of cases)
+
+
+### Diagnosis Tree 2: "Training Unstable"
+
+**Symptom**: Loss bounces wildly, reward spikes then crashes, training oscillates.
+
+```
+START: Training Unstable
+
+├─ STEP 1: Characterize the instability
+│  ├─ Plot loss curve: Does it bounce at same magnitude or grow?
+│  ├─ Plot reward curve: Does it oscillate around mean or trend down?
+│  ├─ Compute: reward variance over 100 episodes
+│  └─ This tells you: Is it normal variance or pathological instability?
+
+├─ STEP 2: Check if environment is deterministic
+│  ├─ Deterministic environment + stochastic policy = normal variance
+│  ├─ Stochastic environment + any policy = high variance (expected)
+│  ├─ If stochastic: Can you reduce randomness? Or accept higher variance?
+│  └─ Some instability is normal; distinguish from pathological
+
+├─ STEP 3: Check reward scale
+│  ├─ If rewards >> 1: Gradient updates too large
+│  │  ├─ Single step might overshoot optimum
+│  │  ├─ Solution: Normalize rewards to [-1, 1]
+│  │  └─ This often fixes instability immediately
+│  ├─ If reward has outliers: Single large reward breaks training
+│  │  ├─ Solution: Reward clipping or scaling
+│  │  └─ Example: r = np.clip(reward, -1, 1)
+│  └─ Check: Is reward scale consistent?
+
+├─ STEP 4: Check learning rate (LR often causes instability)
+│  ├─ If loss oscillates: LR likely too high
+│  │  ├─ Try reducing by 2-5× (e.g., 1e-3 → 3e-4)
+│  │  ├─ Watch first 100 steps: Loss should decrease monotonically
+│  │  └─ If still oscillates: try 10× reduction
+│  ├─ If you have LR scheduler: Check if it's too aggressive
+│  │  ├─ Scheduler reducing LR too fast can cause steps
+│  │  └─ Solution: Slower schedule (more steps to final LR)
+│  └─ Test: Set LR very low (1e-5), see if training is smooth
+│     ├─ YES → Increase LR gradually until instability starts
+│     └─ This bracketing finds safe LR range
+
+├─ STEP 5: Check batch size
+│  ├─ Small batch (< 32): High gradient variance, bouncy updates
+│  │  ├─ Solution: Increase batch size (32, 64, 128)
+│  │  └─ But not too large: training becomes slow
+│  ├─ Large batch (> 512): Might overfit, large gradient steps
+│  │  ├─ Solution: Use gradient accumulation
+│  │  └─ Or reduce learning rate slightly
+│  └─ Start with batch_size=64, adjust if needed
+
+├─ STEP 6: Check gradient clipping
+│  ├─ Are gradients exploding? (Check max gradient norm)
+│  │  ├─ If max grad norm > 100: Likely exploding gradients
+│  │  ├─ Solution: Enable gradient clipping (max_norm=1.0)
+│  │  └─ Code: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+│  ├─ If max grad norm reasonable (< 10): Skip this step
+│  └─ Watch grad norm over training: Should stay roughly constant
+
+├─ STEP 7: Check algorithm-specific parameters
+│  ├─ For PPO: Is clipping epsilon reasonable? (0.2 default)
+│  │  ├─ Too high: Over-clips, doesn't update
+│  │  └─ Too low: Allows large updates, instability
+│  ├─ For DQN: Is target network update frequency appropriate?
+│  │  ├─ Update too often: Target constantly changing
+│  │  └─ Update too rarely: Stale targets
+│  └─ For A3C/A2C: Check entropy coefficient
+│     ├─ Too high: Too much exploration, policy noisy
+│     └─ Too low: Premature convergence
+
+└─ STEP 8: Check exploration decay
+   ├─ Is exploration decaying too fast? (Policy becomes deterministic)
+   │  └─ If entropy→0 early: Agent exploits before exploring
+   ├─ Is exploration decaying too slow? (Policy stays noisy)
+   │  └─ If entropy stays high: Too much randomness in later training
+   └─ Entropy should decay: high early, low late
+      └─ Plot entropy over training: should show clear decay curve
+```
+
+**ROOT CAUSES in order of likelihood:**
+
+1. **Learning rate too high** (35% of cases)
+2. **Reward scale too large** (25% of cases)
+3. **Batch size too small** (15% of cases)
+4. **Gradient explosion** (10% of cases)
+5. **Algorithm parameters** (10% of cases)
+6. **Environment stochasticity** (5% of cases)
+
+
+### Diagnosis Tree 3: "Suboptimal Policy"
+
+**Symptom**: Agent learned something but performs worse than expected. Better than random baseline, but not good enough.
+
+```
+START: Suboptimal Policy
+
+├─ STEP 1: How suboptimal? (Quantify the gap)
+│  ├─ Compute: Agent reward vs theoretical optimal
+│  ├─ If 80% of optimal: Normal (RL usually gets 80-90% optimal)
+│  ├─ If 50% of optimal: Significantly suboptimal
+│  ├─ If 20% of optimal: Very bad
+│  └─ This tells you: Is it "good enough" or truly broken?
+
+├─ STEP 2: Is it stuck in local optimum?
+│  ├─ Run multiple seeds: Do you get similar reward each seed?
+│  ├─ If rewards similar across seeds: Consistent local optimum
+│  ├─ If rewards vary wildly: High variance, need more training
+│  └─ Solution if local optimum: More exploration or better reward shaping
+
+├─ STEP 3: Check reward hacking
+│  ├─ Visualize agent behavior: Does it match intent?
+│  ├─ Example: Cart-pole reward is [0, 1] per timestep
+│  │  ├─ Agent might learn: "Stay in center, don't move"
+│  │  ├─ Policy is suboptimal but still gets reward
+│  │  └─ Solution: Reward engineering (bonus for progress)
+│  └─ Hacking signs:
+│     ├─ Agent does something weird but gets reward
+│     ├─ Behavior makes no intuitive sense
+│     └─ Reward increases but performance bad
+
+├─ STEP 4: Is exploration sufficient?
+│  ├─ Check entropy: Does policy explore initially?
+│  ├─ Check epsilon decay (if using ε-greedy): Does it decay appropriately?
+│  ├─ Is agent exploring broadly or stuck in small region?
+│  ├─ Solution: Slower exploration decay or intrinsic motivation
+│  └─ Use RND/curiosity if environment has sparse rewards
+
+├─ STEP 5: Check network capacity
+│  ├─ Is network too small to represent optimal policy?
+│  ├─ For vision: Use standard CNN (not tiny network)
+│  ├─ For continuous control: 2-3 hidden layers, 128-256 units
+│  ├─ Test: Double network size, does performance improve?
+│  └─ If yes: Original network was too small
+
+├─ STEP 6: Check data efficiency
+│  ├─ Is agent training long enough?
+│  ├─ RL usually needs: simple tasks 100k steps, complex tasks 1M+ steps
+│  ├─ If training only 10k steps: Too short, agent didn't converge
+│  ├─ Solution: Train longer (but check reward curve first)
+│  └─ If reward plateaus early: Extend training won't help
+
+├─ STEP 7: Check observation and action spaces
+│  ├─ Is action space continuous or discrete?
+│  ├─ Is action discretization appropriate?
+│  │  ├─ Too coarse: Can't express fine control
+│  │  ├─ Too fine: Huge action space, hard to learn
+│  │  └─ Example: 100 actions for simple control = too many
+│  ├─ Is observation sufficient? (See Diagnosis Tree 1, Step 3)
+│  └─ Missing information in observation = impossible to be optimal
+
+├─ STEP 8: Check reward structure
+│  ├─ Is reward dense or sparse?
+│  ├─ Sparse reward + suboptimal policy: Agent might not be exploring to good region
+│  │  ├─ Solution: Reward shaping (bonus for progress)
+│  │  └─ Or: Intrinsic motivation (RND/curiosity)
+│  ├─ Dense reward + suboptimal: Possible misalignment with intent
+│  └─ Can you improve by reshaping reward?
+
+└─ STEP 9: Compare with baseline algorithm
+   ├─ Run reference implementation on same env
+   ├─ Does reference get better reward?
+   ├─ YES → Your implementation has a bug
+   ├─ NO → Problem is inherent to algorithm or environment
+   └─ This isolates: Implementation issue vs fundamental difficulty
+```
+
+**ROOT CAUSES in order of likelihood:**
+
+1. **Exploration insufficient** (30% of cases)
+2. **Training not long enough** (25% of cases)
+3. **Reward hacking** (20% of cases)
+4. **Network too small** (12% of cases)
+5. **Observation insufficient** (8% of cases)
+6. **Algorithm mismatch** (5% of cases)
+
+
+## Part 3: What to Check First
+
+### Critical Checks (Do These First)
+
+#### Check 1: Reward Scale Analysis
+
+**Why**: Reward scale is the MOST COMMON source of RL failures.
+
+```python
+# DIAGNOSTIC SCRIPT
+import numpy as np
+
+# Collect rewards from random policy
+rewards = []
+for episode in range(100):
+    state = env.reset()
+    for step in range(1000):
+        action = env.action_space.sample()  # Random action
+        state, reward, done, _ = env.step(action)
+        rewards.append(reward)
+        if done:
+            break
+
+rewards = np.array(rewards)
+
+print(f"Reward statistics from random policy:")
+print(f"  Min: {rewards.min()}")
+print(f"  Max: {rewards.max()}")
+print(f"  Mean: {rewards.mean()}")
+print(f"  Std: {rewards.std()}")
+print(f"  Range: [{rewards.min()}, {rewards.max()}]")
+
+# RED FLAGS
+if abs(rewards.max()) > 100 or abs(rewards.min()) > 100:
+    print("⚠️ RED FLAG: Rewards >> 1, normalize them!")
+
+if rewards.std() > 10:
+    print("⚠️ RED FLAG: High reward variance, normalize or clip")
+
+if rewards.mean() == rewards.max():
+    print("⚠️ RED FLAG: Constant rewards, no signal to learn from!")
+
+if (rewards > 1).any() and (rewards < -1).any():
+    print("✓ Reward scale looks reasonable ([-1, 1] range)")
+```
+
+**Action if scale is wrong:**
+
+```python
+# Normalize to [-1, 1]
+reward = reward / max(abs(rewards.max()), abs(rewards.min()))
+
+# Or clip
+reward = np.clip(reward, -1, 1)
+
+# Or shift and scale
+reward = 2 * (reward - rewards.min()) / (rewards.max() - rewards.min()) - 1
+```
+
+#### Check 2: Environment Sanity Check
+
+**Why**: Broken environment → no algorithm will work.
+
+```python
+# DIAGNOSTIC SCRIPT
+def sanity_check_env(env, num_episodes=5):
+    """Quick check if environment is sane."""
+
+    for episode in range(num_episodes):
+        state = env.reset()
+        print(f"\nEpisode {episode}:")
+        print(f"  Initial state shape: {state.shape}, dtype: {state.dtype}")
+        print(f"  Initial state range: [{state.min()}, {state.max()}]")
+
+        for step in range(10):
+            action = env.action_space.sample()
+            next_state, reward, done, info = env.step(action)
+
+            print(f"  Step {step}: action={action}, reward={reward}, done={done}")
+            print(f"    State shape: {next_state.shape}, range: [{next_state.min()}, {next_state.max()}]")
+
+            # Check for NaN
+            if np.isnan(next_state).any() or np.isnan(reward):
+                print(f"    ⚠️ NaN detected!")
+
+            # Check for reasonable values
+            if np.abs(next_state).max() > 1e6:
+                print(f"    ⚠️ State explosion (values > 1e6)")
+
+            if done:
+                break
+
+    print("\n✓ Environment check complete")
+
+sanity_check_env(env)
+```
+
+**RED FLAGS:**
+
+- NaN or inf in observations/rewards
+- State values exploding (> 1e6)
+- Reward always same (no signal)
+- Done flag never true (infinite episodes)
+- State never changes despite actions
+
+#### Check 3: Can You Beat It Manually?
+
+**Why**: If human can't solve it, agent won't either (unless reward hacking).
+
+```python
+# Manual policy: Hardcoded behavior
+def manual_policy(state):
+    # Example for CartPole: if pole tilting right, push right
+    if state[2] > 0:  # angle > 0
+        return 1  # Push right
+    else:
+        return 0  # Push left
+
+# Test manual policy
+total_reward = 0
+for episode in range(10):
+    state = env.reset()
+    for step in range(500):
+        action = manual_policy(state)
+        state, reward, done, _ = env.step(action)
+        total_reward += reward
+        if done:
+            break
+
+avg_reward = total_reward / 10
+print(f"Manual policy average reward: {avg_reward}")
+
+# If avg_reward > 0: Environment is learnable
+# If avg_reward ≤ 0: Environment is broken or impossible
+```
+
+#### Check 4: Observation Normalization
+
+**Why**: Non-normalized observations cause learning problems.
+
+```python
+# Check if observations are normalized
+for episode in range(10):
+    state = env.reset()
+    print(f"Episode {episode}: state range [{state.min()}, {state.max()}]")
+
+    # For images: should be [0, 1] or [-1, 1]
+    # For physical states: should be roughly [-1, 1]
+
+    if state.min() < -10 or state.max() > 10:
+        print("⚠️ Observations not normalized!")
+        # Solution:
+        state = state / np.abs(state).max()  # Normalize
+```
+
+
+## Part 4: Common RL Bugs Catalog
+
+### Bug 1: Reward Scale > 1
+
+**Symptom**: Training unstable, loss spikes, agent doesn't learn
+
+**Root Cause**: Gradients too large due to reward scale
+
+**Code Example**:
+
+```python
+# WRONG: Reward in [0, 1000]
+reward = success_count * 1000
+
+# CORRECT: Normalize to [-1, 1]
+reward = success_count * 1000
+reward = reward / max_possible_reward  # Result: [-1, 1]
+```
+
+**Fix**: Divide rewards by max possible value
+
+**Detection**:
+
+```python
+rewards = [collect 100 episodes]
+if max(abs(r) for r in rewards) > 1:
+    print("⚠️ Reward scale issue detected")
+```
+
+
+### Bug 2: Environment Reset Broken
+
+**Symptom**: Agent learns initial state but can't adapt
+
+**Root Cause**: Reset doesn't randomize initial state or returns same state
+
+**Code Example**:
+
+```python
+# WRONG: Reset always same state
+def reset(self):
+    self.state = np.array([0, 0, 0, 0])  # Always [0,0,0,0]
+    return self.state
+
+# CORRECT: Reset randomizes initial state
+def reset(self):
+    self.state = np.random.uniform(-0.1, 0.1, size=4)  # Random
+    return self.state
+```
+
+**Fix**: Make reset() randomize initial state
+
+**Detection**:
+
+```python
+states = [env.reset() for _ in range(10)]
+if len(set(map(tuple, states))) == 1:
+    print("⚠️ Reset broken, always same state")
+```
+
+
+### Bug 3: Observation Insufficient (Partial Observability)
+
+**Symptom**: Agent can't learn because it doesn't see enough
+
+**Root Cause**: Observation missing velocity, derivatives, or temporal info
+
+**Code Example**:
+
+```python
+# WRONG: Only position, no velocity
+state = np.array([position])  # Can't infer velocity from position alone
+
+# CORRECT: Position + velocity
+state = np.array([position, velocity])
+
+# WRONG for images: Single frame
+observation = env.render()  # Single frame, no temporal info
+
+# CORRECT for images: Stacked frames
+frames = [frame_t-3, frame_t-2, frame_t-1, frame_t]  # 4 frames
+observation = np.stack(frames, axis=-1)  # Shape: (84, 84, 4)
+```
+
+**Fix**: Add missing information to observation
+
+**Detection**:
+
+```python
+# If agent converges to bad performance despite long training
+# Check: Can you compute optimal action from observation?
+# If no: Observation is insufficient
+```
+
+
+### Bug 4: Reward Always Same (No Signal)
+
+**Symptom**: Loss decreases but doesn't improve over time, reward flat
+
+**Root Cause**: Reward is constant or nearly constant
+
+**Code Example**:
+
+```python
+# WRONG: Constant reward
+reward = 1.0  # Every step gets +1, no differentiation
+
+# CORRECT: Differentiate good and bad outcomes
+if reached_goal:
+    reward = 1.0
+else:
+    reward = 0.0  # Or -0.1 for living cost
+```
+
+**Fix**: Ensure reward differentiates outcomes
+
+**Detection**:
+
+```python
+rewards = [collect random policy rewards]
+if rewards.std() < 0.01:
+    print("⚠️ Reward has no variance, no signal to learn")
+```
+
+
+### Bug 5: Learning Rate Too High
+
+**Symptom**: Loss oscillates or explodes, training unstable
+
+**Root Cause**: Gradient updates too large, overshooting optimum
+
+**Code Example**:
+
+```python
+# WRONG: Learning rate 1e-2 (too high)
+optimizer = Adam(model.parameters(), lr=1e-2)
+
+# CORRECT: Learning rate 3e-4 (safe default)
+optimizer = Adam(model.parameters(), lr=3e-4)
+```
+
+**Fix**: Reduce learning rate by 2-5×
+
+**Detection**:
+
+```python
+# Watch loss first 100 steps
+# If loss increases first step: LR too high
+# If loss decreases but oscillates: LR probably high
+```
+
+
+### Bug 6: Learning Rate Too Low
+
+**Symptom**: Agent learns very slowly, training takes forever
+
+**Root Cause**: Gradient updates too small, learning crawls
+
+**Code Example**:
+
+```python
+# WRONG: Learning rate 1e-6 (too low)
+optimizer = Adam(model.parameters(), lr=1e-6)
+
+# CORRECT: Learning rate 3e-4
+optimizer = Adam(model.parameters(), lr=3e-4)
+```
+
+**Fix**: Increase learning rate by 2-5×
+
+**Detection**:
+
+```python
+# Training curve increases very slowly
+# If training 1M steps and reward barely improved: LR too low
+```
+
+
+### Bug 7: No Exploration Decay
+
+**Symptom**: Agent learns but remains noisy, doesn't fully exploit
+
+**Root Cause**: Exploration (epsilon or entropy) not decaying
+
+**Code Example**:
+
+```python
+# WRONG: Constant epsilon
+epsilon = 0.3  # Forever
+
+# CORRECT: Decay epsilon
+epsilon = epsilon_linear(step, total_steps=1_000_000,
+                         epsilon_start=1.0, epsilon_end=0.01)
+```
+
+**Fix**: Add exploration decay schedule
+
+**Detection**:
+
+```python
+# Plot entropy or epsilon over training
+# Should show clear decay from high to low
+# If flat: not decaying
+```
+
+
+### Bug 8: Exploration Decay Too Fast
+
+**Symptom**: Agent plateaus early, stuck in local optimum
+
+**Root Cause**: Exploration stops before finding good policy
+
+**Code Example**:
+
+```python
+# WRONG: Decays to zero in 10k steps (for 1M step training)
+epsilon = 0.99 ** (step / 100)  # Reaches 0 too fast
+
+# CORRECT: Decays over full training
+epsilon = epsilon_linear(step, total_steps=1_000_000,
+                         epsilon_start=1.0, epsilon_end=0.01)
+```
+
+**Fix**: Use longer decay schedule
+
+**Detection**:
+
+```python
+# Plot epsilon over training
+# Should reach final value at 50-80% through training
+# Not at 5%
+```
+
+
+### Bug 9: Reward Hacking
+
+**Symptom**: Agent achieves high reward but behavior is useless
+
+**Root Cause**: Agent found way to game reward not aligned with intent
+
+**Code Example**:
+
+```python
+# WRONG: Reward for just staying alive
+reward = 1.0  # Every timestep
+# Agent learns: Stay in corner, don't move, get infinite reward
+
+# CORRECT: Reward for progress + living cost
+position_before = self.state[0]
+self.state = compute_next_state(...)
+position_after = self.state[0]
+progress = position_after - position_before
+
+reward = progress - 0.01  # Progress bonus, living cost
+```
+
+**Fix**: Reshape reward to align with intent
+
+**Detection**:
+
+```python
+# Visualize agent behavior
+# If behavior weird but reward high: hacking
+# If reward increases but task performance bad: hacking
+```
+
+
+### Bug 10: Testing with Exploration
+
+**Symptom**: Test performance much worse than training, high variance
+
+**Root Cause**: Using stochastic policy at test time
+
+**Code Example**:
+
+```python
+# WRONG: Test with epsilon > 0
+for test_episode in range(100):
+    action = epsilon_greedy(q_values, epsilon=0.05)  # Wrong!
+    # Agent still explores at test
+
+# CORRECT: Test greedy
+for test_episode in range(100):
+    action = np.argmax(q_values)  # Deterministic
+```
+
+**Fix**: Use greedy/deterministic policy at test time
+
+**Detection**:
+
+```python
+# Test reward variance high?
+# Test reward < train reward?
+# Check: Are you using exploration at test time?
+```
+
+
+## Part 5: Logging and Monitoring
+
+### What Metrics to Track
+
+```python
+# Minimal set of metrics for RL debugging
+class RLLogger:
+    def __init__(self):
+        self.episode_rewards = []
+        self.policy_losses = []
+        self.value_losses = []
+        self.entropies = []
+        self.gradient_norms = []
+
+    def log_episode(self, episode_reward):
+        self.episode_rewards.append(episode_reward)
+
+    def log_losses(self, policy_loss, value_loss, entropy):
+        self.policy_losses.append(policy_loss)
+        self.value_losses.append(value_loss)
+        self.entropies.append(entropy)
+
+    def log_gradient_norm(self, norm):
+        self.gradient_norms.append(norm)
+
+    def plot_training(self):
+        """Visualize training progress."""
+        # Plot 1: Episode rewards over time (smoothed)
+        # Plot 2: Policy and value losses
+        # Plot 3: Entropy (should decay)
+        # Plot 4: Gradient norms
+        pass
+```
+
+### What Each Metric Means
+
+#### Metric 1: Episode Reward
+
+**What to look for**:
+
+- Should trend upward over time
+- Should have decreasing variance (less oscillation)
+- Slight noise is normal
+
+**Red flags**:
+
+- Flat line: Not learning
+- Downward trend: Getting worse
+- Wild oscillations: Instability or unlucky randomness
+
+**Code**:
+
+```python
+rewards = agent.get_episode_rewards()
+reward_smoothed = np.convolve(rewards, np.ones(100)/100, mode='valid')
+plt.plot(reward_smoothed)  # Smooth to see trend
+```
+
+#### Metric 2: Policy Loss
+
+**What to look for**:
+
+- Should decrease over training
+- Decrease should smooth out (not oscillating)
+
+**Red flags**:
+
+- Loss increasing: Learning rate too high
+- Loss oscillating: Learning rate too high or reward scale wrong
+- Loss = 0: Policy not updating
+
+**Code**:
+
+```python
+if policy_loss > policy_loss_prev:
+    print("⚠️ Policy loss increased, LR might be too high")
+```
+
+#### Metric 3: Value Loss (for critic-based methods)
+
+**What to look for**:
+
+- Should decrease initially, then plateau
+- Should not oscillate heavily
+
+**Red flags**:
+
+- Loss exploding: LR too high
+- Loss not changing: Not updating
+
+**Code**:
+
+```python
+value_loss_smoothed = np.convolve(value_losses, np.ones(100)/100)
+if value_loss_smoothed[-1] > value_loss_smoothed[-100]:
+    print("⚠️ Value loss increasing recently")
+```
+
+#### Metric 4: Entropy (Policy Randomness)
+
+**What to look for**:
+
+- Should start high (exploring)
+- Should decay to low (exploiting)
+- Clear downward trend
+
+**Red flags**:
+
+- Entropy always high: Too much exploration
+- Entropy drops to zero: Over-exploiting
+- No decay: Entropy not decreasing
+
+**Code**:
+
+```python
+if entropy[-1] > entropy[-100]:
+    print("⚠️ Entropy increasing, exploration not decaying")
+```
+
+#### Metric 5: Gradient Norms
+
+**What to look for**:
+
+- Should stay roughly constant over training
+- Typical range: 0.1 to 10
+
+**Red flags**:
+
+- Gradient norms > 100: Exploding gradients
+- Gradient norms < 0.001: Vanishing gradients
+- Sudden spikes: Outlier data or numerical issue
+
+**Code**:
+
+```python
+total_norm = 0
+for p in model.parameters():
+    param_norm = p.grad.norm(2)
+    total_norm += param_norm ** 2
+total_norm = total_norm ** 0.5
+
+if total_norm > 100:
+    print("⚠️ Gradient explosion detected")
+```
+
+### Visualization Script
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot_rl_training(rewards, policy_losses, value_losses, entropies):
+    """Plot training metrics for RL debugging."""
+
+    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+
+    # Plot 1: Episode rewards
+    ax = axes[0, 0]
+    ax.plot(rewards, alpha=0.3, label='Episode reward')
+    reward_smooth = np.convolve(rewards, np.ones(100)/100, mode='valid')
+    ax.plot(range(100, len(rewards)), reward_smooth, label='Smoothed (100 episodes)')
+    ax.set_xlabel('Episode')
+    ax.set_ylabel('Reward')
+    ax.set_title('Episode Rewards Over Time')
+    ax.legend()
+    ax.grid()
+
+    # Plot 2: Policy loss
+    ax = axes[0, 1]
+    ax.plot(policy_losses, alpha=0.3)
+    loss_smooth = np.convolve(policy_losses, np.ones(100)/100, mode='valid')
+    ax.plot(range(100, len(policy_losses)), loss_smooth, label='Smoothed')
+    ax.set_xlabel('Step')
+    ax.set_ylabel('Policy Loss')
+    ax.set_title('Policy Loss Over Time')
+    ax.legend()
+    ax.grid()
+
+    # Plot 3: Entropy
+    ax = axes[1, 0]
+    ax.plot(entropies, label='Policy entropy')
+    ax.set_xlabel('Step')
+    ax.set_ylabel('Entropy')
+    ax.set_title('Policy Entropy (Should Decrease)')
+    ax.legend()
+    ax.grid()
+
+    # Plot 4: Value loss
+    ax = axes[1, 1]
+    ax.plot(value_losses, alpha=0.3)
+    loss_smooth = np.convolve(value_losses, np.ones(100)/100, mode='valid')
+    ax.plot(range(100, len(value_losses)), loss_smooth, label='Smoothed')
+    ax.set_xlabel('Step')
+    ax.set_ylabel('Value Loss')
+    ax.set_title('Value Loss Over Time')
+    ax.legend()
+    ax.grid()
+
+    plt.tight_layout()
+    plt.show()
+```
+
+
+## Part 6: Common Pitfalls and Red Flags
+
+### Pitfall 1: "Bigger Network = Better Learning"
+
+**Wrong**: Oversized networks overfit and learn slowly
+
+**Right**: Start with small network (2-3 hidden layers, 64-256 units)
+
+**Red Flag**: Network has > 10M parameters for simple task
+
+**Fix**:
+
+```python
+# Too big
+model = nn.Sequential(
+    nn.Linear(4, 1024),
+    nn.ReLU(),
+    nn.Linear(1024, 1024),
+    nn.Linear(1024, 2)
+)
+
+# Right size
+model = nn.Sequential(
+    nn.Linear(4, 128),
+    nn.ReLU(),
+    nn.Linear(128, 128),
+    nn.Linear(128, 2)
+)
+```
+
+
+### Pitfall 2: "Random Seed Doesn't Matter"
+
+**Wrong**: Different seeds give very different results (indicates instability)
+
+**Right**: Results should be consistent across seeds (within reasonable variance)
+
+**Red Flag**: Reward varies by 50%+ across 5 seeds
+
+**Fix**:
+
+```python
+# Test across multiple seeds
+rewards_by_seed = []
+for seed in range(5):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    reward = train_agent(seed)
+    rewards_by_seed.append(reward)
+
+print(f"Mean: {np.mean(rewards_by_seed)}, Std: {np.std(rewards_by_seed)}")
+if np.std(rewards_by_seed) > 0.5 * np.mean(rewards_by_seed):
+    print("⚠️ High variance across seeds, training unstable")
+```
+
+
+### Pitfall 3: "Skip Observation Normalization"
+
+**Wrong**: Non-normalized observations (scale [-1e6, 1e6])
+
+**Right**: Normalized observations (scale [-1, 1])
+
+**Red Flag**: State values > 100 or < -100
+
+**Fix**:
+
+```python
+# Normalize images
+observation = observation.astype(np.float32) / 255.0
+
+# Normalize states
+observation = (observation - observation_mean) / observation_std
+
+# Or standardize on-the-fly
+normalized_obs = (obs - running_mean) / (running_std + 1e-8)
+```
+
+
+### Pitfall 4: "Ignore the Reward Curve Shape"
+
+**Wrong**: Only look at final reward, ignore curve shape
+
+**Right**: Curve shape tells you what's wrong
+
+**Red Flag**: Curve shapes indicate:
+
+- Flat then sudden jump: Long exploration, then found policy
+- Oscillating: Unstable learning
+- Decreasing after peak: Catastrophic forgetting
+
+**Fix**:
+
+```python
+# Look at curve shape
+if reward_curve is flat:
+    print("Not learning, check environment/reward")
+elif reward_curve oscillates:
+    print("Unstable, check LR or reward scale")
+elif reward_curve peaks then drops:
+    print("Overfitting or exploration decay wrong")
+```
+
+
+### Pitfall 5: "Skip the Random Baseline Check"
+
+**Wrong**: Train agent without knowing what random baseline is
+
+**Right**: Always compute random baseline first
+
+**Red Flag**: Agent barely beats random (within 5% of baseline)
+
+**Fix**:
+
+```python
+# Compute random baseline
+random_rewards = []
+for _ in range(100):
+    state = env.reset()
+    episode_reward = 0
+    for step in range(1000):
+        action = env.action_space.sample()
+        state, reward, done, _ = env.step(action)
+        episode_reward += reward
+        if done:
+            break
+    random_rewards.append(episode_reward)
+
+random_baseline = np.mean(random_rewards)
+print(f"Random baseline: {random_baseline}")
+
+# Compare agent
+agent_reward = train_agent()
+improvement = (agent_reward - random_baseline) / random_baseline
+print(f"Agent improvement: {improvement*100}%")
+```
+
+
+### Pitfall 6: "Changing Multiple Hyperparameters at Once"
+
+**Wrong**: Change 5 things, training breaks, don't know which caused it
+
+**Right**: Change one thing at a time, test, measure, iterate
+
+**Red Flag**: Code has "TUNING" comments with 10 simultaneous changes
+
+**Fix**:
+
+```python
+# Scientific method for debugging
+def debug_lr():
+    for lr in [1e-5, 1e-4, 1e-3, 1e-2]:
+        reward = train_with_lr(lr)
+        print(f"LR={lr}: Reward={reward}")
+        # Only change LR, keep everything else same
+
+def debug_batch_size():
+    for batch in [32, 64, 128, 256]:
+        reward = train_with_batch(batch)
+        print(f"Batch={batch}: Reward={reward}")
+        # Only change batch, keep everything else same
+```
+
+
+### Pitfall 7: "Using Training Metrics to Judge Performance"
+
+**Wrong**: Trust training reward, test once at the end
+
+**Right**: Monitor test reward during training (with exploration off)
+
+**Red Flag**: Training reward high, test reward low (overfitting)
+
+**Fix**:
+
+```python
+# Evaluate with greedy policy (no exploration)
+def evaluate(agent, num_episodes=10):
+    episode_rewards = []
+    for _ in range(num_episodes):
+        state = env.reset()
+        episode_reward = 0
+        for step in range(1000):
+            action = agent.act(state, explore=False)  # Greedy
+            state, reward, done, _ = env.step(action)
+            episode_reward += reward
+            if done:
+                break
+        episode_rewards.append(episode_reward)
+    return np.mean(episode_rewards)
+
+# Monitor during training
+for step in range(total_steps):
+    train_agent_step()
+
+    if step % 10000 == 0:
+        test_reward = evaluate(agent)  # Evaluate periodically
+        print(f"Step {step}: Test reward={test_reward}")
+```
+
+
+## Part 7: Red Flags Checklist
+
+```
+CRITICAL RED FLAGS (Stop and debug immediately):
+
+[ ] NaN in loss or rewards
+    → Check: reward scale, gradients, network outputs
+
+[ ] Gradient norms > 100 (exploding)
+    → Check: Enable gradient clipping, reduce LR
+
+[ ] Gradient norms < 1e-4 (vanishing)
+    → Check: Increase LR, check network initialization
+
+[ ] Reward always same
+    → Check: Is reward function broken? No differentiation?
+
+[ ] Agent never improves beyond random baseline
+    → Check: Reward scale, environment, observation, exploration
+
+[ ] Loss oscillates wildly
+    → Check: Learning rate (likely too high), reward scale
+
+[ ] Episode length decreases over training
+    → Check: Agent learning bad behavior, poor reward shaping
+
+[ ] Test reward >> training reward
+    → Check: Training is lucky, test is representative
+
+[ ] Training gets worse after improving
+    → Check: Catastrophic forgetting, stability issue
+
+
+IMPORTANT RED FLAGS (Debug within a few training runs):
+
+[ ] Entropy not decaying (always high)
+    → Check: Entropy regularization, exploration decay
+
+[ ] Entropy goes to zero early
+    → Check: Entropy coefficient too low, exploration too aggressive
+
+[ ] Variance across seeds > 50% of mean
+    → Check: Training is unstable or lucky, try more seeds
+
+[ ] Network weights not changing
+    → Check: Gradient zero, LR zero, network not connected
+
+[ ] Loss = 0 (perfect fit)
+    → Check: Network overfitting, reward too easy
+
+
+MINOR RED FLAGS (Watch for patterns):
+
+[ ] Training slower than expected
+    → Check: LR too low, batch size too small, network too small
+
+[ ] Occasional loss spikes
+    → Check: Outlier data, reward outliers, clipping needed
+
+[ ] Reward variance high
+    → Check: Normal if environment stochastic, check if aligns with intent
+
+[ ] Agent behavior seems random even late in training
+    → Check: Entropy not decaying, exploration not stopping
+```
+
+
+## Part 8: Rationalization Resistance
+
+| Rationalization | Reality | Counter-Guidance |
+|-----------------|---------|------------------|
+| "Higher learning rate will speed up learning" | Can cause instability, often slows learning | Start with 3e-4, measure effect, don't assume |
+| "Bigger network always learns better" | Oversized networks overfit, slow training | Start small (64-256 units), increase only if needed |
+| "Random seed doesn't matter, RL is random anyway" | High variance indicates instability, not inherent randomness | Run 5+ seeds, variance should be low, not high |
+| "I'll try all hyperparameters (grid search)" | Combinatorial explosion, wastes time, no diagnosis | Check environment/reward FIRST, then tune one param at a time |
+| "Adding regularization helps unstable training" | Regularization is for overfitting, not instability | Instability usually LR or reward scale, not overfitting |
+| "My algorithm is broken" | 80% chance environment, reward, or observation is broken | Check those FIRST before blaming algorithm |
+| "More training always helps" | If reward plateaus, more training won't help | Check if training converged, if not why |
+| "Skip observation normalization, network will learn to normalize" | Network should not spend capacity learning normalization | Normalize observations before network |
+| "Test with epsilon > 0 to reduce variance" | Test should use learned policy, exploration harms test | Use greedy policy at test time |
+| "If loss doesn't decrease, algorithm is broken" | More likely: reward scale wrong, gradient clipping needed | Check reward scale, enable gradient clipping before changing algorithm |
+
+
+## Key Takeaways
+
+1. **Follow the systematic process**: Don't random tweak. Check environment → reward → observation → algorithm.
+
+2. **80/20 rule**: Most failures are in environment, reward, or observation. Check those first.
+
+3. **Reward scale is critical**: Most common bug. Normalize to [-1, 1].
+
+4. **Diagnosis trees**: Use them. Different symptoms have different root causes.
+
+5. **Metrics tell you everything**: Loss, entropy, gradient norms reveal what's wrong.
+
+6. **Rationalization is the enemy**: Don't assume, measure. Plot curves, check outputs, verify.
+
+7. **Simple environment first**: If agent can't learn CartPole, bigger environment won't help.
+
+8. **One seed is not enough**: Run 5+ seeds, look at variance, not just mean.
+
+This skill is about **systematic debugging**, not random tweaking. Apply the framework, follow diagnosis trees, and you'll find the bug.
diff --git a/skills/using-deep-rl/rl-environments.md b/skills/using-deep-rl/rl-environments.md
new file mode 100644
index 0000000..336dba7
--- /dev/null
+++ b/skills/using-deep-rl/rl-environments.md
@@ -0,0 +1,1780 @@
+
+# RL Environments: Building and Debugging Custom Environments
+
+## When to Use This Skill
+
+Invoke this skill when you need to:
+
+- **Create Custom Environments**: Build a new environment from scratch using Gym/gymnasium
+- **Define Observation/Action Spaces**: Design Box, Discrete, Dict, Tuple spaces correctly
+- **Use Environment Wrappers**: Add preprocessing, modify rewards, implement time limits
+- **Parallelize Environments**: Choose between DummyVectorEnv, SyncVectorEnv, AsyncVectorEnv
+- **Debug Environment Bugs**: Diagnose reset/step issues, reward scaling, space mismatches
+- **Test Environments**: Validate environments before training agents
+- **Handle API Differences**: Migrate between Gym versions or Gym vs gymnasium
+- **Implement Complex State**: Manage multi-component observations and state systems
+- **Enforce Action Bounds**: Properly clip or scale actions
+- **Catch Common Pitfalls**: Avoid 10+ common environment implementation mistakes
+
+**Core Problem**: Environments are the foundation of RL training. Broken environments cause 80% of RL failures, but environment bugs are often missed because they don't error—they silently break training. This skill systematically teaches correct environment design and provides a debugging methodology.
+
+## Do NOT Use This Skill For
+
+- **Algorithm implementation** (route to specific algorithm skills like value-based-methods, policy-gradient-methods, actor-critic-methods)
+- **Reward design and shaping** (route to reward-shaping-engineering for reward function engineering and potential-based shaping)
+- **RL theory and foundations** (route to rl-foundations for MDPs, Bellman equations, value functions)
+- **Training debugging beyond environment issues** (route to rl-debugging for systematic diagnosis of training failures)
+- **Exploration strategy selection** (route to exploration-strategies for ε-greedy, curiosity-driven, RND methods)
+
+
+## Part 1: Understanding the Gym/Gymnasium API
+
+### The Standard Interface
+
+Every Gym/Gymnasium environment implements:
+
+```python
+import gymnasium as gym  # or 'gym' for older versions
+
+class CustomEnv(gym.Env):
+    """Template for all custom environments"""
+
+    def __init__(self):
+        # Define action and observation spaces
+        self.action_space = gym.spaces.Discrete(4)  # 4 possible actions
+        self.observation_space = gym.spaces.Box(
+            low=0, high=255, shape=(84, 84, 3), dtype=np.uint8
+        )
+
+    def reset(self, seed=None):
+        """Reset environment to initial state
+
+        Returns:
+            observation (np.ndarray): Initial observation
+            info (dict): Auxiliary info (can be empty dict)
+        """
+        super().reset(seed=seed)
+        obs = self._get_initial_observation()
+        info = {}
+        return obs, info
+
+    def step(self, action):
+        """Take one action in the environment
+
+        Args:
+            action: Action from action_space
+
+        Returns:
+            observation (np.ndarray): Current observation after action
+            reward (float): Reward for this step
+            terminated (bool): True if episode ended (goal/failure)
+            truncated (bool): True if episode cut off (time limit)
+            info (dict): Auxiliary info
+        """
+        obs = self._apply_action(action)
+        reward = self._compute_reward()
+        terminated = self._is_done()
+        truncated = False  # Set by TimeLimit wrapper usually
+        info = {}
+        return obs, reward, terminated, truncated, info
+
+    def render(self, mode='human'):
+        """Visualize the environment (optional)"""
+        pass
+
+    def close(self):
+        """Cleanup resources (optional)"""
+        pass
+```
+
+### Key API Points
+
+**1. Reset Format (Gymnasium API)**
+
+```python
+# CORRECT: Reset returns (observation, info)
+observation, info = env.reset()
+
+# WRONG: Old Gym API returned just observation
+observation = env.reset()  # This is Gym, not Gymnasium
+```
+
+**2. Step Format (Gymnasium API)**
+
+```python
+# CORRECT: Step returns (obs, reward, terminated, truncated, info)
+obs, reward, terminated, truncated, info = env.step(action)
+
+# WRONG: Old Gym API
+obs, reward, done, info = env.step(action)  # 'done' is single boolean
+```
+
+**3. Gym vs Gymnasium**
+
+| Feature | Gym (OpenAI) | Gymnasium (Maintained) |
+|---------|--------------|----------------------|
+| Reset return | `obs` | `(obs, info)` |
+| Step return | `(obs, r, done, info)` | `(obs, r, terminated, truncated, info)` |
+| Render | `env.render(mode='human')` | `env.render()`; mode set at init |
+| Import | `import gym` | `import gymnasium as gym` |
+| Support | Deprecated | Current standard |
+
+**Decision**: Use `gymnasium` for new code. If stuck with older code:
+
+```python
+# Compatibility wrapper
+try:
+    import gymnasium as gym
+except ImportError:
+    import gym
+```
+
+
+## Part 2: Observation and Action Space Design
+
+### Space Types
+
+**Discrete Space** (for discrete actions or observations)
+
+```python
+# 4 possible actions: 0, 1, 2, 3
+action_space = gym.spaces.Discrete(4)
+
+# 5 possible discrete states
+observation_space = gym.spaces.Discrete(5)
+
+# With start parameter
+action_space = gym.spaces.Discrete(4, start=1)  # 1, 2, 3, 4
+```
+
+**Box Space** (for continuous or image data)
+
+```python
+# Continuous control: 3D position, each in [-1, 1]
+action_space = gym.spaces.Box(
+    low=-1.0,
+    high=1.0,
+    shape=(3,),
+    dtype=np.float32
+)
+
+# Image observation: 84x84 RGB, pixels 0-255
+observation_space = gym.spaces.Box(
+    low=0,
+    high=255,
+    shape=(84, 84, 3),
+    dtype=np.uint8
+)
+
+# Multi-component continuous: 2D position + 1D velocity
+observation_space = gym.spaces.Box(
+    low=np.array([-1.0, -1.0, -10.0]),
+    high=np.array([1.0, 1.0, 10.0]),
+    dtype=np.float32
+)
+```
+
+**Dict Space** (for structured observations with multiple components)
+
+```python
+# Multi-component observation: image + state vector
+observation_space = gym.spaces.Dict({
+    'image': gym.spaces.Box(0, 255, (84, 84, 3), dtype=np.uint8),
+    'position': gym.spaces.Box(-1, 1, (2,), dtype=np.float32),
+})
+
+# Access in reset/step:
+obs = {
+    'image': np.random.randint(0, 256, (84, 84, 3), dtype=np.uint8),
+    'position': np.array([0.5, -0.3], dtype=np.float32),
+}
+```
+
+**Tuple Space** (for ordered multiple components)
+
+```python
+observation_space = gym.spaces.Tuple((
+    gym.spaces.Box(-1, 1, (2,), dtype=np.float32),  # Position
+    gym.spaces.Discrete(4),  # Direction
+))
+
+# Access:
+obs = (np.array([0.5, -0.3], dtype=np.float32), 2)
+```
+
+**MultiDiscrete** (for multiple discrete action dimensions)
+
+```python
+# Game with 4 actions per agent, 3 agents
+action_space = gym.spaces.MultiDiscrete([4, 4, 4])
+
+# Or asymmetric
+action_space = gym.spaces.MultiDiscrete([3, 4, 5])  # Different choices per dimension
+```
+
+### Space Validation Patterns
+
+**Always validate that observations match the space:**
+
+```python
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    obs = self._get_observation()
+
+    # CRITICAL: Validate observation against space
+    assert self.observation_space.contains(obs), \
+        f"Observation {obs} not in space {self.observation_space}"
+
+    return obs, {}
+
+def step(self, action):
+    # CRITICAL: Validate action is in action space
+    assert self.action_space.contains(action), \
+        f"Action {action} not in space {self.action_space}"
+
+    obs = self._apply_action(action)
+
+    # Validate observation
+    assert self.observation_space.contains(obs), \
+        f"Observation {obs} not in space {self.observation_space}"
+
+    reward = self._compute_reward()
+    terminated = self._check_done()
+    truncated = False
+
+    return obs, reward, terminated, truncated, {}
+```
+
+### Common Space Mistakes
+
+**Mistake 1: dtype mismatch (uint8 vs float32)**
+
+```python
+# WRONG: Space says uint8 but observation is float32
+observation_space = gym.spaces.Box(0, 255, (84, 84, 3), dtype=np.uint8)
+obs = np.random.random((84, 84, 3)).astype(np.float32)  # MISMATCH!
+assert self.observation_space.contains(obs)  # FAILS
+
+# CORRECT: Match dtype
+observation_space = gym.spaces.Box(0, 1, (84, 84, 3), dtype=np.float32)
+obs = np.random.random((84, 84, 3)).astype(np.float32)
+assert self.observation_space.contains(obs)  # PASSES
+```
+
+**Mistake 2: Range mismatch**
+
+```python
+# WRONG: Observation outside declared range
+observation_space = gym.spaces.Box(0, 1, (4,), dtype=np.float32)
+obs = np.array([0.5, 1.5, 0.2, 0.8], dtype=np.float32)  # 1.5 > 1!
+assert self.observation_space.contains(obs)  # FAILS
+
+# CORRECT: Ensure observations stay within bounds
+obs = np.clip(obs, 0, 1)
+```
+
+**Mistake 3: Shape mismatch**
+
+```python
+# WRONG: Wrong shape
+observation_space = gym.spaces.Box(0, 255, (84, 84, 3), dtype=np.uint8)
+obs = np.random.randint(0, 256, (84, 84), dtype=np.uint8)  # 2D, not 3D!
+assert self.observation_space.contains(obs)  # FAILS
+
+# CORRECT: Match shape exactly
+obs = np.random.randint(0, 256, (84, 84, 3), dtype=np.uint8)
+```
+
+
+## Part 3: Creating Custom Environments - Template
+
+### Step 1: Inherit from gym.Env
+
+```python
+import gymnasium as gym
+import numpy as np
+
+class CartPoleMini(gym.Env):
+    """Simple environment for demonstration"""
+
+    # These are required attributes
+    metadata = {"render_modes": ["human"], "render_fps": 30}
+
+    def __init__(self, render_mode=None):
+        # Store render mode
+        self.render_mode = render_mode
+
+        # Action space: push cart left (0) or right (1)
+        self.action_space = gym.spaces.Discrete(2)
+
+        # Observation space: position, velocity, angle, angular velocity
+        self.observation_space = gym.spaces.Box(
+            low=np.array([-2.4, -10, -0.2, -10], dtype=np.float32),
+            high=np.array([2.4, 10, 0.2, 10], dtype=np.float32),
+            dtype=np.float32
+        )
+
+        # Episode variables
+        self.state = None
+        self.steps = 0
+        self.max_steps = 500
+```
+
+### Step 2: Implement reset()
+
+```python
+    def reset(self, seed=None):
+        """Reset to initial state
+
+        Returns:
+            obs (np.ndarray): Initial observation
+            info (dict): Empty dict
+        """
+        super().reset(seed=seed)
+
+        # Initialize state to center position with small noise
+        self.state = np.array(
+            [
+                self.np_random.uniform(-0.05, 0.05),  # position
+                0.0,  # velocity
+                self.np_random.uniform(-0.05, 0.05),  # angle
+                0.0,  # angular velocity
+            ],
+            dtype=np.float32
+        )
+        self.steps = 0
+
+        # Validate and return
+        assert self.observation_space.contains(self.state)
+        return self.state, {}
+```
+
+### Step 3: Implement step()
+
+```python
+    def step(self, action):
+        """Execute one step of the environment
+
+        Args:
+            action: 0 (push left) or 1 (push right)
+
+        Returns:
+            obs, reward, terminated, truncated, info
+        """
+        assert self.action_space.contains(action)
+
+        # Validate state
+        assert self.observation_space.contains(self.state)
+
+        x, x_dot, theta, theta_dot = self.state
+
+        # Physics: apply force based on action
+        force = 10.0 if action == 1 else -10.0
+
+        # Simplified cartpole physics
+        acceleration = (force + 0.1 * theta) / 1.0
+        theta_dot_new = theta_dot + 0.02 * acceleration
+        theta_new = theta + 0.02 * theta_dot
+
+        x_dot_new = x_dot + 0.02 * acceleration
+        x_new = x + 0.02 * x_dot
+
+        # Update state
+        self.state = np.array(
+            [x_new, x_dot_new, theta_new, theta_dot_new],
+            dtype=np.float32
+        )
+
+        # Clamp values to stay in bounds
+        self.state = np.clip(self.state,
+                            self.observation_space.low,
+                            self.observation_space.high)
+
+        # Compute reward
+        reward = 1.0 if abs(theta) < 0.2 else -1.0
+
+        # Check termination
+        x, theta = self.state[0], self.state[2]
+        terminated = abs(x) > 2.4 or abs(theta) > 0.2
+
+        # Check truncation (max steps)
+        self.steps += 1
+        truncated = self.steps >= self.max_steps
+
+        # Validate output
+        assert self.observation_space.contains(self.state)
+        assert isinstance(reward, (int, float))
+
+        return self.state, float(reward), terminated, truncated, {}
+```
+
+### Step 4: Implement render() and close() (Optional)
+
+```python
+    def render(self):
+        """Render the environment (optional)"""
+        if self.render_mode == "human":
+            # Print state for visualization
+            x, x_dot, theta, theta_dot = self.state
+            print(f"Position: {x:.2f}, Angle: {theta:.2f}")
+
+    def close(self):
+        """Cleanup (optional)"""
+        pass
+```
+
+### Complete Custom Environment Example
+
+```python
+import gymnasium as gym
+import numpy as np
+
+class GridWorldEnv(gym.Env):
+    """Simple 5x5 grid world where agent seeks goal"""
+
+    def __init__(self):
+        # Actions: up=0, right=1, down=2, left=3
+        self.action_space = gym.spaces.Discrete(4)
+
+        # Observation: (x, y) position
+        self.observation_space = gym.spaces.Box(
+            low=0, high=4, shape=(2,), dtype=np.int32
+        )
+
+        self.grid_size = 5
+        self.goal = np.array([4, 4], dtype=np.int32)
+        self.agent_pos = np.array([0, 0], dtype=np.int32)
+        self.steps = 0
+        self.max_steps = 50
+
+    def reset(self, seed=None):
+        super().reset(seed=seed)
+        self.agent_pos = np.array([0, 0], dtype=np.int32)
+        self.steps = 0
+        assert self.observation_space.contains(self.agent_pos)
+        return self.agent_pos.copy(), {}
+
+    def step(self, action):
+        assert self.action_space.contains(action)
+
+        # Move agent
+        moves = {
+            0: np.array([0, 1], dtype=np.int32),   # up
+            1: np.array([1, 0], dtype=np.int32),   # right
+            2: np.array([0, -1], dtype=np.int32),  # down
+            3: np.array([-1, 0], dtype=np.int32),  # left
+        }
+
+        self.agent_pos += moves[action]
+        self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size - 1)
+
+        # Reward
+        distance_to_goal = np.linalg.norm(self.agent_pos - self.goal)
+        reward = 1.0 if np.array_equal(self.agent_pos, self.goal) else -0.01
+
+        # Done
+        terminated = np.array_equal(self.agent_pos, self.goal)
+        self.steps += 1
+        truncated = self.steps >= self.max_steps
+
+        return self.agent_pos.copy(), reward, terminated, truncated, {}
+```
+
+
+## Part 4: Environment Wrappers
+
+### Why Use Wrappers?
+
+Wrappers add functionality without modifying the original environment:
+
+```python
+# Without wrappers: modify environment directly (WRONG - mixes concerns)
+class CartPoleNormalized(CartPole):
+    def step(self, action):
+        obs, reward, done, info = super().step(action)
+        obs = obs / 2.4  # Normalize observation
+        reward = reward / 100  # Normalize reward
+        return obs, reward, done, info
+
+# With wrappers: compose functionality (RIGHT - clean separation)
+env = CartPole()
+env = NormalizeObservation(env)
+env = NormalizeReward(env)
+```
+
+### Wrapper Pattern
+
+```python
+class BaseWrapper(gym.Wrapper):
+    """Base class for all wrappers"""
+
+    def __init__(self, env):
+        super().__init__(env)
+        # Don't modify spaces unless you redefine them
+
+    def reset(self, seed=None):
+        obs, info = self.env.reset(seed=seed)
+        return self._process_observation(obs), info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        obs = self._process_observation(obs)
+        reward = self._process_reward(reward)
+        return obs, reward, terminated, truncated, info
+
+    def _process_observation(self, obs):
+        return obs
+
+    def _process_reward(self, reward):
+        return reward
+```
+
+### Common Built-in Wrappers
+
+**TimeLimit: Add episode time limit**
+
+```python
+env = gym.make("CartPole-v1")
+env = gym.wrappers.TimeLimit(env, max_episode_steps=500)
+# Now truncated=True after 500 steps
+```
+
+**NormalizeObservation: Normalize observations to [-1, 1]**
+
+```python
+env = gym.wrappers.NormalizeObservation(env)
+# Observations normalized using running mean/std
+```
+
+**RecordVideo: Save episode videos**
+
+```python
+env = gym.wrappers.RecordVideo(
+    env,
+    video_folder="videos/",
+    episode_trigger=lambda ep: ep % 10 == 0
+)
+```
+
+**ClipAction: Clip actions to action space bounds**
+
+```python
+env = gym.wrappers.ClipAction(env)
+# Actions automatically clipped to [-1, 1] or similar
+```
+
+### Custom Wrapper Example: Scale Rewards
+
+```python
+class ScaleRewardWrapper(gym.Wrapper):
+    """Scale rewards by a constant factor"""
+
+    def __init__(self, env, scale=0.1):
+        super().__init__(env)
+        self.scale = scale
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        return obs, reward * self.scale, terminated, truncated, info
+```
+
+**Custom Wrapper Example: Frame Stacking**
+
+```python
+class FrameStackWrapper(gym.Wrapper):
+    """Stack last 4 frames for temporal information"""
+
+    def __init__(self, env, num_frames=4):
+        super().__init__(env)
+        self.num_frames = num_frames
+        self.frame_buffer = collections.deque(maxlen=num_frames)
+
+        # Modify observation space to include stacking
+        old_space = env.observation_space
+        self.observation_space = gym.spaces.Box(
+            low=old_space.low.min(),
+            high=old_space.high.max(),
+            shape=(old_space.shape[0], old_space.shape[1],
+                   old_space.shape[2] * num_frames),
+            dtype=old_space.dtype
+        )
+
+    def reset(self, seed=None):
+        obs, info = self.env.reset(seed=seed)
+        self.frame_buffer.clear()
+        for _ in range(self.num_frames):
+            self.frame_buffer.append(obs)
+        return self._get_stacked_obs(), info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        self.frame_buffer.append(obs)
+        return self._get_stacked_obs(), reward, terminated, truncated, info
+
+    def _get_stacked_obs(self):
+        # Stack frames along channel dimension
+        return np.concatenate(list(self.frame_buffer), axis=2)
+```
+
+### Wrapper Chaining
+
+```python
+# Correct: Chain wrappers for composable functionality
+env = gym.make("Atari2600-v0")
+env = gym.wrappers.TimeLimit(env, max_episode_steps=4500)
+env = gym.wrappers.ClipAction(env)
+env = FrameStackWrapper(env, num_frames=4)
+env = gym.wrappers.NormalizeObservation(env)
+
+# Order matters: think about data flow
+# raw env -> ClipAction -> FrameStack -> NormalizeObservation
+```
+
+
+## Part 5: Vectorized Environments
+
+### Types of Vectorized Environments
+
+**DummyVectorEnv: Serial execution (simple, slowest)**
+
+```python
+from gymnasium.vector import DummyVectorEnv
+
+# Create 4 independent environments (serial)
+envs = DummyVectorEnv([
+    lambda: gym.make("CartPole-v1")
+    for i in range(4)
+])
+
+obs, info = envs.reset()  # obs shape: (4, 4)
+actions = np.array([0, 1, 1, 0])  # 4 actions
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+# rewards shape: (4,)
+```
+
+**SyncVectorEnv: Synchronized parallel (fast, moderate complexity)**
+
+```python
+from gymnasium.vector import SyncVectorEnv
+
+# Create 8 parallel environments (all step together)
+envs = SyncVectorEnv([
+    lambda: gym.make("CartPole-v1")
+    for i in range(8)
+])
+
+obs, info = envs.reset()
+# All 8 envs step synchronously
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+```
+
+**AsyncVectorEnv: Asynchronous parallel (fastest, most complex)**
+
+```python
+from gymnasium.vector import AsyncVectorEnv
+
+# Create 16 parallel environments (independent processes)
+envs = AsyncVectorEnv([
+    lambda: gym.make("CartPole-v1")
+    for i in range(16)
+])
+
+# Same API as SyncVectorEnv but faster
+obs, info = envs.reset()
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+envs.close()  # IMPORTANT: Close async envs to cleanup processes
+```
+
+### Comparison and Decision Tree
+
+| Feature | Dummy | Sync | Async |
+|---------|-------|------|-------|
+| Speed | Slow | Fast | Fastest |
+| CPU cores | 1 | 1 (+ GIL) | N |
+| Memory | Low | Moderate | High |
+| Complexity | Simple | Medium | Complex |
+| Debugging | Easy | Medium | Hard |
+| Best for | Testing | Training | Large-scale training |
+
+**When to use each:**
+
+```python
+num_envs = 32
+
+if num_envs <= 1:
+    # Single environment
+    env = gym.make("CartPole-v1")
+elif num_envs <= 4:
+    # Few environments: use Dummy for simplicity
+    env = DummyVectorEnv([gym.make("CartPole-v1") for _ in range(num_envs)])
+elif num_envs <= 8:
+    # Medium: use Sync for speed without complexity
+    env = SyncVectorEnv([gym.make("CartPole-v1") for _ in range(num_envs)])
+else:
+    # Many: use Async for maximum speed
+    env = AsyncVectorEnv([gym.make("CartPole-v1") for _ in range(num_envs)])
+```
+
+### Common Vectorized Environment Bugs
+
+**Bug 1: Forgetting to close AsyncVectorEnv**
+
+```python
+# WRONG: Processes leak
+envs = AsyncVectorEnv([...] for _ in range(16))
+# ... training ...
+# Forgot to close! Processes stay alive, memory leaks
+
+# CORRECT: Always close
+try:
+    envs = AsyncVectorEnv([...] for _ in range(16))
+    # ... training ...
+finally:
+    envs.close()  # Cleanup
+
+# Or use context manager
+from contextlib import contextmanager
+
+@contextmanager
+def make_async_envs(num_envs):
+    envs = AsyncVectorEnv([...] for _ in range(num_envs))
+    try:
+        yield envs
+    finally:
+        envs.close()
+```
+
+**Bug 2: Non-parallel-safe environment**
+
+```python
+# WRONG: Environment uses shared state, breaks with AsyncVectorEnv
+class NonParallelEnv(gym.Env):
+    global_counter = 0  # SHARED STATE!
+
+    def step(self, action):
+        self.global_counter += 1  # Race condition with async!
+        ...
+
+# CORRECT: No shared state
+class ParallelSafeEnv(gym.Env):
+    def __init__(self):
+        self.counter = 0  # Instance variable, not shared
+
+    def step(self, action):
+        self.counter += 1  # Safe in parallel
+        ...
+```
+
+**Bug 3: Handling auto-reset in vectorized envs**
+
+```python
+# When an episode terminates in vectorized env, it auto-resets
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+
+# If terminateds[i] is True, envs[i] has been auto-reset
+# The obs[i] is the NEW initial observation from the reset
+# NOT the final observation of the episode
+
+# To get final observation before reset:
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+final_obs = info['final_observation']  # Original terminal obs
+reset_obs = obs  # New obs from auto-reset
+```
+
+
+## Part 6: Common Environment Bugs and Fixes
+
+### Bug 1: Reward Scale Too Large
+
+**Symptom**: Training unstable, losses spike, agent behavior random
+
+```python
+# WRONG: Reward in range [0, 1000]
+def step(self, action):
+    reward = self.goal_distance * 1000  # Can be up to 1000!
+    return obs, reward, done, truncated, info
+
+# Problem: Gradients huge -> param updates too large -> training breaks
+
+# CORRECT: Reward in [-1, 1]
+def step(self, action):
+    reward = self.goal_distance  # Range [0, 1]
+    reward = reward - 0.5  # Scale to [-0.5, 0.5]
+    return obs, reward, done, truncated, info
+
+# Or normalize post-hoc
+reward = np.clip(reward / 1000, -1, 1)
+```
+
+### Bug 2: Action Not Applied Correctly
+
+**Symptom**: Agent learns but behavior doesn't match reward signal
+
+```python
+# WRONG: Action read but not used
+def step(self, action):
+    obs = self._get_next_obs()  # Doesn't use action!
+    reward = 1.0  # Reward independent of action
+    return obs, reward, False, False, {}
+
+# CORRECT: Action determines next state
+def step(self, action):
+    self._apply_action_to_physics(action)
+    obs = self._get_next_obs()
+    reward = self._compute_reward(action)
+    return obs, reward, False, False, {}
+```
+
+### Bug 3: Missing Terminal State Flag
+
+**Symptom**: Episodes don't end properly, agent never learns boundaries
+
+```python
+# WRONG: Always done=False
+def step(self, action):
+    ...
+    return obs, reward, False, False, {}  # Episode never ends!
+
+# CORRECT: Set terminated when episode should end
+def step(self, action):
+    ...
+    terminated = self._check_done_condition()
+    if terminated:
+        reward += 100  # Bonus for reaching goal
+    return obs, reward, terminated, False, {}
+
+# Also differentiate from truncation
+def step(self, action):
+    ...
+    self.steps += 1
+    terminated = self._reached_goal()  # Success condition
+    truncated = self.steps >= self.max_steps  # Time limit
+    return obs, reward, terminated, truncated, {}
+```
+
+### Bug 4: Observation/Space Mismatch
+
+**Symptom**: Training crashes or behaves oddly after environment change
+
+```python
+# WRONG: Space and observation don't match
+def __init__(self):
+    self.observation_space = gym.spaces.Box(0, 1, (4,), dtype=np.float32)
+
+def step(self, action):
+    obs = np.random.randint(0, 256, (4,), dtype=np.uint8)  # uint8!
+    return obs, reward, done, truncated, {}  # Mismatch!
+
+# CORRECT: Match dtype and range
+def __init__(self):
+    self.observation_space = gym.spaces.Box(0, 255, (4,), dtype=np.uint8)
+
+def step(self, action):
+    obs = np.random.randint(0, 256, (4,), dtype=np.uint8)  # Matches!
+    assert self.observation_space.contains(obs)
+    return obs, reward, done, truncated, {}
+```
+
+### Bug 5: Reset Not Initializing State
+
+**Symptom**: First episode works, subsequent episodes fail
+
+```python
+# WRONG: Reset doesn't actually reset
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    # Forgot to initialize state!
+    return self.state, {}  # self.state is stale from last episode
+
+# CORRECT: Reset initializes everything
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    self.state = self._initialize_state()
+    self.steps = 0
+    return self.state, {}
+```
+
+### Bug 6: Non-Deterministic Environment Without Proper Seeding
+
+**Symptom**: Same reset produces different initial states, breaks reproducibility
+
+```python
+# WRONG: Randomness not seeded
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    self.state = np.random.randn(4)  # Uses default RNG, ignores seed!
+    return self.state, {}
+
+# CORRECT: Use self.np_random which respects seed
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    # self.np_random is seeded by super().reset()
+    self.state = self.np_random.randn(4)
+    return self.state, {}
+```
+
+### Bug 7: Info Dict Contains Non-Serializable Objects
+
+**Symptom**: Episode fails when saving/loading with replay buffers
+
+```python
+# WRONG: Info dict contains unpicklable objects
+def step(self, action):
+    info = {
+        'env': self,  # Can't pickle!
+        'callback': self.callback_fn,  # Can't pickle!
+    }
+    return obs, reward, done, truncated, info
+
+# CORRECT: Only basic types in info dict
+def step(self, action):
+    info = {
+        'level': self.level,
+        'score': self.score,
+        'x_position': float(self.x),
+    }
+    return obs, reward, done, truncated, info
+```
+
+### Bug 8: Action Space Not Enforced
+
+**Symptom**: Agent takes actions outside valid range, causes crashes
+
+```python
+# WRONG: Action space defined but not enforced
+def __init__(self):
+    self.action_space = gym.spaces.Box(-1, 1, (3,))
+
+def step(self, action):
+    # action could be [10, 10, 10] and we don't catch it!
+    velocity = action * 10  # Huge velocity!
+    ...
+
+# CORRECT: Clip or validate actions
+def step(self, action):
+    assert self.action_space.contains(action), \
+        f"Invalid action {action}"
+
+    # Or clip to bounds
+    action = np.clip(action,
+                     self.action_space.low,
+                     self.action_space.high)
+    ...
+```
+
+### Bug 9: Observation Normalization Not Applied
+
+**Symptom**: Training unstable when observations are in [0, 255] instead of [0, 1]
+
+```python
+# WRONG: Large observation range breaks training
+def step(self, action):
+    obs = self.render_to_image()  # Range [0, 255]
+    return obs, reward, done, truncated, {}
+
+# CORRECT: Normalize observations
+def step(self, action):
+    obs = self.render_to_image()  # Range [0, 255]
+    obs = obs.astype(np.float32) / 255.0  # Normalize to [0, 1]
+    return obs, reward, done, truncated, {}
+
+# Or use NormalizeObservation wrapper
+env = NormalizeObservation(env)
+```
+
+### Bug 10: Forgetting to Return Info Dict
+
+**Symptom**: Step returns wrong number of values, crashes agent training loop
+
+```python
+# WRONG: Step returns 4 values (old Gym API)
+def step(self, action):
+    return obs, reward, done, info  # WRONG!
+
+# CORRECT: Step returns 5 values (Gymnasium API)
+def step(self, action):
+    return obs, reward, terminated, truncated, info
+
+# Or use try-except during migration
+try:
+    obs, reward, terminated, truncated, info = env.step(action)
+except ValueError:
+    obs, reward, done, info = env.step(action)
+    terminated = done
+    truncated = False
+```
+
+
+## Part 7: Environment Testing Checklist
+
+Before training an RL agent on a custom environment, validate:
+
+### Pre-Training Validation Checklist
+
+```python
+class EnvironmentValidator:
+    """Validate custom environment before training"""
+
+    def validate_all(self, env):
+        """Run all validation tests"""
+        print("Validating environment...")
+
+        # 1. Spaces are valid
+        self.validate_spaces(env)
+        print("✓ Spaces valid")
+
+        # 2. Reset works
+        obs, info = self.validate_reset(env)
+        print("✓ Reset works")
+
+        # 3. Step works and returns correct format
+        self.validate_step(env, obs)
+        print("✓ Step works")
+
+        # 4. Observations are valid
+        self.validate_observations(env, obs)
+        print("✓ Observations valid")
+
+        # 5. Actions are enforced
+        self.validate_actions(env)
+        print("✓ Actions enforced")
+
+        # 6. Terminal states work
+        self.validate_termination(env)
+        print("✓ Termination works")
+
+        # 7. Environment is reproducible
+        self.validate_reproducibility(env)
+        print("✓ Reproducibility verified")
+
+        # 8. Random agent can run
+        self.validate_random_agent(env)
+        print("✓ Random agent runs")
+
+        print("\nEnvironment validation PASSED!")
+
+    def validate_spaces(self, env):
+        """Check spaces are defined"""
+        assert hasattr(env, 'action_space'), "No action_space"
+        assert hasattr(env, 'observation_space'), "No observation_space"
+        assert isinstance(env.action_space, gym.spaces.Space)
+        assert isinstance(env.observation_space, gym.spaces.Space)
+
+    def validate_reset(self, env):
+        """Check reset returns (obs, info)"""
+        result = env.reset()
+        assert isinstance(result, tuple) and len(result) == 2, \
+            f"Reset should return (obs, info), got {result}"
+        obs, info = result
+        assert isinstance(info, dict), "Info should be dict"
+        return obs, info
+
+    def validate_step(self, env, obs):
+        """Check step returns 5-tuple"""
+        action = env.action_space.sample()
+        result = env.step(action)
+        assert isinstance(result, tuple) and len(result) == 5, \
+            f"Step should return 5-tuple, got {len(result)}"
+        obs, reward, terminated, truncated, info = result
+        assert isinstance(reward, (int, float)), "Reward must be number"
+        assert isinstance(terminated, (bool, np.bool_)), "terminated must be bool"
+        assert isinstance(truncated, (bool, np.bool_)), "truncated must be bool"
+        assert isinstance(info, dict), "Info must be dict"
+
+    def validate_observations(self, env, obs):
+        """Check observations match space"""
+        assert env.observation_space.contains(obs), \
+            f"Observation {obs.shape} not in space {env.observation_space}"
+
+    def validate_actions(self, env):
+        """Check invalid actions fail"""
+        if isinstance(env.action_space, gym.spaces.Discrete):
+            invalid_action = env.action_space.n + 10
+            assert not env.action_space.contains(invalid_action)
+
+    def validate_termination(self, env):
+        """Check episodes can terminate"""
+        obs, _ = env.reset()
+        for _ in range(1000):
+            action = env.action_space.sample()
+            obs, reward, terminated, truncated, info = env.step(action)
+            if terminated or truncated:
+                break
+        assert terminated or truncated, \
+            "Episode never terminated in 1000 steps!"
+
+    def validate_reproducibility(self, env):
+        """Check reset with seed is reproducible"""
+        obs1, _ = env.reset(seed=42)
+        obs2, _ = env.reset(seed=42)
+        assert np.allclose(obs1, obs2), "Reset not reproducible!"
+
+    def validate_random_agent(self, env):
+        """Check environment works with random actions"""
+        obs, _ = env.reset()
+        total_reward = 0
+        for _ in range(100):
+            action = env.action_space.sample()
+            obs, reward, terminated, truncated, info = env.step(action)
+            total_reward += reward
+            if terminated or truncated:
+                break
+        assert total_reward is not None, "No reward computed!"
+
+# Usage
+validator = EnvironmentValidator()
+validator.validate_all(env)
+```
+
+### Manual Testing
+
+Before training, play with the environment manually:
+
+```python
+# Manual environment exploration
+env = GridWorldEnv()
+obs, _ = env.reset()
+
+while True:
+    action = int(input("Action (0=up, 1=right, 2=down, 3=left): "))
+    obs, reward, terminated, truncated, info = env.step(action)
+    print(f"Position: {obs}, Reward: {reward}, Done: {terminated}")
+
+    if terminated or truncated:
+        obs, _ = env.reset()
+        print("Episode reset")
+```
+
+
+## Part 8: Red Flags and Anti-Patterns
+
+### Red Flag 1: Reward Scale Issue
+
+```python
+# RED FLAG: Rewards in [0, 1000000]
+reward = distance_to_goal * 1000000  # HUGE!
+
+# Solution: Scale to [-1, 1]
+reward = -distance_to_goal / max_distance
+assert -1 <= reward <= 1
+```
+
+### Red Flag 2: Observation Type Mismatch
+
+```python
+# RED FLAG: Observation dtype doesn't match space
+observation_space = Box(0, 255, (84, 84, 3), dtype=np.uint8)
+obs = np.random.random((84, 84, 3)).astype(np.float32)  # MISMATCH!
+
+# Solution: Match dtype exactly
+obs = (obs * 255).astype(np.uint8)
+```
+
+### Red Flag 3: Missing Done Flag
+
+```python
+# RED FLAG: Episodes never end
+def step(self, action):
+    return obs, reward, False, False, {}  # Always False!
+
+# Solution: Implement termination logic
+terminated = self.check_goal_reached() or self.check_failure()
+```
+
+### Red Flag 4: Action Bounds Not Enforced
+
+```python
+# RED FLAG: Network outputs unconstrained
+def step(self, action):  # action could be [1000, -1000]
+    velocity = action  # HUGE velocity!
+
+# Solution: Clip or validate
+action = np.clip(action,
+                 self.action_space.low,
+                 self.action_space.high)
+```
+
+### Red Flag 5: Vectorized Environment Auto-Reset Confusion
+
+```python
+# RED FLAG: Treating auto-reset obs as terminal obs
+obs, rewards, terminateds, truncateds, info = envs.step(actions)
+# obs contains NEW reset observations, not final observations!
+
+# Solution: Use info['final_observation']
+final_obs = info['final_observation']
+```
+
+### Red Flag 6: Non-Parallel-Safe Shared State
+
+```python
+# RED FLAG: Shared state breaks AsyncVectorEnv
+class Env(gym.Env):
+    global_counter = 0  # SHARED!
+
+    def step(self, action):
+        Env.global_counter += 1  # Race condition!
+
+# Solution: Instance variables only
+def __init__(self):
+    self.counter = 0  # Instance-specific
+```
+
+### Red Flag 7: Info Dict with Unpicklable Objects
+
+```python
+# RED FLAG: Can't serialize for replay buffer
+info = {
+    'env': self,
+    'callback': self.fn,
+}
+
+# Solution: Only basic types
+info = {
+    'level': 5,
+    'score': 100,
+}
+```
+
+### Red Flag 8: Forgetting to Close AsyncVectorEnv
+
+```python
+# RED FLAG: Process leak
+envs = AsyncVectorEnv([...])
+# ... forgot env.close()
+
+# Solution: Always close
+envs.close()  # or use try/finally
+```
+
+
+## Part 9: Rationalization Resistance
+
+**Common Wrong Beliefs About Environments:**
+
+**Claim 1**: "My custom environment should just work without testing"
+
+- **Reality**: 80% of RL failures are environment bugs. Test before training.
+- **Evidence**: Standard validation checklist catches bugs 95% of the time
+
+**Claim 2**: "Reward scaling doesn't matter, only matters for learning rate"
+
+- **Reality**: Reward scale affects gradient magnitudes directly. Too large = instability.
+- **Evidence**: Scaling reward by 100x often breaks training even with correct learning rate
+
+**Claim 3**: "Wrappers are optional complexity I don't need"
+
+- **Reality**: Wrappers enforce separation of concerns. Without them, environments become unmaintainable.
+- **Evidence**: Real RL code uses 3-5 wrappers (TimeLimit, Normalize, ClipAction, etc)
+
+**Claim 4**: "Vectorized environments are always faster"
+
+- **Reality**: Parallelization overhead for small envs can make them slower.
+- **Evidence**: For < 4 envs, DummyVectorEnv is faster than AsyncVectorEnv
+
+**Claim 5**: "My environment is correct if the agent learns something"
+
+- **Reality**: Agent can learn to game a broken reward signal.
+- **Evidence**: Agent learning ≠ environment correctness. Run tests.
+
+**Claim 6**: "AsyncVectorEnv doesn't need explicit close()"
+
+- **Reality**: Processes leak if not closed, draining system resources.
+- **Evidence**: Unmanaged AsyncVectorEnv with 16+ processes brings systems to halt
+
+**Claim 7**: "Observation normalization breaks training"
+
+- **Reality**: Unnormalized large observations (like [0, 255]) break training.
+- **Evidence**: Normalizing [0, 255] images to [0, 1] is standard practice
+
+**Claim 8**: "I don't need to validate action space enforcement"
+
+- **Reality**: Network outputs can violate bounds, causing physics errors.
+- **Evidence**: Unclipped continuous actions often cause simulation failures
+
+
+## Part 10: Pressure Test Scenarios
+
+### Scenario 1: Custom Environment Debugging
+
+```python
+# Subagent challenge WITHOUT skill:
+# "I built a custom CartPole variant. Training fails silently
+# (agent doesn't learn). The environment seems fine when I test it.
+# Where do I start debugging?"
+
+# Expected WITH skill:
+# 1. Validate observation space matches actual observations
+# 2. Validate action space bounds are enforced
+# 3. Check reward scale is in [-1, 1]
+# 4. Verify reset/step API is correct (Gym vs Gymnasium)
+# 5. Run environment validator checklist
+# 6. Manual play-test to check physics
+# 7. Verify terminal state logic
+```
+
+### Scenario 2: Wrapper Composition
+
+```python
+# Challenge: Build a correct wrapper stack
+# env = gym.make("CartPole-v1")
+# env = TimeLimit(env, 500)  # Add time limit
+# env = NormalizeObservation(env)  # Normalize
+# Should be safe to use with any policy training
+
+# WITHOUT skill: Guess order, wrong wrapping
+# WITH skill: Know correct order, understand composition
+```
+
+### Scenario 3: Vectorization Decision
+
+```python
+# Challenge: "I need to train on 32 parallel CartPoles.
+# Which vectorized environment type is best?"
+
+# WITHOUT skill: Try all three, pick whichever runs
+# WITH skill: Analyze trade-offs
+#   - 32 envs -> AsyncVectorEnv
+#   - Memory acceptable? -> Yes
+#   - Debugging needed? -> No -> Use Async
+```
+
+### Scenario 4: Space Mismatch Detection
+
+```python
+# Challenge: Environment crashes during training with cryptic error.
+# Observation is (84, 84, 3) uint8 but CNN expects float32 in [0, 1]
+
+# WITHOUT skill: Spend hours debugging network
+# WITH skill: Immediately suspect observation/space mismatch
+# Run validator, find dtype mismatch, fix preprocessing
+```
+
+
+## Part 11: Advanced Patterns - Multi-Agent Environments
+
+### Multi-Agent Observation Spaces
+
+**Scenario: Multi-agent game with individual agent observations**
+
+```python
+class MultiAgentGridWorld(gym.Env):
+    """2-agent cooperative environment"""
+
+    def __init__(self, num_agents=2):
+        self.num_agents = num_agents
+
+        # Each agent has its own action space
+        self.action_space = gym.spaces.MultiDiscrete([4] * num_agents)
+
+        # Each agent observes its own position + other agents' positions
+        # Dict space allows per-agent observations
+        self.observation_space = gym.spaces.Dict({
+            f'agent_{i}': gym.spaces.Box(0, 4, (2 * num_agents,), dtype=np.int32)
+            for i in range(num_agents)
+        })
+
+        self.agents = [np.array([i, 0], dtype=np.int32) for i in range(num_agents)]
+        self.goal = np.array([4, 4], dtype=np.int32)
+
+    def reset(self, seed=None):
+        super().reset(seed=seed)
+        self.agents = [np.array([i, 0], dtype=np.int32) for i in range(self.num_agents)]
+
+        obs = {}
+        for i in range(self.num_agents):
+            agent_obs = np.concatenate([agent.copy() for agent in self.agents])
+            obs[f'agent_{i}'] = agent_obs.astype(np.int32)
+
+        return obs, {}
+
+    def step(self, actions):
+        """actions is array of length num_agents"""
+        moves = [
+            np.array([0, 1], dtype=np.int32),
+            np.array([1, 0], dtype=np.int32),
+            np.array([0, -1], dtype=np.int32),
+            np.array([-1, 0], dtype=np.int32),
+        ]
+
+        # Apply each agent's action
+        for i, action in enumerate(actions):
+            self.agents[i] += moves[action]
+            self.agents[i] = np.clip(self.agents[i], 0, 4)
+
+        # Shared reward: both agents get reward for reaching goal
+        distances = [np.linalg.norm(agent - self.goal) for agent in self.agents]
+        reward = sum(1.0 / (1.0 + d) for d in distances)
+
+        # Both must reach goal
+        terminated = all(np.array_equal(agent, self.goal) for agent in self.agents)
+
+        # Construct observation for each agent
+        obs = {}
+        for i in range(self.num_agents):
+            agent_obs = np.concatenate([agent.copy() for agent in self.agents])
+            obs[f'agent_{i}'] = agent_obs.astype(np.int32)
+
+        truncated = False
+        return obs, reward, terminated, truncated, {}
+```
+
+### Key Multi-Agent Patterns
+
+```python
+# Pattern 1: Separate rewards per agent
+rewards = {
+    f'agent_{i}': compute_reward_for_agent(i)
+    for i in range(num_agents)
+}
+
+# Pattern 2: Shared team reward
+team_reward = sum(individual_rewards) / num_agents
+
+# Pattern 3: Mixed observations (shared + individual)
+obs = {
+    f'agent_{i}': {
+        'own_state': agent_states[i],
+        'other_positions': [s for j, s in enumerate(agent_states) if j != i],
+        'global_state': shared_state,
+    }
+    for i in range(num_agents)
+}
+
+# Pattern 4: Synchronized reset for coordinated behavior
+def reset(self, seed=None):
+    super().reset(seed=seed)
+    # All agents reset to coordinated starting positions
+    self.agents = initialize_team_formation()
+```
+
+
+## Part 12: Integration with Training Loops
+
+### Proper Environment Integration
+
+```python
+class TrainingLoop:
+    """Shows correct environment integration pattern"""
+
+    def __init__(self, env, num_parallel=4):
+        self.env = self._setup_environment(env, num_parallel)
+        self.policy = build_policy()
+
+    def _setup_environment(self, env, num_parallel):
+        """Proper environment setup"""
+        if num_parallel == 1:
+            env = gym.make(env)
+        elif num_parallel <= 4:
+            env = DummyVectorEnv([lambda: gym.make(env) for _ in range(num_parallel)])
+        else:
+            env = SyncVectorEnv([lambda: gym.make(env) for _ in range(num_parallel)])
+
+        # Add standard wrappers
+        env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
+        env = NormalizeObservation(env)
+
+        return env
+
+    def train_one_episode(self):
+        """Correct training loop"""
+        obs, info = self.env.reset()
+
+        total_reward = 0
+        steps = 0
+
+        while True:
+            # Get action from policy
+            action = self.policy.get_action(obs)
+
+            # CRITICAL: Validate action is in space
+            assert self.env.action_space.contains(action)
+
+            # Step environment
+            obs, reward, terminated, truncated, info = self.env.step(action)
+
+            # CRITICAL: Handle auto-reset in vectorized case
+            if 'final_observation' in info:
+                final_obs = info['final_observation']
+                # Store final obs in replay buffer, not reset obs
+            else:
+                final_obs = obs
+
+            # Store experience
+            self.store_experience(obs, reward, terminated, truncated, info)
+
+            total_reward += np.mean(reward) if isinstance(reward, np.ndarray) else reward
+            steps += 1
+
+            # Check termination
+            if np.any(terminated) or np.any(truncated):
+                break
+
+        return total_reward / steps
+
+    def store_experience(self, obs, reward, terminated, truncated, info):
+        """Correct experience storage"""
+        # Handle vectorized case (obs, reward are arrays)
+        if isinstance(reward, np.ndarray):
+            for i in range(len(reward)):
+                self.replay_buffer.add(
+                    obs=obs[i] if isinstance(obs, np.ndarray) else obs,
+                    action=None,  # Set before storing
+                    reward=reward[i],
+                    done=terminated[i] or truncated[i],
+                    next_obs=obs[i] if isinstance(obs, np.ndarray) else obs,
+                )
+```
+
+### Common Integration Mistakes
+
+**Mistake 1: Not closing AsyncVectorEnv**
+
+```python
+# WRONG: Process leak
+envs = AsyncVectorEnv([...] for _ in range(16))
+for episode in range(1000):
+    obs, _ = envs.reset()
+    # ... training ...
+# Processes never cleaned up
+
+# CORRECT: Always cleanup
+try:
+    envs = AsyncVectorEnv([...] for _ in range(16))
+    for episode in range(1000):
+        obs, _ = envs.reset()
+        # ... training ...
+finally:
+    envs.close()
+```
+
+**Mistake 2: Using wrong observation after auto-reset**
+
+```python
+# WRONG: Mixing terminal and reset observations
+obs, reward, terminated, truncated, info = envs.step(actions)
+# obs is reset observation, but we treat it as terminal!
+store_in_replay_buffer(obs, reward, terminated)
+
+# CORRECT: Use final_observation for training
+final_obs = info.get('final_observation', obs)
+if np.any(terminated):
+    store_in_replay_buffer(final_obs, reward, terminated)
+else:
+    next_obs = obs
+```
+
+**Mistake 3: Not validating agent actions**
+
+```python
+# WRONG: Trust agent always outputs valid action
+action = policy(obs)
+obs, reward, terminated, truncated, info = env.step(action)
+
+# CORRECT: Validate before stepping
+action = policy(obs)
+action = np.clip(action, env.action_space.low, env.action_space.high)
+assert env.action_space.contains(action)
+obs, reward, terminated, truncated, info = env.step(action)
+```
+
+
+## Part 13: Performance Optimization
+
+### Observation Preprocessing Performance
+
+```python
+class OptimizedObservationPreprocessing:
+    """Efficient observation handling"""
+
+    def __init__(self, env):
+        self.env = env
+
+    def preprocess_observation(self, obs):
+        """Optimized preprocessing"""
+        # Avoid unnecessary copies
+        if obs.dtype == np.uint8:
+            # In-place division for efficiency
+            obs = obs.astype(np.float32) / 255.0
+        else:
+            obs = obs / 255.0
+
+        # Use memmap for large observations
+        if obs.nbytes > 1_000_000:  # > 1MB
+            # Consider using memory-mapped arrays
+            pass
+
+        return obs
+
+    def batch_preprocess(self, obs_batch):
+        """Batch processing for vectorized envs"""
+        # Vectorized preprocessing is faster than per-obs
+        if isinstance(obs_batch, np.ndarray) and obs_batch.ndim == 4:
+            # (batch_size, H, W, C) image batch
+            obs_batch = obs_batch.astype(np.float32) / 255.0
+        return obs_batch
+```
+
+### Vectorization Performance Tips
+
+```python
+# Benchmark: When does parallelization help?
+
+# For CartPole (fast env):
+# - 1 env: 10k steps/sec on 1 core
+# - 4 Dummy: 9k steps/sec (overhead)
+# - 4 Sync: 15k steps/sec (parallelism helps)
+# - 4 Async: 12k steps/sec (context switch overhead)
+
+# For Atari (slow env):
+# - 1 env: 0.5k steps/sec on 1 core
+# - 16 Dummy: 7k steps/sec (overhead worth it)
+# - 16 Sync: 15k steps/sec (GIL limits)
+# - 16 Async: 25k steps/sec (parallelism dominates)
+
+# Rule of thumb:
+# - env_step_time < 1ms: parallelization overhead dominates, use Dummy
+# - env_step_time 1-10ms: parallelization helps, use Sync
+# - env_step_time > 10ms: parallelization essential, use Async
+```
+
+
+## Part 14: Debugging Environment Issues Systematically
+
+### Diagnostic Checklist for Broken Training
+
+```python
+class EnvironmentDebugger:
+    """Systematic environment debugging"""
+
+    def full_diagnosis(self, env, policy):
+        """Complete environment diagnostic"""
+        print("=== Environment Diagnostic ===")
+
+        # 1. Check environment API
+        self.check_api(env)
+        print("✓ API correct")
+
+        # 2. Check spaces
+        self.check_spaces(env)
+        print("✓ Spaces valid")
+
+        # 3. Check reset/step mechanics
+        self.check_mechanics(env)
+        print("✓ Reset/step mechanics correct")
+
+        # 4. Check observation statistics
+        obs_stats = self.analyze_observations(env)
+        print(f"✓ Observations: mean={obs_stats['mean']:.3f}, std={obs_stats['std']:.3f}")
+
+        # 5. Check reward statistics
+        reward_stats = self.analyze_rewards(env)
+        print(f"✓ Rewards: mean={reward_stats['mean']:.3f}, std={reward_stats['std']:.3f}")
+        if abs(reward_stats['mean']) > 1 or reward_stats['std'] > 1:
+            print("  WARNING: Reward scale may be too large")
+
+        # 6. Check episode lengths
+        lengths = self.analyze_episode_lengths(env)
+        print(f"✓ Episode lengths: mean={lengths['mean']:.1f}, min={lengths['min']}, max={lengths['max']}")
+
+        # 7. Check reproducibility
+        self.check_reproducibility(env)
+        print("✓ Reproducibility verified")
+
+        # 8. Check with policy
+        self.check_policy_integration(env, policy)
+        print("✓ Policy integration works")
+
+    def analyze_observations(self, env, num_episodes=10):
+        """Analyze observation distribution"""
+        obs_list = []
+        for _ in range(num_episodes):
+            obs, _ = env.reset()
+            for _ in range(100):
+                action = env.action_space.sample()
+                obs, _, terminated, truncated, _ = env.step(action)
+                obs_list.append(obs.flatten())
+                if terminated or truncated:
+                    break
+
+        obs_array = np.concatenate(obs_list)
+        return {
+            'mean': np.mean(obs_array),
+            'std': np.std(obs_array),
+            'min': np.min(obs_array),
+            'max': np.max(obs_array),
+        }
+
+    def analyze_rewards(self, env, num_episodes=10):
+        """Analyze reward distribution"""
+        rewards = []
+        for _ in range(num_episodes):
+            obs, _ = env.reset()
+            for _ in range(100):
+                action = env.action_space.sample()
+                obs, reward, terminated, truncated, _ = env.step(action)
+                rewards.append(reward)
+                if terminated or truncated:
+                    break
+
+        rewards = np.array(rewards)
+        return {
+            'mean': np.mean(rewards),
+            'std': np.std(rewards),
+            'min': np.min(rewards),
+            'max': np.max(rewards),
+        }
+
+    def analyze_episode_lengths(self, env, num_episodes=20):
+        """Analyze episode length distribution"""
+        lengths = []
+        for _ in range(num_episodes):
+            obs, _ = env.reset()
+            steps = 0
+            for step in range(10000):  # Max steps
+                action = env.action_space.sample()
+                obs, reward, terminated, truncated, _ = env.step(action)
+                steps += 1
+                if terminated or truncated:
+                    break
+            lengths.append(steps)
+
+        lengths = np.array(lengths)
+        return {
+            'mean': np.mean(lengths),
+            'min': int(np.min(lengths)),
+            'max': int(np.max(lengths)),
+            'median': int(np.median(lengths)),
+        }
+```
+
+
+## Summary: When to Invoke This Skill
+
+**Use rl-environments skill when:**
+
+1. Creating custom environments from scratch
+2. Debugging environment-related training failures
+3. Implementing observation/action spaces
+4. Using or creating wrappers
+5. Parallelizing environments
+6. Testing environments before training
+7. Handling Gym vs Gymnasium differences
+8. Migrating environment code between versions
+9. Building multi-agent or multi-component environments
+10. Enforcing action/observation bounds correctly
+11. Optimizing environment performance
+12. Debugging training failures systematically
+
+**This skill prevents:**
+
+- 80% of RL bugs (environment issues)
+- Silent training failures from broken environments
+- Vectorization-related data corruption
+- Observation/action space mismatches
+- Reward scaling instabilities
+- Terminal state logic errors
+- Reproducibility issues from poor seeding
+- Performance degradation from inefficient environments
+- Multi-agent coordination failures
+- Integration issues with training loops
diff --git a/skills/using-deep-rl/rl-evaluation.md b/skills/using-deep-rl/rl-evaluation.md
new file mode 100644
index 0000000..c28c8b3
--- /dev/null
+++ b/skills/using-deep-rl/rl-evaluation.md
@@ -0,0 +1,1591 @@
+
+# RL Evaluation: Rigorous Methodology for Agent Assessment
+
+<CRITICAL_CONTEXT>
+RL evaluation is uniquely challenging due to high variance, temporal instability, environment overfitting, and sample efficiency considerations. Without rigorous methodology, you will:
+
+- Draw conclusions from statistical noise
+- Report results that don't generalize
+- Deploy agents that fail in production
+- Waste resources on false improvements
+
+This skill provides systematic evaluation protocols that ensure statistical validity, generalization measurement, and deployment-ready assessment.
+</CRITICAL_CONTEXT>
+
+## When to Use This Skill
+
+Use this skill when:
+
+- ✅ Evaluating RL agent performance
+- ✅ Comparing multiple RL algorithms
+- ✅ Reporting results for publication or deployment
+- ✅ Making algorithm selection decisions
+- ✅ Assessing readiness for production deployment
+- ✅ Debugging training (need accurate performance estimates)
+
+DO NOT use for:
+
+- ❌ Quick sanity checks during development (use informal evaluation)
+- ❌ Monitoring training progress (use running averages)
+- ❌ Initial hyperparameter sweeps (use coarse evaluation)
+
+**When in doubt:** If the evaluation result will inform a decision (publish, deploy, choose algorithm), use this skill.
+
+
+## Core Principles
+
+### Principle 1: Statistical Rigor is Non-Negotiable
+
+**Reality:** RL has inherently high variance. Single runs are meaningless.
+
+**Enforcement:**
+
+- Minimum 5-10 random seeds for any performance claim
+- Report mean ± std or 95% confidence intervals
+- Statistical significance testing when comparing algorithms
+- Never report single-seed results as representative
+
+### Principle 2: Train/Test Discipline Prevents Overfitting
+
+**Reality:** Agents exploit environment quirks. Training performance ≠ generalization.
+
+**Enforcement:**
+
+- Separate train/test environment instances
+- Different random seeds for train/eval
+- Test on distribution shifts (new instances, physics, appearances)
+- Report both training and generalization performance
+
+### Principle 3: Sample Efficiency Matters
+
+**Reality:** Final performance ignores cost. Samples are often expensive.
+
+**Enforcement:**
+
+- Report sample efficiency curves (reward vs steps)
+- Include "reward at X steps" for multiple budgets
+- Consider deployment constraints
+- Compare at SAME sample budget, not just asymptotic
+
+### Principle 4: Evaluation Mode Must Match Deployment
+
+**Reality:** Stochastic vs deterministic evaluation changes results by 10-30%.
+
+**Enforcement:**
+
+- Specify evaluation mode (stochastic/deterministic)
+- Match evaluation to deployment scenario
+- Report both if ambiguous
+- Explain choice in methodology
+
+### Principle 5: Offline RL Requires Special Care
+
+**Reality:** Cannot accurately evaluate offline RL without online rollouts.
+
+**Enforcement:**
+
+- Acknowledge evaluation limitations
+- Use conservative metrics (in-distribution performance)
+- Quantify uncertainty
+- Staged deployment (offline → small online trial → full)
+
+
+## Statistical Evaluation Protocol
+
+### Multi-Seed Evaluation (MANDATORY)
+
+**Minimum Requirements:**
+
+- **Exploration/research**: 5-10 seeds minimum
+- **Publication**: 10-20 seeds
+- **Production deployment**: 20-50 seeds (depending on variance)
+
+**Protocol:**
+
+```python
+import numpy as np
+from scipy import stats
+
+def evaluate_multi_seed(algorithm, env_name, seeds, total_steps):
+    """
+    Evaluate algorithm across multiple random seeds.
+
+    Args:
+        algorithm: RL algorithm class
+        env_name: Environment name
+        seeds: List of random seeds
+        total_steps: Training steps per seed
+
+    Returns:
+        Dictionary with statistics
+    """
+    final_rewards = []
+    sample_efficiency_curves = []
+
+    for seed in seeds:
+        # Train agent
+        env = gym.make(env_name, seed=seed)
+        agent = algorithm(env, seed=seed)
+
+        # Track performance during training
+        eval_points = np.linspace(0, total_steps, num=20, dtype=int)
+        curve = []
+
+        for step in eval_points:
+            agent.train(steps=step)
+            reward = evaluate_deterministic(agent, env, episodes=10)
+            curve.append((step, reward))
+
+        sample_efficiency_curves.append(curve)
+        final_rewards.append(curve[-1][1])  # Final performance
+
+    final_rewards = np.array(final_rewards)
+
+    return {
+        'mean': np.mean(final_rewards),
+        'std': np.std(final_rewards),
+        'median': np.median(final_rewards),
+        'min': np.min(final_rewards),
+        'max': np.max(final_rewards),
+        'iqr': (np.percentile(final_rewards, 75) -
+                np.percentile(final_rewards, 25)),
+        'confidence_interval_95': stats.t.interval(
+            0.95,
+            len(final_rewards) - 1,
+            loc=np.mean(final_rewards),
+            scale=stats.sem(final_rewards)
+        ),
+        'all_seeds': final_rewards,
+        'curves': sample_efficiency_curves
+    }
+
+# Usage
+results = evaluate_multi_seed(
+    algorithm=PPO,
+    env_name="HalfCheetah-v3",
+    seeds=range(10),  # 10 seeds
+    total_steps=1_000_000
+)
+
+print(f"Performance: {results['mean']:.1f} ± {results['std']:.1f}")
+print(f"95% CI: [{results['confidence_interval_95'][0]:.1f}, "
+      f"{results['confidence_interval_95'][1]:.1f}]")
+print(f"Median: {results['median']:.1f}")
+print(f"Range: [{results['min']:.1f}, {results['max']:.1f}]")
+```
+
+**Reporting Template:**
+
+```
+Algorithm: PPO
+Environment: HalfCheetah-v3
+Seeds: 10
+Total Steps: 1M
+
+Final Performance:
+- Mean: 4,523 ± 387
+- Median: 4,612
+- 95% CI: [4,246, 4,800]
+- Range: [3,812, 5,201]
+
+Sample Efficiency:
+- Reward at 100k steps: 1,234 ± 156
+- Reward at 500k steps: 3,456 ± 289
+- Reward at 1M steps: 4,523 ± 387
+```
+
+### Statistical Significance Testing
+
+**When comparing algorithms:**
+
+```python
+def compare_algorithms(results_A, results_B, alpha=0.05):
+    """
+    Compare two algorithms with statistical rigor.
+
+    Args:
+        results_A: Array of final rewards for algorithm A (multiple seeds)
+        results_B: Array of final rewards for algorithm B (multiple seeds)
+        alpha: Significance level (default 0.05)
+
+    Returns:
+        Dictionary with comparison statistics
+    """
+    # T-test for difference in means
+    t_statistic, p_value = stats.ttest_ind(results_A, results_B)
+
+    # Effect size (Cohen's d)
+    pooled_std = np.sqrt((np.std(results_A)**2 + np.std(results_B)**2) / 2)
+    cohens_d = (np.mean(results_A) - np.mean(results_B)) / pooled_std
+
+    # Bootstrap confidence interval for difference
+    def bootstrap_diff(n_bootstrap=10000):
+        diffs = []
+        for _ in range(n_bootstrap):
+            sample_A = np.random.choice(results_A, size=len(results_A))
+            sample_B = np.random.choice(results_B, size=len(results_B))
+            diffs.append(np.mean(sample_A) - np.mean(sample_B))
+        return np.percentile(diffs, [2.5, 97.5])
+
+    ci_diff = bootstrap_diff()
+
+    return {
+        'mean_A': np.mean(results_A),
+        'mean_B': np.mean(results_B),
+        'difference': np.mean(results_A) - np.mean(results_B),
+        'p_value': p_value,
+        'significant': p_value < alpha,
+        'cohens_d': cohens_d,
+        'ci_difference': ci_diff,
+        'conclusion': (
+            f"Algorithm A is {'significantly' if p_value < alpha else 'NOT significantly'} "
+            f"better than B (p={p_value:.4f})"
+        )
+    }
+
+# Usage
+ppo_results = np.array([4523, 4612, 4201, 4789, 4456, 4390, 4678, 4234, 4567, 4498])
+sac_results = np.array([4678, 4890, 4567, 4923, 4712, 4645, 4801, 4556, 4734, 4689])
+
+comparison = compare_algorithms(ppo_results, sac_results)
+print(comparison['conclusion'])
+print(f"Effect size (Cohen's d): {comparison['cohens_d']:.3f}")
+print(f"95% CI for difference: [{comparison['ci_difference'][0]:.1f}, "
+      f"{comparison['ci_difference'][1]:.1f}]")
+```
+
+**Interpreting Effect Size (Cohen's d):**
+
+- d < 0.2: Negligible difference
+- 0.2 ≤ d < 0.5: Small effect
+- 0.5 ≤ d < 0.8: Medium effect
+- d ≥ 0.8: Large effect
+
+**Red Flag:** If p-value < 0.05 but Cohen's d < 0.2, the difference is statistically significant but practically negligible. Don't claim "better" without practical significance.
+
+### Power Analysis: How Many Seeds Needed?
+
+```python
+def required_seeds_for_precision(std_estimate, mean_estimate,
+                                  desired_precision=0.1, confidence=0.95):
+    """
+    Calculate number of seeds needed for desired precision.
+
+    Args:
+        std_estimate: Estimated standard deviation (from pilot runs)
+        mean_estimate: Estimated mean performance
+        desired_precision: Desired precision as fraction of mean (0.1 = ±10%)
+        confidence: Confidence level (0.95 = 95% CI)
+
+    Returns:
+        Required number of seeds
+    """
+    # Z-score for confidence level
+    z = stats.norm.ppf(1 - (1 - confidence) / 2)
+
+    # Desired margin of error
+    margin = desired_precision * mean_estimate
+
+    # Required sample size
+    n = (z * std_estimate / margin) ** 2
+
+    return int(np.ceil(n))
+
+# Example: You ran 3 pilot seeds
+pilot_results = [4500, 4200, 4700]
+std_est = np.std(pilot_results)  # 250
+mean_est = np.mean(pilot_results)  # 4467
+
+# How many seeds for ±10% precision at 95% confidence?
+n_required = required_seeds_for_precision(std_est, mean_est,
+                                           desired_precision=0.1)
+print(f"Need {n_required} seeds for ±10% precision")  # ~12 seeds
+
+# How many for ±5% precision?
+n_tight = required_seeds_for_precision(std_est, mean_est,
+                                        desired_precision=0.05)
+print(f"Need {n_tight} seeds for ±5% precision")  # ~47 seeds
+```
+
+**Practical Guidelines:**
+
+- Quick comparison: 5 seeds (±20% precision)
+- Standard evaluation: 10 seeds (±10% precision)
+- Publication: 20 seeds (±7% precision)
+- Production deployment: 50+ seeds (±5% precision)
+
+
+## Train/Test Discipline
+
+### Environment Instance Separation
+
+**CRITICAL:** Never evaluate on the same environment instances used for training.
+
+```python
+# WRONG: Single environment for both training and evaluation
+env = gym.make("CartPole-v1", seed=42)
+agent.train(env)
+performance = evaluate(agent, env)  # BIASED!
+
+# CORRECT: Separate environments
+train_env = gym.make("CartPole-v1", seed=42)
+eval_env = gym.make("CartPole-v1", seed=999)  # Different seed
+
+agent.train(train_env)
+performance = evaluate(agent, eval_env)  # Unbiased
+```
+
+### Train/Test Split for Custom Environments
+
+**For environments with multiple instances (levels, objects, configurations):**
+
+```python
+def create_train_test_split(all_instances, test_ratio=0.2, seed=42):
+    """
+    Split environment instances into train and test sets.
+
+    Args:
+        all_instances: List of environment configurations
+        test_ratio: Fraction for test set (default 0.2)
+        seed: Random seed for reproducibility
+
+    Returns:
+        (train_instances, test_instances)
+    """
+    np.random.seed(seed)
+    n_test = int(len(all_instances) * test_ratio)
+
+    indices = np.random.permutation(len(all_instances))
+    test_indices = indices[:n_test]
+    train_indices = indices[n_test:]
+
+    train_instances = [all_instances[i] for i in train_indices]
+    test_instances = [all_instances[i] for i in test_indices]
+
+    return train_instances, test_instances
+
+# Example: Maze environments
+all_mazes = [MazeLayout(seed=i) for i in range(100)]
+train_mazes, test_mazes = create_train_test_split(all_mazes, test_ratio=0.2)
+
+print(f"Training on {len(train_mazes)} mazes")  # 80
+print(f"Testing on {len(test_mazes)} mazes")    # 20
+
+# Train only on training set
+agent.train(train_mazes)
+
+# Evaluate on BOTH train and test (measure generalization gap)
+train_performance = evaluate(agent, train_mazes)
+test_performance = evaluate(agent, test_mazes)
+
+generalization_gap = train_performance - test_performance
+print(f"Train: {train_performance:.1f}")
+print(f"Test: {test_performance:.1f}")
+print(f"Generalization gap: {generalization_gap:.1f}")
+
+# Red flag: If gap > 20% of train performance, agent is overfitting
+if generalization_gap > 0.2 * train_performance:
+    print("WARNING: Significant overfitting detected!")
+```
+
+### Randomization Protocol
+
+**Ensure independent randomization for train/eval:**
+
+```python
+class EvaluationProtocol:
+    def __init__(self, env_name, train_seed=42, eval_seed=999):
+        """
+        Proper train/eval environment management.
+
+        Args:
+            env_name: Gym environment name
+            train_seed: Seed for training environment
+            eval_seed: Seed for evaluation environment (DIFFERENT)
+        """
+        self.env_name = env_name
+        self.train_seed = train_seed
+        self.eval_seed = eval_seed
+
+        # Separate environments
+        self.train_env = gym.make(env_name)
+        self.train_env.seed(train_seed)
+        self.train_env.action_space.seed(train_seed)
+        self.train_env.observation_space.seed(train_seed)
+
+        self.eval_env = gym.make(env_name)
+        self.eval_env.seed(eval_seed)
+        self.eval_env.action_space.seed(eval_seed)
+        self.eval_env.observation_space.seed(eval_seed)
+
+    def train_step(self, agent):
+        """Training step on training environment."""
+        return agent.step(self.train_env)
+
+    def evaluate(self, agent, episodes=100):
+        """Evaluation on SEPARATE evaluation environment."""
+        rewards = []
+        for _ in range(episodes):
+            state = self.eval_env.reset()
+            episode_reward = 0
+            done = False
+
+            while not done:
+                action = agent.act_deterministic(state)
+                state, reward, done, _ = self.eval_env.step(action)
+                episode_reward += reward
+
+            rewards.append(episode_reward)
+
+        return np.mean(rewards), np.std(rewards)
+
+# Usage
+protocol = EvaluationProtocol("HalfCheetah-v3", train_seed=42, eval_seed=999)
+
+# Training
+agent = SAC()
+for step in range(1_000_000):
+    protocol.train_step(agent)
+
+    if step % 10_000 == 0:
+        mean_reward, std_reward = protocol.evaluate(agent, episodes=10)
+        print(f"Step {step}: {mean_reward:.1f} ± {std_reward:.1f}")
+```
+
+
+## Sample Efficiency Metrics
+
+### Sample Efficiency Curves
+
+**Report performance at multiple sample budgets, not just final:**
+
+```python
+def compute_sample_efficiency_curve(agent_class, env_name, seed,
+                                     max_steps, eval_points=20):
+    """
+    Compute sample efficiency curve (reward vs steps).
+
+    Args:
+        agent_class: RL algorithm class
+        env_name: Environment name
+        seed: Random seed
+        max_steps: Maximum training steps
+        eval_points: Number of evaluation points
+
+    Returns:
+        List of (steps, reward) tuples
+    """
+    env = gym.make(env_name, seed=seed)
+    agent = agent_class(env, seed=seed)
+
+    eval_steps = np.logspace(3, np.log10(max_steps), num=eval_points, dtype=int)
+    # [1000, 1500, 2200, ..., max_steps] (logarithmic spacing)
+
+    curve = []
+    current_step = 0
+
+    for target_step in eval_steps:
+        # Train until target_step
+        steps_to_train = target_step - current_step
+        agent.train(steps=steps_to_train)
+        current_step = target_step
+
+        # Evaluate
+        reward = evaluate_deterministic(agent, env, episodes=10)
+        curve.append((target_step, reward))
+
+    return curve
+
+# Compare sample efficiency of multiple algorithms
+algorithms = [PPO, SAC, TD3]
+env_name = "HalfCheetah-v3"
+max_steps = 1_000_000
+
+for algo in algorithms:
+    # Average across 5 seeds
+    all_curves = []
+    for seed in range(5):
+        curve = compute_sample_efficiency_curve(algo, env_name, seed, max_steps)
+        all_curves.append(curve)
+
+    # Aggregate
+    steps = [point[0] for point in all_curves[0]]
+    rewards_at_step = [[curve[i][1] for curve in all_curves]
+                       for i in range(len(steps))]
+    mean_rewards = [np.mean(rewards) for rewards in rewards_at_step]
+    std_rewards = [np.std(rewards) for rewards in rewards_at_step]
+
+    # Report at specific budgets
+    for i, step in enumerate([100_000, 500_000, 1_000_000]):
+        idx = steps.index(step)
+        print(f"{algo.__name__} at {step} steps: "
+              f"{mean_rewards[idx]:.1f} ± {std_rewards[idx]:.1f}")
+```
+
+**Sample Output:**
+
+```
+PPO at 100k steps: 1,234 ± 156
+PPO at 500k steps: 3,456 ± 289
+PPO at 1M steps: 4,523 ± 387
+
+SAC at 100k steps: 891 ± 178
+SAC at 500k steps: 3,789 ± 245
+SAC at 1M steps: 4,912 ± 312
+
+TD3 at 100k steps: 756 ± 134
+TD3 at 500k steps: 3,234 ± 298
+TD3 at 1M steps: 4,678 ± 276
+```
+
+**Analysis:**
+
+- PPO is most sample-efficient early (1,234 at 100k)
+- SAC has best final performance (4,912 at 1M)
+- If sample budget is 100k → PPO is best choice
+- If sample budget is 1M → SAC is best choice
+
+### Area Under Curve (AUC) Metric
+
+**Single metric for sample efficiency:**
+
+```python
+def compute_auc(curve):
+    """
+    Compute area under sample efficiency curve.
+
+    Args:
+        curve: List of (steps, reward) tuples
+
+    Returns:
+        AUC value (higher = more sample efficient)
+    """
+    steps = np.array([point[0] for point in curve])
+    rewards = np.array([point[1] for point in curve])
+
+    # Trapezoidal integration
+    auc = np.trapz(rewards, steps)
+    return auc
+
+# Compare algorithms by AUC
+for algo in algorithms:
+    all_aucs = []
+    for seed in range(5):
+        curve = compute_sample_efficiency_curve(algo, env_name, seed, max_steps)
+        auc = compute_auc(curve)
+        all_aucs.append(auc)
+
+    print(f"{algo.__name__} AUC: {np.mean(all_aucs):.2e} ± {np.std(all_aucs):.2e}")
+```
+
+**Note:** AUC is sensitive to evaluation point spacing. Use consistent evaluation points across algorithms.
+
+
+## Generalization Testing
+
+### Distribution Shift Evaluation
+
+**Test on environment variations to measure robustness:**
+
+```python
+def evaluate_generalization(agent, env_name, shifts):
+    """
+    Evaluate agent on distribution shifts.
+
+    Args:
+        agent: Trained RL agent
+        env_name: Base environment name
+        shifts: Dictionary of shift types and parameters
+
+    Returns:
+        Dictionary of performance on each shift
+    """
+    results = {}
+
+    # Baseline (no shift)
+    baseline_env = gym.make(env_name)
+    baseline_perf = evaluate(agent, baseline_env, episodes=50)
+    results['baseline'] = baseline_perf
+
+    # Test shifts
+    for shift_name, shift_params in shifts.items():
+        shifted_env = apply_shift(env_name, shift_params)
+        shift_perf = evaluate(agent, shifted_env, episodes=50)
+        results[shift_name] = shift_perf
+
+        # Compute degradation
+        degradation = (baseline_perf - shift_perf) / baseline_perf
+        results[f'{shift_name}_degradation'] = degradation
+
+    return results
+
+# Example: Robotic grasping
+shifts = {
+    'lighting_dim': {'lighting_scale': 0.5},
+    'lighting_bright': {'lighting_scale': 1.5},
+    'camera_angle_15deg': {'camera_rotation': 15},
+    'table_height_+5cm': {'table_height_offset': 0.05},
+    'object_mass_+50%': {'mass_scale': 1.5},
+    'object_friction_-30%': {'friction_scale': 0.7}
+}
+
+gen_results = evaluate_generalization(agent, "RobotGrasp-v1", shifts)
+
+print(f"Baseline: {gen_results['baseline']:.2%} success")
+for shift_name in shifts.keys():
+    perf = gen_results[shift_name]
+    deg = gen_results[f'{shift_name}_degradation']
+    print(f"{shift_name}: {perf:.2%} success ({deg:.1%} degradation)")
+
+# Red flag: If any degradation > 50%, agent is brittle
+```
+
+### Zero-Shot Transfer Evaluation
+
+**Test on completely new environments:**
+
+```python
+def zero_shot_transfer(agent, train_env_name, test_env_names):
+    """
+    Evaluate zero-shot transfer to related environments.
+
+    Args:
+        agent: Agent trained on train_env_name
+        train_env_name: Training environment
+        test_env_names: List of related test environments
+
+    Returns:
+        Transfer performance dictionary
+    """
+    results = {}
+
+    # Source performance
+    source_env = gym.make(train_env_name)
+    source_perf = evaluate(agent, source_env, episodes=50)
+    results['source'] = source_perf
+
+    # Target performances
+    for target_env_name in test_env_names:
+        target_env = gym.make(target_env_name)
+        target_perf = evaluate(agent, target_env, episodes=50)
+        results[target_env_name] = target_perf
+
+        # Transfer efficiency
+        transfer_ratio = target_perf / source_perf
+        results[f'{target_env_name}_transfer_ratio'] = transfer_ratio
+
+    return results
+
+# Example: Locomotion transfer
+agent_trained_on_cheetah = train(PPO, "HalfCheetah-v3")
+
+transfer_results = zero_shot_transfer(
+    agent_trained_on_cheetah,
+    train_env_name="HalfCheetah-v3",
+    test_env_names=["Hopper-v3", "Walker2d-v3", "Ant-v3"]
+)
+
+print(f"Source (HalfCheetah): {transfer_results['source']:.1f}")
+for env in ["Hopper-v3", "Walker2d-v3", "Ant-v3"]:
+    perf = transfer_results[env]
+    ratio = transfer_results[f'{env}_transfer_ratio']
+    print(f"{env}: {perf:.1f} ({ratio:.1%} of source)")
+```
+
+### Robustness to Adversarial Perturbations
+
+**Test against worst-case scenarios:**
+
+```python
+def adversarial_evaluation(agent, env, perturbation_types,
+                            perturbation_magnitudes):
+    """
+    Evaluate robustness to adversarial perturbations.
+
+    Args:
+        agent: RL agent to evaluate
+        env: Environment
+        perturbation_types: List of perturbation types
+        perturbation_magnitudes: List of magnitudes to test
+
+    Returns:
+        Robustness curve for each perturbation type
+    """
+    results = {}
+
+    for perturb_type in perturbation_types:
+        results[perturb_type] = []
+
+        for magnitude in perturbation_magnitudes:
+            # Apply perturbation
+            perturbed_env = add_perturbation(env, perturb_type, magnitude)
+
+            # Evaluate
+            perf = evaluate(agent, perturbed_env, episodes=20)
+            results[perturb_type].append((magnitude, perf))
+
+    return results
+
+# Example: Vision-based control
+perturbation_types = ['gaussian_noise', 'occlusion', 'brightness']
+magnitudes = [0.0, 0.1, 0.2, 0.3, 0.5]
+
+robustness = adversarial_evaluation(
+    agent, env, perturbation_types, magnitudes
+)
+
+for perturb_type, curve in robustness.items():
+    print(f"\n{perturb_type}:")
+    for magnitude, perf in curve:
+        print(f"  Magnitude {magnitude}: {perf:.1f} reward")
+```
+
+
+## Evaluation Protocols
+
+### Stochastic vs Deterministic Evaluation
+
+**Decision Tree:**
+
+```
+Is the policy inherently deterministic?
+├─ YES (DQN, DDPG without noise)
+│  └─ Use deterministic evaluation
+└─ NO (PPO, SAC, stochastic policies)
+   ├─ Will deployment use stochastic policy?
+   │  ├─ YES (dialogue, exploration needed)
+   │  │  └─ Use stochastic evaluation
+   │  └─ NO (control, deterministic deployment)
+   │     └─ Use deterministic evaluation
+   └─ Unsure?
+      └─ Report BOTH stochastic and deterministic
+```
+
+**Implementation:**
+
+```python
+class EvaluationMode:
+    @staticmethod
+    def deterministic(agent, env, episodes=100):
+        """
+        Deterministic evaluation (use mean/argmax action).
+        """
+        rewards = []
+        for _ in range(episodes):
+            state = env.reset()
+            episode_reward = 0
+            done = False
+
+            while not done:
+                # Use mean action (no sampling)
+                if hasattr(agent, 'act_deterministic'):
+                    action = agent.act_deterministic(state)
+                else:
+                    action = agent.policy.mean(state)  # Or argmax for discrete
+
+                state, reward, done, _ = env.step(action)
+                episode_reward += reward
+
+            rewards.append(episode_reward)
+
+        return np.mean(rewards), np.std(rewards)
+
+    @staticmethod
+    def stochastic(agent, env, episodes=100):
+        """
+        Stochastic evaluation (sample from policy).
+        """
+        rewards = []
+        for _ in range(episodes):
+            state = env.reset()
+            episode_reward = 0
+            done = False
+
+            while not done:
+                # Sample from policy distribution
+                action = agent.policy.sample(state)
+
+                state, reward, done, _ = env.step(action)
+                episode_reward += reward
+
+            rewards.append(episode_reward)
+
+        return np.mean(rewards), np.std(rewards)
+
+    @staticmethod
+    def report_both(agent, env, episodes=100):
+        """
+        Report both evaluation modes for transparency.
+        """
+        det_mean, det_std = EvaluationMode.deterministic(agent, env, episodes)
+        sto_mean, sto_std = EvaluationMode.stochastic(agent, env, episodes)
+
+        return {
+            'deterministic': {'mean': det_mean, 'std': det_std},
+            'stochastic': {'mean': sto_mean, 'std': sto_std},
+            'difference': det_mean - sto_mean
+        }
+
+# Usage
+sac_agent = SAC(env)
+sac_agent.train(steps=1_000_000)
+
+eval_results = EvaluationMode.report_both(sac_agent, env, episodes=100)
+
+print(f"Deterministic: {eval_results['deterministic']['mean']:.1f} "
+      f"± {eval_results['deterministic']['std']:.1f}")
+print(f"Stochastic: {eval_results['stochastic']['mean']:.1f} "
+      f"± {eval_results['stochastic']['std']:.1f}")
+print(f"Difference: {eval_results['difference']:.1f}")
+```
+
+**Interpretation:**
+
+- If difference < 5% of mean: Evaluation mode doesn't matter much
+- If difference > 15% of mean: Evaluation mode significantly affects results
+  - Must clearly specify which mode used
+  - Ensure fair comparison across algorithms (same mode)
+
+### Episode Count Selection
+
+**How many evaluation episodes needed?**
+
+```python
+def required_eval_episodes(env, agent, desired_sem, max_episodes=1000):
+    """
+    Determine number of evaluation episodes for desired standard error.
+
+    Args:
+        env: Environment
+        agent: Agent to evaluate
+        desired_sem: Desired standard error of mean
+        max_episodes: Maximum episodes to test
+
+    Returns:
+        Required number of episodes
+    """
+    # Run initial episodes to estimate variance
+    initial_episodes = min(20, max_episodes)
+    rewards = []
+
+    for _ in range(initial_episodes):
+        state = env.reset()
+        episode_reward = 0
+        done = False
+
+        while not done:
+            action = agent.act_deterministic(state)
+            state, reward, done, _ = env.step(action)
+            episode_reward += reward
+
+        rewards.append(episode_reward)
+
+    # Estimate standard deviation
+    std_estimate = np.std(rewards)
+
+    # Required episodes: n = (std / desired_sem)^2
+    required = int(np.ceil((std_estimate / desired_sem) ** 2))
+
+    return min(required, max_episodes)
+
+# Usage
+agent = PPO(env)
+agent.train(steps=1_000_000)
+
+# Want standard error < 10 reward units
+n_episodes = required_eval_episodes(env, agent, desired_sem=10)
+print(f"Need {n_episodes} episodes for SEM < 10")
+
+# Evaluate with required episodes
+final_eval = evaluate(agent, env, episodes=n_episodes)
+```
+
+**Rule of Thumb:**
+
+- Quick check: 10 episodes
+- Standard evaluation: 50-100 episodes
+- Publication/deployment: 100-200 episodes
+- High-variance environments: 500+ episodes
+
+### Evaluation Frequency During Training
+
+**How often to evaluate during training?**
+
+```python
+def adaptive_evaluation_schedule(total_steps, early_freq=1000,
+                                  late_freq=10000, transition_step=100000):
+    """
+    Create adaptive evaluation schedule.
+
+    Early training: Frequent evaluations (detect divergence early)
+    Late training: Infrequent evaluations (policy more stable)
+
+    Args:
+        total_steps: Total training steps
+        early_freq: Evaluation frequency in early training
+        late_freq: Evaluation frequency in late training
+        transition_step: Step to transition from early to late
+
+    Returns:
+        List of evaluation timesteps
+    """
+    eval_steps = []
+
+    # Early phase
+    current_step = 0
+    while current_step < transition_step:
+        eval_steps.append(current_step)
+        current_step += early_freq
+
+    # Late phase
+    while current_step < total_steps:
+        eval_steps.append(current_step)
+        current_step += late_freq
+
+    # Always evaluate at end
+    if eval_steps[-1] != total_steps:
+        eval_steps.append(total_steps)
+
+    return eval_steps
+
+# Usage
+schedule = adaptive_evaluation_schedule(
+    total_steps=1_000_000,
+    early_freq=1_000,    # Every 1k steps for first 100k
+    late_freq=10_000,    # Every 10k steps after 100k
+    transition_step=100_000
+)
+
+print(f"Total evaluations: {len(schedule)}")
+print(f"First 10 eval steps: {schedule[:10]}")
+print(f"Last 10 eval steps: {schedule[-10:]}")
+
+# Training loop
+agent = PPO(env)
+for step in range(1_000_000):
+    agent.train_step()
+
+    if step in schedule:
+        eval_perf = evaluate(agent, eval_env, episodes=10)
+        log(step, eval_perf)
+```
+
+**Guidelines:**
+
+- Evaluation is expensive (10-100 episodes × episode length)
+- Early training: Evaluate frequently to detect divergence
+- Late training: Evaluate less frequently (policy stabilizes)
+- Don't evaluate every step (wastes compute)
+- Save checkpoints at evaluation steps (for later analysis)
+
+
+## Offline RL Evaluation
+
+### The Offline RL Evaluation Problem
+
+**CRITICAL:** You cannot accurately evaluate offline RL policies without online rollouts.
+
+**Why:**
+
+- Learned Q-values are only accurate for data distribution
+- Policy wants to visit out-of-distribution states
+- Q-values for OOD states are extrapolated (unreliable)
+- Dataset doesn't contain policy's trajectories
+
+**What to do:**
+
+```python
+class OfflineRLEvaluation:
+    """
+    Conservative offline RL evaluation protocol.
+    """
+
+    @staticmethod
+    def in_distribution_performance(offline_dataset, policy):
+        """
+        Evaluate policy on dataset trajectories (lower bound).
+
+        This measures: "How well does policy match best trajectories
+        in dataset?" NOT "How good is the policy?"
+        """
+        returns = []
+
+        for trajectory in offline_dataset:
+            # Check if policy would generate this trajectory
+            policy_match = True
+            for (state, action) in trajectory:
+                policy_action = policy(state)
+                if not actions_match(policy_action, action):
+                    policy_match = False
+                    break
+
+            if policy_match:
+                returns.append(trajectory.return)
+
+        if len(returns) == 0:
+            return None  # Policy doesn't match any dataset trajectory
+
+        return np.mean(returns)
+
+    @staticmethod
+    def behavioral_cloning_baseline(offline_dataset):
+        """
+        Train behavior cloning on dataset (baseline).
+
+        Offline RL should outperform BC, otherwise it's not learning.
+        """
+        bc_policy = BehaviorCloning(offline_dataset)
+        bc_policy.train()
+        return bc_policy
+
+    @staticmethod
+    def model_based_evaluation(offline_dataset, policy, model):
+        """
+        Use learned dynamics model for evaluation (if available).
+
+        CAUTION: Model errors compound. Short rollouts only.
+        """
+        # Train dynamics model on dataset
+        model.train(offline_dataset)
+
+        # Generate short rollouts (5-10 steps)
+        rollout_returns = []
+        for _ in range(100):
+            state = sample_initial_state(offline_dataset)
+            rollout_return = 0
+
+            for step in range(10):  # Short rollouts only
+                action = policy(state)
+                next_state, reward = model.predict(state, action)
+                rollout_return += reward
+                state = next_state
+
+            rollout_returns.append(rollout_return)
+
+        # Heavy discount for model uncertainty
+        uncertainty = model.get_uncertainty(offline_dataset)
+        adjusted_return = np.mean(rollout_returns) * (1 - uncertainty)
+
+        return adjusted_return
+
+    @staticmethod
+    def state_coverage_metric(offline_dataset, policy, num_rollouts=100):
+        """
+        Measure how much policy stays in-distribution.
+
+        Low coverage → policy goes OOD → evaluation unreliable
+        """
+        # Get dataset state distribution
+        dataset_states = get_all_states(offline_dataset)
+
+        # Simulate policy rollouts
+        policy_states = []
+        for _ in range(num_rollouts):
+            trajectory = simulate_with_model(policy)  # Needs model
+            policy_states.extend(trajectory.states)
+
+        # Compute coverage (fraction of policy states near dataset states)
+        coverage = compute_coverage(policy_states, dataset_states)
+        return coverage
+
+    @staticmethod
+    def full_offline_evaluation(offline_dataset, policy):
+        """
+        Comprehensive offline evaluation (still conservative).
+        """
+        results = {}
+
+        # 1. In-distribution performance
+        results['in_dist_perf'] = OfflineRLEvaluation.in_distribution_performance(
+            offline_dataset, policy
+        )
+
+        # 2. Compare to behavior cloning
+        bc_policy = OfflineRLEvaluation.behavioral_cloning_baseline(offline_dataset)
+        results['bc_baseline'] = evaluate(bc_policy, offline_dataset)
+
+        # 3. Model-based evaluation (if model available)
+        # model = train_dynamics_model(offline_dataset)
+        # results['model_eval'] = OfflineRLEvaluation.model_based_evaluation(
+        #     offline_dataset, policy, model
+        # )
+
+        # 4. State coverage
+        # results['coverage'] = OfflineRLEvaluation.state_coverage_metric(
+        #     offline_dataset, policy
+        # )
+
+        return results
+
+# Usage
+offline_dataset = load_offline_dataset("d4rl-halfcheetah-medium-v0")
+offline_policy = CQL(offline_dataset)
+offline_policy.train()
+
+eval_results = OfflineRLEvaluation.full_offline_evaluation(
+    offline_dataset, offline_policy
+)
+
+print("Offline Evaluation (CONSERVATIVE):")
+print(f"In-distribution performance: {eval_results['in_dist_perf']}")
+print(f"BC baseline: {eval_results['bc_baseline']}")
+print("\nWARNING: These are lower bounds. True performance unknown without online evaluation.")
+```
+
+### Staged Deployment for Offline RL
+
+**Best practice: Gradually introduce online evaluation**
+
+```python
+def staged_offline_to_online_deployment(offline_policy, env):
+    """
+    Staged deployment: Offline → Small online trial → Full deployment
+
+    Stage 1: Offline evaluation (conservative)
+    Stage 2: Small online trial (safety-constrained)
+    Stage 3: Full online evaluation
+    Stage 4: Deployment
+    """
+    results = {}
+
+    # Stage 1: Offline evaluation
+    print("Stage 1: Offline evaluation")
+    offline_perf = offline_evaluation(offline_policy)
+    results['offline'] = offline_perf
+
+    if offline_perf < minimum_threshold:
+        print("Failed offline evaluation. Stop.")
+        return results
+
+    # Stage 2: Small online trial (100 episodes)
+    print("Stage 2: Small online trial (100 episodes)")
+    online_trial_perf = evaluate(offline_policy, env, episodes=100)
+    results['small_trial'] = online_trial_perf
+
+    # Check degradation
+    degradation = (offline_perf - online_trial_perf) / offline_perf
+    if degradation > 0.3:  # >30% degradation
+        print(f"WARNING: {degradation:.1%} performance drop in online trial")
+        print("Policy may be overfitting to offline data. Investigate.")
+        return results
+
+    # Stage 3: Full online evaluation (1000 episodes)
+    print("Stage 3: Full online evaluation (1000 episodes)")
+    online_full_perf = evaluate(offline_policy, env, episodes=1000)
+    results['full_online'] = online_full_perf
+
+    # Stage 4: Deployment decision
+    if online_full_perf > deployment_threshold:
+        print("Passed all stages. Ready for deployment.")
+        results['deploy'] = True
+    else:
+        print("Failed online evaluation. Do not deploy.")
+        results['deploy'] = False
+
+    return results
+```
+
+
+## Common Pitfalls
+
+### Pitfall 1: Single Seed Reporting
+
+**Symptom:** Reporting one training run as "the result"
+
+**Why it's wrong:** RL has high variance. Single seed is noise.
+
+**Detection:**
+
+- Paper shows single training curve
+- No variance/error bars
+- No mention of multiple seeds
+
+**Fix:** Minimum 5-10 seeds, report mean ± std
+
+
+### Pitfall 2: Cherry-Picking Results
+
+**Symptom:** Running many experiments, reporting best
+
+**Why it's wrong:** Creates false positives (p-hacking)
+
+**Detection:**
+
+- Results seem too good
+- No mention of failed runs
+- "We tried many seeds and picked a representative one"
+
+**Fix:** Report ALL runs. Pre-register experiments.
+
+
+### Pitfall 3: Evaluating on Training Set
+
+**Symptom:** Agent evaluated on same environment instances used for training
+
+**Why it's wrong:** Measures memorization, not generalization
+
+**Detection:**
+
+- No mention of train/test split
+- Same random seed for training and evaluation
+- Perfect performance on specific instances
+
+**Fix:** Separate train/test environments with different seeds
+
+
+### Pitfall 4: Ignoring Sample Efficiency
+
+**Symptom:** Comparing algorithms only on final performance
+
+**Why it's wrong:** Final performance ignores cost to achieve it
+
+**Detection:**
+
+- No sample efficiency curves
+- No "reward at X steps" metrics
+- Only asymptotic performance reported
+
+**Fix:** Report sample efficiency curves, compare at multiple budgets
+
+
+### Pitfall 5: Conflating Train and Eval Performance
+
+**Symptom:** Using training episode returns as evaluation
+
+**Why it's wrong:** Training uses exploration, evaluation should not
+
+**Detection:**
+
+- "Training reward" used for algorithm comparison
+- No separate evaluation protocol
+- Same environment instance for both
+
+**Fix:** Separate training (with exploration) and evaluation (without)
+
+
+### Pitfall 6: Insufficient Evaluation Episodes
+
+**Symptom:** Evaluating with 5-10 episodes
+
+**Why it's wrong:** High variance → unreliable estimates
+
+**Detection:**
+
+- Large error bars
+- Inconsistent results across runs
+- SEM > 10% of mean
+
+**Fix:** 50-100 episodes minimum, power analysis for exact number
+
+
+### Pitfall 7: Reporting Peak Instead of Final
+
+**Symptom:** Selecting best checkpoint during training
+
+**Why it's wrong:** Peak is overfitting to evaluation variance
+
+**Detection:**
+
+- "Best performance during training" reported
+- Early stopping based on eval performance
+- No mention of final performance
+
+**Fix:** Report final performance, or use validation set for model selection
+
+
+### Pitfall 8: No Generalization Testing
+
+**Symptom:** Only evaluating on single environment configuration
+
+**Why it's wrong:** Doesn't measure robustness to distribution shift
+
+**Detection:**
+
+- No mention of distribution shifts
+- Only one environment configuration tested
+- No transfer/zero-shot evaluation
+
+**Fix:** Test on held-out environments, distribution shifts, adversarial cases
+
+
+### Pitfall 9: Inconsistent Evaluation Mode
+
+**Symptom:** Comparing stochastic and deterministic evaluations
+
+**Why it's wrong:** Evaluation mode affects results by 10-30%
+
+**Detection:**
+
+- No mention of evaluation mode
+- Comparing algorithms with different modes
+- Unclear if sampling or mean action used
+
+**Fix:** Specify evaluation mode, ensure consistency across comparisons
+
+
+### Pitfall 10: Offline RL Without Online Validation
+
+**Symptom:** Deploying offline RL policy based on Q-values alone
+
+**Why it's wrong:** Q-values extrapolate OOD, unreliable
+
+**Detection:**
+
+- No online rollouts before deployment
+- Claiming performance based on learned values
+- Ignoring distribution shift
+
+**Fix:** Staged deployment (offline → small online trial → full deployment)
+
+
+## Red Flags
+
+| Red Flag | Implication | Action |
+|----------|-------------|--------|
+| Only one training curve shown | Single seed, cherry-picked | Demand multi-seed results |
+| No error bars or confidence intervals | No variance accounting | Require statistical rigor |
+| "We picked a representative seed" | Cherry-picking | Reject, require all seeds |
+| No train/test split mentioned | Likely overfitting | Check evaluation protocol |
+| No sample efficiency curves | Ignoring cost | Request curves or AUC |
+| Evaluation mode not specified | Unclear methodology | Ask: stochastic or deterministic? |
+| < 20 evaluation episodes | High variance | Require more episodes |
+| Only final performance reported | Missing sample efficiency | Request performance at multiple steps |
+| No generalization testing | Narrow evaluation | Request distribution shift tests |
+| Offline RL with no online validation | Unreliable estimates | Require online trial |
+| Results too good to be true | Probably cherry-picked or overfitting | Deep investigation |
+| p-value reported without effect size | Statistically significant but practically irrelevant | Check Cohen's d |
+
+
+## Rationalization Table
+
+| Rationalization | Why It's Wrong | Counter |
+|-----------------|----------------|---------|
+| "RL papers commonly use single seed, so it's acceptable" | Common ≠ correct. Field is improving standards. | "Newer venues require multi-seed. Improve rigor." |
+| "Our algorithm is deterministic, variance is low" | Algorithm determinism ≠ environment/initialization determinism | "Environment randomness still causes variance." |
+| "We don't have compute for 10 seeds" | Then don't make strong performance claims | "Report 3-5 seeds with caveats, or wait for compute." |
+| "Evaluation on training set is faster" | Speed < correctness | "Fast wrong answer is worse than slow right answer." |
+| "We care about final performance, not sample efficiency" | Depends on application, often sample efficiency matters | "Clarify deployment constraints. Samples usually matter." |
+| "Stochastic/deterministic doesn't matter" | 10-30% difference is common | "Specify mode, ensure fair comparison." |
+| "10 eval episodes is enough" | Standard error likely > 10% of mean | "Compute SEM, use power analysis." |
+| "Our environment is simple, doesn't need generalization testing" | Deployment is rarely identical to training | "Test at least 2-3 distribution shifts." |
+| "Offline RL Q-values are accurate" | Only for in-distribution, not OOD | "Q-values extrapolate. Need online validation." |
+| "We reported the best run, but all were similar" | Then report all and show they're similar | "Show mean ± std to prove similarity." |
+
+
+## Decision Trees
+
+### Decision Tree 1: How Many Seeds?
+
+```
+What is the use case?
+├─ Quick internal comparison
+│  └─ 3-5 seeds (caveat: preliminary results)
+├─ Algorithm selection for production
+│  └─ 10-20 seeds
+├─ Publication
+│  └─ 10-20 seeds (depends on venue)
+└─ Safety-critical deployment
+   └─ 20-50 seeds (need tight confidence intervals)
+```
+
+### Decision Tree 2: Evaluation Mode?
+
+```
+Is policy inherently deterministic?
+├─ YES (DQN, deterministic policies)
+│  └─ Deterministic evaluation
+└─ NO (PPO, SAC, stochastic policies)
+   ├─ Will deployment use stochastic policy?
+   │  ├─ YES
+   │  │  └─ Stochastic evaluation
+   │  └─ NO
+   │     └─ Deterministic evaluation
+   └─ Unsure?
+      └─ Report BOTH, explain trade-offs
+```
+
+### Decision Tree 3: How Many Evaluation Episodes?
+
+```
+What is variance estimate?
+├─ Unknown
+│  └─ Start with 20 episodes, estimate variance, use power analysis
+├─ Known (σ)
+│  ├─ Low variance (σ < 0.1 * μ)
+│  │  └─ 20-50 episodes sufficient
+│  ├─ Medium variance (0.1 * μ ≤ σ < 0.3 * μ)
+│  │  └─ 50-100 episodes
+│  └─ High variance (σ ≥ 0.3 * μ)
+│     └─ 100-500 episodes (or use variance reduction techniques)
+```
+
+### Decision Tree 4: Generalization Testing?
+
+```
+Is environment parameterized or procedurally generated?
+├─ YES (multiple instances possible)
+│  ├─ Use train/test split (80/20)
+│  └─ Report both train and test performance
+└─ NO (single environment)
+   ├─ Can you create distribution shifts?
+   │  ├─ YES (modify dynamics, observations, etc.)
+   │  │  └─ Test on 3-5 distribution shifts
+   │  └─ NO
+   │     └─ At minimum, use different random seed for eval
+```
+
+### Decision Tree 5: Offline RL Evaluation?
+
+```
+Can you do online rollouts?
+├─ YES
+│  └─ Use staged deployment (offline → small trial → full online)
+├─ NO (completely offline)
+│  ├─ Use conservative offline metrics
+│  ├─ Compare to behavior cloning baseline
+│  ├─ Clearly state limitations
+│  └─ Do NOT claim actual performance, only lower bounds
+└─ PARTIAL (limited online budget)
+   └─ Use model-based evaluation + small online trial
+```
+
+
+## Workflow
+
+### Standard Evaluation Workflow
+
+```
+1. Pre-Experiment Planning
+   ☐ Define evaluation protocol BEFORE running experiments
+   ☐ Select number of seeds (minimum 5-10)
+   ☐ Define train/test split if applicable
+   ☐ Specify evaluation mode (stochastic/deterministic)
+   ☐ Define sample budgets for efficiency curves
+   ☐ Pre-register experiments (commit to protocol)
+
+2. Training Phase
+   ☐ Train on training environments ONLY
+   ☐ Use separate eval environments with different seeds
+   ☐ Evaluate at regular intervals (adaptive schedule)
+   ☐ Save checkpoints at evaluation points
+   ☐ Log both training and evaluation performance
+
+3. Evaluation Phase
+   ☐ Final evaluation on test set (never seen during training)
+   ☐ Use sufficient episodes (50-100 minimum)
+   ☐ Evaluate across all seeds
+   ☐ Compute statistics (mean, std, CI, median, IQR)
+   ☐ Test generalization (distribution shifts, zero-shot transfer)
+
+4. Analysis Phase
+   ☐ Compute sample efficiency metrics (AUC, reward at budgets)
+   ☐ Statistical significance testing if comparing algorithms
+   ☐ Check effect size (Cohen's d), not just p-value
+   ☐ Identify failure cases and edge cases
+   ☐ Measure robustness to perturbations
+
+5. Reporting Phase
+   ☐ Report all seeds, not selected subset
+   ☐ Include mean ± std or 95% CI
+   ☐ Show sample efficiency curves
+   ☐ Report both training and generalization performance
+   ☐ Specify evaluation mode
+   ☐ Include negative results and failure analysis
+   ☐ Provide reproducibility details (seeds, hyperparameters)
+```
+
+### Checklist for Publication/Deployment
+
+```
+Statistical Rigor:
+☐ Minimum 10 seeds
+☐ Mean ± std or 95% CI reported
+☐ Statistical significance testing (if comparing algorithms)
+☐ Effect size reported (Cohen's d)
+
+Train/Test Discipline:
+☐ Separate train/test environments
+☐ Different random seeds for train/eval
+☐ No evaluation on training data
+☐ Generalization gap reported (train vs test performance)
+
+Comprehensive Metrics:
+☐ Final performance
+☐ Sample efficiency curves
+☐ Performance at multiple sample budgets
+☐ Evaluation mode specified (stochastic/deterministic)
+
+Generalization:
+☐ Tested on distribution shifts
+☐ Zero-shot transfer evaluation (if applicable)
+☐ Robustness to perturbations
+
+Methodology:
+☐ Sufficient evaluation episodes (50-100+)
+☐ Evaluation protocol clearly described
+☐ Reproducibility details provided
+☐ Negative results included
+
+Offline RL (if applicable):
+☐ Conservative offline metrics used
+☐ Online validation included (or limitations clearly stated)
+☐ Comparison to behavior cloning baseline
+☐ Distribution shift acknowledged
+```
+
+
+## Integration with rl-debugging
+
+RL evaluation and debugging are closely related:
+
+**Use rl-debugging when:**
+
+- Evaluation reveals poor performance
+- Need to diagnose WHY agent fails
+- Debugging training issues
+
+**Use rl-evaluation when:**
+
+- Agent seems to work, need to measure HOW WELL
+- Comparing multiple algorithms
+- Preparing for deployment
+
+**Combined workflow:**
+
+1. Train agent
+2. Evaluate (rl-evaluation skill)
+3. If performance poor → Debug (rl-debugging skill)
+4. Fix issues
+5. Re-evaluate
+6. Repeat until satisfactory
+7. Final rigorous evaluation for deployment
+
+
+## Summary
+
+RL evaluation is NOT just "run the agent and see what happens." It requires:
+
+1. **Statistical rigor**: Multi-seed, confidence intervals, significance testing
+2. **Train/test discipline**: Separate environments, no overfitting
+3. **Comprehensive metrics**: Sample efficiency, generalization, robustness
+4. **Appropriate protocols**: Evaluation mode, episode count, frequency
+5. **Offline RL awareness**: Conservative estimates, staged deployment
+
+Without rigorous evaluation:
+
+- You will draw wrong conclusions from noise
+- You will deploy agents that fail in production
+- You will waste resources on false improvements
+- You will make scientifically invalid claims
+
+With rigorous evaluation:
+
+- Reliable performance estimates
+- Valid algorithm comparisons
+- Deployment-ready agents
+- Reproducible research
+
+**When in doubt:** More seeds, more episodes, more generalization tests.
+
+
+**END OF SKILL**
diff --git a/skills/using-deep-rl/rl-foundations.md b/skills/using-deep-rl/rl-foundations.md
new file mode 100644
index 0000000..d04c9d8
--- /dev/null
+++ b/skills/using-deep-rl/rl-foundations.md
@@ -0,0 +1,2187 @@
+
+# RL Foundations
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **New to RL**: User asks "what is RL" or "how does RL work"
+- **Theory Questions**: MDP, value functions, Bellman equations, policy optimization
+- **Conceptual Confusion**: Mixing up V(s) and Q(s,a), value iteration vs policy iteration
+- **Before Implementation**: User wants to implement RL algorithms without understanding foundations
+- **Debugging Theory**: Why discount factor matters, why exploration needed, how algorithms differ
+- **Foundation Check**: User jumps to DQN/PPO without understanding MDPs
+
+**This skill provides the theoretical foundation for ALL other deep-rl skills.**
+
+Do NOT use this skill for:
+
+- Algorithm implementation (use value-based-methods, policy-gradient-methods, actor-critic-methods)
+- Debugging code (use rl-debugging)
+- Environment setup (use rl-environments)
+
+## Core Principle
+
+**Understanding the theory enables everything else.**
+
+Reinforcement learning is built on a rigorous mathematical foundation:
+
+1. **MDP** (Markov Decision Process) - the framework
+2. **Value Functions** - quantify expected return
+3. **Bellman Equations** - recursive decomposition
+4. **Optimal Policy** - maximize expected return
+5. **Algorithms** - methods to find optimal policy
+
+**Without this foundation, you're copy-pasting code you cannot debug, adapt, or extend.**
+
+
+## Part 1: Markov Decision Process (MDP)
+
+### What is an MDP?
+
+An MDP is the mathematical framework for sequential decision-making under uncertainty.
+
+**Formal Definition**: A Markov Decision Process is a 5-tuple (S, A, P, R, γ):
+
+- **S**: State space (set of all possible states)
+- **A**: Action space (set of all possible actions)
+- **P**: Transition probability P(s'|s,a) - probability of reaching state s' from state s after action a
+- **R**: Reward function R(s,a,s') - immediate reward for transition
+- **γ**: Discount factor (0 ≤ γ ≤ 1) - controls importance of future rewards
+
+**Key Property**: **Markov Property**
+
+```
+P(s_{t+1} | s_t, a_t, s_{t-1}, a_{t-1}, ..., s_0, a_0) = P(s_{t+1} | s_t, a_t)
+```
+
+**Meaning**: The future depends only on the present state, not the history.
+
+**Why this matters**: Allows recursive algorithms (Bellman equations). If Markov property violated, standard RL algorithms may fail.
+
+
+### Example 1: GridWorld MDP
+
+**Problem**: Agent navigates 4x4 grid to reach goal.
+
+```
+S = {(0,0), (0,1), ..., (3,3)}  # 16 states
+A = {UP, DOWN, LEFT, RIGHT}      # 4 actions
+R = -1 for each step, +10 at goal
+γ = 0.9
+P: Deterministic (up always moves up if not wall)
+```
+
+**Visualization**:
+
+```
+S  .  .  .
+.  .  .  .
+.  #  .  .  # = wall
+.  .  .  G  G = goal (+10)
+```
+
+**Transition Example**:
+
+- State s = (1,1), Action a = RIGHT
+- Deterministic: P(s'=(1,2) | s=(1,1), a=RIGHT) = 1.0
+- Reward: R(s,a,s') = -1
+- Next state: s' = (1,2)
+
+**Markov Property Holds**: Future position depends only on current position and action, not how you got there.
+
+
+### Example 2: Stochastic GridWorld
+
+**Modification**: Actions succeed with probability 0.8, move perpendicular with probability 0.1 each.
+
+```
+P((1,2) | (1,1), RIGHT) = 0.8  # intended
+P((0,1) | (1,1), RIGHT) = 0.1  # slip up
+P((2,1) | (1,1), RIGHT) = 0.1  # slip down
+```
+
+**Why Stochastic**: Models real-world uncertainty (robot actuators, wind, slippery surfaces).
+
+**Consequence**: Agent must consider probabilities when choosing actions.
+
+
+### Example 3: Continuous State MDP (Cartpole)
+
+```
+S ⊂ ℝ⁴: (cart_position, cart_velocity, pole_angle, pole_angular_velocity)
+A = {LEFT, RIGHT}  # discrete actions, continuous state
+R = +1 for each timestep upright
+γ = 0.99
+P: Physics-based transition (continuous dynamics)
+```
+
+**Key Difference**: State space is continuous, requires function approximation (neural networks).
+
+**Still an MDP**: Markov property holds (physics is Markovian given state).
+
+
+### When is Markov Property Violated?
+
+**Example: Poker**
+
+```
+State: Current cards visible
+Markov Violated: Opponents' strategies depend on past betting patterns
+```
+
+**Solution**: Augment state with history (last N actions), or use partially observable MDP (POMDP).
+
+**Example: Robot with Noisy Sensors**
+
+```
+State: Raw sensor reading (single frame)
+Markov Violated: True position requires integrating multiple frames
+```
+
+**Solution**: Stack frames (last 4 frames as state), or use recurrent network (LSTM).
+
+
+### Episodic vs Continuing Tasks
+
+**Episodic**: Task terminates (games, reaching goal)
+
+```
+Episode: s₀ → s₁ → ... → s_T (terminal state)
+Return: G_t = r_t + γr_{t+1} + ... + γ^{T-t}r_T
+```
+
+**Continuing**: Task never ends (stock trading, robot operation)
+
+```
+Return: G_t = r_t + γr_{t+1} + γ²r_{t+2} + ... (infinite)
+```
+
+**Critical**: Continuing tasks REQUIRE γ < 1 (else return infinite).
+
+
+### MDP Pitfall #1: Using Wrong State Representation
+
+**Bad**: State = current frame only (when velocity matters)
+
+```python
+# Pong: Ball position alone doesn't tell velocity
+state = current_frame  # WRONG - not Markovian
+```
+
+**Good**: State = last 4 frames (velocity from difference)
+
+```python
+# Frame stacking preserves Markov property
+state = np.concatenate([frame_t, frame_{t-1}, frame_{t-2}, frame_{t-3}])
+```
+
+**Why**: Ball velocity = (position_t - position_{t-1}) / dt, need history.
+
+
+### MDP Pitfall #2: Reward Function Shapes Behavior
+
+**Example**: Robot navigating to goal
+
+**Bad Reward**:
+
+```python
+reward = +1 if at_goal else 0  # Sparse
+```
+
+**Problem**: No signal until goal reached, hard to learn.
+
+**Better Reward**:
+
+```python
+reward = -distance_to_goal  # Dense
+```
+
+**Problem**: Agent learns to get closer but may not reach goal (local optimum).
+
+**Best Reward** (Potential-Based Shaping):
+
+```python
+reward = (distance_prev - distance_curr) + large_bonus_at_goal
+```
+
+**Why**: Encourages progress + explicit goal reward.
+
+**Takeaway**: Reward function engineering is CRITICAL. Route to reward-shaping skill for details.
+
+
+### MDP Formulation Checklist
+
+Before implementing any RL algorithm, answer:
+
+- [ ] **States**: What information defines the situation? Is it Markovian?
+- [ ] **Actions**: What can the agent do? Discrete or continuous?
+- [ ] **Transitions**: Deterministic or stochastic? Do you know P(s'|s,a)?
+- [ ] **Rewards**: Immediate reward for each transition? Sparse or dense?
+- [ ] **Discount**: Episodic (can use γ=1) or continuing (need γ<1)?
+- [ ] **Markov Property**: Does current state fully determine future?
+
+**If you cannot answer these, you cannot implement RL algorithms effectively.**
+
+
+## Part 2: Value Functions
+
+### What is a Value Function?
+
+A value function quantifies "how good" a state (or state-action pair) is.
+
+**State-Value Function V^π(s)**:
+
+```
+V^π(s) = E_π[G_t | s_t = s]
+       = E_π[r_t + γr_{t+1} + γ²r_{t+2} + ... | s_t = s]
+```
+
+**Meaning**: Expected cumulative discounted reward starting from state s and following policy π.
+
+**Action-Value Function Q^π(s,a)**:
+
+```
+Q^π(s,a) = E_π[G_t | s_t = s, a_t = a]
+         = E_π[r_t + γr_{t+1} + γ²r_{t+2} + ... | s_t = s, a_t = a]
+```
+
+**Meaning**: Expected cumulative discounted reward starting from state s, taking action a, then following policy π.
+
+**Relationship**:
+
+```
+V^π(s) = Σ_a π(a|s) Q^π(s,a)
+```
+
+**Intuition**: V(s) = value of state, Q(s,a) = value of state-action pair.
+
+
+### Critical Distinction: Value vs Reward
+
+**Reward r(s,a)**: Immediate, one-step payoff.
+
+**Value V(s)**: Long-term, cumulative expected reward.
+
+**Example: GridWorld**
+
+```
+Reward: r = -1 every step, r = +10 at goal
+Value at state 2 steps from goal:
+  V(s) ≈ -1 + γ(-1) + γ²(+10)
+       = -1 - 0.9 + 0.81*10
+       = -1.9 + 8.1 = 6.2
+```
+
+**Key**: Value is higher than immediate reward because it accounts for future goal reward.
+
+**Common Mistake**: Setting V(s) = r(s). This ignores all future rewards.
+
+
+### Example: Computing V^π for Simple Policy
+
+**GridWorld**: 3x3 grid, goal at (2,2), γ=0.9, r=-1 per step.
+
+**Policy π**: Always move right or down (deterministic).
+
+**Manual Calculation**:
+
+```
+V^π((2,2)) = 0  (goal, no future rewards)
+
+V^π((2,1)) = r + γ V^π((2,2))
+           = -1 + 0.9 * 0 = -1
+
+V^π((1,2)) = r + γ V^π((2,2))
+           = -1 + 0.9 * 0 = -1
+
+V^π((1,1)) = r + γ V^π((1,2))  (assuming action = DOWN)
+           = -1 + 0.9 * (-1) = -1.9
+
+V^π((0,0)) = r + γ V^π((0,1))
+           = ... (depends on path)
+```
+
+**Observation**: Values decrease as distance from goal increases (more -1 rewards to collect).
+
+
+### Optimal Value Functions
+
+**Optimal State-Value Function V*(s)**:
+
+```
+V*(s) = max_π V^π(s)
+```
+
+**Meaning**: Maximum value achievable from state s under ANY policy.
+
+**Optimal Action-Value Function Q*(s,a)**:
+
+```
+Q*(s,a) = max_π Q^π(s,a)
+```
+
+**Meaning**: Maximum value achievable from state s, taking action a, then acting optimally.
+
+**Optimal Policy π***:
+
+```
+π*(s) = argmax_a Q*(s,a)
+```
+
+**Meaning**: Policy that achieves V*(s) at all states.
+
+**Key Insight**: If you know Q*(s,a), optimal policy is trivial (pick action with max Q).
+
+
+### Value Function Pitfall #1: Confusing V and Q
+
+**Wrong Understanding**:
+
+- V(s) = value of state s
+- Q(s,a) = value of action a (WRONG - ignores state)
+
+**Correct Understanding**:
+
+- V(s) = value of state s (average over actions under policy)
+- Q(s,a) = value of taking action a IN STATE s
+
+**Example**: GridWorld
+
+```
+State s = (1,1)
+V(s) might be 5.0 (average value under policy)
+
+Q(s, RIGHT) = 6.0  (moving right is good)
+Q(s, LEFT)  = 2.0  (moving left is bad)
+Q(s, UP)    = 4.0
+Q(s, DOWN)  = 7.0  (moving down is best)
+
+V(s) = π(RIGHT|s)*6 + π(LEFT|s)*2 + π(UP|s)*4 + π(DOWN|s)*7
+```
+
+**Takeaway**: Q depends on BOTH state and action. V depends only on state.
+
+
+### Value Function Pitfall #2: Forgetting Expectation
+
+**Wrong**: V(s) = sum of rewards on one trajectory.
+
+**Correct**: V(s) = expected sum over ALL possible trajectories.
+
+**Example**: Stochastic GridWorld
+
+```python
+# WRONG: Compute V by running one episode
+episode_return = sum([r_0, r_1, ..., r_T])
+V[s_0] = episode_return  # This is ONE sample, not expectation
+
+# CORRECT: Compute V by averaging over many episodes
+returns = []
+for _ in range(1000):
+    episode_return = run_episode(policy, start_state=s)
+    returns.append(episode_return)
+V[s] = np.mean(returns)  # Expectation via Monte Carlo
+```
+
+**Key**: Value is an expectation, not a single sample.
+
+
+### Value Function Pitfall #3: Ignoring Discount Factor
+
+**Scenario**: User computes V without discounting.
+
+**Wrong**:
+
+```python
+V[s] = r_0 + r_1 + r_2 + ...  # No discount
+```
+
+**Correct**:
+
+```python
+V[s] = r_0 + gamma*r_1 + gamma**2*r_2 + ...
+```
+
+**Why It Matters**: Without discount, values blow up in continuing tasks.
+
+**Example**: Continuing task with r=1 every step
+
+```
+Without discount: V = 1 + 1 + 1 + ... = ∞
+With γ=0.9:      V = 1 + 0.9 + 0.81 + ... = 1/(1-0.9) = 10
+```
+
+**Takeaway**: Always discount future rewards in continuing tasks.
+
+
+## Part 3: Policies
+
+### What is a Policy?
+
+A policy π is a mapping from states to actions (or action probabilities).
+
+**Deterministic Policy**: π: S → A
+
+```
+π(s) = a  (always take action a in state s)
+```
+
+**Stochastic Policy**: π: S × A → [0,1]
+
+```
+π(a|s) = probability of taking action a in state s
+Σ_a π(a|s) = 1  (probabilities sum to 1)
+```
+
+
+### Example: Policies in GridWorld
+
+**Deterministic Policy**:
+
+```python
+def policy(state):
+    if state[0] < 2:
+        return "RIGHT"
+    else:
+        return "DOWN"
+```
+
+**Stochastic Policy**:
+
+```python
+def policy(state):
+    # 70% right, 20% down, 10% up
+    return np.random.choice(["RIGHT", "DOWN", "UP"], 
+                           p=[0.7, 0.2, 0.1])
+```
+
+**Uniform Random Policy**:
+
+```python
+def policy(state):
+    return np.random.choice(["UP", "DOWN", "LEFT", "RIGHT"])
+```
+
+
+### Policy Evaluation
+
+**Problem**: Given policy π, compute V^π(s) for all states.
+
+**Approach 1: Monte Carlo** (sample trajectories)
+
+```python
+# Run many episodes, average returns
+V = defaultdict(float)
+counts = defaultdict(int)
+
+for episode in range(10000):
+    trajectory = run_episode(policy)
+    G = 0
+    for (s, a, r) in reversed(trajectory):
+        G = r + gamma * G
+        V[s] += G
+        counts[s] += 1
+
+for s in V:
+    V[s] /= counts[s]  # Average
+```
+
+**Approach 2: Bellman Expectation** (iterative)
+
+```python
+# Initialize V arbitrarily
+V = {s: 0 for s in states}
+
+# Iterate until convergence
+while not converged:
+    V_new = {}
+    for s in states:
+        V_new[s] = sum(policy(a|s) * (R(s,a) + gamma * sum(P(s'|s,a) * V[s'] 
+                                                            for s' in states))
+                       for a in actions)
+    V = V_new
+```
+
+**Approach 2 requires knowing P(s'|s,a)** (model-based).
+
+
+### Policy Improvement
+
+**Theorem**: Given V^π, greedy policy π' with respect to V^π is at least as good as π.
+
+```
+π'(s) = argmax_a Q^π(s,a)
+      = argmax_a Σ_{s'} P(s'|s,a) [R(s,a,s') + γV^π(s')]
+```
+
+**Proof Sketch**: By construction, π' maximizes expected immediate reward + future value.
+
+**Consequence**: Iterating policy evaluation + policy improvement converges to optimal policy π*.
+
+
+### Optimal Policy π*
+
+**Theorem**: There exists an optimal policy π*that achieves V*(s) at all states.
+
+**How to find π* from Q***:
+
+```python
+def optimal_policy(state):
+    return argmax(Q_star[state, :])  # Greedy w.r.t. Q*
+```
+
+**How to find π* from V***:
+
+```python
+def optimal_policy(state):
+    # One-step lookahead
+    return argmax([R(state, a) + gamma * sum(P(s'|state,a) * V_star[s'] 
+                                              for s' in states)
+                   for a in actions])
+```
+
+**Key**: Optimal policy is deterministic (greedy w.r.t. Q*or V*).
+
+**Exception**: In stochastic games with multiple optimal actions, any distribution over optimal actions is fine.
+
+
+### Policy Pitfall #1: Greedy Policy Without Exploration
+
+**Problem**: Always taking argmax(Q) means never trying new actions.
+
+**Example**:
+
+```python
+# Pure greedy policy (WRONG for learning)
+def policy(state):
+    return argmax(Q[state, :])
+```
+
+**Why It Fails**: If Q is initialized wrong, agent never explores better actions.
+
+**Solution**: ε-greedy policy
+
+```python
+def epsilon_greedy_policy(state, epsilon=0.1):
+    if random.random() < epsilon:
+        return random.choice(actions)  # Explore
+    else:
+        return argmax(Q[state, :])      # Exploit
+```
+
+**Exploration-Exploitation Tradeoff**: Explore to find better actions, exploit to maximize reward.
+
+
+### Policy Pitfall #2: Stochastic Policy for Deterministic Optimal
+
+**Scenario**: Optimal policy is deterministic (most MDPs), but user uses stochastic policy.
+
+**Effect**: Suboptimal performance (randomness doesn't help).
+
+**Example**: GridWorld optimal policy always moves toward goal (deterministic).
+
+**When Stochastic is Needed**:
+
+1. **During Learning**: Exploration (ε-greedy, Boltzmann)
+2. **Partially Observable**: Stochasticity can help in POMDPs
+3. **Multi-Agent**: Randomness prevents exploitation by opponents
+
+**Takeaway**: After learning, optimal policy is usually deterministic. Use stochastic for exploration.
+
+
+## Part 4: Bellman Equations
+
+### Bellman Expectation Equation
+
+**For V^π**:
+
+```
+V^π(s) = Σ_a π(a|s) Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V^π(s')]
+```
+
+**Intuition**: Value of state s = expected immediate reward + discounted value of next state.
+
+**For Q^π**:
+
+```
+Q^π(s,a) = Σ_{s'} P(s'|s,a) [R(s,a,s') + γ Σ_{a'} π(a'|s') Q^π(s',a')]
+```
+
+**Intuition**: Value of (s,a) = expected immediate reward + discounted value of next (s',a').
+
+**Relationship**:
+
+```
+V^π(s) = Σ_a π(a|s) Q^π(s,a)
+Q^π(s,a) = Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V^π(s')]
+```
+
+
+### Bellman Optimality Equation
+
+**For V***:
+
+```
+V*(s) = max_a Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V*(s')]
+```
+
+**Intuition**: Optimal value = max over actions of (immediate reward + discounted optimal future value).
+
+**For Q***:
+
+```
+Q*(s,a) = Σ_{s'} P(s'|s,a) [R(s,a,s') + γ max_{a'} Q*(s',a')]
+```
+
+**Intuition**: Optimal Q-value = expected immediate reward + discounted optimal Q-value of next state.
+
+**Relationship**:
+
+```
+V*(s) = max_a Q*(s,a)
+Q*(s,a) = Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V*(s')]
+```
+
+
+### Deriving the Bellman Equation
+
+**Start with definition of V^π**:
+
+```
+V^π(s) = E_π[G_t | s_t = s]
+       = E_π[r_t + γr_{t+1} + γ²r_{t+2} + ... | s_t = s]
+```
+
+**Factor out first reward**:
+
+```
+V^π(s) = E_π[r_t + γ(r_{t+1} + γr_{t+2} + ...) | s_t = s]
+       = E_π[r_t | s_t = s] + γ E_π[r_{t+1} + γr_{t+2} + ... | s_t = s]
+```
+
+**Second term is V^π(s_{t+1})**:
+
+```
+V^π(s) = E_π[r_t | s_t = s] + γ E_π[V^π(s_{t+1}) | s_t = s]
+```
+
+**Expand expectations**:
+
+```
+V^π(s) = Σ_a π(a|s) Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V^π(s')]
+```
+
+**This is the Bellman Expectation Equation.**
+
+**Key Insight**: Value function satisfies a consistency equation (recursive).
+
+
+### Why Bellman Equations Matter
+
+**1. Iterative Algorithms**: Use Bellman equation as update rule
+
+```python
+# Value Iteration
+V_new[s] = max_a Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V[s']]
+
+# Q-Learning
+Q[s,a] += alpha * (r + gamma * max_a' Q[s',a'] - Q[s,a])
+```
+
+**2. Convergence Guarantees**: Bellman operator is a contraction, guarantees convergence.
+
+**3. Understanding Algorithms**: All RL algorithms approximate Bellman equations.
+
+**Takeaway**: Bellman equations are the foundation of RL algorithms.
+
+
+### Bellman Pitfall #1: Forgetting Max vs Expectation
+
+**Bellman Expectation** (for policy π):
+
+```
+V^π(s) = Σ_a π(a|s) ...  # Expectation over policy
+```
+
+**Bellman Optimality** (for optimal policy):
+
+```
+V*(s) = max_a ...  # Maximize over actions
+```
+
+**Consequence**:
+
+- Policy evaluation uses Bellman expectation
+- Value iteration uses Bellman optimality
+
+**Common Mistake**: Using max when evaluating a non-greedy policy.
+
+
+### Bellman Pitfall #2: Ignoring Transition Probabilities
+
+**Deterministic Transition**:
+
+```
+V^π(s) = R(s,a) + γ V^π(s')  # Direct, s' is deterministic
+```
+
+**Stochastic Transition**:
+
+```
+V^π(s) = Σ_{s'} P(s'|s,a) [R(s,a,s') + γ V^π(s')]  # Weighted sum
+```
+
+**Example**: Stochastic GridWorld
+
+```
+# Action RIGHT from (1,1)
+V((1,1)) = 0.8 * [r + γ V((1,2))]    # 80% intended
+         + 0.1 * [r + γ V((0,1))]    # 10% slip up
+         + 0.1 * [r + γ V((2,1))]    # 10% slip down
+```
+
+**Takeaway**: Don't forget to weight by transition probabilities in stochastic environments.
+
+
+## Part 5: Discount Factor γ
+
+### What Does γ Control?
+
+**Discount factor γ ∈ [0, 1]** controls how much the agent cares about future rewards.
+
+**γ = 0**: Only immediate reward matters
+
+```
+V(s) = E[r_t]  (myopic)
+```
+
+**γ = 1**: All future rewards matter equally
+
+```
+V(s) = E[r_t + r_{t+1} + r_{t+2} + ...]  (far-sighted)
+```
+
+**γ = 0.9**: Future discounted exponentially
+
+```
+V(s) = E[r_t + 0.9*r_{t+1} + 0.81*r_{t+2} + ...]
+```
+
+**Reward 10 steps away**:
+
+- γ=0.9: worth 0.9^10 = 0.35 of immediate reward
+- γ=0.99: worth 0.99^10 = 0.90 of immediate reward
+
+
+### Planning Horizon
+
+**Effective Horizon**: How far ahead does agent plan?
+
+**Approximation**: Horizon ≈ 1/(1-γ)
+
+**Examples**:
+
+- γ=0.9 → Horizon ≈ 10 steps
+- γ=0.99 → Horizon ≈ 100 steps
+- γ=0.5 → Horizon ≈ 2 steps
+- γ=0.999 → Horizon ≈ 1000 steps
+
+**Intuition**: After horizon steps, rewards are discounted to ~37% (e^{-1}).
+
+**Formal**: Σ_{t=0}^∞ γ^t = 1/(1-γ) (sum of geometric series).
+
+
+### Choosing γ
+
+**Rule of Thumb**:
+
+- **Task horizon known**: γ such that 1/(1-γ) ≈ task_length
+- **Short episodes** (< 100 steps): γ = 0.9 to 0.95
+- **Long episodes** (100-1000 steps): γ = 0.99
+- **Very long** (> 1000 steps): γ = 0.999
+
+**Example: Pong** (episode ~ 1000 steps)
+
+```
+γ = 0.99  # Horizon ≈ 100, sees ~10% of episode
+```
+
+**Example: Cartpole** (episode ~ 200 steps)
+
+```
+γ = 0.99  # Horizon ≈ 100, sees half of episode
+```
+
+**Example: Chess** (game ~ 40 moves = 80 steps)
+
+```
+γ = 0.95  # Horizon ≈ 20, sees quarter of game
+```
+
+
+### γ = 1 Special Case
+
+**When γ = 1**:
+
+- Only valid for **episodic tasks** (guaranteed termination)
+- Continuing tasks: V = ∞ (unbounded)
+
+**Example: GridWorld** (terminates at goal)
+
+```
+γ = 1.0  # OK, episode ends
+V(s) = -steps_to_goal + 10  (finite)
+```
+
+**Example: Stock trading** (never terminates)
+
+```
+γ = 1.0  # WRONG, V = ∞
+γ = 0.99  # Correct
+```
+
+**Takeaway**: Use γ < 1 for continuing tasks, γ = 1 allowed for episodic.
+
+
+### Discount Factor Pitfall #1: Too Small γ
+
+**Scenario**: Task requires 50 steps to reach goal, γ=0.9.
+
+**Problem**:
+
+```
+Reward at step 50 discounted by 0.9^50 = 0.0052
+```
+
+**Effect**: Agent effectively blind to long-term goals (can't see reward).
+
+**Solution**: Increase γ to 0.99 (0.99^50 = 0.61, still significant).
+
+**Symptom**: Agent learns suboptimal policy (ignores distant goals).
+
+
+### Discount Factor Pitfall #2: γ = 1 in Continuing Tasks
+
+**Scenario**: Continuing task (never terminates), γ=1.
+
+**Problem**:
+
+```
+V(s) = r + r + r + ... = ∞  (unbounded)
+```
+
+**Effect**: Value iteration, Q-learning diverge (values explode).
+
+**Solution**: Use γ < 1 (e.g., γ=0.99).
+
+**Symptom**: Values grow without bound, algorithm doesn't converge.
+
+
+### Discount Factor Pitfall #3: Treating γ as Hyperparameter
+
+**Wrong Mindset**: "Let's grid search γ in [0.9, 0.95, 0.99]."
+
+**Correct Mindset**: "Task requires planning X steps ahead, so γ = 1 - 1/X."
+
+**Example**: Goal 100 steps away
+
+```
+Required horizon = 100
+γ = 1 - 1/100 = 0.99
+```
+
+**Takeaway**: γ is not arbitrary. Choose based on task horizon.
+
+
+## Part 6: Algorithm Families
+
+### Three Paradigms
+
+**1. Dynamic Programming (DP)**:
+
+- Requires full MDP model (P, R known)
+- Exact algorithms (no sampling)
+- Examples: Value Iteration, Policy Iteration
+
+**2. Monte Carlo (MC)**:
+
+- Model-free (learn from experience)
+- Learns from complete episodes
+- Examples: First-visit MC, Every-visit MC
+
+**3. Temporal Difference (TD)**:
+
+- Model-free (learn from experience)
+- Learns from incomplete episodes
+- Examples: TD(0), Q-learning, SARSA
+
+**Key Differences**:
+
+- DP: Needs model, no sampling
+- MC: No model, full episodes
+- TD: No model, partial episodes (most flexible)
+
+
+### Value Iteration
+
+**Algorithm**: Iteratively apply Bellman optimality operator.
+
+```python
+# Initialize
+V = {s: 0 for s in states}
+
+# Iterate until convergence
+while not converged:
+    V_new = {}
+    for s in states:
+        # Bellman optimality backup
+        V_new[s] = max([sum(P(s_next|s,a) * (R(s,a,s_next) + gamma * V[s_next])
+                            for s_next in states)
+                        for a in actions])
+    
+    if max(abs(V_new[s] - V[s]) for s in states) < threshold:
+        converged = True
+    V = V_new
+
+# Extract policy
+policy = {s: argmax([sum(P(s_next|s,a) * (R(s,a,s_next) + gamma * V[s_next])
+                         for s_next in states)
+                    for a in actions])
+          for s in states}
+```
+
+**Convergence**: Guaranteed (Bellman operator is contraction).
+
+**Computational Cost**: O(|S|² |A|) per iteration.
+
+**When to Use**: Small state spaces (< 10,000 states), full model available.
+
+
+### Policy Iteration
+
+**Algorithm**: Alternate between policy evaluation and policy improvement.
+
+```python
+# Initialize random policy
+policy = {s: random.choice(actions) for s in states}
+
+while not converged:
+    # Policy Evaluation: Compute V^π
+    V = {s: 0 for s in states}
+    while not converged_V:
+        V_new = {}
+        for s in states:
+            a = policy[s]
+            V_new[s] = sum(P(s_next|s,a) * (R(s,a,s_next) + gamma * V[s_next])
+                          for s_next in states)
+        V = V_new
+    
+    # Policy Improvement: Make policy greedy w.r.t. V
+    policy_stable = True
+    for s in states:
+        old_action = policy[s]
+        policy[s] = argmax([sum(P(s_next|s,a) * (R(s,a,s_next) + gamma * V[s_next])
+                               for s_next in states)
+                           for a in actions])
+        if old_action != policy[s]:
+            policy_stable = False
+    
+    if policy_stable:
+        converged = True
+```
+
+**Convergence**: Guaranteed, often fewer iterations than value iteration.
+
+**When to Use**: When policy converges faster than values (common).
+
+**Key Difference from Value Iteration**:
+
+- Value iteration: no explicit policy until end
+- Policy iteration: maintain and improve policy each iteration
+
+
+### Monte Carlo Methods
+
+**Idea**: Estimate V^π(s) by averaging returns from state s.
+
+```python
+# First-visit MC
+V = defaultdict(float)
+counts = defaultdict(int)
+
+for episode in range(num_episodes):
+    trajectory = run_episode(policy)  # [(s_0, a_0, r_0), ..., (s_T, a_T, r_T)]
+    
+    G = 0
+    visited = set()
+    for (s, a, r) in reversed(trajectory):
+        G = r + gamma * G  # Accumulate return
+        
+        if s not in visited:  # First-visit
+            V[s] += G
+            counts[s] += 1
+            visited.add(s)
+    
+    for s in counts:
+        V[s] /= counts[s]  # Average return
+```
+
+**Advantages**:
+
+- No model needed (model-free)
+- Can handle stochastic environments
+- Unbiased estimates
+
+**Disadvantages**:
+
+- Requires complete episodes (can't learn mid-episode)
+- High variance (one trajectory is noisy)
+- Slow convergence
+
+**When to Use**: Episodic tasks, when model unavailable.
+
+
+### Temporal Difference (TD) Learning
+
+**Idea**: Update V after each step using bootstrapping.
+
+**TD(0) Update**:
+
+```python
+V[s] += alpha * (r + gamma * V[s_next] - V[s])
+#                \_____________________/
+#                       TD error
+```
+
+**Bootstrapping**: Use current estimate V[s_next] instead of true return.
+
+**Full Algorithm**:
+
+```python
+V = {s: 0 for s in states}
+
+for episode in range(num_episodes):
+    s = initial_state()
+    
+    while not terminal:
+        a = policy(s)
+        s_next, r = environment.step(s, a)
+        
+        # TD update
+        V[s] += alpha * (r + gamma * V[s_next] - V[s])
+        
+        s = s_next
+```
+
+**Advantages**:
+
+- No model needed (model-free)
+- Can learn from incomplete episodes (online)
+- Lower variance than MC
+
+**Disadvantages**:
+
+- Biased estimates (bootstrap uses estimate)
+- Requires tuning α (learning rate)
+
+**When to Use**: Model-free, need online learning.
+
+
+### Q-Learning (TD for Q-values)
+
+**TD for action-values Q(s,a)**:
+
+```python
+Q[s,a] += alpha * (r + gamma * max_a' Q[s_next, a'] - Q[s,a])
+```
+
+**Full Algorithm**:
+
+```python
+Q = defaultdict(lambda: defaultdict(float))
+
+for episode in range(num_episodes):
+    s = initial_state()
+    
+    while not terminal:
+        # ε-greedy action selection
+        if random.random() < epsilon:
+            a = random.choice(actions)
+        else:
+            a = argmax(Q[s])
+        
+        s_next, r = environment.step(s, a)
+        
+        # Q-learning update (off-policy)
+        Q[s][a] += alpha * (r + gamma * max(Q[s_next].values()) - Q[s][a])
+        
+        s = s_next
+```
+
+**Key**: Off-policy (learns optimal Q regardless of behavior policy).
+
+**When to Use**: Model-free, discrete actions, want optimal policy.
+
+
+### SARSA (On-Policy TD)
+
+**Difference from Q-learning**: Uses next action from policy (on-policy).
+
+```python
+Q[s,a] += alpha * (r + gamma * Q[s_next, a_next] - Q[s,a])
+#                                      ^^^^^^
+#                                      Action from policy, not max
+```
+
+**Full Algorithm**:
+
+```python
+Q = defaultdict(lambda: defaultdict(float))
+
+for episode in range(num_episodes):
+    s = initial_state()
+    a = epsilon_greedy(Q[s], epsilon)  # Choose first action
+    
+    while not terminal:
+        s_next, r = environment.step(s, a)
+        a_next = epsilon_greedy(Q[s_next], epsilon)  # Next action from policy
+        
+        # SARSA update (on-policy)
+        Q[s][a] += alpha * (r + gamma * Q[s_next][a_next] - Q[s][a])
+        
+        s, a = s_next, a_next
+```
+
+**Difference from Q-learning**:
+
+- Q-learning: learns optimal policy (off-policy)
+- SARSA: learns policy being followed (on-policy)
+
+**When to Use**: When you want policy to reflect exploration strategy.
+
+
+### Algorithm Comparison
+
+| Algorithm | Model? | Episodes? | Convergence | Use Case |
+|-----------|--------|-----------|-------------|----------|
+| Value Iteration | Yes (P, R) | No | Guaranteed | Small MDPs, known model |
+| Policy Iteration | Yes (P, R) | No | Guaranteed, faster | Small MDPs, good init policy |
+| Monte Carlo | No | Complete | Slow, high variance | Episodic, model-free |
+| TD(0) | No | Partial | Faster, lower variance | Online, model-free |
+| Q-Learning | No | Partial | Guaranteed* | Discrete actions, off-policy |
+| SARSA | No | Partial | Guaranteed* | On-policy, safe exploration |
+
+*With appropriate exploration and learning rate schedule.
+
+
+### Algorithm Pitfall #1: Using DP Without Model
+
+**Scenario**: User tries value iteration on real robot (no model).
+
+**Problem**: Value iteration requires P(s'|s,a) and R(s,a,s').
+
+**Solution**: Use model-free methods (Q-learning, SARSA, policy gradients).
+
+**Red Flag**: "Let's use policy iteration for Atari games." (No model available.)
+
+
+### Algorithm Pitfall #2: Monte Carlo on Non-Episodic Tasks
+
+**Scenario**: Continuing task (never terminates), try MC.
+
+**Problem**: MC requires complete episodes to compute return.
+
+**Solution**: Use TD methods (learn from partial trajectories).
+
+**Red Flag**: "Let's use MC for stock trading." (Continuing task.)
+
+
+### Algorithm Pitfall #3: Confusing Q-Learning and SARSA
+
+**Scenario**: User uses Q-learning but expects on-policy behavior.
+
+**Example**: Cliff walking with epsilon-greedy
+
+- Q-learning: Learns optimal (risky) path along cliff
+- SARSA: Learns safe path away from cliff (accounts for exploration)
+
+**Takeaway**:
+
+- Q-learning: Learns optimal policy (off-policy)
+- SARSA: Learns policy being followed (on-policy)
+
+**Choose based on whether you want optimal policy or policy that accounts for exploration.**
+
+
+## Part 7: Exploration vs Exploitation
+
+### The Tradeoff
+
+**Exploitation**: Choose action with highest known value (maximize immediate reward).
+
+**Exploration**: Try new actions to discover if they're better (maximize long-term information).
+
+**Dilemma**: Must explore to find optimal policy, but exploration sacrifices short-term reward.
+
+**Example**: Restaurant choice
+
+- Exploitation: Go to your favorite restaurant (known good)
+- Exploration: Try a new restaurant (might be better, might be worse)
+
+
+### Why Exploration is Necessary
+
+**Scenario**: GridWorld, Q-values initialized to 0.
+
+**Without Exploration**:
+
+```python
+# Greedy policy
+policy(s) = argmax(Q[s, :])  # Always 0 initially, picks arbitrary action
+```
+
+**Problem**: If first action happens to be BAD, Q[s,a] becomes negative, never tried again.
+
+**Result**: Agent stuck in suboptimal policy (local optimum).
+
+**With Exploration**:
+
+```python
+# ε-greedy
+if random.random() < epsilon:
+    action = random.choice(actions)  # Explore
+else:
+    action = argmax(Q[s, :])  # Exploit
+```
+
+**Result**: Eventually tries all actions, discovers optimal.
+
+
+### ε-Greedy Exploration
+
+**Algorithm**:
+
+```python
+def epsilon_greedy(state, Q, epsilon=0.1):
+    if random.random() < epsilon:
+        return random.choice(actions)  # Explore with prob ε
+    else:
+        return argmax(Q[state, :])     # Exploit with prob 1-ε
+```
+
+**Tuning ε**:
+
+- **ε = 0**: No exploration (greedy, can get stuck)
+- **ε = 1**: Random policy (no exploitation, never converges)
+- **ε = 0.1**: Common choice (10% exploration)
+
+**Decay Schedule**:
+
+```python
+epsilon = max(epsilon_min, epsilon * decay_rate)
+# Start high (ε=1.0), decay to low (ε=0.01)
+```
+
+**Rationale**: Explore heavily early, exploit more as you learn.
+
+
+### Upper Confidence Bound (UCB)
+
+**Idea**: Choose action that balances value and uncertainty.
+
+**UCB Formula**:
+
+```python
+action = argmax(Q[s,a] + c * sqrt(log(N[s]) / N[s,a]))
+#                ^^^^^^     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#               Exploitation        Exploration bonus
+```
+
+**Where**:
+
+- N[s] = number of times state s visited
+- N[s,a] = number of times action a taken in state s
+- c = exploration constant
+
+**Intuition**: Actions tried less often get exploration bonus (uncertainty).
+
+**Advantage over ε-greedy**: Adaptive exploration (focuses on uncertain actions).
+
+
+### Optimistic Initialization
+
+**Idea**: Initialize Q-values to high values (optimistic).
+
+```python
+Q = defaultdict(lambda: defaultdict(lambda: 10.0))  # Optimistic
+```
+
+**Effect**: All actions initially seem good, encourages exploration.
+
+**How it works**:
+
+1. All Q-values start high (optimistic)
+2. Agent tries action, gets real reward (likely lower)
+3. Q-value decreases, agent tries other actions
+4. Continues until all actions explored
+
+**Advantage**: Simple, no ε parameter.
+
+**Disadvantage**: Only works for finite action spaces, exploration stops after initial phase.
+
+
+### Boltzmann Exploration (Softmax)
+
+**Idea**: Choose actions probabilistically based on Q-values.
+
+```python
+def softmax(Q, temperature=1.0):
+    exp_Q = np.exp(Q / temperature)
+    return exp_Q / np.sum(exp_Q)
+
+probs = softmax(Q[state, :])
+action = np.random.choice(actions, p=probs)
+```
+
+**Temperature**:
+
+- High temperature (τ→∞): Uniform random (more exploration)
+- Low temperature (τ→0): Greedy (more exploitation)
+
+**Advantage**: Naturally weights exploration by Q-values (poor actions less likely).
+
+**Disadvantage**: Requires tuning temperature, computationally more expensive.
+
+
+### Exploration Pitfall #1: No Exploration
+
+**Scenario**: Pure greedy policy.
+
+```python
+action = argmax(Q[state, :])  # No randomness
+```
+
+**Problem**: Agent never explores, gets stuck in local optimum.
+
+**Example**: Q-values initialized to 0, first action is UP (arbitrary).
+
+- Agent always chooses UP (Q still 0 for others)
+- Never discovers RIGHT is optimal
+- Stuck forever
+
+**Solution**: Always use some exploration (ε-greedy with ε ≥ 0.01).
+
+
+### Exploration Pitfall #2: Too Much Exploration
+
+**Scenario**: ε = 0.5 (50% random actions).
+
+**Problem**: Agent wastes time on known-bad actions.
+
+**Effect**: Slow convergence, poor performance even after learning.
+
+**Solution**: Decay ε over time (start high, end low).
+
+```python
+epsilon = max(0.01, epsilon * 0.995)  # Decay to 1%
+```
+
+
+### Exploration Pitfall #3: Exploration at Test Time
+
+**Scenario**: Evaluating learned policy with ε-greedy (ε=0.1).
+
+**Problem**: Test performance artificially low (10% random actions).
+
+**Solution**: Use greedy policy at test time.
+
+```python
+# Training
+action = epsilon_greedy(state, Q, epsilon=0.1)
+
+# Testing
+action = argmax(Q[state, :])  # Greedy, no exploration
+```
+
+**Takeaway**: Exploration is for learning, not evaluation.
+
+
+## Part 8: When Theory is Sufficient
+
+### Theory vs Implementation
+
+**When Understanding Theory is Enough**:
+
+1. **Debugging**: Understanding Bellman equation explains why Q-values aren't converging
+2. **Hyperparameter Tuning**: Understanding γ explains why agent is myopic
+3. **Algorithm Selection**: Understanding model-free vs model-based explains why value iteration fails
+4. **Conceptual Design**: Understanding exploration explains why agent gets stuck
+
+**When You Need Implementation**:
+
+1. **Real Problems**: Toy examples don't teach debugging real environments
+2. **Scaling**: Neural networks, replay buffers, parallel environments
+3. **Engineering**: Practical details (learning rate schedules, reward clipping)
+
+**This Skill's Scope**: Theory, intuition, foundations.
+
+**Other Skills for Implementation**: value-based-methods, policy-gradient-methods, actor-critic-methods.
+
+
+### What This Skill Taught You
+
+**1. MDP Formulation**: S, A, P, R, γ - the framework for RL.
+
+**2. Value Functions**: V(s) = expected cumulative reward, Q(s,a) = value of action in state.
+
+**3. Bellman Equations**: Recursive decomposition, foundation of all algorithms.
+
+**4. Discount Factor**: γ controls planning horizon (1/(1-γ)).
+
+**5. Policies**: Deterministic vs stochastic, optimal policy π*.
+
+**6. Algorithms**:
+
+- DP: Value iteration, policy iteration (model-based)
+- MC: Monte Carlo (episodic, model-free)
+- TD: Q-learning, SARSA (online, model-free)
+
+**7. Exploration**: ε-greedy, UCB, necessary for learning.
+
+**8. Theory-Practice Gap**: When theory suffices vs when to implement.
+
+
+### Next Steps
+
+After mastering foundations, route to:
+
+**For Discrete Actions**:
+
+- **value-based-methods**: DQN, Double DQN, Dueling DQN (Q-learning + neural networks)
+
+**For Continuous Actions**:
+
+- **actor-critic-methods**: SAC, TD3, A2C (policy + value function)
+
+**For Any Action Space**:
+
+- **policy-gradient-methods**: REINFORCE, PPO (direct policy optimization)
+
+**For Debugging**:
+
+- **rl-debugging**: Why agent not learning, reward issues, convergence problems
+
+**For Environment Setup**:
+
+- **rl-environments**: Gym, custom environments, wrappers
+
+
+## Part 9: Common Pitfalls
+
+### Pitfall #1: Skipping MDP Formulation
+
+**Symptom**: Implementing Q-learning without defining states, actions, rewards clearly.
+
+**Consequence**: Algorithm fails, user doesn't know why.
+
+**Solution**: Always answer:
+
+- What are states? (Markovian?)
+- What are actions? (Discrete/continuous?)
+- What is reward function? (Sparse/dense?)
+- What is discount factor? (Based on horizon?)
+
+
+### Pitfall #2: Confusing Value and Reward
+
+**Symptom**: Setting V(s) = r(s).
+
+**Consequence**: Ignores future rewards, policy suboptimal.
+
+**Solution**: V(s) = E[r + γr' + γ²r'' + ...], not just r.
+
+
+### Pitfall #3: Arbitrary Discount Factor
+
+**Symptom**: "Let's use γ=0.9 because it's common."
+
+**Consequence**: Agent can't see long-term goals (if γ too small) or values diverge (if γ=1 in continuing task).
+
+**Solution**: Choose γ based on horizon (γ = 1 - 1/horizon).
+
+
+### Pitfall #4: No Exploration
+
+**Symptom**: Pure greedy policy during learning.
+
+**Consequence**: Agent stuck in local optimum.
+
+**Solution**: ε-greedy with ε ≥ 0.01, decay over time.
+
+
+### Pitfall #5: Using DP Without Model
+
+**Symptom**: Trying value iteration on real robot.
+
+**Consequence**: Algorithm requires P(s'|s,a), R(s,a), which are unknown.
+
+**Solution**: Use model-free methods (Q-learning, policy gradients).
+
+
+### Pitfall #6: Monte Carlo on Continuing Tasks
+
+**Symptom**: Using MC on task that never terminates.
+
+**Consequence**: Cannot compute return (episode never ends).
+
+**Solution**: Use TD methods (learn from partial trajectories).
+
+
+### Pitfall #7: Confusing Q-Learning and SARSA
+
+**Symptom**: Using Q-learning but expecting safe exploration.
+
+**Consequence**: Q-learning learns optimal (risky) policy, ignores exploration safety.
+
+**Solution**: Use SARSA for safe on-policy learning, Q-learning for optimal off-policy.
+
+
+### Pitfall #8: Exploration at Test Time
+
+**Symptom**: Evaluating with ε-greedy (ε > 0).
+
+**Consequence**: Test performance artificially low.
+
+**Solution**: Greedy policy at test time (ε=0).
+
+
+### Pitfall #9: Treating Bellman as Black Box
+
+**Symptom**: Using Q-learning update without understanding why.
+
+**Consequence**: Cannot debug convergence issues, tune hyperparameters.
+
+**Solution**: Derive Bellman equation, understand bootstrapping.
+
+
+### Pitfall #10: Ignoring Transition Probabilities
+
+**Symptom**: Using deterministic Bellman equation in stochastic environment.
+
+**Consequence**: Wrong value estimates.
+
+**Solution**: Weight by P(s'|s,a) in stochastic environments.
+
+
+## Part 10: Rationalization Resistance
+
+### Rationalization Table
+
+| Rationalization | Reality | Counter-Guidance | Red Flag |
+|-----------------|---------|------------------|----------|
+| "I'll just copy Q-learning code" | Doesn't understand Q(s,a) meaning, cannot debug | "Let's understand what Q represents: expected cumulative reward. Why does Bellman equation have max?" | Jumping to code without theory |
+| "V(s) is the reward at state s" | V is cumulative, r is immediate | "V(s) = E[r + γr' + ...], not just r. Value is long-term." | Confusing value and reward |
+| "γ=0.9 is standard" | γ depends on task horizon | "What's your task horizon? γ=0.9 means ~10 steps. Need more?" | Arbitrary discount factor |
+| "I don't need exploration, greedy is fine" | Gets stuck in local optimum | "Without exploration, you never try new actions. Use ε-greedy." | No exploration strategy |
+| "Value iteration for Atari" | Atari doesn't have model (P, R unknown) | "Value iteration needs full model. Use model-free (DQN)." | DP on model-free problem |
+| "Monte Carlo for continuing task" | MC requires episodes (termination) | "MC needs complete episodes. Use TD for continuing tasks." | MC on continuing task |
+| "Q-learning and SARSA are the same" | Q-learning off-policy, SARSA on-policy | "Q-learning learns optimal, SARSA learns policy followed." | Confusing on-policy and off-policy |
+| "I'll test with ε-greedy (ε=0.1)" | Test should be greedy (exploit only) | "Exploration is for learning. Test with ε=0 (greedy)." | Exploration at test time |
+| "Bellman equation is just a formula" | It's the foundation of all algorithms | "Derive it. Understand why V(s) = r + γV(s'). Enables debugging." | Black-box understanding |
+| "Deterministic transition, no need for P" | Correct, but must recognize when stochastic | "If stochastic, must weight by P(s'|s,a). Check environment." | Ignoring stochasticity |
+
+
+## Part 11: Red Flags
+
+Watch for these signs of misunderstanding:
+
+- [ ] **Skipping MDP Formulation**: Implementing algorithm without defining S, A, P, R, γ
+- [ ] **Value-Reward Confusion**: Treating V(s) as immediate reward instead of cumulative
+- [ ] **Arbitrary γ**: Choosing discount factor without considering task horizon
+- [ ] **No Exploration**: Pure greedy policy during learning
+- [ ] **DP Without Model**: Using value/policy iteration when model unavailable
+- [ ] **MC on Continuing**: Using Monte Carlo on non-episodic tasks
+- [ ] **Q-SARSA Confusion**: Not understanding on-policy vs off-policy
+- [ ] **Test Exploration**: Using ε-greedy during evaluation
+- [ ] **Bellman Black Box**: Using TD updates without understanding Bellman equation
+- [ ] **Ignoring Stochasticity**: Forgetting transition probabilities in stochastic environments
+- [ ] **Planning Horizon Mismatch**: γ=0.9 for task requiring 100-step planning
+- [ ] **Policy-Value Confusion**: Confusing π(s) and V(s), or Q(s,a) and π(a|s)
+
+**If any red flag triggered → Explain theory → Derive equation → Connect to algorithm**
+
+
+## Part 12: Code Examples
+
+### Example 1: Value Iteration on GridWorld
+
+```python
+import numpy as np
+
+# GridWorld: 4x4, goal at (3,3), walls at (1,1) and (2,2)
+grid_size = 4
+goal = (3, 3)
+walls = {(1, 1), (2, 2)}
+
+# MDP definition
+gamma = 0.9
+actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
+
+def next_state(s, a):
+    """Deterministic transition"""
+    x, y = s
+    if a == 'UP': x -= 1
+    elif a == 'DOWN': x += 1
+    elif a == 'LEFT': y -= 1
+    elif a == 'RIGHT': y += 1
+    
+    # Boundary check
+    x = max(0, min(grid_size - 1, x))
+    y = max(0, min(grid_size - 1, y))
+    
+    # Wall check
+    if (x, y) in walls:
+        return s  # Bounce back
+    return (x, y)
+
+def reward(s, a, s_next):
+    """Reward function"""
+    if s_next == goal:
+        return 10
+    elif s_next in walls:
+        return -5
+    else:
+        return -1
+
+# Value Iteration
+V = np.zeros((grid_size, grid_size))
+threshold = 0.01
+max_iterations = 1000
+
+for iteration in range(max_iterations):
+    V_new = np.zeros((grid_size, grid_size))
+    
+    for x in range(grid_size):
+        for y in range(grid_size):
+            s = (x, y)
+            
+            if s == goal:
+                V_new[x, y] = 0  # Terminal state
+                continue
+            
+            # Bellman optimality backup
+            values = []
+            for a in actions:
+                s_next = next_state(s, a)
+                r = reward(s, a, s_next)
+                value = r + gamma * V[s_next[0], s_next[1]]
+                values.append(value)
+            
+            V_new[x, y] = max(values)
+    
+    # Check convergence
+    if np.max(np.abs(V_new - V)) < threshold:
+        print(f"Converged in {iteration} iterations")
+        break
+    
+    V = V_new
+
+# Extract policy
+policy = {}
+for x in range(grid_size):
+    for y in range(grid_size):
+        s = (x, y)
+        if s == goal:
+            policy[s] = None
+            continue
+        
+        best_action = None
+        best_value = -float('inf')
+        for a in actions:
+            s_next = next_state(s, a)
+            r = reward(s, a, s_next)
+            value = r + gamma * V[s_next[0], s_next[1]]
+            if value > best_value:
+                best_value = value
+                best_action = a
+        policy[s] = best_action
+
+print("Value Function:")
+print(V)
+print("\nOptimal Policy:")
+for x in range(grid_size):
+    row = []
+    for y in range(grid_size):
+        action = policy.get((x, y), '')
+        if action == 'UP': symbol = '↑'
+        elif action == 'DOWN': symbol = '↓'
+        elif action == 'LEFT': symbol = '←'
+        elif action == 'RIGHT': symbol = '→'
+        else: symbol = 'G'  # Goal
+        row.append(symbol)
+    print(' '.join(row))
+```
+
+**Output**:
+
+```
+Converged in 23 iterations
+Value Function:
+[[ 2.39  3.65  5.05  6.17]
+ [ 3.65  0.    6.17  7.59]
+ [ 5.05  0.    7.59  8.77]
+ [ 6.17  7.59  8.77  0.  ]]
+
+Optimal Policy:
+→ → → ↓
+↓ G → ↓
+→ G → ↓
+→ → → G
+```
+
+**Key Observations**:
+
+- Values increase as you get closer to goal
+- Policy points toward goal (shortest path)
+- Walls (value=0) are avoided
+
+
+### Example 2: Q-Learning on GridWorld
+
+```python
+import numpy as np
+import random
+
+# Same GridWorld setup
+grid_size = 4
+goal = (3, 3)
+walls = {(1, 1), (2, 2)}
+actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
+gamma = 0.9
+alpha = 0.1  # Learning rate
+epsilon = 0.1  # Exploration
+
+# Q-table
+Q = {}
+for x in range(grid_size):
+    for y in range(grid_size):
+        for a in actions:
+            Q[((x, y), a)] = 0.0
+
+def epsilon_greedy(s, epsilon):
+    if random.random() < epsilon:
+        return random.choice(actions)
+    else:
+        # Greedy
+        best_action = actions[0]
+        best_value = Q[(s, best_action)]
+        for a in actions:
+            if Q[(s, a)] > best_value:
+                best_value = Q[(s, a)]
+                best_action = a
+        return best_action
+
+# Training
+num_episodes = 1000
+
+for episode in range(num_episodes):
+    s = (0, 0)  # Start state
+    
+    while s != goal:
+        # Choose action
+        a = epsilon_greedy(s, epsilon)
+        
+        # Take action
+        s_next = next_state(s, a)
+        r = reward(s, a, s_next)
+        
+        # Q-learning update
+        if s_next == goal:
+            max_Q_next = 0  # Terminal
+        else:
+            max_Q_next = max(Q[(s_next, a_prime)] for a_prime in actions)
+        
+        Q[(s, a)] += alpha * (r + gamma * max_Q_next - Q[(s, a)])
+        
+        s = s_next
+
+# Extract policy
+print("Learned Policy:")
+for x in range(grid_size):
+    row = []
+    for y in range(grid_size):
+        s = (x, y)
+        if s == goal:
+            row.append('G')
+        else:
+            best_action = max(actions, key=lambda a: Q[(s, a)])
+            if best_action == 'UP': symbol = '↑'
+            elif best_action == 'DOWN': symbol = '↓'
+            elif best_action == 'LEFT': symbol = '←'
+            elif best_action == 'RIGHT': symbol = '→'
+            row.append(symbol)
+    print(' '.join(row))
+```
+
+**Output** (similar to value iteration):
+
+```
+→ → → ↓
+↓ G → ↓
+→ G → ↓
+→ → → G
+```
+
+**Key Differences from Value Iteration**:
+
+- Q-learning is model-free (doesn't need P, R)
+- Learns from experience (episodes)
+- Uses ε-greedy exploration
+- Requires many episodes to converge
+
+
+### Example 3: Policy Evaluation (MC vs TD)
+
+```python
+import numpy as np
+from collections import defaultdict
+import random
+
+# Simple chain MDP: s0 → s1 → s2 → goal
+# Deterministic policy: always go right
+# Reward: -1 per step, +10 at goal
+# gamma = 0.9
+
+gamma = 0.9
+
+# Monte Carlo Policy Evaluation
+def mc_policy_evaluation(num_episodes=1000):
+    V = defaultdict(float)
+    counts = defaultdict(int)
+    
+    for _ in range(num_episodes):
+        # Generate episode
+        trajectory = [
+            (0, -1),  # (state, reward)
+            (1, -1),
+            (2, -1),
+            (3, 10),  # goal
+        ]
+        
+        # Compute returns
+        G = 0
+        visited = set()
+        for s, r in reversed(trajectory):
+            G = r + gamma * G
+            if s not in visited:
+                V[s] += G
+                counts[s] += 1
+                visited.add(s)
+    
+    for s in V:
+        V[s] /= counts[s]
+    
+    return V
+
+# TD(0) Policy Evaluation
+def td_policy_evaluation(num_episodes=1000, alpha=0.1):
+    V = defaultdict(float)
+    
+    for _ in range(num_episodes):
+        s = 0
+        
+        while s != 3:  # Until goal
+            # Take action (deterministic policy)
+            s_next = s + 1
+            r = 10 if s_next == 3 else -1
+            
+            # TD update
+            V[s] += alpha * (r + gamma * V[s_next] - V[s])
+            
+            s = s_next
+    
+    return V
+
+# Compare
+V_mc = mc_policy_evaluation()
+V_td = td_policy_evaluation()
+
+print("Monte Carlo V:")
+print({s: round(V_mc[s], 2) for s in [0, 1, 2]})
+
+print("\nTD(0) V:")
+print({s: round(V_td[s], 2) for s in [0, 1, 2]})
+
+# True values (analytical)
+V_true = {
+    0: -1 + gamma * (-1 + gamma * (-1 + gamma * 10)),
+    1: -1 + gamma * (-1 + gamma * 10),
+    2: -1 + gamma * 10,
+}
+print("\nTrue V:")
+print({s: round(V_true[s], 2) for s in [0, 1, 2]})
+```
+
+**Output**:
+
+```
+Monte Carlo V:
+{0: 4.39, 1: 6.1, 2: 8.0}
+
+TD(0) V:
+{0: 4.41, 1: 6.12, 2: 8.01}
+
+True V:
+{0: 4.39, 1: 6.1, 2: 8.0}
+```
+
+**Observations**:
+
+- Both MC and TD converge to true values
+- TD uses bootstrapping (updates before episode ends)
+- MC waits for complete episode
+
+
+### Example 4: Discount Factor Impact
+
+```python
+import numpy as np
+
+# Simple MDP: chain of 10 states, +1 reward at end
+# Compare different gamma values
+
+def value_iteration_chain(gamma, num_states=10):
+    V = np.zeros(num_states + 1)  # +1 for goal
+    
+    # Value iteration
+    for _ in range(100):
+        V_new = np.zeros(num_states + 1)
+        for s in range(num_states):
+            # Deterministic: s → s+1, reward = +1 at goal
+            s_next = s + 1
+            r = 1 if s_next == num_states else 0
+            V_new[s] = r + gamma * V[s_next]
+        V = V_new
+    
+    return V[:num_states]  # Exclude goal
+
+# Compare gamma values
+for gamma in [0.5, 0.9, 0.99, 1.0]:
+    V = value_iteration_chain(gamma)
+    print(f"γ={gamma}:")
+    print(f"  V(s_0) = {V[0]:.4f}")
+    print(f"  V(s_5) = {V[5]:.4f}")
+    print(f"  V(s_9) = {V[9]:.4f}")
+    print(f"  Effective horizon = {1/(1-gamma) if gamma < 1 else 'inf':.1f}\n")
+```
+
+**Output**:
+
+```
+γ=0.5:
+  V(s_0) = 0.0010
+  V(s_5) = 0.0313
+  V(s_9) = 0.5000
+  Effective horizon = 2.0
+
+γ=0.9:
+  V(s_0) = 0.3487
+  V(s_5) = 0.5905
+  V(s_9) = 0.9000
+  Effective horizon = 10.0
+
+γ=0.99:
+  V(s_0) = 0.9044
+  V(s_5) = 0.9510
+  V(s_9) = 0.9900
+  Effective horizon = 100.0
+
+γ=1.0:
+  V(s_0) = 1.0000
+  V(s_5) = 1.0000
+  V(s_9) = 1.0000
+  Effective horizon = inf
+```
+
+**Key Insights**:
+
+- γ=0.5: Value at s_0 is tiny (can't "see" reward 10 steps away)
+- γ=0.9: Moderate values (horizon ≈ 10, matches task length)
+- γ=0.99: High values (can plan far ahead)
+- γ=1.0: All states have same value (no discounting)
+
+**Lesson**: Choose γ based on how far ahead agent must plan.
+
+
+### Example 5: Exploration Comparison
+
+```python
+import numpy as np
+import random
+
+# Simple bandit: 3 actions, true Q* = [1.0, 5.0, 3.0]
+# Compare exploration strategies
+
+true_Q = [1.0, 5.0, 3.0]
+num_actions = 3
+
+def sample_reward(action):
+    """Stochastic reward"""
+    return true_Q[action] + np.random.randn() * 0.5
+
+# Strategy 1: ε-greedy
+def epsilon_greedy_experiment(epsilon=0.1, num_steps=1000):
+    Q = [0.0] * num_actions
+    counts = [0] * num_actions
+    
+    total_reward = 0
+    for _ in range(num_steps):
+        # Choose action
+        if random.random() < epsilon:
+            action = random.randint(0, num_actions - 1)
+        else:
+            action = np.argmax(Q)
+        
+        # Observe reward
+        reward = sample_reward(action)
+        total_reward += reward
+        
+        # Update Q
+        counts[action] += 1
+        Q[action] += (reward - Q[action]) / counts[action]
+    
+    return total_reward / num_steps
+
+# Strategy 2: UCB
+def ucb_experiment(c=2.0, num_steps=1000):
+    Q = [0.0] * num_actions
+    counts = [0] * num_actions
+    
+    # Initialize: try each action once
+    for a in range(num_actions):
+        reward = sample_reward(a)
+        counts[a] = 1
+        Q[a] = reward
+    
+    total_reward = 0
+    for t in range(num_actions, num_steps):
+        # UCB action selection
+        ucb_values = [Q[a] + c * np.sqrt(np.log(t) / counts[a]) 
+                     for a in range(num_actions)]
+        action = np.argmax(ucb_values)
+        
+        # Observe reward
+        reward = sample_reward(action)
+        total_reward += reward
+        
+        # Update Q
+        counts[action] += 1
+        Q[action] += (reward - Q[action]) / counts[action]
+    
+    return total_reward / num_steps
+
+# Strategy 3: Greedy (no exploration)
+def greedy_experiment(num_steps=1000):
+    Q = [0.0] * num_actions
+    counts = [0] * num_actions
+    
+    total_reward = 0
+    for _ in range(num_steps):
+        action = np.argmax(Q)
+        reward = sample_reward(action)
+        total_reward += reward
+        
+        counts[action] += 1
+        Q[action] += (reward - Q[action]) / counts[action]
+    
+    return total_reward / num_steps
+
+# Compare (average over 100 runs)
+num_runs = 100
+
+greedy_rewards = [greedy_experiment() for _ in range(num_runs)]
+epsilon_rewards = [epsilon_greedy_experiment() for _ in range(num_runs)]
+ucb_rewards = [ucb_experiment() for _ in range(num_runs)]
+
+print(f"Greedy:     {np.mean(greedy_rewards):.2f} ± {np.std(greedy_rewards):.2f}")
+print(f"ε-greedy:   {np.mean(epsilon_rewards):.2f} ± {np.std(epsilon_rewards):.2f}")
+print(f"UCB:        {np.mean(ucb_rewards):.2f} ± {np.std(ucb_rewards):.2f}")
+print(f"\nOptimal: {max(true_Q):.2f}")
+```
+
+**Output**:
+
+```
+Greedy:     1.05 ± 0.52
+ε-greedy:   4.62 ± 0.21
+UCB:        4.83 ± 0.18
+
+Optimal: 5.00
+```
+
+**Insights**:
+
+- Greedy: Gets stuck on first action (often suboptimal)
+- ε-greedy: Explores, finds near-optimal
+- UCB: Slightly better, focuses exploration on uncertain actions
+
+**Lesson**: Exploration is critical. UCB > ε-greedy > greedy.
+
+
+## Part 13: When to Route Elsewhere
+
+This skill covers **theory and foundations**. Route to other skills for:
+
+**Implementation**:
+
+- **value-based-methods**: DQN, Double DQN, Dueling DQN (Q-learning + neural networks)
+- **policy-gradient-methods**: REINFORCE, PPO, TRPO (policy optimization)
+- **actor-critic-methods**: A2C, SAC, TD3 (policy + value)
+
+**Debugging**:
+
+- **rl-debugging**: Agent not learning, reward issues, convergence problems
+
+**Infrastructure**:
+
+- **rl-environments**: Gym API, custom environments, wrappers
+
+**Special Topics**:
+
+- **exploration-strategies**: Curiosity, RND, intrinsic motivation
+- **reward-shaping**: Potential-based shaping, inverse RL
+- **multi-agent-rl**: QMIX, MADDPG, cooperative/competitive
+- **offline-rl**: CQL, IQL, learning from fixed datasets
+- **model-based-rl**: MBPO, Dreamer, world models
+
+**Evaluation**:
+
+- **rl-evaluation**: Proper evaluation methodology, metrics
+
+
+## Summary
+
+**You now understand**:
+
+1. **MDP**: S, A, P, R, γ - the framework for RL
+2. **Value Functions**: V(s) = cumulative expected reward, Q(s,a) = value of action
+3. **Bellman Equations**: Recursive decomposition, foundation of algorithms
+4. **Discount Factor**: γ controls planning horizon (1/(1-γ))
+5. **Policies**: π(s) maps states to actions, π* is optimal
+6. **Algorithms**: DP (value/policy iteration), MC, TD (Q-learning, SARSA)
+7. **Exploration**: ε-greedy, UCB, necessary for learning
+8. **Theory-Practice**: When understanding suffices vs when to implement
+
+**Key Takeaways**:
+
+- **MDP formulation comes first** (define S, A, P, R, γ before implementing)
+- **Value ≠ Reward** (V is cumulative, r is immediate)
+- **γ is not arbitrary** (choose based on task horizon)
+- **Exploration is mandatory** (ε-greedy, UCB, not pure greedy)
+- **Algorithm families differ** (DP needs model, MC needs episodes, TD is most flexible)
+- **Bellman equations enable everything** (understand them to debug algorithms)
+
+**Next**: Route to implementation skills (value-based, policy-gradient, actor-critic) to build real agents.
+
+**This foundation will enable you to implement, debug, and extend RL algorithms effectively.**
diff --git a/skills/using-deep-rl/value-based-methods.md b/skills/using-deep-rl/value-based-methods.md
new file mode 100644
index 0000000..02858f6
--- /dev/null
+++ b/skills/using-deep-rl/value-based-methods.md
@@ -0,0 +1,1253 @@
+
+# Value-Based Methods
+
+## When to Use This Skill
+
+Invoke this skill when you encounter:
+
+- **Algorithm Selection**: "Should I use DQN or policy gradient for my problem?"
+- **DQN Implementation**: User implementing DQN and needs guidance on architecture
+- **Training Issues**: "DQN is diverging", "Q-values too high", "slow to learn"
+- **Variant Questions**: "What's Double DQN?", "Should I use Dueling?", "Is Rainbow worth it?"
+- **Discrete Action RL**: User has discrete action space and implementing value method
+- **Hyperparameter Tuning**: Debugging learning rates, replay buffer size, network architecture
+- **Implementation Bugs**: Target network missing, frame stacking wrong, reward scaling issues
+- **Custom Environments**: Designing states, rewards, action spaces for DQN
+
+**This skill provides practical implementation guidance for discrete action RL.**
+
+Do NOT use this skill for:
+
+- Continuous action spaces (route to actor-critic-methods)
+- Policy gradients (route to policy-gradient-methods)
+- Model-based RL (route to model-based-rl)
+- Offline RL (route to offline-rl-methods)
+- Theory foundations (route to rl-foundations)
+
+
+## Core Principle
+
+**Value-based methods solve discrete action RL by learning Q(s,a) = expected return from taking action a in state s, then acting greedily. They're powerful for discrete spaces but require careful implementation to avoid instability.**
+
+Key insight: Value methods assume you can enumerate and compare all action values. This breaks down with continuous actions (infinite actions to compare). Use them for:
+
+- Games (Atari, Chess)
+- Discrete control (robot navigation, discrete movement)
+- Dialog systems (discrete utterances)
+- Combinatorial optimization
+
+**Do not use for**:
+
+- Continuous control (robot arm angles, vehicle acceleration)
+- Stochastic policies required (multi-agent, exploration in deterministic policy)
+- Exploration of large action space (too slow to learn all actions)
+
+
+## Part 1: Q-Learning Foundation
+
+### From TD Learning to Q-Learning
+
+You understand TD learning from rl-foundations. Q-learning extends it to **action-values**.
+
+**TD(0) for V(s)**:
+
+```
+V[s] ← V[s] + α(r + γV[s'] - V[s])
+```
+
+**Q-Learning for Q(s,a)**:
+
+```
+Q[s,a] ← Q[s,a] + α(r + γ max_a' Q[s',a'] - Q[s,a])
+```
+
+**Key difference**: Q-learning has **max over next actions** (off-policy).
+
+### Off-Policy Learning
+
+Q-learning learns the **optimal policy π*(a|s) = argmax_a Q(s,a)** regardless of exploration policy.
+
+**Example: Cliff Walking**
+
+```
+Agent follows epsilon-greedy (explores 10% random)
+But Q-learning learns: "Take safe path away from cliff" (optimal)
+NOT: "Walk along cliff edge" (what exploring policy does sometimes)
+
+Q-learning separates:
+- Behavior policy: ε-greedy (for exploration)
+- Target policy: greedy (what we're learning toward)
+```
+
+**Why This Matters**: Off-policy learning is sample-efficient (can learn from any exploration strategy). On-policy methods like SARSA would learn the exploration noise into policy.
+
+### Convergence Guarantee
+
+**Theorem**: Q-learning converges to Q*(s,a) if:
+
+1. All state-action pairs visited infinitely often
+2. Learning rate α(t) → 0 (e.g., α = 1/N(s,a))
+3. Sufficiently small ε (exploration not zero)
+
+**Practical**: Use ε-decay schedule that ensures eventual convergence.
+
+```python
+epsilon = max(epsilon_min, epsilon * decay_rate)
+# Start: ε=1.0, decay to ε=0.01
+# Ensures: all actions eventually tried, then exploitation takes over
+```
+
+### Q-Learning Pitfall #1: Small State Spaces Only
+
+**Scenario**: User implements tabular Q-learning for Atari.
+
+**Problem**:
+
+```
+Atari image: 210×160 RGB = 20,160 pixels
+Possible states: 256^20160 (astronomical)
+Tabular Q-learning: impossible
+```
+
+**Solution**: Use function approximation (neural networks) → Deep Q-Networks
+
+**Red Flag**: Tabular Q-learning works only for small state spaces (<10,000 unique states).
+
+
+## Part 2: Deep Q-Networks (DQN)
+
+### What DQN Adds to Q-Learning
+
+DQN = Q-learning + neural network + **two critical stability mechanisms**:
+
+1. **Experience Replay**: Break temporal correlation
+2. **Target Network**: Prevent moving target problem
+
+### Mechanism 1: Experience Replay
+
+**Problem without replay**:
+
+```python
+# Naive approach (WRONG)
+state = env.reset()
+for t in range(1000):
+    action = epsilon_greedy(state)
+    next_state, reward = env.step(action)
+
+    # Update Q from this single transition
+    Q(state, action) += α(reward + γ max Q(next_state) - Q(state, action))
+    state = next_state
+```
+
+**Why this fails**:
+
+- Consecutive transitions are **highly correlated** (state_t and state_{t+1} very similar)
+- Neural network gradient updates are unstable with correlated data
+- Network overfits to recent trajectory
+
+**Experience Replay Solution**:
+
+```python
+# Collect experiences in buffer
+replay_buffer = []
+
+for episode in range(num_episodes):
+    state = env.reset()
+    for t in range(max_steps):
+        action = epsilon_greedy(state)
+        next_state, reward = env.step(action)
+
+        # Store experience (not learn yet)
+        replay_buffer.append((state, action, reward, next_state, done))
+
+        # Sample random batch and learn
+        if len(replay_buffer) > batch_size:
+            batch = random.sample(replay_buffer, batch_size)
+            for (s, a, r, s_next, done) in batch:
+                if done:
+                    target = r
+                else:
+                    target = r + gamma * max(Q(s_next))
+                loss = (Q(s,a) - target)^2
+
+            # Update network weights
+            optimizer.step(loss)
+
+        state = next_state
+```
+
+**Why this works**:
+
+1. **Breaks correlation**: Random sampling decorrelates gradient updates
+2. **Sample efficiency**: Reuse old experiences (learn more from same env interactions)
+3. **Stability**: Averaged gradients are smoother
+
+### Mechanism 2: Target Network
+
+**Problem without target network**:
+
+```python
+# Moving target problem (WRONG)
+loss = (Q(s,a) - [r + γ max Q(s_next, a_next)])^2
+       #     ^^^^             ^^^^
+       # Same network computing both target and prediction
+```
+
+**Issue**: Network updates move both the prediction AND the target, creating instability.
+
+**Analogy**: Trying to hit a moving target that moves whenever you aim.
+
+**Target Network Solution**:
+
+```python
+# Separate networks
+main_network = create_network()      # Learning network
+target_network = create_network()    # Stable target (frozen)
+
+# Training loop
+loss = (main_network(s,a) - [r + γ max target_network(s_next)])^2
+                                  ^^^^^^^^
+                    Target network doesn't update every step
+
+# Periodically synchronize
+if t % update_frequency == 0:
+    target_network = copy(main_network)  # Freeze for N steps
+```
+
+**Why this works**:
+
+1. **Stability**: Target doesn't move as much (frozen for many steps)
+2. **Bellman consistency**: Gives network time to learn, then adjusts target
+3. **Convergence**: Bootstrapping no longer destabilized by moving target
+
+### DQN Architecture Pattern
+
+```python
+import torch
+import torch.nn as nn
+
+class DQN(nn.Module):
+    def __init__(self, input_size, num_actions):
+        super().__init__()
+        # For Atari: CNN backbone
+        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)  # Frame stack: 4 frames
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
+        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
+
+        # Flatten and FC layers
+        self.fc1 = nn.Linear(64*7*7, 512)  # After convolutions
+        self.fc_value = nn.Linear(512, 1)  # For dueling: value stream
+        self.fc_actions = nn.Linear(512, num_actions)  # For dueling: advantage stream
+
+    def forward(self, x):
+        # x shape: (batch, 4, 84, 84) for Atari
+        x = torch.relu(self.conv1(x))
+        x = torch.relu(self.conv2(x))
+        x = torch.relu(self.conv3(x))
+        x = x.flatten(start_dim=1)
+        x = torch.relu(self.fc1(x))
+
+        # For basic DQN: just action values
+        q_values = self.fc_actions(x)
+        return q_values
+```
+
+### Hyperparameter Guidance
+
+| Parameter | Value Range | Effect | Guidance |
+|-----------|------------|--------|----------|
+| Replay buffer size | 10k-1M | Memory, sample diversity | Start 100k, increase for slow learning |
+| Batch size | 32-256 | Stability vs memory | 32-64 common; larger = more stable |
+| Learning rate α | 0.0001-0.001 | Convergence speed | Start 0.0001, increase if too slow |
+| Target update freq | 1k-10k steps | Stability | Update every 1000-5000 steps |
+| ε initial | 0.5-1.0 | Exploration | Start 1.0 (random) |
+| ε final | 0.01-0.05 | Late exploitation | 0.01-0.05 typical |
+| ε decay | 10k-1M steps | Exploration → Exploitation | Tune to problem (larger env → longer decay) |
+
+### DQN Pitfall #1: Missing Target Network
+
+**Symptom**: "DQN loss explodes immediately, Q-values diverge to ±infinity"
+
+**Root cause**: No target network (or target updates too frequently)
+
+```python
+# WRONG - target network updates every step
+loss = (Q(s,a) - [r + γ max Q(s_next)])^2  # Both from same network
+
+# CORRECT - target network frozen for steps
+loss = (Q_main(s,a) - [r + γ max Q_target(s_next)])^2
+# Update target: if step % 1000 == 0: Q_target = copy(Q_main)
+```
+
+**Fix**: Verify target network update frequency (1000-5000 steps typical).
+
+### DQN Pitfall #2: Replay Buffer Too Small
+
+**Symptom**: "Sample efficiency very poor, agent takes millions of steps to learn"
+
+**Root cause**: Small replay buffer = replay many recent correlated experiences
+
+```python
+# WRONG
+replay_buffer_size = 10_000
+# After 10k steps, only seeing recent experience (no diversity)
+
+# CORRECT
+replay_buffer_size = 100_000 or 1_000_000
+# See diverse experiences from long history
+```
+
+**Rule of Thumb**: Replay buffer ≥ 10 × episode length (more is usually better)
+
+**Memory vs Sample Efficiency Tradeoff**:
+
+- 10k buffer: Low memory, high correlation (bad)
+- 100k buffer: Moderate memory, good diversity (usually sufficient)
+- 1M buffer: High memory, excellent diversity (overkill unless long episodes)
+
+### DQN Pitfall #3: No Frame Stacking
+
+**Symptom**: "Learning very slow or doesn't converge"
+
+**Root cause**: Single frame doesn't show velocity (violates Markov property)
+
+```python
+# WRONG - single frame
+state = current_frame  # No velocity information
+# Network cannot infer: is ball moving left or right?
+
+# CORRECT - stack frames
+state = np.stack([frame_t, frame_{t-1}, frame_{t-2}, frame_{t-3}])
+# Velocity: difference between consecutive frames
+```
+
+**Implementation**:
+
+```python
+from collections import deque
+
+class FrameBuffer:
+    def __init__(self, num_frames=4):
+        self.buffer = deque(maxlen=num_frames)
+
+    def add_frame(self, frame):
+        self.buffer.append(frame)
+
+    def get_state(self):
+        return np.stack(list(self.buffer))  # (4, 84, 84)
+```
+
+### DQN Pitfall #4: Reward Clipping Wrong
+
+**Symptom**: "Training unstable" or "Learned policy much worse than Q-values suggest"
+
+**Context**: Atari papers clip rewards to {-1, 0, +1} for stability.
+
+**Misunderstanding**: Clipping destroys reward information.
+
+```python
+# WRONG - unthinking clip
+reward = np.clip(reward, -1, 1)  # All rewards become -1,0,+1
+# In custom env with rewards in [-100, 1000], loses critical information
+
+# CORRECT - Normalize instead
+reward = (reward - reward_mean) / reward_std
+# Preserves differences, stabilizes scale
+```
+
+**When to clip**: Only if rewards are naturally in {-1, 0, +1} (like Atari).
+
+**When to normalize**: Custom environments with arbitrary scales.
+
+
+## Part 3: Double DQN
+
+### The Overestimation Bias Problem
+
+**Max operator bias**: In stochastic environments, max over noisy estimates is biased upward.
+
+**Example**:
+
+```
+True Q*(s,a) values: [10.0, 5.0, 8.0]
+
+Due to noise, estimates: [11.0, 4.0, 9.0]
+                            ↑
+                        True Q = 10, estimate = 11
+
+Standard DQN takes max: max(Q_estimates) = 11
+But true Q*(s,best_action) = 10
+
+Systematic overestimation! Agent thinks actions better than they are.
+```
+
+**Consequence**:
+
+- Inflated Q-values during training
+- Learned policy (greedy) performs worse than Q-values suggest
+- Especially bad early in training when estimates very noisy
+
+### Double DQN Solution
+
+**Insight**: Use one network to **select** best action, another to **evaluate** it.
+
+```python
+# Standard DQN (overestimates)
+target = r + γ max_a Q_target(s_next, a)
+         #        ^^^^
+         # Both selecting and evaluating with same network
+
+# Double DQN (unbiased)
+best_action = argmax_a Q_main(s_next, a)      # Select with main network
+target = r + γ Q_target(s_next, best_action)  # Evaluate with target network
+```
+
+**Why it works**:
+
+- Decouples selection and evaluation
+- Removes systematic bias
+- Unbiased estimator of true Q*
+
+### Implementation
+
+```python
+class DoubleDQN(DQNAgent):
+    def compute_loss(self, batch):
+        states, actions, rewards, next_states, dones = batch
+
+        # Main network Q-values for current state
+        q_values = self.main_network(states)
+        q_values_current = q_values.gather(1, actions)
+
+        # Double DQN: select action with main network
+        next_q_main = self.main_network(next_states)
+        best_actions = next_q_main.argmax(1, keepdim=True)
+
+        # Evaluate with target network
+        next_q_target = self.target_network(next_states)
+        max_next_q = next_q_target.gather(1, best_actions).detach()
+
+        # TD target (handles done flag)
+        targets = rewards + (1 - dones) * self.gamma * max_next_q
+
+        loss = F.smooth_l1_loss(q_values_current, targets)
+        return loss
+```
+
+### When to Use Double DQN
+
+**Use Double DQN if**:
+
+- Training a medium-complexity task (Atari)
+- Suspicious that Q-values are too optimistic
+- Want slightly better sample efficiency
+
+**Standard DQN is OK if**:
+
+- Small action space (less overestimation)
+- Training is otherwise stable
+- Sample efficiency not critical
+
+**Takeaway**: Double DQN is strictly better, minimal cost, use it.
+
+
+## Part 4: Dueling DQN
+
+### Dueling Architecture: Separating Value and Advantage
+
+**Insight**: Q(s,a) = V(s) + A(s,a) where:
+
+- **V(s)**: How good is this state? (independent of action)
+- **A(s,a)**: How much better is action a than average? (action-specific advantage)
+
+**Why separate**:
+
+1. **Better feature learning**: Network learns state features independently from action value
+2. **Stabilization**: Value stream sees many states (more gradient signal)
+3. **Generalization**: Advantage stream learns which actions matter
+
+**Example**:
+
+```
+Atari Breakout:
+V(s) = "Ball in good position, paddle ready" (state value)
+A(s,LEFT) = -2 (moving left here hurts)
+A(s,RIGHT) = +3 (moving right here helps)
+A(s,NOOP) = 0 (staying still is neutral)
+
+Q(s,LEFT) = V + A = 5 + (-2) = 3
+Q(s,RIGHT) = V + A = 5 + 3 = 8  ← Best action
+Q(s,NOOP) = V + A = 5 + 0 = 5
+```
+
+### Architecture
+
+```python
+class DuelingDQN(nn.Module):
+    def __init__(self, input_size, num_actions):
+        super().__init__()
+
+        # Shared feature backbone
+        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
+        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
+        self.fc = nn.Linear(64*7*7, 512)
+
+        # Value stream (single output)
+        self.value_fc = nn.Linear(512, 256)
+        self.value = nn.Linear(256, 1)
+
+        # Advantage stream (num_actions outputs)
+        self.advantage_fc = nn.Linear(512, 256)
+        self.advantage = nn.Linear(256, num_actions)
+
+    def forward(self, x):
+        # Shared backbone
+        x = torch.relu(self.conv1(x))
+        x = torch.relu(self.conv2(x))
+        x = torch.relu(self.conv3(x))
+        x = x.flatten(start_dim=1)
+        x = torch.relu(self.fc(x))
+
+        # Value stream
+        v = torch.relu(self.value_fc(x))
+        v = self.value(v)
+
+        # Advantage stream
+        a = torch.relu(self.advantage_fc(x))
+        a = self.advantage(a)
+
+        # Combine: Q = V + (A - mean(A))
+        # Subtract mean(A) for normalization (prevents instability)
+        q = v + (a - a.mean(dim=1, keepdim=True))
+        return q
+```
+
+### Why Subtract Mean of Advantages?
+
+```python
+# Without mean subtraction
+q = v + a
+# Problem: V and A not separately identifiable
+# V could be 100 + A = -90 or V = 50 + A = -40 (same Q)
+
+# With mean subtraction
+q = v + (a - mean(a))
+# Mean advantage = 0 on average
+# Forces: V learns state value, A learns relative advantage
+# More stable training
+```
+
+### When to Use Dueling DQN
+
+**Use Dueling if**:
+
+- Training complex environments (Atari)
+- Want better feature learning
+- Training is unstable (helps stabilization)
+
+**Standard DQN is OK if**:
+
+- Simple environments
+- Computational budget tight
+
+**Takeaway**: Dueling is strictly better for neural network learning, minimal cost, use it.
+
+
+## Part 5: Prioritized Experience Replay
+
+### Problem with Uniform Sampling
+
+**Issue**: All transitions equally likely to be sampled.
+
+```python
+# Uniform sampling
+batch = random.sample(replay_buffer, batch_size)
+# Includes: boring transitions, important transitions, rare transitions
+# All mixed together with equal weight
+```
+
+**Problem**:
+
+- Wasted learning on transitions already understood
+- Rare important transitions sampled rarely
+- Sample inefficiency
+
+**Example**:
+
+```
+Atari agent learns mostly: "Move paddle left-right in routine positions"
+Rarely: "What happens when ball is in corner?" (rare, important)
+
+Uniform replay: 95% learning about paddle, 5% about corners
+Should be: More focus on corners (rarer, more surprising)
+```
+
+### Prioritized Experience Replay Solution
+
+**Insight**: Sample transitions proportional to **TD error** (surprise).
+
+```python
+# Compute TD error (surprise)
+td_error = |r + γ max Q(s_next) - Q(s,a)|
+#           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#           How wrong was our prediction?
+
+# Probability ∝ TD error^α
+# High error transitions sampled more
+batch = sample_proportional_to_priority(replay_buffer, priorities)
+```
+
+### Implementation
+
+```python
+import numpy as np
+
+class PrioritizedReplayBuffer:
+    def __init__(self, size, alpha=0.6):
+        self.buffer = []
+        self.priorities = []
+        self.size = size
+        self.alpha = alpha  # How much to prioritize (0=uniform, 1=full priority)
+        self.epsilon = 1e-6  # Small value to avoid zero priority
+
+    def add(self, experience):
+        # New experiences get max priority (important!)
+        max_priority = np.max(self.priorities) if self.priorities else 1.0
+
+        if len(self.buffer) < self.size:
+            self.buffer.append(experience)
+            self.priorities.append(max_priority)
+        else:
+            # Replace oldest if full
+            self.buffer[len(self.buffer) % self.size] = experience
+            self.priorities[len(self.priorities) % self.size] = max_priority
+
+    def sample(self, batch_size):
+        # Compute sampling probabilities
+        priorities = np.array(self.priorities) ** self.alpha
+        priorities = priorities / np.sum(priorities)
+
+        # Sample indices
+        indices = np.random.choice(len(self.buffer), batch_size, p=priorities)
+        batch = [self.buffer[i] for i in indices]
+
+        # Importance sampling weights (correct for bias from prioritized sampling)
+        weights = (1 / (len(self.buffer) * priorities[indices])) ** (1/3)  # β=1/3
+        weights = weights / np.max(weights)  # Normalize
+
+        return batch, indices, weights
+
+    def update_priorities(self, indices, td_errors):
+        # Update priorities based on new TD errors
+        for idx, td_error in zip(indices, td_errors):
+            self.priorities[idx] = (np.abs(td_error) + self.epsilon) ** self.alpha
+```
+
+### Importance Sampling Weights
+
+**Problem**: Prioritized sampling introduces bias (samples important transitions more).
+
+**Solution**: Reweight gradients by inverse probability.
+
+```python
+# Uniform sampling: each transition contributes equally
+loss = mean((r + γ max Q(s_next) - Q(s,a))^2)
+
+# Prioritized sampling: bias toward high TD error
+# Correct with importance weight (large TD error → small weight)
+loss = mean(weights * (r + γ max Q(s_next) - Q(s,a))^2)
+#            ^^^^^^^
+#      Importance sampling correction
+
+# weights ∝ 1/priority (inverse)
+```
+
+### When to Use Prioritized Replay
+
+**Use if**:
+
+- Training large environments (Atari)
+- Sample efficiency critical
+- Have computational budget for priority updates
+
+**Use standard uniform if**:
+
+- Small environments
+- Computational budget tight
+- Standard training is working fine
+
+**Note**: Adds complexity (priority updates), minimal empirical gain in many cases.
+
+
+## Part 6: Rainbow DQN
+
+### Combining All Improvements
+
+**Rainbow** = Double DQN + Dueling DQN + Prioritized Replay + 3 more innovations:
+
+1. **Double DQN**: Reduce overestimation bias
+2. **Dueling DQN**: Separate value and advantage
+3. **Prioritized Replay**: Sample important transitions
+4. **Noisy Networks**: Exploration through network parameters
+5. **Distributional RL**: Learn Q distribution not just mean
+6. **Multi-step Returns**: n-step TD learning instead of 1-step
+
+### When to Use Rainbow
+
+**Use Rainbow if**:
+
+- Need state-of-the-art Atari performance
+- Have weeks of compute for tuning
+- Paper requires it
+
+**Use Double + Dueling DQN if**:
+
+- Standard DQN training unstable
+- Want good performance with less tuning
+- Typical development
+
+**Use Basic DQN if**:
+
+- Learning the method
+- Sample efficiency not critical
+- Simple environments
+
+**Lesson**: Understand components separately before combining.
+
+```
+Learning progression:
+1. Q-learning (understand basics)
+2. Basic DQN (add neural networks)
+3. Double DQN (fix overestimation)
+4. Dueling DQN (improve architecture)
+5. Add prioritized replay (sample efficiency)
+6. Rainbow (combine all)
+```
+
+
+## Part 7: Common Bugs and Debugging
+
+### Bug #1: Training Divergence (Q-values explode)
+
+**Diagnosis Tree**:
+
+1. **Check target network**:
+
+   ```python
+   # WRONG - updating every step
+   loss = (Q_main(s,a) - [r + γ max Q_main(s_next)])^2
+   # FIX - use separate target network
+   loss = (Q_main(s,a) - [r + γ max Q_target(s_next)])^2
+   ```
+
+2. **Check learning rate**:
+
+   ```python
+   # WRONG - too high
+   optimizer = torch.optim.Adam(network.parameters(), lr=0.1)
+   # FIX - reduce learning rate
+   optimizer = torch.optim.Adam(network.parameters(), lr=0.0001)
+   ```
+
+3. **Check reward scale**:
+
+   ```python
+   # WRONG - rewards too large
+   reward = 1000 * indicator  # Values explode
+   # FIX - normalize
+   reward = 10 * indicator
+   # Or: reward = (reward - reward_mean) / reward_std
+   ```
+
+4. **Check replay buffer**:
+
+   ```python
+   # WRONG - too small
+   replay_buffer_size = 1000
+   # FIX - increase size
+   replay_buffer_size = 100_000
+   ```
+
+### Bug #2: Poor Sample Efficiency (Slow Learning)
+
+**Diagnosis Tree**:
+
+1. **Check replay buffer size**:
+
+   ```python
+   # Too small → high correlation
+   if len(replay_buffer) < 100_000:
+       print("WARNING: Replay buffer too small for Atari")
+   ```
+
+2. **Check target network update frequency**:
+
+   ```python
+   # Too frequent → moving target
+   # Too infrequent → slow target adjustment
+   # Good: every 1000-5000 steps
+   if update_frequency > 10_000:
+       print("Target updates too infrequent")
+   ```
+
+3. **Check batch size**:
+
+   ```python
+   # Too small → noisy gradients
+   # Too large → slow training
+   # Good: 32-64
+   if batch_size < 16 or batch_size > 256:
+       print("Consider adjusting batch size")
+   ```
+
+4. **Check epsilon decay**:
+
+   ```python
+   # Decaying too fast → premature exploitation
+   # Decaying too slow → wastes steps exploring
+   # Typical: decay over 10% of total steps
+   if decay_steps < total_steps * 0.05:
+       print("Epsilon decays too quickly")
+   ```
+
+### Bug #3: Q-Values Too Optimistic (Learned Policy << Training Q)
+
+**Diagnosis**:
+
+**Red Flag**: Policy performance much worse than max Q-value during training.
+
+```python
+# Symptom
+max_q_value = 100.0
+actual_episode_return = 5.0
+# 20x gap suggests overestimation
+
+# Solutions (try in order)
+1. Use Double DQN (reduces overestimation)
+2. Reduce learning rate (slower updates → less optimistic)
+3. Increase target network update frequency (more stable target)
+4. Check reward function (might be wrong)
+```
+
+### Bug #4: Frame Stacking Wrong
+
+**Symptoms**:
+
+- Very slow learning despite "correct" implementation
+- Network can't learn velocity-dependent behaviors
+
+**Diagnosis**:
+
+```python
+# WRONG - single frame
+state_shape = (84, 84, 3)
+# Network sees only position, not velocity
+
+# CORRECT - stack 4 frames
+state_shape = (84, 84, 4)
+# Last 4 frames show motion
+
+# Check frame stacking implementation
+frame_stack = deque(maxlen=4)
+for frame in frames:
+    frame_stack.append(frame)
+    state = np.stack(list(frame_stack))  # (4, 84, 84)
+```
+
+### Bug #5: Network Architecture Mismatch
+
+**Symptoms**:
+
+- CNN on non-image input (or vice versa)
+- Output layer wrong number of actions
+- Input preprocessing wrong
+
+**Diagnosis**:
+
+```python
+# Image input → use CNN
+if input_type == 'image':
+    network = CNN(num_actions)
+
+# Vector input → use FC
+elif input_type == 'vector':
+    network = FullyConnected(input_size, num_actions)
+
+# Output layer MUST have num_actions outputs
+assert network.output_size == num_actions
+```
+
+
+## Part 8: Hyperparameter Tuning
+
+### Learning Rate
+
+**Too high** (α > 0.001):
+
+- Divergence, unstable training
+- Q-values explode
+
+**Too low** (α < 0.00001):
+
+- Very slow learning
+- May not converge in reasonable time
+
+**Start**: α = 0.0001, adjust if needed
+
+```python
+# Adaptive strategy
+if max_q_value > 1000:
+    print("Reduce learning rate")
+    alpha = alpha / 2
+if learning_curve_flat:
+    print("Increase learning rate")
+    alpha = alpha * 1.1
+```
+
+### Replay Buffer Size
+
+**Too small** (< 10k for Atari):
+
+- High correlation in gradients
+- Slow learning, poor sample efficiency
+
+**Too large** (> 10M):
+
+- Excessive memory
+- Stale experiences dominate
+- Diminishing returns
+
+**Rule of thumb**: 10 × episode length
+
+```python
+episode_length = 1000  # typical
+ideal_buffer = 100_000  # 10 × typical Atari episode
+
+# Can increase if GPU memory available and learning slow
+if learning_slow:
+    buffer_size = 500_000  # More diversity
+```
+
+### Epsilon Decay
+
+**Too fast** (decay in 10k steps):
+
+- Agent exploits before learning
+- Suboptimal policy
+
+**Too slow** (decay in 1M steps):
+
+- Wasted exploration time
+- Slow performance improvement
+
+**Rule**: Decay over ~10% of total training steps
+
+```python
+total_steps = 1_000_000
+epsilon_decay_steps = total_steps * 0.1  # 100k steps
+epsilon = max(epsilon_min, epsilon * (epsilon_decay_steps / current_step))
+```
+
+### Target Network Update Frequency
+
+**Too frequent** (every 100 steps):
+
+- Target still moves rapidly
+- Less stabilization benefit
+
+**Too infrequent** (every 100k steps):
+
+- Network drifts far from target
+- Large jumps in learning
+
+**Sweet spot**: Every 1k-5k steps (1000 typical)
+
+```python
+update_frequency = 1000  # steps between target updates
+if update_frequency < 500:
+    print("Target updates might be too frequent")
+if update_frequency > 10_000:
+    print("Target updates might be too infrequent")
+```
+
+### Reward Scaling
+
+**No scaling** (raw rewards vary wildly):
+
+- Learning rate effects vary by task
+- Convergence issues
+
+**Clipping** (clip to {-1, 0, +1}):
+
+- Good for Atari, loses information in custom envs
+
+**Normalization** (zero-mean, unit variance):
+
+- General solution
+- Preserves reward differences
+
+```python
+# Track running statistics
+running_mean = 0.0
+running_var = 1.0
+
+def normalize_reward(reward):
+    global running_mean, running_var
+    running_mean = 0.99 * running_mean + 0.01 * reward
+    running_var = 0.99 * running_var + 0.01 * (reward - running_mean)**2
+    return (reward - running_mean) / np.sqrt(running_var + 1e-8)
+```
+
+
+## Part 9: When to Use Each Method
+
+### DQN Selection Matrix
+
+| Situation | Method | Why |
+|-----------|--------|-----|
+| Learning method | Basic DQN | Understand target network, replay buffer |
+| Medium task | Double DQN | Fix overestimation, minimal overhead |
+| Complex task | Double + Dueling | Better architecture + bias reduction |
+| Sample critical | Add Prioritized | Focus on important transitions |
+| State-of-art | Rainbow | Best Atari performance |
+| Simple Atari | DQN | Sufficient, faster to debug |
+| Non-Atari discrete | DQN/Double | Adapt architecture to input type |
+
+### Action Space Check
+
+**Before implementing DQN, ask**:
+
+```python
+if action_space == 'continuous':
+    print("ERROR: Use actor-critic or policy gradient")
+    print("Value methods only for discrete actions")
+    redirect_to_actor_critic_methods()
+
+elif action_space == 'discrete' and len(actions) <= 100:
+    print("✓ DQN appropriate")
+
+elif action_space == 'discrete' and len(actions) > 1000:
+    print("⚠ Large action space, consider policy gradient")
+    print("Or: hierarchical RL, action abstraction")
+```
+
+
+## Part 10: Red Flags Checklist
+
+When you see these, suspect bugs:
+
+- [ ] **Single frame input**: No velocity info, add frame stacking
+- [ ] **No target network**: Divergence expected, add it
+- [ ] **Small replay buffer** (< 10k): Poor efficiency, increase
+- [ ] **High learning rate** (> 0.001): Instability likely, decrease
+- [ ] **No frame preprocessing**: Raw image pixels, normalize to [0,1]
+- [ ] **Updating target every step**: Moving target problem, freeze it
+- [ ] **No exploration decay**: Explores forever, add epsilon decay
+- [ ] **Continuous actions**: Wrong method, use actor-critic
+- [ ] **Very large rewards** (> 100): Scaling issues, normalize
+- [ ] **Only one environment**: Bias high, use frame skipping or multiple envs
+- [ ] **Immediate best performance**: Overfitting to initial conditions, likely divergence later
+- [ ] **Q-values >> rewards**: Overestimation, try Double DQN
+- [ ] **All Q-values zero**: Network not learning, check learning rate
+- [ ] **Training loss increasing**: Learning rate too high, divergence
+
+
+## Part 11: Pitfall Rationalization
+
+| Rationalization | Reality | Counter-Guidance | Red Flag |
+|-----------------|---------|------------------|----------|
+| "I'll skip target network, save memory" | Causes instability/divergence | Target network critical, minimal memory cost | "Target network optional" |
+| "DQN works for continuous actions" | Breaks fundamental assumption (enumerate all actions) | Value methods discrete-only, use SAC/TD3 for continuous | Continuous action DQN attempt |
+| "Uniform replay is fine" | Wastes learning on boring transitions | Prioritized replay better, but uniform adequate for many tasks | Always recommending prioritized |
+| "I'll use tiny replay buffer, it's faster" | High correlation, poor learning | 100k+ buffer typical, speed tradeoff acceptable | Buffer < 10k for Atari |
+| "Frame stacking unnecessary, CNN sees motion" | Single frame Markov-violating | Frame stacking required for velocity from pixels | Single frame policy |
+| "Rainbow is just DQN + tricks" | Missing that components solve specific problems | Each component fixes identified issue (overestimation, architecture, sampling) | Jumping to Rainbow without understanding |
+| "Clip rewards, I saw it in a paper" | Clips away important reward information | Only clip for {-1,0,+1} Atari-style, normalize otherwise | Blind reward clipping |
+| "Larger network will learn faster" | Overfitting, slower gradients, memory issues | Standard architecture (32-64-64 CNN) works, don't over-engineer | Unreasonably large networks |
+| "Policy gradient would be simpler here" | Value methods discrete-only right choice | Know when each applies (discrete → value, continuous → policy) | Wrong method choice for action space |
+| "Epsilon decay is a hyperparameter like any other" | decay schedule should match task complexity | Tune decay to problem (game length), not arbitrary | Epsilon decay without reasoning |
+
+
+## Part 12: Pressure Test Scenarios
+
+### Scenario 1: Continuous Action Space
+
+**User**: "I have a robot with continuous action space (joint angles in ℝ^7). Can I use DQN?"
+
+**Wrong Response**: "Sure, discretize the actions"
+(Combinatorial explosion, inefficient)
+
+**Correct Response**: "No, value methods are discrete-only. Use actor-critic (SAC) or policy gradient (PPO). They handle continuous actions naturally. Discretization would create 7-dimensional action space explosion (e.g., 10 values per joint = 10^7 actions)."
+
+
+### Scenario 2: Training Unstable
+
+**User**: "My DQN is diverging immediately, loss explodes. Implementation looks right. What's wrong?"
+
+**Systematic Debug**:
+
+```
+1. Check target network
+   - Print: "Is target_network separate from main_network?"
+   - Likely cause: updating together
+
+2. Check learning rate
+   - Print: "Learning rate = ?"
+   - If > 0.001, reduce
+
+3. Check reward scale
+   - Print: "max(rewards) = ?"
+   - If > 100, normalize
+
+4. Check initial Q-values
+   - Print: "mean(Q-values) = ?"
+   - Should start near zero
+```
+
+**Answer**: Target network most likely culprit. Verify separate networks with proper update frequency.
+
+
+### Scenario 3: Rainbow vs Double DQN
+
+**User**: "Should I implement Rainbow or just Double DQN? Is Rainbow worth the complexity?"
+
+**Guidance**:
+
+```
+Double DQN:
++ Fixes overestimation bias
++ Simple to implement
++ 90% of Rainbow benefits in many cases
+- Missing other optimizations
+
+Rainbow:
++ Best Atari performance
++ State-of-the-art
+- Complex (6 components)
+- Harder to debug
+- More hyperparameters
+
+Recommendation:
+Start: Double DQN
+If unstable: Add Dueling
+If slow: Add Prioritized
+Only go to Rainbow: If need SotA and have time
+```
+
+
+### Scenario 4: Frame Stacking Issue
+
+**User**: "My agent trains on Atari but learning is slow. How many frames should I stack?"
+
+**Diagnosis**:
+
+```python
+# Check if frame stacking implemented
+if state.shape != (4, 84, 84):
+    print("ERROR: Not using frame stacking")
+    print("Single frame (1, 84, 84) violates Markov property")
+    print("Add frame stacking: stack last 4 frames")
+
+# Frame count
+4 frames: Standard (shows ~80ms at 50fps = ~4 frames)
+3 frames: OK, slightly less velocity info
+2 frames: Minimum, just barely Markovian
+1 frame: WRONG, not Markovian
+8+ frames: Too many, outdated states in stack
+```
+
+
+### Scenario 5: Hyperparameter Tuning
+
+**User**: "I've tuned learning rate, buffer size, epsilon. What else affects performance?"
+
+**Guidance**:
+
+```
+Priority 1 (Critical):
+- Target network update frequency (1000-5000 steps)
+- Replay buffer size (100k+ typical)
+- Frame stacking (4 frames)
+
+Priority 2 (Important):
+- Learning rate (0.0001-0.0005)
+- Epsilon decay schedule (over ~10% of steps)
+- Batch size (32-64)
+
+Priority 3 (Nice to have):
+- Network architecture (32-64-64 CNN standard)
+- Reward normalization (helps but not required)
+- Double/Dueling DQN (improvements, not essentials)
+
+Start with Priority 1, only adjust Priority 2-3 if unstable.
+```
+
+
+## Part 13: When to Route Elsewhere
+
+### Route to rl-foundations if
+
+- User confused about Bellman equations
+- Unclear on value function definition
+- Needs theory behind Q-learning convergence
+
+### Route to actor-critic-methods if
+
+- Continuous action space
+- Need deterministic policy gradients
+- Stochastic policy required
+
+### Route to policy-gradient-methods if
+
+- Large discrete action space (> 1000 actions)
+- Need policy regularization
+- Exploration by stochasticity useful
+
+### Route to offline-rl-methods if
+
+- No environment access (batch learning)
+- Learning from logged data only
+
+### Route to rl-debugging if
+
+- General training issues
+- Need systematic debugging methodology
+- Credit assignment problems
+
+### Route to reward-shaping if
+
+- Sparse rewards
+- Reward design affecting learning
+- Potential-based shaping questions
+
+
+## Summary
+
+**You now understand**:
+
+1. **Q-Learning**: TD learning for action values, off-policy convergence guarantee
+2. **DQN**: Add neural networks + experience replay + target network for stability
+3. **Stability Mechanisms**:
+   - Replay buffer: Break correlation
+   - Target network: Prevent moving target problem
+4. **Common Variants**:
+   - Double DQN: Fix overestimation bias
+   - Dueling DQN: Separate value and advantage
+   - Prioritized Replay: Focus on important transitions
+   - Rainbow: Combine improvements
+5. **When to Use**: Discrete action spaces only, not continuous
+6. **Common Bugs**: Divergence, poor efficiency, overoptimism, frame issues
+7. **Hyperparameter Tuning**: Buffer size, learning rate, epsilon decay, target frequency
+8. **Debugging Strategy**: Systematic diagnosis (target network → learning rate → reward scale)
+
+**Key Takeaways**:
+
+- Value methods are for **discrete actions ONLY**
+- DQN requires **target network and experience replay**
+- **Frame stacking** needed for video inputs (Markov property)
+- **Double DQN** fixes overestimation, use it
+- **Start simple**, add Dueling/Prioritized only if needed
+- **Systematic debugging** beats random tuning
+
+**Next**: Implement on simple environment first (CartPole or small custom task), then scale to Atari.