Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/pufferlib/references/environments.md
+++ b/skills/pufferlib/references/environments.md
@@ -0,0 +1,508 @@
+# PufferLib Environments Guide
+
+## Overview
+
+PufferLib provides the PufferEnv API for creating high-performance custom environments, and the Ocean suite containing 20+ pre-built environments. Environments support both single-agent and multi-agent scenarios with native vectorization.
+
+## PufferEnv API
+
+### Core Characteristics
+
+PufferEnv is designed for performance through in-place operations:
+- Observations, actions, and rewards are initialized from a shared buffer object
+- All operations happen in-place to avoid creating and copying arrays
+- Native support for both single-agent and multi-agent environments
+- Flat observation/action spaces for efficient vectorization
+
+### Creating a PufferEnv
+
+```python
+import numpy as np
+import pufferlib
+from pufferlib import PufferEnv
+
+class MyEnvironment(PufferEnv):
+    def __init__(self, buf=None):
+        super().__init__(buf)
+
+        # Define observation and action spaces
+        self.observation_space = self.make_space({
+            'image': (84, 84, 3),
+            'vector': (10,)
+        })
+
+        self.action_space = self.make_discrete(4)  # 4 discrete actions
+
+        # Initialize state
+        self.reset()
+
+    def reset(self):
+        """Reset environment to initial state."""
+        # Reset internal state
+        self.agent_pos = np.array([0, 0])
+        self.step_count = 0
+
+        # Return initial observation
+        obs = {
+            'image': np.zeros((84, 84, 3), dtype=np.uint8),
+            'vector': np.zeros(10, dtype=np.float32)
+        }
+
+        return obs
+
+    def step(self, action):
+        """Execute one environment step."""
+        # Update state based on action
+        self.step_count += 1
+
+        # Calculate reward
+        reward = self._compute_reward()
+
+        # Check if episode is done
+        done = self.step_count >= 1000
+
+        # Generate observation
+        obs = self._get_observation()
+
+        # Additional info
+        info = {'episode': {'r': reward, 'l': self.step_count}} if done else {}
+
+        return obs, reward, done, info
+
+    def _compute_reward(self):
+        """Compute reward for current state."""
+        return 1.0
+
+    def _get_observation(self):
+        """Generate observation from current state."""
+        return {
+            'image': np.random.randint(0, 256, (84, 84, 3), dtype=np.uint8),
+            'vector': np.random.randn(10).astype(np.float32)
+        }
+```
+
+### Observation Spaces
+
+#### Discrete Spaces
+
+```python
+# Single discrete value
+self.observation_space = self.make_discrete(10)  # Values 0-9
+
+# Dict with discrete values
+self.observation_space = self.make_space({
+    'position': (1,),  # Continuous
+    'type': self.make_discrete(5)  # Discrete
+})
+```
+
+#### Continuous Spaces
+
+```python
+# Box space (continuous)
+self.observation_space = self.make_space({
+    'image': (84, 84, 3),      # Image
+    'vector': (10,),            # Vector
+    'scalar': (1,)              # Single value
+})
+```
+
+#### Multi-Discrete Spaces
+
+```python
+# Multiple discrete values
+self.observation_space = self.make_multi_discrete([3, 5, 2])  # 3 values, 5 values, 2 values
+```
+
+### Action Spaces
+
+```python
+# Discrete actions
+self.action_space = self.make_discrete(4)  # 4 actions: 0, 1, 2, 3
+
+# Continuous actions
+self.action_space = self.make_space((3,))  # 3D continuous action
+
+# Multi-discrete actions
+self.action_space = self.make_multi_discrete([3, 3])  # Two 3-way discrete choices
+```
+
+## Multi-Agent Environments
+
+PufferLib has native multi-agent support, treating single-agent and multi-agent environments uniformly.
+
+### Multi-Agent PufferEnv
+
+```python
+class MultiAgentEnv(PufferEnv):
+    def __init__(self, num_agents=4, buf=None):
+        super().__init__(buf)
+
+        self.num_agents = num_agents
+
+        # Per-agent observation space
+        self.single_observation_space = self.make_space({
+            'position': (2,),
+            'velocity': (2,),
+            'global': (10,)
+        })
+
+        # Per-agent action space
+        self.single_action_space = self.make_discrete(5)
+
+        self.reset()
+
+    def reset(self):
+        """Reset all agents."""
+        self.agents = {f'agent_{i}': Agent(i) for i in range(self.num_agents)}
+
+        # Return observations for all agents
+        return {
+            agent_id: self._get_obs(agent)
+            for agent_id, agent in self.agents.items()
+        }
+
+    def step(self, actions):
+        """Step all agents."""
+        # actions is a dict: {agent_id: action}
+        observations = {}
+        rewards = {}
+        dones = {}
+        infos = {}
+
+        for agent_id, action in actions.items():
+            agent = self.agents[agent_id]
+
+            # Update agent
+            agent.update(action)
+
+            # Generate results
+            observations[agent_id] = self._get_obs(agent)
+            rewards[agent_id] = self._compute_reward(agent)
+            dones[agent_id] = agent.is_done()
+            infos[agent_id] = {}
+
+        # Check for global done condition
+        dones['__all__'] = all(dones.values())
+
+        return observations, rewards, dones, infos
+```
+
+## Ocean Environment Suite
+
+PufferLib provides the Ocean suite with 20+ pre-built environments:
+
+### Available Environments
+
+#### Arcade Games
+- **Atari**: Classic Atari 2600 games via Arcade Learning Environment
+- **Procgen**: Procedurally generated games for generalization testing
+
+#### Grid-Based
+- **Minigrid**: Partially observable gridworld environments
+- **Crafter**: Open-ended survival crafting game
+- **NetHack**: Classic roguelike dungeon crawler
+- **MiniHack**: Simplified NetHack variants
+
+#### Multi-Agent
+- **PettingZoo**: Multi-agent environment suite (including Butterfly)
+- **MAgent**: Large-scale multi-agent scenarios
+- **Neural MMO**: Massively multi-agent survival game
+
+#### Specialized
+- **Pokemon Red**: Classic Pokemon game environment
+- **GPUDrive**: High-performance driving simulator
+- **Griddly**: Grid-based game engine
+- **MicroRTS**: Real-time strategy game
+
+### Using Ocean Environments
+
+```python
+import pufferlib
+
+# Make environment
+env = pufferlib.make('procgen-coinrun', num_envs=256)
+
+# With custom configuration
+env = pufferlib.make(
+    'atari-pong',
+    num_envs=128,
+    frameskip=4,
+    framestack=4
+)
+
+# Multi-agent environment
+env = pufferlib.make('pettingzoo-knights-archers-zombies', num_agents=4)
+```
+
+## Custom Environment Development
+
+### Development Workflow
+
+1. **Prototype in Python**: Start with pure Python PufferEnv
+2. **Optimize Critical Paths**: Identify bottlenecks
+3. **Implement in C**: Rewrite performance-critical code in C
+4. **Create Bindings**: Use Python C API
+5. **Compile**: Build as extension module
+6. **Register**: Add to Ocean suite
+
+### Performance Benchmarks
+
+- **Pure Python**: 100k-500k steps/second
+- **C Implementation**: 100M+ steps/second
+- **Training with Python env**: ~400k total SPS
+- **Training with C env**: ~4M total SPS
+
+### Python Optimization Tips
+
+```python
+# Use NumPy operations instead of Python loops
+# Bad
+for i in range(len(array)):
+    array[i] = array[i] * 2
+
+# Good
+array *= 2
+
+# Pre-allocate arrays instead of appending
+# Bad
+observations = []
+for i in range(n):
+    observations.append(generate_obs())
+
+# Good
+observations = np.empty((n, obs_shape), dtype=np.float32)
+for i in range(n):
+    observations[i] = generate_obs()
+
+# Use in-place operations
+# Bad
+new_state = state + delta
+
+# Good
+state += delta
+```
+
+### C Extension Example
+
+```c
+// my_env.c
+#include <Python.h>
+#include <numpy/arrayobject.h>
+
+// Fast environment step implementation
+static PyObject* fast_step(PyObject* self, PyObject* args) {
+    PyArrayObject* state;
+    int action;
+
+    if (!PyArg_ParseTuple(args, "O!i", &PyArray_Type, &state, &action)) {
+        return NULL;
+    }
+
+    // High-performance C implementation
+    // ...
+
+    return Py_BuildValue("Ofi", obs, reward, done);
+}
+
+static PyMethodDef methods[] = {
+    {"fast_step", fast_step, METH_VARARGS, "Fast environment step"},
+    {NULL, NULL, 0, NULL}
+};
+
+static struct PyModuleDef module = {
+    PyModuleDef_HEAD_INIT,
+    "my_env_c",
+    NULL,
+    -1,
+    methods
+};
+
+PyMODINIT_FUNC PyInit_my_env_c(void) {
+    import_array();
+    return PyModule_Create(&module);
+}
+```
+
+## Third-Party Environment Integration
+
+### Gymnasium Environments
+
+```python
+import gymnasium as gym
+import pufferlib
+
+# Wrap Gymnasium environment
+gym_env = gym.make('CartPole-v1')
+puffer_env = pufferlib.emulate(gym_env, num_envs=256)
+
+# Or use make directly
+env = pufferlib.make('gym-CartPole-v1', num_envs=256)
+```
+
+### PettingZoo Environments
+
+```python
+from pettingzoo.butterfly import pistonball_v6
+import pufferlib
+
+# Wrap PettingZoo environment
+pz_env = pistonball_v6.env()
+puffer_env = pufferlib.emulate(pz_env, num_envs=128)
+
+# Or use make directly
+env = pufferlib.make('pettingzoo-pistonball', num_envs=128)
+```
+
+### Custom Wrappers
+
+```python
+class CustomWrapper(pufferlib.PufferEnv):
+    """Wrapper to modify environment behavior."""
+
+    def __init__(self, base_env, buf=None):
+        super().__init__(buf)
+        self.base_env = base_env
+        self.observation_space = base_env.observation_space
+        self.action_space = base_env.action_space
+
+    def reset(self):
+        obs = self.base_env.reset()
+        # Modify observation
+        return self._process_obs(obs)
+
+    def step(self, action):
+        # Modify action
+        modified_action = self._process_action(action)
+
+        obs, reward, done, info = self.base_env.step(modified_action)
+
+        # Modify outputs
+        obs = self._process_obs(obs)
+        reward = self._process_reward(reward)
+
+        return obs, reward, done, info
+```
+
+## Environment Best Practices
+
+### State Management
+
+```python
+# Store minimal state, compute on demand
+class EfficientEnv(PufferEnv):
+    def __init__(self, buf=None):
+        super().__init__(buf)
+        self.agent_pos = np.zeros(2)  # Minimal state
+
+    def _get_observation(self):
+        # Compute full observation on demand
+        observation = np.zeros((84, 84, 3), dtype=np.uint8)
+        self._render_scene(observation, self.agent_pos)
+        return observation
+```
+
+### Reward Scaling
+
+```python
+# Normalize rewards to reasonable range
+def step(self, action):
+    # ... environment logic ...
+
+    # Scale large rewards
+    raw_reward = compute_raw_reward()
+    reward = np.clip(raw_reward / 100.0, -10, 10)
+
+    return obs, reward, done, info
+```
+
+### Episode Termination
+
+```python
+def step(self, action):
+    # ... environment logic ...
+
+    # Multiple termination conditions
+    timeout = self.step_count >= self.max_steps
+    success = self._check_success()
+    failure = self._check_failure()
+
+    done = timeout or success or failure
+
+    info = {
+        'TimeLimit.truncated': timeout,
+        'success': success
+    }
+
+    return obs, reward, done, info
+```
+
+### Memory Efficiency
+
+```python
+# Reuse buffers instead of allocating new ones
+class MemoryEfficientEnv(PufferEnv):
+    def __init__(self, buf=None):
+        super().__init__(buf)
+
+        # Pre-allocate observation buffer
+        self._obs_buffer = np.zeros((84, 84, 3), dtype=np.uint8)
+
+    def _get_observation(self):
+        # Reuse buffer, modify in place
+        self._render_scene(self._obs_buffer)
+        return self._obs_buffer  # Return view, not copy
+```
+
+## Debugging Environments
+
+### Validation Checks
+
+```python
+# Add assertions to catch bugs
+def step(self, action):
+    assert self.action_space.contains(action), f"Invalid action: {action}"
+
+    obs, reward, done, info = self._step_impl(action)
+
+    assert self.observation_space.contains(obs), "Invalid observation"
+    assert np.isfinite(reward), "Non-finite reward"
+
+    return obs, reward, done, info
+```
+
+### Rendering
+
+```python
+class DebuggableEnv(PufferEnv):
+    def __init__(self, buf=None, render_mode=None):
+        super().__init__(buf)
+        self.render_mode = render_mode
+
+    def render(self):
+        """Render environment for debugging."""
+        if self.render_mode == 'human':
+            # Display to screen
+            self._display_scene()
+        elif self.render_mode == 'rgb_array':
+            # Return image
+            return self._render_to_array()
+```
+
+### Logging
+
+```python
+import logging
+
+logger = logging.getLogger(__name__)
+
+def step(self, action):
+    logger.debug(f"Step {self.step_count}: action={action}")
+
+    obs, reward, done, info = self._step_impl(action)
+
+    if done:
+        logger.info(f"Episode finished: reward={self.total_reward}")
+
+    return obs, reward, done, info
+```