Initial commit
This commit is contained in:
508
skills/pufferlib/references/environments.md
Normal file
508
skills/pufferlib/references/environments.md
Normal file
@@ -0,0 +1,508 @@
|
||||
# PufferLib Environments Guide
|
||||
|
||||
## Overview
|
||||
|
||||
PufferLib provides the PufferEnv API for creating high-performance custom environments, and the Ocean suite containing 20+ pre-built environments. Environments support both single-agent and multi-agent scenarios with native vectorization.
|
||||
|
||||
## PufferEnv API
|
||||
|
||||
### Core Characteristics
|
||||
|
||||
PufferEnv is designed for performance through in-place operations:
|
||||
- Observations, actions, and rewards are initialized from a shared buffer object
|
||||
- All operations happen in-place to avoid creating and copying arrays
|
||||
- Native support for both single-agent and multi-agent environments
|
||||
- Flat observation/action spaces for efficient vectorization
|
||||
|
||||
### Creating a PufferEnv
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import pufferlib
|
||||
from pufferlib import PufferEnv
|
||||
|
||||
class MyEnvironment(PufferEnv):
|
||||
def __init__(self, buf=None):
|
||||
super().__init__(buf)
|
||||
|
||||
# Define observation and action spaces
|
||||
self.observation_space = self.make_space({
|
||||
'image': (84, 84, 3),
|
||||
'vector': (10,)
|
||||
})
|
||||
|
||||
self.action_space = self.make_discrete(4) # 4 discrete actions
|
||||
|
||||
# Initialize state
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset environment to initial state."""
|
||||
# Reset internal state
|
||||
self.agent_pos = np.array([0, 0])
|
||||
self.step_count = 0
|
||||
|
||||
# Return initial observation
|
||||
obs = {
|
||||
'image': np.zeros((84, 84, 3), dtype=np.uint8),
|
||||
'vector': np.zeros(10, dtype=np.float32)
|
||||
}
|
||||
|
||||
return obs
|
||||
|
||||
def step(self, action):
|
||||
"""Execute one environment step."""
|
||||
# Update state based on action
|
||||
self.step_count += 1
|
||||
|
||||
# Calculate reward
|
||||
reward = self._compute_reward()
|
||||
|
||||
# Check if episode is done
|
||||
done = self.step_count >= 1000
|
||||
|
||||
# Generate observation
|
||||
obs = self._get_observation()
|
||||
|
||||
# Additional info
|
||||
info = {'episode': {'r': reward, 'l': self.step_count}} if done else {}
|
||||
|
||||
return obs, reward, done, info
|
||||
|
||||
def _compute_reward(self):
|
||||
"""Compute reward for current state."""
|
||||
return 1.0
|
||||
|
||||
def _get_observation(self):
|
||||
"""Generate observation from current state."""
|
||||
return {
|
||||
'image': np.random.randint(0, 256, (84, 84, 3), dtype=np.uint8),
|
||||
'vector': np.random.randn(10).astype(np.float32)
|
||||
}
|
||||
```
|
||||
|
||||
### Observation Spaces
|
||||
|
||||
#### Discrete Spaces
|
||||
|
||||
```python
|
||||
# Single discrete value
|
||||
self.observation_space = self.make_discrete(10) # Values 0-9
|
||||
|
||||
# Dict with discrete values
|
||||
self.observation_space = self.make_space({
|
||||
'position': (1,), # Continuous
|
||||
'type': self.make_discrete(5) # Discrete
|
||||
})
|
||||
```
|
||||
|
||||
#### Continuous Spaces
|
||||
|
||||
```python
|
||||
# Box space (continuous)
|
||||
self.observation_space = self.make_space({
|
||||
'image': (84, 84, 3), # Image
|
||||
'vector': (10,), # Vector
|
||||
'scalar': (1,) # Single value
|
||||
})
|
||||
```
|
||||
|
||||
#### Multi-Discrete Spaces
|
||||
|
||||
```python
|
||||
# Multiple discrete values
|
||||
self.observation_space = self.make_multi_discrete([3, 5, 2]) # 3 values, 5 values, 2 values
|
||||
```
|
||||
|
||||
### Action Spaces
|
||||
|
||||
```python
|
||||
# Discrete actions
|
||||
self.action_space = self.make_discrete(4) # 4 actions: 0, 1, 2, 3
|
||||
|
||||
# Continuous actions
|
||||
self.action_space = self.make_space((3,)) # 3D continuous action
|
||||
|
||||
# Multi-discrete actions
|
||||
self.action_space = self.make_multi_discrete([3, 3]) # Two 3-way discrete choices
|
||||
```
|
||||
|
||||
## Multi-Agent Environments
|
||||
|
||||
PufferLib has native multi-agent support, treating single-agent and multi-agent environments uniformly.
|
||||
|
||||
### Multi-Agent PufferEnv
|
||||
|
||||
```python
|
||||
class MultiAgentEnv(PufferEnv):
|
||||
def __init__(self, num_agents=4, buf=None):
|
||||
super().__init__(buf)
|
||||
|
||||
self.num_agents = num_agents
|
||||
|
||||
# Per-agent observation space
|
||||
self.single_observation_space = self.make_space({
|
||||
'position': (2,),
|
||||
'velocity': (2,),
|
||||
'global': (10,)
|
||||
})
|
||||
|
||||
# Per-agent action space
|
||||
self.single_action_space = self.make_discrete(5)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset all agents."""
|
||||
self.agents = {f'agent_{i}': Agent(i) for i in range(self.num_agents)}
|
||||
|
||||
# Return observations for all agents
|
||||
return {
|
||||
agent_id: self._get_obs(agent)
|
||||
for agent_id, agent in self.agents.items()
|
||||
}
|
||||
|
||||
def step(self, actions):
|
||||
"""Step all agents."""
|
||||
# actions is a dict: {agent_id: action}
|
||||
observations = {}
|
||||
rewards = {}
|
||||
dones = {}
|
||||
infos = {}
|
||||
|
||||
for agent_id, action in actions.items():
|
||||
agent = self.agents[agent_id]
|
||||
|
||||
# Update agent
|
||||
agent.update(action)
|
||||
|
||||
# Generate results
|
||||
observations[agent_id] = self._get_obs(agent)
|
||||
rewards[agent_id] = self._compute_reward(agent)
|
||||
dones[agent_id] = agent.is_done()
|
||||
infos[agent_id] = {}
|
||||
|
||||
# Check for global done condition
|
||||
dones['__all__'] = all(dones.values())
|
||||
|
||||
return observations, rewards, dones, infos
|
||||
```
|
||||
|
||||
## Ocean Environment Suite
|
||||
|
||||
PufferLib provides the Ocean suite with 20+ pre-built environments:
|
||||
|
||||
### Available Environments
|
||||
|
||||
#### Arcade Games
|
||||
- **Atari**: Classic Atari 2600 games via Arcade Learning Environment
|
||||
- **Procgen**: Procedurally generated games for generalization testing
|
||||
|
||||
#### Grid-Based
|
||||
- **Minigrid**: Partially observable gridworld environments
|
||||
- **Crafter**: Open-ended survival crafting game
|
||||
- **NetHack**: Classic roguelike dungeon crawler
|
||||
- **MiniHack**: Simplified NetHack variants
|
||||
|
||||
#### Multi-Agent
|
||||
- **PettingZoo**: Multi-agent environment suite (including Butterfly)
|
||||
- **MAgent**: Large-scale multi-agent scenarios
|
||||
- **Neural MMO**: Massively multi-agent survival game
|
||||
|
||||
#### Specialized
|
||||
- **Pokemon Red**: Classic Pokemon game environment
|
||||
- **GPUDrive**: High-performance driving simulator
|
||||
- **Griddly**: Grid-based game engine
|
||||
- **MicroRTS**: Real-time strategy game
|
||||
|
||||
### Using Ocean Environments
|
||||
|
||||
```python
|
||||
import pufferlib
|
||||
|
||||
# Make environment
|
||||
env = pufferlib.make('procgen-coinrun', num_envs=256)
|
||||
|
||||
# With custom configuration
|
||||
env = pufferlib.make(
|
||||
'atari-pong',
|
||||
num_envs=128,
|
||||
frameskip=4,
|
||||
framestack=4
|
||||
)
|
||||
|
||||
# Multi-agent environment
|
||||
env = pufferlib.make('pettingzoo-knights-archers-zombies', num_agents=4)
|
||||
```
|
||||
|
||||
## Custom Environment Development
|
||||
|
||||
### Development Workflow
|
||||
|
||||
1. **Prototype in Python**: Start with pure Python PufferEnv
|
||||
2. **Optimize Critical Paths**: Identify bottlenecks
|
||||
3. **Implement in C**: Rewrite performance-critical code in C
|
||||
4. **Create Bindings**: Use Python C API
|
||||
5. **Compile**: Build as extension module
|
||||
6. **Register**: Add to Ocean suite
|
||||
|
||||
### Performance Benchmarks
|
||||
|
||||
- **Pure Python**: 100k-500k steps/second
|
||||
- **C Implementation**: 100M+ steps/second
|
||||
- **Training with Python env**: ~400k total SPS
|
||||
- **Training with C env**: ~4M total SPS
|
||||
|
||||
### Python Optimization Tips
|
||||
|
||||
```python
|
||||
# Use NumPy operations instead of Python loops
|
||||
# Bad
|
||||
for i in range(len(array)):
|
||||
array[i] = array[i] * 2
|
||||
|
||||
# Good
|
||||
array *= 2
|
||||
|
||||
# Pre-allocate arrays instead of appending
|
||||
# Bad
|
||||
observations = []
|
||||
for i in range(n):
|
||||
observations.append(generate_obs())
|
||||
|
||||
# Good
|
||||
observations = np.empty((n, obs_shape), dtype=np.float32)
|
||||
for i in range(n):
|
||||
observations[i] = generate_obs()
|
||||
|
||||
# Use in-place operations
|
||||
# Bad
|
||||
new_state = state + delta
|
||||
|
||||
# Good
|
||||
state += delta
|
||||
```
|
||||
|
||||
### C Extension Example
|
||||
|
||||
```c
|
||||
// my_env.c
|
||||
#include <Python.h>
|
||||
#include <numpy/arrayobject.h>
|
||||
|
||||
// Fast environment step implementation
|
||||
static PyObject* fast_step(PyObject* self, PyObject* args) {
|
||||
PyArrayObject* state;
|
||||
int action;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!i", &PyArray_Type, &state, &action)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// High-performance C implementation
|
||||
// ...
|
||||
|
||||
return Py_BuildValue("Ofi", obs, reward, done);
|
||||
}
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
{"fast_step", fast_step, METH_VARARGS, "Fast environment step"},
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
static struct PyModuleDef module = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"my_env_c",
|
||||
NULL,
|
||||
-1,
|
||||
methods
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC PyInit_my_env_c(void) {
|
||||
import_array();
|
||||
return PyModule_Create(&module);
|
||||
}
|
||||
```
|
||||
|
||||
## Third-Party Environment Integration
|
||||
|
||||
### Gymnasium Environments
|
||||
|
||||
```python
|
||||
import gymnasium as gym
|
||||
import pufferlib
|
||||
|
||||
# Wrap Gymnasium environment
|
||||
gym_env = gym.make('CartPole-v1')
|
||||
puffer_env = pufferlib.emulate(gym_env, num_envs=256)
|
||||
|
||||
# Or use make directly
|
||||
env = pufferlib.make('gym-CartPole-v1', num_envs=256)
|
||||
```
|
||||
|
||||
### PettingZoo Environments
|
||||
|
||||
```python
|
||||
from pettingzoo.butterfly import pistonball_v6
|
||||
import pufferlib
|
||||
|
||||
# Wrap PettingZoo environment
|
||||
pz_env = pistonball_v6.env()
|
||||
puffer_env = pufferlib.emulate(pz_env, num_envs=128)
|
||||
|
||||
# Or use make directly
|
||||
env = pufferlib.make('pettingzoo-pistonball', num_envs=128)
|
||||
```
|
||||
|
||||
### Custom Wrappers
|
||||
|
||||
```python
|
||||
class CustomWrapper(pufferlib.PufferEnv):
|
||||
"""Wrapper to modify environment behavior."""
|
||||
|
||||
def __init__(self, base_env, buf=None):
|
||||
super().__init__(buf)
|
||||
self.base_env = base_env
|
||||
self.observation_space = base_env.observation_space
|
||||
self.action_space = base_env.action_space
|
||||
|
||||
def reset(self):
|
||||
obs = self.base_env.reset()
|
||||
# Modify observation
|
||||
return self._process_obs(obs)
|
||||
|
||||
def step(self, action):
|
||||
# Modify action
|
||||
modified_action = self._process_action(action)
|
||||
|
||||
obs, reward, done, info = self.base_env.step(modified_action)
|
||||
|
||||
# Modify outputs
|
||||
obs = self._process_obs(obs)
|
||||
reward = self._process_reward(reward)
|
||||
|
||||
return obs, reward, done, info
|
||||
```
|
||||
|
||||
## Environment Best Practices
|
||||
|
||||
### State Management
|
||||
|
||||
```python
|
||||
# Store minimal state, compute on demand
|
||||
class EfficientEnv(PufferEnv):
|
||||
def __init__(self, buf=None):
|
||||
super().__init__(buf)
|
||||
self.agent_pos = np.zeros(2) # Minimal state
|
||||
|
||||
def _get_observation(self):
|
||||
# Compute full observation on demand
|
||||
observation = np.zeros((84, 84, 3), dtype=np.uint8)
|
||||
self._render_scene(observation, self.agent_pos)
|
||||
return observation
|
||||
```
|
||||
|
||||
### Reward Scaling
|
||||
|
||||
```python
|
||||
# Normalize rewards to reasonable range
|
||||
def step(self, action):
|
||||
# ... environment logic ...
|
||||
|
||||
# Scale large rewards
|
||||
raw_reward = compute_raw_reward()
|
||||
reward = np.clip(raw_reward / 100.0, -10, 10)
|
||||
|
||||
return obs, reward, done, info
|
||||
```
|
||||
|
||||
### Episode Termination
|
||||
|
||||
```python
|
||||
def step(self, action):
|
||||
# ... environment logic ...
|
||||
|
||||
# Multiple termination conditions
|
||||
timeout = self.step_count >= self.max_steps
|
||||
success = self._check_success()
|
||||
failure = self._check_failure()
|
||||
|
||||
done = timeout or success or failure
|
||||
|
||||
info = {
|
||||
'TimeLimit.truncated': timeout,
|
||||
'success': success
|
||||
}
|
||||
|
||||
return obs, reward, done, info
|
||||
```
|
||||
|
||||
### Memory Efficiency
|
||||
|
||||
```python
|
||||
# Reuse buffers instead of allocating new ones
|
||||
class MemoryEfficientEnv(PufferEnv):
|
||||
def __init__(self, buf=None):
|
||||
super().__init__(buf)
|
||||
|
||||
# Pre-allocate observation buffer
|
||||
self._obs_buffer = np.zeros((84, 84, 3), dtype=np.uint8)
|
||||
|
||||
def _get_observation(self):
|
||||
# Reuse buffer, modify in place
|
||||
self._render_scene(self._obs_buffer)
|
||||
return self._obs_buffer # Return view, not copy
|
||||
```
|
||||
|
||||
## Debugging Environments
|
||||
|
||||
### Validation Checks
|
||||
|
||||
```python
|
||||
# Add assertions to catch bugs
|
||||
def step(self, action):
|
||||
assert self.action_space.contains(action), f"Invalid action: {action}"
|
||||
|
||||
obs, reward, done, info = self._step_impl(action)
|
||||
|
||||
assert self.observation_space.contains(obs), "Invalid observation"
|
||||
assert np.isfinite(reward), "Non-finite reward"
|
||||
|
||||
return obs, reward, done, info
|
||||
```
|
||||
|
||||
### Rendering
|
||||
|
||||
```python
|
||||
class DebuggableEnv(PufferEnv):
|
||||
def __init__(self, buf=None, render_mode=None):
|
||||
super().__init__(buf)
|
||||
self.render_mode = render_mode
|
||||
|
||||
def render(self):
|
||||
"""Render environment for debugging."""
|
||||
if self.render_mode == 'human':
|
||||
# Display to screen
|
||||
self._display_scene()
|
||||
elif self.render_mode == 'rgb_array':
|
||||
# Return image
|
||||
return self._render_to_array()
|
||||
```
|
||||
|
||||
### Logging
|
||||
|
||||
```python
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def step(self, action):
|
||||
logger.debug(f"Step {self.step_count}: action={action}")
|
||||
|
||||
obs, reward, done, info = self._step_impl(action)
|
||||
|
||||
if done:
|
||||
logger.info(f"Episode finished: reward={self.total_reward}")
|
||||
|
||||
return obs, reward, done, info
|
||||
```
|
||||
Reference in New Issue
Block a user