Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,340 @@
#!/usr/bin/env python3
"""
PufferLib Environment Template
This template provides a starting point for creating custom PufferEnv environments.
Customize the observation space, action space, and environment logic for your task.
"""
import numpy as np
import pufferlib
from pufferlib import PufferEnv
class MyEnvironment(PufferEnv):
"""
Custom PufferLib environment template.
This is a simple grid world example. Customize it for your specific task.
"""
def __init__(self, buf=None, grid_size=10, max_steps=1000):
"""
Initialize environment.
Args:
buf: Shared memory buffer (managed by PufferLib)
grid_size: Size of the grid world
max_steps: Maximum steps per episode
"""
super().__init__(buf)
self.grid_size = grid_size
self.max_steps = max_steps
# Define observation space
# Option 1: Flat vector observation
self.observation_space = self.make_space((4,)) # [x, y, goal_x, goal_y]
# Option 2: Dict observation with multiple components
# self.observation_space = self.make_space({
# 'position': (2,),
# 'goal': (2,),
# 'grid': (grid_size, grid_size)
# })
# Option 3: Image observation
# self.observation_space = self.make_space((grid_size, grid_size, 3))
# Define action space
# Option 1: Discrete actions
self.action_space = self.make_discrete(4) # 0: up, 1: right, 2: down, 3: left
# Option 2: Continuous actions
# self.action_space = self.make_space((2,)) # [dx, dy]
# Option 3: Multi-discrete actions
# self.action_space = self.make_multi_discrete([3, 3]) # Two 3-way choices
# Initialize state
self.agent_pos = None
self.goal_pos = None
self.step_count = 0
self.reset()
def reset(self):
"""
Reset environment to initial state.
Returns:
observation: Initial observation
"""
# Reset state
self.agent_pos = np.array([0, 0], dtype=np.float32)
self.goal_pos = np.array([self.grid_size - 1, self.grid_size - 1], dtype=np.float32)
self.step_count = 0
# Return initial observation
return self._get_observation()
def step(self, action):
"""
Execute one environment step.
Args:
action: Action to take
Returns:
observation: New observation
reward: Reward for this step
done: Whether episode is complete
info: Additional information
"""
self.step_count += 1
# Execute action
self._apply_action(action)
# Compute reward
reward = self._compute_reward()
# Check if episode is done
done = self._is_done()
# Get new observation
observation = self._get_observation()
# Additional info
info = {}
if done:
# Include episode statistics when episode ends
info['episode'] = {
'r': reward,
'l': self.step_count
}
return observation, reward, done, info
def _apply_action(self, action):
"""Apply action to update environment state."""
# Discrete actions: 0=up, 1=right, 2=down, 3=left
if action == 0: # Up
self.agent_pos[1] = min(self.agent_pos[1] + 1, self.grid_size - 1)
elif action == 1: # Right
self.agent_pos[0] = min(self.agent_pos[0] + 1, self.grid_size - 1)
elif action == 2: # Down
self.agent_pos[1] = max(self.agent_pos[1] - 1, 0)
elif action == 3: # Left
self.agent_pos[0] = max(self.agent_pos[0] - 1, 0)
def _compute_reward(self):
"""Compute reward for current state."""
# Distance to goal
distance = np.linalg.norm(self.agent_pos - self.goal_pos)
# Reward shaping: negative distance + bonus for reaching goal
reward = -distance / self.grid_size
# Goal reached
if distance < 0.5:
reward += 10.0
return reward
def _is_done(self):
"""Check if episode is complete."""
# Episode ends if goal reached or max steps exceeded
distance = np.linalg.norm(self.agent_pos - self.goal_pos)
goal_reached = distance < 0.5
timeout = self.step_count >= self.max_steps
return goal_reached or timeout
def _get_observation(self):
"""Generate observation from current state."""
# Return flat vector observation
observation = np.concatenate([
self.agent_pos,
self.goal_pos
]).astype(np.float32)
return observation
class MultiAgentEnvironment(PufferEnv):
"""
Multi-agent environment template.
Example: Cooperative navigation task where agents must reach individual goals.
"""
def __init__(self, buf=None, num_agents=4, grid_size=10, max_steps=1000):
super().__init__(buf)
self.num_agents = num_agents
self.grid_size = grid_size
self.max_steps = max_steps
# Per-agent observation space
self.single_observation_space = self.make_space({
'position': (2,),
'goal': (2,),
'others': (2 * (num_agents - 1),) # Positions of other agents
})
# Per-agent action space
self.single_action_space = self.make_discrete(5) # 4 directions + stay
# Initialize state
self.agent_positions = None
self.goal_positions = None
self.step_count = 0
self.reset()
def reset(self):
"""Reset all agents."""
# Random initial positions
self.agent_positions = np.random.rand(self.num_agents, 2) * self.grid_size
# Random goal positions
self.goal_positions = np.random.rand(self.num_agents, 2) * self.grid_size
self.step_count = 0
# Return observations for all agents
return {
f'agent_{i}': self._get_obs(i)
for i in range(self.num_agents)
}
def step(self, actions):
"""
Step all agents.
Args:
actions: Dict of {agent_id: action}
Returns:
observations: Dict of {agent_id: observation}
rewards: Dict of {agent_id: reward}
dones: Dict of {agent_id: done}
infos: Dict of {agent_id: info}
"""
self.step_count += 1
observations = {}
rewards = {}
dones = {}
infos = {}
# Update all agents
for agent_id, action in actions.items():
agent_idx = int(agent_id.split('_')[1])
# Apply action
self._apply_action(agent_idx, action)
# Generate outputs
observations[agent_id] = self._get_obs(agent_idx)
rewards[agent_id] = self._compute_reward(agent_idx)
dones[agent_id] = self._is_done(agent_idx)
infos[agent_id] = {}
# Global done condition
dones['__all__'] = all(dones.values()) or self.step_count >= self.max_steps
return observations, rewards, dones, infos
def _apply_action(self, agent_idx, action):
"""Apply action for specific agent."""
if action == 0: # Up
self.agent_positions[agent_idx, 1] += 1
elif action == 1: # Right
self.agent_positions[agent_idx, 0] += 1
elif action == 2: # Down
self.agent_positions[agent_idx, 1] -= 1
elif action == 3: # Left
self.agent_positions[agent_idx, 0] -= 1
# action == 4: Stay
# Clip to grid bounds
self.agent_positions[agent_idx] = np.clip(
self.agent_positions[agent_idx],
0,
self.grid_size - 1
)
def _compute_reward(self, agent_idx):
"""Compute reward for specific agent."""
distance = np.linalg.norm(
self.agent_positions[agent_idx] - self.goal_positions[agent_idx]
)
return -distance / self.grid_size
def _is_done(self, agent_idx):
"""Check if specific agent is done."""
distance = np.linalg.norm(
self.agent_positions[agent_idx] - self.goal_positions[agent_idx]
)
return distance < 0.5
def _get_obs(self, agent_idx):
"""Get observation for specific agent."""
# Get positions of other agents
other_positions = np.concatenate([
self.agent_positions[i]
for i in range(self.num_agents)
if i != agent_idx
])
return {
'position': self.agent_positions[agent_idx].astype(np.float32),
'goal': self.goal_positions[agent_idx].astype(np.float32),
'others': other_positions.astype(np.float32)
}
def test_environment():
"""Test environment to verify it works correctly."""
print("Testing single-agent environment...")
env = MyEnvironment()
obs = env.reset()
print(f"Initial observation shape: {obs.shape}")
for step in range(10):
action = env.action_space.sample()
obs, reward, done, info = env.step(action)
print(f"Step {step}: reward={reward:.3f}, done={done}")
if done:
obs = env.reset()
print("Episode finished, resetting...")
print("\nTesting multi-agent environment...")
multi_env = MultiAgentEnvironment(num_agents=4)
obs = multi_env.reset()
print(f"Number of agents: {len(obs)}")
for step in range(10):
actions = {
agent_id: multi_env.single_action_space.sample()
for agent_id in obs.keys()
}
obs, rewards, dones, infos = multi_env.step(actions)
print(f"Step {step}: mean_reward={np.mean(list(rewards.values())):.3f}")
if dones.get('__all__', False):
obs = multi_env.reset()
print("Episode finished, resetting...")
print("\n✓ Environment tests passed!")
if __name__ == '__main__':
test_environment()

View File

@@ -0,0 +1,239 @@
#!/usr/bin/env python3
"""
PufferLib Training Template
This template provides a complete training script for reinforcement learning
with PufferLib. Customize the environment, policy, and training configuration
as needed for your use case.
"""
import argparse
import torch
import torch.nn as nn
import pufferlib
from pufferlib import PuffeRL
from pufferlib.pytorch import layer_init
class Policy(nn.Module):
"""Example policy network."""
def __init__(self, observation_space, action_space, hidden_size=256):
super().__init__()
self.observation_space = observation_space
self.action_space = action_space
# Encoder network
self.encoder = nn.Sequential(
layer_init(nn.Linear(observation_space.shape[0], hidden_size)),
nn.ReLU(),
layer_init(nn.Linear(hidden_size, hidden_size)),
nn.ReLU()
)
# Policy head (actor)
self.actor = layer_init(nn.Linear(hidden_size, action_space.n), std=0.01)
# Value head (critic)
self.critic = layer_init(nn.Linear(hidden_size, 1), std=1.0)
def forward(self, observations):
"""Forward pass through policy."""
features = self.encoder(observations)
logits = self.actor(features)
value = self.critic(features)
return logits, value
def make_env():
"""Create environment. Customize this for your task."""
# Option 1: Use Ocean environment
return pufferlib.make('procgen-coinrun', num_envs=256)
# Option 2: Use Gymnasium environment
# return pufferlib.make('gym-CartPole-v1', num_envs=256)
# Option 3: Use custom environment
# from my_envs import MyEnvironment
# return pufferlib.emulate(MyEnvironment, num_envs=256)
def create_policy(env):
"""Create policy network."""
return Policy(
observation_space=env.observation_space,
action_space=env.action_space,
hidden_size=256
)
def train(args):
"""Main training function."""
# Set random seeds
torch.manual_seed(args.seed)
# Create environment
print(f"Creating environment with {args.num_envs} parallel environments...")
env = pufferlib.make(
args.env_name,
num_envs=args.num_envs,
num_workers=args.num_workers
)
# Create policy
print("Initializing policy...")
policy = create_policy(env)
if args.device == 'cuda' and torch.cuda.is_available():
policy = policy.cuda()
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
print("Using CPU")
# Create logger
if args.logger == 'wandb':
from pufferlib import WandbLogger
logger = WandbLogger(
project=args.project,
name=args.exp_name,
config=vars(args)
)
elif args.logger == 'neptune':
from pufferlib import NeptuneLogger
logger = NeptuneLogger(
project=args.project,
name=args.exp_name,
api_token=args.neptune_token
)
else:
from pufferlib import NoLogger
logger = NoLogger()
# Create trainer
print("Creating trainer...")
trainer = PuffeRL(
env=env,
policy=policy,
device=args.device,
learning_rate=args.learning_rate,
batch_size=args.batch_size,
n_epochs=args.n_epochs,
gamma=args.gamma,
gae_lambda=args.gae_lambda,
clip_coef=args.clip_coef,
ent_coef=args.ent_coef,
vf_coef=args.vf_coef,
max_grad_norm=args.max_grad_norm,
logger=logger,
compile=args.compile
)
# Training loop
print(f"Starting training for {args.num_iterations} iterations...")
for iteration in range(1, args.num_iterations + 1):
# Collect rollouts
rollout_data = trainer.evaluate()
# Train on batch
train_metrics = trainer.train()
# Log results
trainer.mean_and_log()
# Save checkpoint
if iteration % args.save_freq == 0:
checkpoint_path = f"{args.checkpoint_dir}/checkpoint_{iteration}.pt"
trainer.save_checkpoint(checkpoint_path)
print(f"Saved checkpoint to {checkpoint_path}")
# Print progress
if iteration % args.log_freq == 0:
mean_reward = rollout_data.get('mean_reward', 0)
sps = rollout_data.get('sps', 0)
print(f"Iteration {iteration}/{args.num_iterations} | "
f"Mean Reward: {mean_reward:.2f} | "
f"SPS: {sps:,.0f}")
print("Training complete!")
# Save final model
final_path = f"{args.checkpoint_dir}/final_model.pt"
trainer.save_checkpoint(final_path)
print(f"Saved final model to {final_path}")
def main():
parser = argparse.ArgumentParser(description='PufferLib Training')
# Environment
parser.add_argument('--env-name', type=str, default='procgen-coinrun',
help='Environment name')
parser.add_argument('--num-envs', type=int, default=256,
help='Number of parallel environments')
parser.add_argument('--num-workers', type=int, default=8,
help='Number of vectorization workers')
# Training
parser.add_argument('--num-iterations', type=int, default=10000,
help='Number of training iterations')
parser.add_argument('--learning-rate', type=float, default=3e-4,
help='Learning rate')
parser.add_argument('--batch-size', type=int, default=32768,
help='Batch size for training')
parser.add_argument('--n-epochs', type=int, default=4,
help='Number of training epochs per batch')
parser.add_argument('--device', type=str, default='cuda',
choices=['cuda', 'cpu'], help='Device to use')
# PPO Parameters
parser.add_argument('--gamma', type=float, default=0.99,
help='Discount factor')
parser.add_argument('--gae-lambda', type=float, default=0.95,
help='GAE lambda')
parser.add_argument('--clip-coef', type=float, default=0.2,
help='PPO clipping coefficient')
parser.add_argument('--ent-coef', type=float, default=0.01,
help='Entropy coefficient')
parser.add_argument('--vf-coef', type=float, default=0.5,
help='Value function coefficient')
parser.add_argument('--max-grad-norm', type=float, default=0.5,
help='Maximum gradient norm')
# Logging
parser.add_argument('--logger', type=str, default='none',
choices=['wandb', 'neptune', 'none'],
help='Logger to use')
parser.add_argument('--project', type=str, default='pufferlib-training',
help='Project name for logging')
parser.add_argument('--exp-name', type=str, default='experiment',
help='Experiment name')
parser.add_argument('--neptune-token', type=str, default=None,
help='Neptune API token')
parser.add_argument('--log-freq', type=int, default=10,
help='Logging frequency (iterations)')
# Checkpointing
parser.add_argument('--checkpoint-dir', type=str, default='checkpoints',
help='Directory to save checkpoints')
parser.add_argument('--save-freq', type=int, default=100,
help='Checkpoint save frequency (iterations)')
# Misc
parser.add_argument('--seed', type=int, default=42,
help='Random seed')
parser.add_argument('--compile', action='store_true',
help='Use torch.compile for faster training')
args = parser.parse_args()
# Create checkpoint directory
import os
os.makedirs(args.checkpoint_dir, exist_ok=True)
# Run training
train(args)
if __name__ == '__main__':
main()