Initial commit
This commit is contained in:
340
skills/pufferlib/scripts/env_template.py
Normal file
340
skills/pufferlib/scripts/env_template.py
Normal file
@@ -0,0 +1,340 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PufferLib Environment Template
|
||||
|
||||
This template provides a starting point for creating custom PufferEnv environments.
|
||||
Customize the observation space, action space, and environment logic for your task.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pufferlib
|
||||
from pufferlib import PufferEnv
|
||||
|
||||
|
||||
class MyEnvironment(PufferEnv):
|
||||
"""
|
||||
Custom PufferLib environment template.
|
||||
|
||||
This is a simple grid world example. Customize it for your specific task.
|
||||
"""
|
||||
|
||||
def __init__(self, buf=None, grid_size=10, max_steps=1000):
|
||||
"""
|
||||
Initialize environment.
|
||||
|
||||
Args:
|
||||
buf: Shared memory buffer (managed by PufferLib)
|
||||
grid_size: Size of the grid world
|
||||
max_steps: Maximum steps per episode
|
||||
"""
|
||||
super().__init__(buf)
|
||||
|
||||
self.grid_size = grid_size
|
||||
self.max_steps = max_steps
|
||||
|
||||
# Define observation space
|
||||
# Option 1: Flat vector observation
|
||||
self.observation_space = self.make_space((4,)) # [x, y, goal_x, goal_y]
|
||||
|
||||
# Option 2: Dict observation with multiple components
|
||||
# self.observation_space = self.make_space({
|
||||
# 'position': (2,),
|
||||
# 'goal': (2,),
|
||||
# 'grid': (grid_size, grid_size)
|
||||
# })
|
||||
|
||||
# Option 3: Image observation
|
||||
# self.observation_space = self.make_space((grid_size, grid_size, 3))
|
||||
|
||||
# Define action space
|
||||
# Option 1: Discrete actions
|
||||
self.action_space = self.make_discrete(4) # 0: up, 1: right, 2: down, 3: left
|
||||
|
||||
# Option 2: Continuous actions
|
||||
# self.action_space = self.make_space((2,)) # [dx, dy]
|
||||
|
||||
# Option 3: Multi-discrete actions
|
||||
# self.action_space = self.make_multi_discrete([3, 3]) # Two 3-way choices
|
||||
|
||||
# Initialize state
|
||||
self.agent_pos = None
|
||||
self.goal_pos = None
|
||||
self.step_count = 0
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset environment to initial state.
|
||||
|
||||
Returns:
|
||||
observation: Initial observation
|
||||
"""
|
||||
# Reset state
|
||||
self.agent_pos = np.array([0, 0], dtype=np.float32)
|
||||
self.goal_pos = np.array([self.grid_size - 1, self.grid_size - 1], dtype=np.float32)
|
||||
self.step_count = 0
|
||||
|
||||
# Return initial observation
|
||||
return self._get_observation()
|
||||
|
||||
def step(self, action):
|
||||
"""
|
||||
Execute one environment step.
|
||||
|
||||
Args:
|
||||
action: Action to take
|
||||
|
||||
Returns:
|
||||
observation: New observation
|
||||
reward: Reward for this step
|
||||
done: Whether episode is complete
|
||||
info: Additional information
|
||||
"""
|
||||
self.step_count += 1
|
||||
|
||||
# Execute action
|
||||
self._apply_action(action)
|
||||
|
||||
# Compute reward
|
||||
reward = self._compute_reward()
|
||||
|
||||
# Check if episode is done
|
||||
done = self._is_done()
|
||||
|
||||
# Get new observation
|
||||
observation = self._get_observation()
|
||||
|
||||
# Additional info
|
||||
info = {}
|
||||
if done:
|
||||
# Include episode statistics when episode ends
|
||||
info['episode'] = {
|
||||
'r': reward,
|
||||
'l': self.step_count
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def _apply_action(self, action):
|
||||
"""Apply action to update environment state."""
|
||||
# Discrete actions: 0=up, 1=right, 2=down, 3=left
|
||||
if action == 0: # Up
|
||||
self.agent_pos[1] = min(self.agent_pos[1] + 1, self.grid_size - 1)
|
||||
elif action == 1: # Right
|
||||
self.agent_pos[0] = min(self.agent_pos[0] + 1, self.grid_size - 1)
|
||||
elif action == 2: # Down
|
||||
self.agent_pos[1] = max(self.agent_pos[1] - 1, 0)
|
||||
elif action == 3: # Left
|
||||
self.agent_pos[0] = max(self.agent_pos[0] - 1, 0)
|
||||
|
||||
def _compute_reward(self):
|
||||
"""Compute reward for current state."""
|
||||
# Distance to goal
|
||||
distance = np.linalg.norm(self.agent_pos - self.goal_pos)
|
||||
|
||||
# Reward shaping: negative distance + bonus for reaching goal
|
||||
reward = -distance / self.grid_size
|
||||
|
||||
# Goal reached
|
||||
if distance < 0.5:
|
||||
reward += 10.0
|
||||
|
||||
return reward
|
||||
|
||||
def _is_done(self):
|
||||
"""Check if episode is complete."""
|
||||
# Episode ends if goal reached or max steps exceeded
|
||||
distance = np.linalg.norm(self.agent_pos - self.goal_pos)
|
||||
goal_reached = distance < 0.5
|
||||
timeout = self.step_count >= self.max_steps
|
||||
|
||||
return goal_reached or timeout
|
||||
|
||||
def _get_observation(self):
|
||||
"""Generate observation from current state."""
|
||||
# Return flat vector observation
|
||||
observation = np.concatenate([
|
||||
self.agent_pos,
|
||||
self.goal_pos
|
||||
]).astype(np.float32)
|
||||
|
||||
return observation
|
||||
|
||||
|
||||
class MultiAgentEnvironment(PufferEnv):
|
||||
"""
|
||||
Multi-agent environment template.
|
||||
|
||||
Example: Cooperative navigation task where agents must reach individual goals.
|
||||
"""
|
||||
|
||||
def __init__(self, buf=None, num_agents=4, grid_size=10, max_steps=1000):
|
||||
super().__init__(buf)
|
||||
|
||||
self.num_agents = num_agents
|
||||
self.grid_size = grid_size
|
||||
self.max_steps = max_steps
|
||||
|
||||
# Per-agent observation space
|
||||
self.single_observation_space = self.make_space({
|
||||
'position': (2,),
|
||||
'goal': (2,),
|
||||
'others': (2 * (num_agents - 1),) # Positions of other agents
|
||||
})
|
||||
|
||||
# Per-agent action space
|
||||
self.single_action_space = self.make_discrete(5) # 4 directions + stay
|
||||
|
||||
# Initialize state
|
||||
self.agent_positions = None
|
||||
self.goal_positions = None
|
||||
self.step_count = 0
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset all agents."""
|
||||
# Random initial positions
|
||||
self.agent_positions = np.random.rand(self.num_agents, 2) * self.grid_size
|
||||
|
||||
# Random goal positions
|
||||
self.goal_positions = np.random.rand(self.num_agents, 2) * self.grid_size
|
||||
|
||||
self.step_count = 0
|
||||
|
||||
# Return observations for all agents
|
||||
return {
|
||||
f'agent_{i}': self._get_obs(i)
|
||||
for i in range(self.num_agents)
|
||||
}
|
||||
|
||||
def step(self, actions):
|
||||
"""
|
||||
Step all agents.
|
||||
|
||||
Args:
|
||||
actions: Dict of {agent_id: action}
|
||||
|
||||
Returns:
|
||||
observations: Dict of {agent_id: observation}
|
||||
rewards: Dict of {agent_id: reward}
|
||||
dones: Dict of {agent_id: done}
|
||||
infos: Dict of {agent_id: info}
|
||||
"""
|
||||
self.step_count += 1
|
||||
|
||||
observations = {}
|
||||
rewards = {}
|
||||
dones = {}
|
||||
infos = {}
|
||||
|
||||
# Update all agents
|
||||
for agent_id, action in actions.items():
|
||||
agent_idx = int(agent_id.split('_')[1])
|
||||
|
||||
# Apply action
|
||||
self._apply_action(agent_idx, action)
|
||||
|
||||
# Generate outputs
|
||||
observations[agent_id] = self._get_obs(agent_idx)
|
||||
rewards[agent_id] = self._compute_reward(agent_idx)
|
||||
dones[agent_id] = self._is_done(agent_idx)
|
||||
infos[agent_id] = {}
|
||||
|
||||
# Global done condition
|
||||
dones['__all__'] = all(dones.values()) or self.step_count >= self.max_steps
|
||||
|
||||
return observations, rewards, dones, infos
|
||||
|
||||
def _apply_action(self, agent_idx, action):
|
||||
"""Apply action for specific agent."""
|
||||
if action == 0: # Up
|
||||
self.agent_positions[agent_idx, 1] += 1
|
||||
elif action == 1: # Right
|
||||
self.agent_positions[agent_idx, 0] += 1
|
||||
elif action == 2: # Down
|
||||
self.agent_positions[agent_idx, 1] -= 1
|
||||
elif action == 3: # Left
|
||||
self.agent_positions[agent_idx, 0] -= 1
|
||||
# action == 4: Stay
|
||||
|
||||
# Clip to grid bounds
|
||||
self.agent_positions[agent_idx] = np.clip(
|
||||
self.agent_positions[agent_idx],
|
||||
0,
|
||||
self.grid_size - 1
|
||||
)
|
||||
|
||||
def _compute_reward(self, agent_idx):
|
||||
"""Compute reward for specific agent."""
|
||||
distance = np.linalg.norm(
|
||||
self.agent_positions[agent_idx] - self.goal_positions[agent_idx]
|
||||
)
|
||||
return -distance / self.grid_size
|
||||
|
||||
def _is_done(self, agent_idx):
|
||||
"""Check if specific agent is done."""
|
||||
distance = np.linalg.norm(
|
||||
self.agent_positions[agent_idx] - self.goal_positions[agent_idx]
|
||||
)
|
||||
return distance < 0.5
|
||||
|
||||
def _get_obs(self, agent_idx):
|
||||
"""Get observation for specific agent."""
|
||||
# Get positions of other agents
|
||||
other_positions = np.concatenate([
|
||||
self.agent_positions[i]
|
||||
for i in range(self.num_agents)
|
||||
if i != agent_idx
|
||||
])
|
||||
|
||||
return {
|
||||
'position': self.agent_positions[agent_idx].astype(np.float32),
|
||||
'goal': self.goal_positions[agent_idx].astype(np.float32),
|
||||
'others': other_positions.astype(np.float32)
|
||||
}
|
||||
|
||||
|
||||
def test_environment():
|
||||
"""Test environment to verify it works correctly."""
|
||||
print("Testing single-agent environment...")
|
||||
env = MyEnvironment()
|
||||
|
||||
obs = env.reset()
|
||||
print(f"Initial observation shape: {obs.shape}")
|
||||
|
||||
for step in range(10):
|
||||
action = env.action_space.sample()
|
||||
obs, reward, done, info = env.step(action)
|
||||
|
||||
print(f"Step {step}: reward={reward:.3f}, done={done}")
|
||||
|
||||
if done:
|
||||
obs = env.reset()
|
||||
print("Episode finished, resetting...")
|
||||
|
||||
print("\nTesting multi-agent environment...")
|
||||
multi_env = MultiAgentEnvironment(num_agents=4)
|
||||
|
||||
obs = multi_env.reset()
|
||||
print(f"Number of agents: {len(obs)}")
|
||||
|
||||
for step in range(10):
|
||||
actions = {
|
||||
agent_id: multi_env.single_action_space.sample()
|
||||
for agent_id in obs.keys()
|
||||
}
|
||||
obs, rewards, dones, infos = multi_env.step(actions)
|
||||
|
||||
print(f"Step {step}: mean_reward={np.mean(list(rewards.values())):.3f}")
|
||||
|
||||
if dones.get('__all__', False):
|
||||
obs = multi_env.reset()
|
||||
print("Episode finished, resetting...")
|
||||
|
||||
print("\n✓ Environment tests passed!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_environment()
|
||||
239
skills/pufferlib/scripts/train_template.py
Normal file
239
skills/pufferlib/scripts/train_template.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PufferLib Training Template
|
||||
|
||||
This template provides a complete training script for reinforcement learning
|
||||
with PufferLib. Customize the environment, policy, and training configuration
|
||||
as needed for your use case.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import pufferlib
|
||||
from pufferlib import PuffeRL
|
||||
from pufferlib.pytorch import layer_init
|
||||
|
||||
|
||||
class Policy(nn.Module):
|
||||
"""Example policy network."""
|
||||
|
||||
def __init__(self, observation_space, action_space, hidden_size=256):
|
||||
super().__init__()
|
||||
|
||||
self.observation_space = observation_space
|
||||
self.action_space = action_space
|
||||
|
||||
# Encoder network
|
||||
self.encoder = nn.Sequential(
|
||||
layer_init(nn.Linear(observation_space.shape[0], hidden_size)),
|
||||
nn.ReLU(),
|
||||
layer_init(nn.Linear(hidden_size, hidden_size)),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
# Policy head (actor)
|
||||
self.actor = layer_init(nn.Linear(hidden_size, action_space.n), std=0.01)
|
||||
|
||||
# Value head (critic)
|
||||
self.critic = layer_init(nn.Linear(hidden_size, 1), std=1.0)
|
||||
|
||||
def forward(self, observations):
|
||||
"""Forward pass through policy."""
|
||||
features = self.encoder(observations)
|
||||
logits = self.actor(features)
|
||||
value = self.critic(features)
|
||||
return logits, value
|
||||
|
||||
|
||||
def make_env():
|
||||
"""Create environment. Customize this for your task."""
|
||||
# Option 1: Use Ocean environment
|
||||
return pufferlib.make('procgen-coinrun', num_envs=256)
|
||||
|
||||
# Option 2: Use Gymnasium environment
|
||||
# return pufferlib.make('gym-CartPole-v1', num_envs=256)
|
||||
|
||||
# Option 3: Use custom environment
|
||||
# from my_envs import MyEnvironment
|
||||
# return pufferlib.emulate(MyEnvironment, num_envs=256)
|
||||
|
||||
|
||||
def create_policy(env):
|
||||
"""Create policy network."""
|
||||
return Policy(
|
||||
observation_space=env.observation_space,
|
||||
action_space=env.action_space,
|
||||
hidden_size=256
|
||||
)
|
||||
|
||||
|
||||
def train(args):
|
||||
"""Main training function."""
|
||||
# Set random seeds
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
# Create environment
|
||||
print(f"Creating environment with {args.num_envs} parallel environments...")
|
||||
env = pufferlib.make(
|
||||
args.env_name,
|
||||
num_envs=args.num_envs,
|
||||
num_workers=args.num_workers
|
||||
)
|
||||
|
||||
# Create policy
|
||||
print("Initializing policy...")
|
||||
policy = create_policy(env)
|
||||
|
||||
if args.device == 'cuda' and torch.cuda.is_available():
|
||||
policy = policy.cuda()
|
||||
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
||||
else:
|
||||
print("Using CPU")
|
||||
|
||||
# Create logger
|
||||
if args.logger == 'wandb':
|
||||
from pufferlib import WandbLogger
|
||||
logger = WandbLogger(
|
||||
project=args.project,
|
||||
name=args.exp_name,
|
||||
config=vars(args)
|
||||
)
|
||||
elif args.logger == 'neptune':
|
||||
from pufferlib import NeptuneLogger
|
||||
logger = NeptuneLogger(
|
||||
project=args.project,
|
||||
name=args.exp_name,
|
||||
api_token=args.neptune_token
|
||||
)
|
||||
else:
|
||||
from pufferlib import NoLogger
|
||||
logger = NoLogger()
|
||||
|
||||
# Create trainer
|
||||
print("Creating trainer...")
|
||||
trainer = PuffeRL(
|
||||
env=env,
|
||||
policy=policy,
|
||||
device=args.device,
|
||||
learning_rate=args.learning_rate,
|
||||
batch_size=args.batch_size,
|
||||
n_epochs=args.n_epochs,
|
||||
gamma=args.gamma,
|
||||
gae_lambda=args.gae_lambda,
|
||||
clip_coef=args.clip_coef,
|
||||
ent_coef=args.ent_coef,
|
||||
vf_coef=args.vf_coef,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
logger=logger,
|
||||
compile=args.compile
|
||||
)
|
||||
|
||||
# Training loop
|
||||
print(f"Starting training for {args.num_iterations} iterations...")
|
||||
for iteration in range(1, args.num_iterations + 1):
|
||||
# Collect rollouts
|
||||
rollout_data = trainer.evaluate()
|
||||
|
||||
# Train on batch
|
||||
train_metrics = trainer.train()
|
||||
|
||||
# Log results
|
||||
trainer.mean_and_log()
|
||||
|
||||
# Save checkpoint
|
||||
if iteration % args.save_freq == 0:
|
||||
checkpoint_path = f"{args.checkpoint_dir}/checkpoint_{iteration}.pt"
|
||||
trainer.save_checkpoint(checkpoint_path)
|
||||
print(f"Saved checkpoint to {checkpoint_path}")
|
||||
|
||||
# Print progress
|
||||
if iteration % args.log_freq == 0:
|
||||
mean_reward = rollout_data.get('mean_reward', 0)
|
||||
sps = rollout_data.get('sps', 0)
|
||||
print(f"Iteration {iteration}/{args.num_iterations} | "
|
||||
f"Mean Reward: {mean_reward:.2f} | "
|
||||
f"SPS: {sps:,.0f}")
|
||||
|
||||
print("Training complete!")
|
||||
|
||||
# Save final model
|
||||
final_path = f"{args.checkpoint_dir}/final_model.pt"
|
||||
trainer.save_checkpoint(final_path)
|
||||
print(f"Saved final model to {final_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PufferLib Training')
|
||||
|
||||
# Environment
|
||||
parser.add_argument('--env-name', type=str, default='procgen-coinrun',
|
||||
help='Environment name')
|
||||
parser.add_argument('--num-envs', type=int, default=256,
|
||||
help='Number of parallel environments')
|
||||
parser.add_argument('--num-workers', type=int, default=8,
|
||||
help='Number of vectorization workers')
|
||||
|
||||
# Training
|
||||
parser.add_argument('--num-iterations', type=int, default=10000,
|
||||
help='Number of training iterations')
|
||||
parser.add_argument('--learning-rate', type=float, default=3e-4,
|
||||
help='Learning rate')
|
||||
parser.add_argument('--batch-size', type=int, default=32768,
|
||||
help='Batch size for training')
|
||||
parser.add_argument('--n-epochs', type=int, default=4,
|
||||
help='Number of training epochs per batch')
|
||||
parser.add_argument('--device', type=str, default='cuda',
|
||||
choices=['cuda', 'cpu'], help='Device to use')
|
||||
|
||||
# PPO Parameters
|
||||
parser.add_argument('--gamma', type=float, default=0.99,
|
||||
help='Discount factor')
|
||||
parser.add_argument('--gae-lambda', type=float, default=0.95,
|
||||
help='GAE lambda')
|
||||
parser.add_argument('--clip-coef', type=float, default=0.2,
|
||||
help='PPO clipping coefficient')
|
||||
parser.add_argument('--ent-coef', type=float, default=0.01,
|
||||
help='Entropy coefficient')
|
||||
parser.add_argument('--vf-coef', type=float, default=0.5,
|
||||
help='Value function coefficient')
|
||||
parser.add_argument('--max-grad-norm', type=float, default=0.5,
|
||||
help='Maximum gradient norm')
|
||||
|
||||
# Logging
|
||||
parser.add_argument('--logger', type=str, default='none',
|
||||
choices=['wandb', 'neptune', 'none'],
|
||||
help='Logger to use')
|
||||
parser.add_argument('--project', type=str, default='pufferlib-training',
|
||||
help='Project name for logging')
|
||||
parser.add_argument('--exp-name', type=str, default='experiment',
|
||||
help='Experiment name')
|
||||
parser.add_argument('--neptune-token', type=str, default=None,
|
||||
help='Neptune API token')
|
||||
parser.add_argument('--log-freq', type=int, default=10,
|
||||
help='Logging frequency (iterations)')
|
||||
|
||||
# Checkpointing
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='checkpoints',
|
||||
help='Directory to save checkpoints')
|
||||
parser.add_argument('--save-freq', type=int, default=100,
|
||||
help='Checkpoint save frequency (iterations)')
|
||||
|
||||
# Misc
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help='Random seed')
|
||||
parser.add_argument('--compile', action='store_true',
|
||||
help='Use torch.compile for faster training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create checkpoint directory
|
||||
import os
|
||||
os.makedirs(args.checkpoint_dir, exist_ok=True)
|
||||
|
||||
# Run training
|
||||
train(args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user