Skip to content

Commit

Permalink
Current min version. Still lots to cut. Needs data to match momask
Browse files Browse the repository at this point in the history
  • Loading branch information
Joseph Suarez committed Feb 8, 2025
1 parent 26eb3f3 commit c238035
Show file tree
Hide file tree
Showing 8 changed files with 5,013 additions and 15 deletions.
75 changes: 60 additions & 15 deletions clean_pufferl.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ def evaluate(data):
env_id = env_id.tolist()

with profile.eval_misc:
data.global_step += sum(mask)
if isinstance(mask, torch.Tensor):
data.global_step += mask.sum().item()
else:
data.global_step += sum(mask)

o = torch.as_tensor(o)
o_device = o.to(config.device)
Expand All @@ -116,7 +119,10 @@ def evaluate(data):
actions = actions.cpu().numpy()
mask = torch.as_tensor(mask)# * policy.mask)
o = o if config.cpu_offload else o_device
experience.store(o, value, actions, logprob, r, d, env_id, mask)

state = data.vecenv.state
demo = data.vecenv.demo
experience.store(o, state, demo, value, actions, logprob, r, d, env_id, mask)

for i in info:
for k, v in pufferlib.utils.unroll_nested_dict(i):
Expand Down Expand Up @@ -156,21 +162,44 @@ def train(data):
dones_np = experience.dones_np[idxs]
values_np = experience.values_np[idxs]
rewards_np = experience.rewards_np[idxs]
# TODO: bootstrap between segment bounds
advantages_np = compute_gae(dones_np, values_np,
rewards_np, config.gamma, config.gae_lambda)
experience.flatten_batch(advantages_np)
experience.flatten_batch()


# Optimizing the policy and value network
total_minibatches = experience.num_minibatches * config.update_epochs
mean_pg_loss, mean_v_loss, mean_entropy_loss = 0, 0, 0
mean_old_kl, mean_kl, mean_clipfrac = 0, 0, 0

# Compute adversarial reward. Note: discriminator doesn't get
# updated as often this way, but GAE is more accurate
state = experience.state.view(experience.num_minibatches,
config.minibatch_size, experience.state.shape[-1])
adversarial_reward = torch.zeros(
experience.num_minibatches, config.minibatch_size).to(config.device)
with torch.no_grad():
for mb in range(experience.num_minibatches):
adversarial_reward[mb] = -torch.log(1 - data.policy.policy.discriminate(state[mb]).squeeze())

# TODO: Nans in adversarial reward and gae
adversarial_reward_np = adversarial_reward.cpu().numpy().ravel()
advantages_np = compute_gae(dones_np, values_np,
rewards_np + adversarial_reward_np, config.gamma, config.gae_lambda)
advantages = torch.as_tensor(advantages_np).to(config.device)
experience.b_advantages = advantages.reshape(experience.minibatch_rows,
experience.num_minibatches, experience.bptt_horizon).transpose(0, 1).reshape(
experience.num_minibatches, experience.minibatch_size)
experience.returns_np = advantages_np + experience.values_np
experience.b_returns = experience.b_advantages + experience.b_values

for epoch in range(config.update_epochs):
lstm_state = None
for mb in range(experience.num_minibatches):
# TODO: bootstrap between segment bounds
with profile.train_misc:
obs = experience.b_obs[mb]
obs = obs.to(config.device)
state = experience.b_state[mb]
demo = experience.b_demo[mb].to(config.device)
atn = experience.b_actions[mb]
log_probs = experience.b_logprobs[mb]
val = experience.b_values[mb]
Expand All @@ -188,6 +217,14 @@ def train(data):
action=atn,
)

# Discriminator loss
# BUG: Data shape is wrong for morph. State should have same shape as demo
disc_state = data.policy.policy.discriminate(state)
disc_demo = data.policy.policy.discriminate(demo)
disc_loss_agent = torch.nn.BCEWithLogitsLoss()(disc_state, torch.zeros_like(disc_state))
disc_loss_demo = torch.nn.BCEWithLogitsLoss()(disc_demo, torch.ones_like(disc_demo))
disc_loss = 0.5 * (disc_loss_agent + disc_loss_demo)

if config.device == 'cuda':
torch.cuda.synchronize()

Expand Down Expand Up @@ -228,7 +265,7 @@ def train(data):
v_loss = 0.5 * ((newvalue - ret) ** 2).mean()

entropy_loss = entropy.mean()
loss = pg_loss - config.ent_coef * entropy_loss + v_loss * config.vf_coef
loss = pg_loss - config.ent_coef*entropy_loss + config.vf_coef*v_loss + config.disc_coef*disc_loss

with profile.learn:
data.optimizer.zero_grad()
Expand Down Expand Up @@ -399,6 +436,10 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtyp
obs_device = device if not pin else 'cpu'
self.obs=torch.zeros(batch_size, *obs_shape, dtype=obs_dtype,
pin_memory=pin, device=device if not pin else 'cpu')
self.demo=torch.zeros(batch_size, 576, dtype=obs_dtype,
pin_memory=pin, device=device if not pin else 'cpu')
self.state=torch.zeros(batch_size, 358, dtype=obs_dtype,
pin_memory=pin, device=device if not pin else 'cpu')
self.actions=torch.zeros(batch_size, *atn_shape, dtype=atn_dtype, pin_memory=pin)
self.logprobs=torch.zeros(batch_size, pin_memory=pin)
self.rewards=torch.zeros(batch_size, pin_memory=pin)
Expand Down Expand Up @@ -443,13 +484,15 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtyp
def full(self):
return self.ptr >= self.batch_size

def store(self, obs, value, action, logprob, reward, done, env_id, mask):
def store(self, obs, state, demo, value, action, logprob, reward, done, env_id, mask):
# Mask learner and Ensure indices do not exceed batch size
ptr = self.ptr
indices = torch.where(mask)[0].numpy()[:self.batch_size - ptr]
end = ptr + len(indices)

self.obs[ptr:end] = obs.to(self.obs.device)[indices]
self.state[ptr:end] = state.to(self.state.device)[indices]
self.demo[ptr:end] = demo.to(self.demo.device)[indices]
self.values_np[ptr:end] = value.cpu().numpy()[indices]
self.actions_np[ptr:end] = action[indices]
self.logprobs_np[ptr:end] = logprob.cpu().numpy()[indices]
Expand All @@ -465,29 +508,31 @@ def sort_training_data(self):
self.b_idxs_obs = torch.as_tensor(idxs.reshape(
self.minibatch_rows, self.num_minibatches, self.bptt_horizon
).transpose(1,0,-1)).to(self.obs.device).long()
self.b_idxs_state = torch.as_tensor(idxs.reshape(
self.minibatch_rows, self.num_minibatches, self.bptt_horizon
).transpose(1,0,-1)).to(self.state.device).long()
self.b_idxs_demo = torch.as_tensor(idxs.reshape(
self.minibatch_rows, self.num_minibatches, self.bptt_horizon
).transpose(1,0,-1)).to(self.demo.device).long()
self.b_idxs = self.b_idxs_obs.to(self.device)
self.b_idxs_flat = self.b_idxs.reshape(
self.num_minibatches, self.minibatch_size)
self.sort_keys = []
return idxs

def flatten_batch(self, advantages_np):
advantages = torch.as_tensor(advantages_np).to(self.device)
def flatten_batch(self):
b_idxs, b_flat = self.b_idxs, self.b_idxs_flat
self.b_actions = self.actions.to(self.device, non_blocking=True)
self.b_logprobs = self.logprobs.to(self.device, non_blocking=True)
self.b_dones = self.dones.to(self.device, non_blocking=True)
self.b_values = self.values.to(self.device, non_blocking=True)
self.b_advantages = advantages.reshape(self.minibatch_rows,
self.num_minibatches, self.bptt_horizon).transpose(0, 1).reshape(
self.num_minibatches, self.minibatch_size)
self.returns_np = advantages_np + self.values_np
self.b_obs = self.obs[self.b_idxs_obs]
self.b_state = self.state[self.b_idxs_state]
self.b_demo = self.demo[self.b_idxs_demo]
self.b_actions = self.b_actions[b_idxs].contiguous()
self.b_logprobs = self.b_logprobs[b_idxs]
self.b_dones = self.b_dones[b_idxs]
self.b_values = self.b_values[b_flat]
self.b_returns = self.b_advantages + self.b_values

class Utilization(Thread):
def __init__(self, delay=1, maxlen=20):
Expand Down
12 changes: 12 additions & 0 deletions pufferlib/environments/morph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .environment import env_creator

try:
import torch
except ImportError:
pass
else:
from .torch import Policy
try:
from .torch import Recurrent
except:
Recurrent = None
139 changes: 139 additions & 0 deletions pufferlib/environments/morph/environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import time
import argparse
import functools

from pufferlib.environments.morph.humanoid_phc import HumanoidPHC

import torch
import numpy as np

import pufferlib

def env_creator(name='morph'):
return functools.partial(make, name)

def make(name, **kwargs):
return PHCPufferEnv(**kwargs)

class PHCPufferEnv(pufferlib.PufferEnv):
def __init__(self, motion_file, has_self_collision, num_envs=32, device_type="cuda",
device_id=0, headless=True, log_interval=128):
cfg = {
'env': {
'num_envs': num_envs,
'motion_file': motion_file,
},
'robot': {
'has_self_collision': has_self_collision,
},
}
self.env = HumanoidPHC(cfg, device_type=device_type, device_id=device_id, headless=headless)
self.single_observation_space = self.env.single_observation_space
self.single_action_space = self.env.single_action_space
self.num_agents = self.num_envs = self.env.num_envs
self.device = self.env.device

# Check the buffer data types, match them to puffer
buffers = pufferlib.namespace(
observations=self.env.obs_buf,
rewards=self.env.rew_buf,
terminals=self.env.reset_buf,
truncations=torch.zeros_like(self.env.reset_buf),
masks=torch.ones_like(self.env.reset_buf),
actions=torch.zeros(
(self.num_agents, *self.single_action_space.shape), dtype=torch.float, device=self.device
),
)

super().__init__(buffers)

self.log_interval = log_interval
self.episode_returns = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device)
self.episode_lengths = torch.zeros(self.num_envs, dtype=torch.int32, device=self.device)
self._infos = {
"episode_return": [],
"episode_length": [],
}

def reset(self, seed=None):
self.env.reset()
self.demo = self.env.demo
self.state = self.env.state
return self.observations, []

def step(self, actions_np):
self.actions[:] = torch.from_numpy(actions_np)

# obs, reward, done are put into the buffers
self.env.step(self.actions)
self.demo = self.env.demo
self.state = self.env.state

# NOTE: rl-games reset done envs in the training script. Keeping this here for now.
# TODO: Move this into the env
done_indices = torch.nonzero(self.terminals).squeeze(-1)
if len(done_indices) > 0:
self.observations[done_indices] = self.env.reset(done_indices)[done_indices]

self._infos["episode_return"] += self.episode_returns[done_indices].tolist()
self._infos["episode_length"] += self.episode_lengths[done_indices].tolist()
self.episode_returns[done_indices] = 0
self.episode_lengths[done_indices] = 0

self.episode_returns += self.rewards
self.episode_lengths += 1

# TODO: self.env.extras has infos. Extract useful info?
info = self.mean_and_log()

return self.observations, self.rewards, self.terminals, self.truncations, info

def close(self):
self.env.close()

def mean_and_log(self):
if len(self._infos["episode_return"]) < self.log_interval:
return []

info = {
"episode_return": np.mean(self._infos["episode_return"]),
"episode_length": np.mean(self._infos["episode_length"]),
}
self._infos["episode_return"].clear()
self._infos["episode_length"].clear()

return [info]


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--num_envs", type=int, default=32)
parser.add_argument("-m", "--motion_file", type=str, default="sample_data/amass_train_take6_upright.pkl")
parser.add_argument("--disable_self_collision", action="store_true")
args = parser.parse_args()

def test_perf(env, timeout=10):
steps = 0
start = time.time()
env.reset()
actions = env.action_space.sample()

print("Starting perf test...")
while time.time() - start < timeout:
env.step(actions)
steps += env.num_agents

end = time.time()
sps = int(steps / (end - start))
print(f"Steps: {steps}, SPS: {sps}")

cfg = {
"env": {
"num_envs": args.num_envs,
"motion_file": args.motion_file,
},
"robot": {"has_self_collision": not args.disable_self_collision},
}

env = PHCPufferEnv(cfg)
test_perf(env)
Loading

0 comments on commit c238035

Please sign in to comment.