Skip to content

Commit

Permalink
Current version. Length stuck at 2?
Browse files Browse the repository at this point in the history
  • Loading branch information
Joseph Suarez committed Feb 10, 2025
1 parent bd8120a commit 2cdd770
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 18 deletions.
3 changes: 3 additions & 0 deletions clean_pufferl.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ def train(data):
experience.returns_np = advantages_np + experience.values_np
experience.b_returns = experience.b_advantages + experience.b_values

# Clamp action to [-1, 1]
experience.b_actions = torch.clamp(experience.b_actions, -1, 1)

for epoch in range(config.update_epochs):
lstm_state = None
for mb in range(experience.num_minibatches):
Expand Down
12 changes: 6 additions & 6 deletions config/morph.ini
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@ eval_timesteps = 100_000
num_workers = 1
num_envs = 1
batch_size = 65536
minibatch_size = 4096
minibatch_size = 16384

disc_coef = 5.0

update_epochs = 6
bptt_horizon = 32
update_epochs = 4
bptt_horizon = 8
anneal_lr = False
gae_lambda = 0.95
gamma = 0.99
gae_lambda = 0.99
gamma = 0.95
clip_coef = 0.2
clip_vloss = True
vf_coef = 0.5
vf_coef = 2.0
vf_clip_coef = 0.2
max_grad_norm = 1.0
ent_coef = 0.0
Expand Down
2 changes: 1 addition & 1 deletion pufferlib/cleanrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def sample_logits(logits: Union[torch.Tensor, List[torch.Tensor]],
if action is None:
action = logits.sample().view(batch, -1)

log_probs = logits.log_prob(action.view(batch, -1)).sum(1)
log_probs = logits.log_prob(action.view(batch, -1)).mean(1)
logits_entropy = logits.entropy().view(batch, -1).sum(1)
return action, log_probs, logits_entropy
elif is_discrete:
Expand Down
5 changes: 1 addition & 4 deletions pufferlib/environments/morph/environment.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import time
import argparse
import functools

from pufferlib.environments.morph.humanoid_phc import HumanoidPHC
from pufferlib.environments.morph.render_env import HumanoidRenderEnv

Expand Down Expand Up @@ -76,8 +75,6 @@ def step(self, actions_np):
if self.clip_actions:
actions_np = np.clip(actions_np, -1, 1)

self.actions[:] = torch.from_numpy(actions_np)

# obs, reward, done are put into the buffers
self.env.step(self.actions)
self.demo = self.env.demo
Expand All @@ -86,7 +83,7 @@ def step(self, actions_np):
self.terminals[:] = self.env.reset_buf
done_indices = torch.nonzero(self.terminals).squeeze(-1)
if len(done_indices) > 0:
self.observations[done_indices] = self.env.reset(done_indices)[done_indices]
self.env.reset(done_indices)
self._infos["episode_return"] += self.episode_returns[done_indices].tolist()
self._infos["episode_length"] += self.episode_lengths[done_indices].tolist()
self.episode_returns[done_indices] = 0
Expand Down
8 changes: 5 additions & 3 deletions pufferlib/environments/morph/humanoid_phc.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,9 @@ def _load_motion(self, motion_train_file, motion_test_file=None):
"device": self.device,
"fix_height": FixHeightMode.full_fix,
"min_length": self._min_motion_len,
"max_length": self._max_motion_len,
# NOTE: this max_length determines the training time, so using 300 for now
# TODO: find a way to evaluate full motion, probably not during training
"max_length": self.max_episode_length,
"im_eval": self.flag_im_eval,
"multi_thread": False, # CHECK ME: need to config?
"smpl_type": self.humanoid_type,
Expand All @@ -924,7 +926,7 @@ def _load_motion(self, motion_train_file, motion_test_file=None):
gender_betas=self.humanoid_shapes.cpu(),
limb_weights=self.humanoid_limb_and_weights.cpu(),
random_sample=(not self.flag_test) and (not self.seq_motions),
max_len=-1 if self.flag_test else self.max_episode_length,
# max_len=-1 if self.flag_test else self.max_episode_length, # NOTE: this is ignored in motion lib
start_idx=self._motion_sample_start_idx,
)

Expand Down Expand Up @@ -1659,7 +1661,7 @@ def resample_motions(self):
limb_weights=self.humanoid_limb_and_weights.cpu(),
gender_betas=self.humanoid_shapes.cpu(),
random_sample=(not self.flag_test) and (not self.seq_motions),
max_len=-1 if self.flag_test else self.max_episode_length,
# max_len=-1 if self.flag_test else self.max_episode_length, # NOTE: this is ignored in motion lib
)

time = self.progress_buf * self.dt + self._motion_start_times + self._motion_start_times_offset
Expand Down
32 changes: 28 additions & 4 deletions pufferlib/environments/morph/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import pufferlib.models


class Recurrent(pufferlib.models.LSTMWrapper):
def __init__(self, env, policy, input_size=512, hidden_size=512, num_layers=1):
super().__init__(env, policy, input_size, hidden_size, num_layers)
Expand All @@ -14,6 +13,12 @@ def __init__(self, env, input_dim, action_dim, demo_dim, hidden):
super().__init__()
self.is_continuous = True

self.actor_mlp = nn.Sequential(
layer_init(nn.Linear(input_dim, hidden)),
nn.ReLU(),
)

'''
self.actor_mlp = nn.Sequential(
layer_init(nn.Linear(input_dim, 2048)),
nn.SiLU(),
Expand All @@ -28,17 +33,26 @@ def __init__(self, env, input_dim, action_dim, demo_dim, hidden):
layer_init(nn.Linear(512, hidden)),
nn.SiLU(),
)
'''
'''
self.mu = nn.Linear(hidden, action_dim)
self.sigma = nn.Parameter(
torch.zeros(action_dim, requires_grad=False, dtype=torch.float32),
requires_grad=False,
)
nn.init.constant_(self.sigma, -2.9)
#self.mu = pufferlib.pytorch.layer_init(
# nn.Linear(hidden, action_dim), std=0.01)
#self.sigma = nn.Parameter(torch.zeros(1, action_dim))
'''
self.mu = pufferlib.pytorch.layer_init(
nn.Linear(hidden, action_dim), std=0.01)
self.sigma = nn.Parameter(torch.zeros(1, action_dim))

### Separate Critic
self.critic_mlp = nn.Sequential(
layer_init(nn.Linear(input_dim, hidden)),
nn.ReLU(),
)

'''
self.critic_mlp = nn.Sequential(
layer_init(nn.Linear(input_dim, 2048)),
nn.SiLU(),
Expand All @@ -53,6 +67,7 @@ def __init__(self, env, input_dim, action_dim, demo_dim, hidden):
layer_init(nn.Linear(512, action_dim)),
nn.SiLU(),
)
'''
self.value = nn.Linear(hidden, 1)

### Discriminator
Expand All @@ -63,8 +78,17 @@ def __init__(self, env, input_dim, action_dim, demo_dim, hidden):
nn.ReLU(),
)
self._disc_logits = layer_init(torch.nn.Linear(hidden, 1))
self.obs_mean = None

def forward(self, observations):
if self.obs_mean is None:
self.obs_mean = torch.mean(observations, dim=0)
self.obs_std = torch.std(observations, dim=0)

observations = torch.clamp(
(observations - self.obs_mean) / self.obs_std,
-10.0, 10.0
)
hidden, lookup = self.encode_observations(observations)
actions, value = self.decode_actions(hidden, lookup)
return actions, value
Expand Down

0 comments on commit 2cdd770

Please sign in to comment.