Current min version. Still lots to cut. Needs data to match momask

PufferAI · Feb 8, 2025 · c238035 · c238035
1 parent 26eb3f3
commit c238035
Show file tree

Hide file tree

Showing 8 changed files with 5,013 additions and 15 deletions.
diff --git a/clean_pufferl.py b/clean_pufferl.py
@@ -89,7 +89,10 @@ def evaluate(data):
             env_id = env_id.tolist()
 
         with profile.eval_misc:
-            data.global_step += sum(mask)
+            if isinstance(mask, torch.Tensor):
+                data.global_step += mask.sum().item()
+            else:
+                data.global_step += sum(mask)
 
             o = torch.as_tensor(o)
             o_device = o.to(config.device)
@@ -116,7 +119,10 @@ def evaluate(data):
             actions = actions.cpu().numpy()
             mask = torch.as_tensor(mask)# * policy.mask)
             o = o if config.cpu_offload else o_device
-            experience.store(o, value, actions, logprob, r, d, env_id, mask)
+
+            state = data.vecenv.state
+            demo = data.vecenv.demo
+            experience.store(o, state, demo, value, actions, logprob, r, d, env_id, mask)
 
             for i in info:
                 for k, v in pufferlib.utils.unroll_nested_dict(i):
@@ -156,21 +162,44 @@ def train(data):
         dones_np = experience.dones_np[idxs]
         values_np = experience.values_np[idxs]
         rewards_np = experience.rewards_np[idxs]
-        # TODO: bootstrap between segment bounds
-        advantages_np = compute_gae(dones_np, values_np,
-            rewards_np, config.gamma, config.gae_lambda)
-        experience.flatten_batch(advantages_np)
+        experience.flatten_batch()
+
 
     # Optimizing the policy and value network
     total_minibatches = experience.num_minibatches * config.update_epochs
     mean_pg_loss, mean_v_loss, mean_entropy_loss = 0, 0, 0
     mean_old_kl, mean_kl, mean_clipfrac = 0, 0, 0
+
+    # Compute adversarial reward. Note: discriminator doesn't get
+    # updated as often this way, but GAE is more accurate
+    state = experience.state.view(experience.num_minibatches,
+        config.minibatch_size, experience.state.shape[-1])
+    adversarial_reward = torch.zeros(
+        experience.num_minibatches, config.minibatch_size).to(config.device)
+    with torch.no_grad():
+        for mb in range(experience.num_minibatches):
+            adversarial_reward[mb] = -torch.log(1 - data.policy.policy.discriminate(state[mb]).squeeze())
+
+    # TODO: Nans in adversarial reward and gae
+    adversarial_reward_np = adversarial_reward.cpu().numpy().ravel()
+    advantages_np = compute_gae(dones_np, values_np,
+        rewards_np + adversarial_reward_np, config.gamma, config.gae_lambda)
+    advantages = torch.as_tensor(advantages_np).to(config.device)
+    experience.b_advantages = advantages.reshape(experience.minibatch_rows,
+        experience.num_minibatches, experience.bptt_horizon).transpose(0, 1).reshape(
+        experience.num_minibatches, experience.minibatch_size)
+    experience.returns_np = advantages_np + experience.values_np
+    experience.b_returns = experience.b_advantages + experience.b_values
+
     for epoch in range(config.update_epochs):
         lstm_state = None
         for mb in range(experience.num_minibatches):
+            # TODO: bootstrap between segment bounds
             with profile.train_misc:
                 obs = experience.b_obs[mb]
                 obs = obs.to(config.device)
+                state = experience.b_state[mb]
+                demo = experience.b_demo[mb].to(config.device)
                 atn = experience.b_actions[mb]
                 log_probs = experience.b_logprobs[mb]
                 val = experience.b_values[mb]
@@ -188,6 +217,14 @@ def train(data):
                         action=atn,
                     )
 
+                # Discriminator loss
+                # BUG: Data shape is wrong for morph. State should have same shape as demo
+                disc_state = data.policy.policy.discriminate(state)
+                disc_demo = data.policy.policy.discriminate(demo)
+                disc_loss_agent = torch.nn.BCEWithLogitsLoss()(disc_state, torch.zeros_like(disc_state))
+                disc_loss_demo = torch.nn.BCEWithLogitsLoss()(disc_demo, torch.ones_like(disc_demo))
+                disc_loss = 0.5 * (disc_loss_agent + disc_loss_demo)
+
                 if config.device == 'cuda':
                     torch.cuda.synchronize()
 
@@ -228,7 +265,7 @@ def train(data):
                     v_loss = 0.5 * ((newvalue - ret) ** 2).mean()
 
                 entropy_loss = entropy.mean()
-                loss = pg_loss - config.ent_coef * entropy_loss + v_loss * config.vf_coef
+                loss = pg_loss - config.ent_coef*entropy_loss + config.vf_coef*v_loss + config.disc_coef*disc_loss
 
             with profile.learn:
                 data.optimizer.zero_grad()
@@ -399,6 +436,10 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtyp
         obs_device = device if not pin else 'cpu'
         self.obs=torch.zeros(batch_size, *obs_shape, dtype=obs_dtype,
             pin_memory=pin, device=device if not pin else 'cpu')
+        self.demo=torch.zeros(batch_size, 576, dtype=obs_dtype,
+            pin_memory=pin, device=device if not pin else 'cpu')
+        self.state=torch.zeros(batch_size, 358, dtype=obs_dtype,
+            pin_memory=pin, device=device if not pin else 'cpu')
         self.actions=torch.zeros(batch_size, *atn_shape, dtype=atn_dtype, pin_memory=pin)
         self.logprobs=torch.zeros(batch_size, pin_memory=pin)
         self.rewards=torch.zeros(batch_size, pin_memory=pin)
@@ -443,13 +484,15 @@ def __init__(self, batch_size, bptt_horizon, minibatch_size, obs_shape, obs_dtyp
     def full(self):
         return self.ptr >= self.batch_size
 
-    def store(self, obs, value, action, logprob, reward, done, env_id, mask):
+    def store(self, obs, state, demo, value, action, logprob, reward, done, env_id, mask):
         # Mask learner and Ensure indices do not exceed batch size
         ptr = self.ptr
         indices = torch.where(mask)[0].numpy()[:self.batch_size - ptr]
         end = ptr + len(indices)
 
         self.obs[ptr:end] = obs.to(self.obs.device)[indices]
+        self.state[ptr:end] = state.to(self.state.device)[indices]
+        self.demo[ptr:end] = demo.to(self.demo.device)[indices]
         self.values_np[ptr:end] = value.cpu().numpy()[indices]
         self.actions_np[ptr:end] = action[indices]
         self.logprobs_np[ptr:end] = logprob.cpu().numpy()[indices]
@@ -465,29 +508,31 @@ def sort_training_data(self):
         self.b_idxs_obs = torch.as_tensor(idxs.reshape(
                 self.minibatch_rows, self.num_minibatches, self.bptt_horizon
             ).transpose(1,0,-1)).to(self.obs.device).long()
+        self.b_idxs_state = torch.as_tensor(idxs.reshape(
+                self.minibatch_rows, self.num_minibatches, self.bptt_horizon
+            ).transpose(1,0,-1)).to(self.state.device).long()
+        self.b_idxs_demo = torch.as_tensor(idxs.reshape(
+                self.minibatch_rows, self.num_minibatches, self.bptt_horizon
+            ).transpose(1,0,-1)).to(self.demo.device).long()
         self.b_idxs = self.b_idxs_obs.to(self.device)
         self.b_idxs_flat = self.b_idxs.reshape(
             self.num_minibatches, self.minibatch_size)
         self.sort_keys = []
         return idxs
 
-    def flatten_batch(self, advantages_np):
-        advantages = torch.as_tensor(advantages_np).to(self.device)
+    def flatten_batch(self):
         b_idxs, b_flat = self.b_idxs, self.b_idxs_flat
         self.b_actions = self.actions.to(self.device, non_blocking=True)
         self.b_logprobs = self.logprobs.to(self.device, non_blocking=True)
         self.b_dones = self.dones.to(self.device, non_blocking=True)
         self.b_values = self.values.to(self.device, non_blocking=True)
-        self.b_advantages = advantages.reshape(self.minibatch_rows,
-            self.num_minibatches, self.bptt_horizon).transpose(0, 1).reshape(
-            self.num_minibatches, self.minibatch_size)
-        self.returns_np = advantages_np + self.values_np
         self.b_obs = self.obs[self.b_idxs_obs]
+        self.b_state = self.state[self.b_idxs_state]
+        self.b_demo = self.demo[self.b_idxs_demo]
         self.b_actions = self.b_actions[b_idxs].contiguous()
         self.b_logprobs = self.b_logprobs[b_idxs]
         self.b_dones = self.b_dones[b_idxs]
         self.b_values = self.b_values[b_flat]
-        self.b_returns = self.b_advantages + self.b_values
 
 class Utilization(Thread):
     def __init__(self, delay=1, maxlen=20):

diff --git a/pufferlib/environments/morph/__init__.py b/pufferlib/environments/morph/__init__.py
@@ -0,0 +1,12 @@
+from .environment import env_creator
+
+try:
+    import torch
+except ImportError:
+    pass
+else:
+    from .torch import Policy
+    try:
+        from .torch import Recurrent
+    except:
+        Recurrent = None
diff --git a/pufferlib/environments/morph/environment.py b/pufferlib/environments/morph/environment.py
@@ -0,0 +1,139 @@
+import time
+import argparse
+import functools
+
+from pufferlib.environments.morph.humanoid_phc import HumanoidPHC
+
+import torch
+import numpy as np
+
+import pufferlib
+
+def env_creator(name='morph'):
+    return functools.partial(make, name)
+
+def make(name, **kwargs):
+    return PHCPufferEnv(**kwargs)
+
+class PHCPufferEnv(pufferlib.PufferEnv):
+    def __init__(self, motion_file, has_self_collision, num_envs=32, device_type="cuda",
+            device_id=0, headless=True, log_interval=128):
+        cfg = {
+            'env': {
+                'num_envs': num_envs,
+                'motion_file': motion_file,
+            },
+            'robot': {
+                'has_self_collision': has_self_collision,
+            },
+        }
+        self.env = HumanoidPHC(cfg, device_type=device_type, device_id=device_id, headless=headless)
+        self.single_observation_space = self.env.single_observation_space
+        self.single_action_space = self.env.single_action_space
+        self.num_agents = self.num_envs = self.env.num_envs
+        self.device = self.env.device
+
+        # Check the buffer data types, match them to puffer
+        buffers = pufferlib.namespace(
+            observations=self.env.obs_buf,
+            rewards=self.env.rew_buf,
+            terminals=self.env.reset_buf,
+            truncations=torch.zeros_like(self.env.reset_buf),
+            masks=torch.ones_like(self.env.reset_buf),
+            actions=torch.zeros(
+                (self.num_agents, *self.single_action_space.shape), dtype=torch.float, device=self.device
+            ),
+        )
+
+        super().__init__(buffers)
+
+        self.log_interval = log_interval
+        self.episode_returns = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device)
+        self.episode_lengths = torch.zeros(self.num_envs, dtype=torch.int32, device=self.device)
+        self._infos = {
+            "episode_return": [],
+            "episode_length": [],
+        }
+
+    def reset(self, seed=None):
+        self.env.reset()
+        self.demo = self.env.demo
+        self.state = self.env.state
+        return self.observations, []
+
+    def step(self, actions_np):
+        self.actions[:] = torch.from_numpy(actions_np)
+
+        # obs, reward, done are put into the buffers
+        self.env.step(self.actions)
+        self.demo = self.env.demo
+        self.state = self.env.state
+
+        # NOTE: rl-games reset done envs in the training script. Keeping this here for now.
+        # TODO: Move this into the env
+        done_indices = torch.nonzero(self.terminals).squeeze(-1)
+        if len(done_indices) > 0:
+            self.observations[done_indices] = self.env.reset(done_indices)[done_indices]
+
+            self._infos["episode_return"] += self.episode_returns[done_indices].tolist()
+            self._infos["episode_length"] += self.episode_lengths[done_indices].tolist()
+            self.episode_returns[done_indices] = 0
+            self.episode_lengths[done_indices] = 0
+
+        self.episode_returns += self.rewards
+        self.episode_lengths += 1
+
+        # TODO: self.env.extras has infos. Extract useful info?
+        info = self.mean_and_log()
+
+        return self.observations, self.rewards, self.terminals, self.truncations, info
+
+    def close(self):
+        self.env.close()
+
+    def mean_and_log(self):
+        if len(self._infos["episode_return"]) < self.log_interval:
+            return []
+
+        info = {
+            "episode_return": np.mean(self._infos["episode_return"]),
+            "episode_length": np.mean(self._infos["episode_length"]),
+        }
+        self._infos["episode_return"].clear()
+        self._infos["episode_length"].clear()
+
+        return [info]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-n", "--num_envs", type=int, default=32)
+    parser.add_argument("-m", "--motion_file", type=str, default="sample_data/amass_train_take6_upright.pkl")
+    parser.add_argument("--disable_self_collision", action="store_true")
+    args = parser.parse_args()
+
+    def test_perf(env, timeout=10):
+        steps = 0
+        start = time.time()
+        env.reset()
+        actions = env.action_space.sample()
+
+        print("Starting perf test...")
+        while time.time() - start < timeout:
+            env.step(actions)
+            steps += env.num_agents
+
+        end = time.time()
+        sps = int(steps / (end - start))
+        print(f"Steps: {steps}, SPS: {sps}")
+
+    cfg = {
+        "env": {
+            "num_envs": args.num_envs,
+            "motion_file": args.motion_file,
+        },
+        "robot": {"has_self_collision": not args.disable_self_collision},
+    }
+
+    env = PHCPufferEnv(cfg)
+    test_perf(env)