some test with the hopper

VIRAL-UCBL1 · Jan 6, 2025 · 7e2e517 · 7e2e517
1 parent 996e1cd
commit 7e2e517
Show file tree

Hide file tree

Showing 8 changed files with 323 additions and 32 deletions.
diff --git a/src/Environments/Hopper.py b/src/Environments/Hopper.py
@@ -28,6 +28,8 @@ def success_func(self, env: gym.Env, info: dict) -> tuple[bool|bool]:
 
 		if info["terminated"]:
 			return False, True
+		elif info["x_position"] > 5.0:
+			return True, False
 		else:
 			return False, False
 

diff --git a/src/Environments/Prompt.py b/src/Environments/Prompt.py
@@ -27,7 +27,7 @@ class Prompt(Enum):
     """,
     }
     HOPPER = {
-        "Goal": "move forward as quickly as possible",
+        "Goal": "Control the Hopper to move in the forward direction",
         "Observation Space": """Box(-inf, inf, (11,), float64)
 
 The observation space consists of the following parts (in order):

diff --git a/src/PolicyTrainer/PolicyTrainer.py b/src/PolicyTrainer/PolicyTrainer.py
@@ -66,7 +66,7 @@ def _learning(self, state: State, queue: Queue = None) -> None:
         )
         vec_env, model, numvenv = self._generate_env_model(state.reward_func, self.numenvs)
         training_callback = TrainingInfoCallback()
-        policy = model.learn(total_timesteps=self.timeout, callback=training_callback, progress_bar=True) # , progress_bar=True
+        policy = model.learn(total_timesteps=self.timeout, callback=training_callback) # , progress_bar=True
         policy.save(f"model/{self.env_name}_{state.idx}.pth")
         metrics = training_callback.get_metrics()
         #self.logger.debug(f"{state.idx} TRAINING METRICS: {metrics}")
@@ -212,6 +212,7 @@ def test_policy_hf(self, policy_path: str, nb_episodes: int = 5):
                 actions, _ = policy.predict(obs)
                 obs, _, term, trunc, info = env.step(actions)
                 if term or trunc:
+                    self.logger.info(info)
                     info["TimeLimit.truncated"] = trunc
                     info["terminated"] = term
                     is_success, _ = self.success_func(env, info)

diff --git a/src/log/Hopper_v5_log.csv b/src/log/Hopper_v5_log.csv
diff --git a/src/log/LunarLander_v3_log.csv b/src/log/LunarLander_v3_log.csv
diff --git a/src/main.py b/src/main.py
@@ -38,18 +38,18 @@ def main():
     memory.
     """
     parse_logger()
-    env_type = LunarLander(Algo.PPO)
+    env_type = Hopper(Algo.PPO)
     model = 'qwen2.5-coder'
     human_feedback = False
     LoggerCSV(env_type, model)
     viral = VIRAL(
         env_type=env_type, model=model, hf=human_feedback, training_time=300_000,
         numenvs=2, options=additional_options)
-    viral.generate_context(Prompt.LUNAR_LANDER.value)
+    viral.generate_context(Prompt.HOPPER.value)
     viral.generate_reward_function(n_init=1, n_refine=2)
     for state in viral.memory:
         viral.logger.info(state)
 
 if __name__ == "__main__":
-    for _ in range(10):
+    for _ in range(3):
         main()
diff --git a/src/test_function.py b/src/test_function.py
@@ -5,7 +5,7 @@
 from log.LoggerCSV import LoggerCSV
 from RLAlgo.DirectSearch import DirectSearch
 from RLAlgo.Reinforce import Reinforce
-from Environments import Prompt, Algo, CartPole, LunarLander
+from Environments import Prompt, Algo, CartPole, LunarLander, Hopper
 from VIRAL import VIRAL
 from LLM.LLMOptions import additional_options
 
@@ -37,36 +37,25 @@ def main():
     memory.
     """
     parse_logger()
-    env_type = LunarLander(Algo.PPO)
+    env_type = Hopper(Algo.PPO)
     model = 'qwen2.5-coder'
     human_feedback = True
     LoggerCSV(env_type, model)
     viral = VIRAL(
         env_type=env_type, model=model, hf=human_feedback, training_time=400_000, numenvs=2, options=additional_options)
     viral.test_reward_func("""
-def reward_func(observations: np.ndarray, is_success: bool, is_failure: bool) -> float:
-    x, y, v_x, v_y, theta, omega, leg_1, leg_2 = observations
-
-    # Penalty for altitude and horizontal distance
-    altitude_penalty = -abs(y) * 0.5  # Scaled down to emphasize landing stability over altitude
-    distance_penalty = abs(x)
-
-    # Reward for landing safely
-    landing_reward = 100 if is_success else -50 if is_failure else 0
-
-    # Penalty for angular deviation from vertical
-    angular_penalty = abs(theta) * 1.0  # Scaled down to make it less significant
-
-    # Penalize large angular velocity
-    angular_velocity_penalty = abs(omega)
-
-    # Reward for maintaining leg contact with the ground
-    leg_contact_reward = 5 if leg_1 == 1 and leg_2 == 1 else -2  # Adjusted weights to be more punitive
-
-    # Final reward calculation
-    total_reward = landing_reward + altitude_penalty + distance_penalty - angular_penalty - angular_velocity_penalty + leg_contact_reward
-
-    return max(total_reward, 0)""")
+def reward_func(observations:np.ndarray, is_success:bool, is_failure:bool) -> float:
+    if is_success:
+        return 10.0
+    elif is_failure:
+        return -5.0
+    else:
+        # Reward based on joint angles and torques to promote efficient movement
+        joint_angles = observations[:9]
+        torques = observations[9:18]
+        angle_reward = np.sum(np.exp(-np.abs(joint_angles)))
+        torque_reward = np.sum(np.exp(-np.abs(torques)))
+        return 0.5 * (angle_reward + torque_reward)""")
     viral.policy_trainer.test_policy_hf("model/LunarLander-v3_1.pth", 5)
     for state in viral.memory:
         viral.logger.info(state)

diff --git a/src/visualise.py b/src/visualise.py
@@ -43,9 +43,9 @@ def main():
     memory.
     """
     parse_logger()
-    env_type = LunarLander(Algo.PPO)
+    env_type = Hopper(Algo.PPO)
     p = PolicyTrainer([], env_type, 1, 2)
-    p.test_policy_hf("model/LunarLander-v3_1.pth", 5)
+    p.test_policy_hf("model/Hopper-v5_1.pth", 5)
 
 if __name__ == "__main__":
     main()