Skip to content

Commit

Permalink
some test with the hopper
Browse files Browse the repository at this point in the history
  • Loading branch information
valentincuzin committed Jan 6, 2025
1 parent 996e1cd commit 7e2e517
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 32 deletions.
2 changes: 2 additions & 0 deletions src/Environments/Hopper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def success_func(self, env: gym.Env, info: dict) -> tuple[bool|bool]:

if info["terminated"]:
return False, True
elif info["x_position"] > 5.0:
return True, False
else:
return False, False

Expand Down
2 changes: 1 addition & 1 deletion src/Environments/Prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Prompt(Enum):
""",
}
HOPPER = {
"Goal": "move forward as quickly as possible",
"Goal": "Control the Hopper to move in the forward direction",
"Observation Space": """Box(-inf, inf, (11,), float64)
The observation space consists of the following parts (in order):
Expand Down
3 changes: 2 additions & 1 deletion src/PolicyTrainer/PolicyTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _learning(self, state: State, queue: Queue = None) -> None:
)
vec_env, model, numvenv = self._generate_env_model(state.reward_func, self.numenvs)
training_callback = TrainingInfoCallback()
policy = model.learn(total_timesteps=self.timeout, callback=training_callback, progress_bar=True) # , progress_bar=True
policy = model.learn(total_timesteps=self.timeout, callback=training_callback) # , progress_bar=True
policy.save(f"model/{self.env_name}_{state.idx}.pth")
metrics = training_callback.get_metrics()
#self.logger.debug(f"{state.idx} TRAINING METRICS: {metrics}")
Expand Down Expand Up @@ -212,6 +212,7 @@ def test_policy_hf(self, policy_path: str, nb_episodes: int = 5):
actions, _ = policy.predict(obs)
obs, _, term, trunc, info = env.step(actions)
if term or trunc:
self.logger.info(info)
info["TimeLimit.truncated"] = trunc
info["terminated"] = term
is_success, _ = self.success_func(env, info)
Expand Down
245 changes: 245 additions & 0 deletions src/log/Hopper_v5_log.csv

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions src/log/LunarLander_v3_log.csv

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,18 @@ def main():
memory.
"""
parse_logger()
env_type = LunarLander(Algo.PPO)
env_type = Hopper(Algo.PPO)
model = 'qwen2.5-coder'
human_feedback = False
LoggerCSV(env_type, model)
viral = VIRAL(
env_type=env_type, model=model, hf=human_feedback, training_time=300_000,
numenvs=2, options=additional_options)
viral.generate_context(Prompt.LUNAR_LANDER.value)
viral.generate_context(Prompt.HOPPER.value)
viral.generate_reward_function(n_init=1, n_refine=2)
for state in viral.memory:
viral.logger.info(state)

if __name__ == "__main__":
for _ in range(10):
for _ in range(3):
main()
39 changes: 14 additions & 25 deletions src/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from log.LoggerCSV import LoggerCSV
from RLAlgo.DirectSearch import DirectSearch
from RLAlgo.Reinforce import Reinforce
from Environments import Prompt, Algo, CartPole, LunarLander
from Environments import Prompt, Algo, CartPole, LunarLander, Hopper
from VIRAL import VIRAL
from LLM.LLMOptions import additional_options

Expand Down Expand Up @@ -37,36 +37,25 @@ def main():
memory.
"""
parse_logger()
env_type = LunarLander(Algo.PPO)
env_type = Hopper(Algo.PPO)
model = 'qwen2.5-coder'
human_feedback = True
LoggerCSV(env_type, model)
viral = VIRAL(
env_type=env_type, model=model, hf=human_feedback, training_time=400_000, numenvs=2, options=additional_options)
viral.test_reward_func("""
def reward_func(observations: np.ndarray, is_success: bool, is_failure: bool) -> float:
x, y, v_x, v_y, theta, omega, leg_1, leg_2 = observations
# Penalty for altitude and horizontal distance
altitude_penalty = -abs(y) * 0.5 # Scaled down to emphasize landing stability over altitude
distance_penalty = abs(x)
# Reward for landing safely
landing_reward = 100 if is_success else -50 if is_failure else 0
# Penalty for angular deviation from vertical
angular_penalty = abs(theta) * 1.0 # Scaled down to make it less significant
# Penalize large angular velocity
angular_velocity_penalty = abs(omega)
# Reward for maintaining leg contact with the ground
leg_contact_reward = 5 if leg_1 == 1 and leg_2 == 1 else -2 # Adjusted weights to be more punitive
# Final reward calculation
total_reward = landing_reward + altitude_penalty + distance_penalty - angular_penalty - angular_velocity_penalty + leg_contact_reward
return max(total_reward, 0)""")
def reward_func(observations:np.ndarray, is_success:bool, is_failure:bool) -> float:
if is_success:
return 10.0
elif is_failure:
return -5.0
else:
# Reward based on joint angles and torques to promote efficient movement
joint_angles = observations[:9]
torques = observations[9:18]
angle_reward = np.sum(np.exp(-np.abs(joint_angles)))
torque_reward = np.sum(np.exp(-np.abs(torques)))
return 0.5 * (angle_reward + torque_reward)""")
viral.policy_trainer.test_policy_hf("model/LunarLander-v3_1.pth", 5)
for state in viral.memory:
viral.logger.info(state)
Expand Down
4 changes: 2 additions & 2 deletions src/visualise.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def main():
memory.
"""
parse_logger()
env_type = LunarLander(Algo.PPO)
env_type = Hopper(Algo.PPO)
p = PolicyTrainer([], env_type, 1, 2)
p.test_policy_hf("model/LunarLander-v3_1.pth", 5)
p.test_policy_hf("model/Hopper-v5_1.pth", 5)

if __name__ == "__main__":
main()

0 comments on commit 7e2e517

Please sign in to comment.