From b7d982c0445ced3839d46f2195cc3dad8abe41e0 Mon Sep 17 00:00:00 2001 From: retinfai Date: Fri, 6 Oct 2023 09:45:49 +1300 Subject: [PATCH 01/38] feat: configify lidar --- .../environments/CarBeatEnvironment.py | 17 ++++++++++------- src/environments/environments/util.py | 10 ++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 8f2f6ab1..26fbf177 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -13,7 +13,7 @@ import numpy as np from environment_interfaces.srv import CarBeatReset from .termination import has_collided, has_flipped_over -from .util import process_odom, reduce_lidar, get_all_goals_and_waypoints_in_multi_tracks +from .util import process_odom, reduce_lidar_n, get_all_goals_and_waypoints_in_multi_tracks from .goal_positions import goal_positions from .waypoints import waypoints @@ -28,7 +28,9 @@ def __init__(self, step_length=0.5, track='track_1', observation_mode='lidar_only', - max_goals=500): + max_goals=500, + num_lidar_points=50 + ): super().__init__('car_beat_environment') # Environment Details ---------------------------------------- @@ -40,6 +42,7 @@ def __init__(self, self.MIN_ACTIONS = np.asarray([0, -3.14]) self.MAX_STEPS_PER_GOAL = max_steps self.OBSERVATION_MODE = observation_mode + self.LIDAR_NUM = num_lidar_points self.num_spawns = 0 self.MAX_GOALS = max_goals @@ -49,7 +52,7 @@ def __init__(self, case 'no_position': self.OBSERVATION_SIZE = 6 + 10 case 'lidar_only': - self.OBSERVATION_SIZE = 2 + 10 + self.OBSERVATION_SIZE = 2 + num_lidar_points case _: raise ValueError(f'Invalid observation mode: {observation_mode}') @@ -309,8 +312,8 @@ def get_observation(self): odom_one = process_odom(odom_one) odom_two = process_odom(odom_two) - lidar_one = reduce_lidar(lidar_one) - lidar_two = reduce_lidar(lidar_two) + lidar_one = reduce_lidar_n(lidar_one, self.LIDAR_NUM) + lidar_two = reduce_lidar_n(lidar_two, self.LIDAR_NUM) match self.OBSERVATION_MODE: case 'full': @@ -349,7 +352,7 @@ def compute_reward(self, state, next_state): self.steps_since_last_goal = 0 - ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[18:20]) + ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2]) # Keeping track of FTG car goal number if ftg_current_distance < self.REWARD_RANGE: @@ -362,7 +365,7 @@ def compute_reward(self, state, next_state): self.ftg_goals_reached += 500 - if has_collided(next_state[8:19], self.COLLISION_RANGE) or has_flipped_over(next_state[2:6]): + if has_collided(next_state[8:8 + self.LIDAR_NUM], self.COLLISION_RANGE) or has_flipped_over(next_state[2:6]): reward -= 25 # TODO: find optimal value for this return reward diff --git a/src/environments/environments/util.py b/src/environments/environments/util.py index 63ed00ea..5bda6704 100644 --- a/src/environments/environments/util.py +++ b/src/environments/environments/util.py @@ -81,6 +81,16 @@ def reduce_lidar(lidar: LaserScan): return reduced_range +def reduce_lidar_n(lidar: LaserScan, num_points: int): + ranges = lidar.ranges + ranges = np.nan_to_num(ranges, posinf=float(10), neginf=float(0)) + ranges = list(ranges) + + index = np.round(np.linspace(0, len(ranges) - 1, num_points)).astype(int) + reduced_range = np.array(ranges)[index] + + return list(reduced_range) + def get_all_goals_and_waypoints_in_multi_tracks(track_name): all_car_goals = {} From b47ac3ca718d15f70f71792d922fcde0eca779ef Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 09:20:50 +1300 Subject: [PATCH 02/38] chore: update train yaml --- src/reinforcement_learning/config/train.yaml | 8 +- .../reinforcement_learning/greatest_gap.py | 226 ------------------ 2 files changed, 4 insertions(+), 230 deletions(-) delete mode 100644 src/reinforcement_learning/reinforcement_learning/greatest_gap.py diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml index 9687b2d8..a7c70ddf 100644 --- a/src/reinforcement_learning/config/train.yaml +++ b/src/reinforcement_learning/config/train.yaml @@ -1,15 +1,15 @@ train: ros__parameters: - environment: 'CarTrack' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat + environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack - algorithm: 'TD3' + algorithm: 'SAC' max_steps_exploration: 1000 max_steps_training: 1000000 reward_range: 2.0 collision_range: 0.2 observation_mode: 'lidar_only' - evaluate_every_n_steps: 2000 - evaluate_for_m_episodes: 3 + evaluate_every_n_steps: 10000 + evaluate_for_m_episodes: 5 # actor_path & critic_path must exist for retrain.sh, it can't be commented # actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht' # critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht' diff --git a/src/reinforcement_learning/reinforcement_learning/greatest_gap.py b/src/reinforcement_learning/reinforcement_learning/greatest_gap.py deleted file mode 100644 index 2c393864..00000000 --- a/src/reinforcement_learning/reinforcement_learning/greatest_gap.py +++ /dev/null @@ -1,226 +0,0 @@ -import rclpy -import time -import torch -import random -import numpy as np - -from cares_reinforcement_learning.algorithm.policy import TD3 -from cares_reinforcement_learning.util import helpers as hlp -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.util import Record - -from environments.CarGoalEnvironment import CarGoalEnvironment -from environments.CarWallEnvironment import CarWallEnvironment -from environments.CarBlockEnvironment import CarBlockEnvironment -from environments.CarTrackEnvironment import CarTrackEnvironment -from environments.follow_the_gap import FollowTheGapNode - -def main(): - rclpy.init() - - params = get_params() - - global MAX_STEPS_TRAINING - global MAX_STEPS_EXPLORATION - global G - global BATCH_SIZE - - ENVIRONMENT,\ - MAX_STEPS_TRAINING,\ - MAX_STEPS_EXPLORATION,\ - GAMMA,\ - TAU,\ - G,\ - BATCH_SIZE,\ - BUFFER_SIZE,\ - SEED,\ - ACTOR_LR,\ - CRITIC_LR,\ - MAX_STEPS,\ - STEP_LENGTH,\ - REWARD_RANGE,\ - COLLISION_RANGE = [param.value for param in params] - - print( - f'---------------------------------------------\n' - f'Environment: {ENVIRONMENT}\n' - f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n' - f'Training Steps: {MAX_STEPS_TRAINING}\n' - f'Gamma: {GAMMA}\n' - f'Tau: {TAU}\n' - f'G: {G}\n' - f'Batch Size: {BATCH_SIZE}\n' - f'Buffer Size: {BUFFER_SIZE}\n' - f'Seed: {SEED}\n' - f'Actor LR: {ACTOR_LR}\n' - f'Critic LR: {CRITIC_LR}\n' - f'Steps per Episode: {MAX_STEPS}\n' - f'Step Length: {STEP_LENGTH}\n' - f'Reward Range: {REWARD_RANGE}\n' - f'Collision Range: {COLLISION_RANGE}\n' - f'---------------------------------------------\n' - ) - - DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - time.sleep(3) - - match(ENVIRONMENT): - case 'CarWall': - env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarBlock': - env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarTrack': - env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case _: - env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE) - - config = { - 'environment': ENVIRONMENT, - 'max_steps_training':MAX_STEPS_TRAINING, - 'max_steps_exploration': MAX_STEPS_EXPLORATION, - 'gamma': GAMMA, - 'tau': TAU, - 'g': G, - 'batch_size': BATCH_SIZE, - 'buffer_size': BUFFER_SIZE, - 'seed': SEED, - 'actor_lr': ACTOR_LR, - 'critic_lr': CRITIC_LR, - 'max_steps': MAX_STEPS, - 'step_length': STEP_LENGTH, - 'reward_range': REWARD_RANGE, - 'collision_range': COLLISION_RANGE - } - record = Record(checkpoint_freq=100, config=config) - - greatestGap(env) - -def greatestGap(env): - state, _ = env.reset() - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - calcMethod = FollowTheGapNode() - - for total_step_counter in range(int(MAX_STEPS_TRAINING)): - episode_timesteps += 1 - # print(f"LIDAR ranges: {state[8:18]}") - action_env = calcMethod.select_action(state, env.goal_position) # Select action - #print(f"Current action:{action_env[0]}, {action_env[1]}") - action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) - next_state, reward, done, truncated, info = env.step(action_env) - state = next_state - - if done or truncated: - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - - -def train(env, agent: TD3, record: Record): - - memory = MemoryBuffer() - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - state, _ = env.reset() - - historical_reward = {"step": [], "episode_reward": []} - - - for total_step_counter in range(int(MAX_STEPS_TRAINING)): - episode_timesteps += 1 - - if total_step_counter < MAX_STEPS_EXPLORATION: - print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}") - action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])]) # action range the env uses [e.g. -2 , 2 for pendulum] - action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) # algorithm range [-1, 1] - else: - action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) # mapping to env range [e.g. -2 , 2 for pendulum] - - next_state, reward, done, truncated, info = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - - state = next_state - episode_reward += reward - - if total_step_counter >= MAX_STEPS_EXPLORATION: - for _ in range(G): - experiences = memory.sample(BATCH_SIZE) - experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done']) - agent.train_policy(experiences) - - record.log( - out=done or truncated, - Step=total_step_counter, - Episode=episode_num, - Step_Reward=reward, - Episode_Reward=episode_reward if (done or truncated) else None, - ) - - if done or truncated: - - historical_reward["step"].append(total_step_counter) - historical_reward["episode_reward"].append(episode_reward) - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - -def get_params(): - ''' - This function fetches the hyperparameters passed in through the launch files - - The hyperparameters below are defaults, to change them, you should change the train.yaml config - ''' - param_node = rclpy.create_node('params') - param_node.declare_parameters( - '', - [ - ('environment', 'CarGoal'), - ('gamma', 0.95), - ('tau', 0.005), - ('g', 10), - ('batch_size', 32), - ('buffer_size', 1_000_000), - ('seed', 123), #TODO: This doesn't do anything yet - ('actor_lr', 1e-4), - ('critic_lr', 1e-3), - ('max_steps_training', 1_000_000), - ('max_steps_exploration', 1_000), - ('max_steps', 10000), - ('step_length', 0.25), - ('reward_range', 0.2), - ('collision_range', 0.2) - ] - ) - - return param_node.get_parameters([ - 'environment', - 'max_steps_training', - 'max_steps_exploration', - 'gamma', - 'tau', - 'g', - 'batch_size', - 'buffer_size', - 'seed', - 'actor_lr', - 'critic_lr', - 'max_steps', - 'step_length', - 'reward_range', - 'collision_range' - ]) - -if __name__ == '__main__': - main() \ No newline at end of file From d46ffd8db231d4d56b03401ac94c3ab4113008a1 Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 10:31:54 +1300 Subject: [PATCH 03/38] feat: add progress to carbeat reward --- src/environments/environments/CarBeatEnvironment.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 26fbf177..3044f048 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -336,7 +336,10 @@ def compute_reward(self, state, next_state): goal_position = self.goal_position current_distance = math.dist(goal_position, next_state[:2]) - + prev_distance = math.dist(goal_position, state[:2]) + + reward += prev_distance - current_distance + self.steps_since_last_goal += 1 if current_distance < self.REWARD_RANGE: @@ -351,7 +354,7 @@ def compute_reward(self, state, next_state): self.update_goal_service(new_x, new_y) self.steps_since_last_goal = 0 - + ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2]) # Keeping track of FTG car goal number From 8c6d56e961bc82ad99bbb3f3127828c86b4d6a42 Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 11:00:40 +1300 Subject: [PATCH 04/38] feat: clean carbeat --- .../environments/CarBeatEnvironment.py | 123 ++++++++---------- 1 file changed, 57 insertions(+), 66 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 3044f048..5acfa4ce 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -238,71 +238,7 @@ def is_terminated(self, state): return has_collided(state[8:19], self.COLLISION_RANGE) \ or has_flipped_over(state[2:6]) \ or self.goals_reached >= self.MAX_GOALS - - ''' - init ai car goal count 0 - init opponent goal count diffrenct between ai goal abs to oppent goal abs - - for each goal passed - ai car goal count ++ - oppoent goal count ++ - - if ai car goal count > oppent goal count: - passed - ''' - - def call_reset_service(self, - car_x, - car_y, - car_Y, - goal_x, - goal_y, - ftg_x, - ftg_y, - ftg_Y - ): - """ - Reset the car and goal position - """ - - request = CarBeatReset.Request() - - request.gx = goal_x - request.gy = goal_y - - request.cx_one = car_x - request.cy_one = car_y - request.cyaw_one = car_Y - - request.cx_two = ftg_x - request.cy_two = ftg_y - request.cyaw_two = ftg_Y - - request.flag = "car_and_goal" - - # Publish to reset Topic to reset other nodes - empty_msg = Empty() - self.reset_pub.publish(empty_msg) - - future = self.reset_client.call_async(request) - rclpy.spin_until_future_complete(self, future) - - return future.result() - - def update_goal_service(self, x, y): - """ - Reset the goal position - """ - - request = CarBeatReset.Request() - request.gx = x - request.gy = y - request.flag = "goal_only" - - future = self.reset_client.call_async(request) - rclpy.spin_until_future_complete(self, future) - - return future.result() + def get_observation(self): @@ -354,7 +290,7 @@ def compute_reward(self, state, next_state): self.update_goal_service(new_x, new_y) self.steps_since_last_goal = 0 - + ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2]) # Keeping track of FTG car goal number @@ -365,6 +301,8 @@ def compute_reward(self, state, next_state): if self.goals_reached >= (self.ftg_goals_reached + self.ftg_offset + 3): print(f'RL Car has overtaken FTG Car') reward += 200 + + # Ensure overtaking won't happen again self.ftg_goals_reached += 500 @@ -372,4 +310,57 @@ def compute_reward(self, state, next_state): reward -= 25 # TODO: find optimal value for this return reward + + def call_reset_service(self, + car_x, + car_y, + car_Y, + goal_x, + goal_y, + ftg_x, + ftg_y, + ftg_Y + ): + """ + Reset the car and goal position + """ + + request = CarBeatReset.Request() + + request.gx = goal_x + request.gy = goal_y + + request.cx_one = car_x + request.cy_one = car_y + request.cyaw_one = car_Y + + request.cx_two = ftg_x + request.cy_two = ftg_y + request.cyaw_two = ftg_Y + + request.flag = "car_and_goal" + + # Publish to reset Topic to reset other nodes + empty_msg = Empty() + self.reset_pub.publish(empty_msg) + + future = self.reset_client.call_async(request) + rclpy.spin_until_future_complete(self, future) + + return future.result() + + def update_goal_service(self, x, y): + """ + Reset the goal position + """ + + request = CarBeatReset.Request() + request.gx = x + request.gy = y + request.flag = "goal_only" + + future = self.reset_client.call_async(request) + rclpy.spin_until_future_complete(self, future) + + return future.result() \ No newline at end of file From d7b34ef667035ad47ad2296d730d05f5d556e101 Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 13:21:29 +1300 Subject: [PATCH 05/38] chore: set up updated train --- .../reinforcement_learning/updated_train.py | 347 ++++++++++++++++++ .../reinforcement_learning/util.py | 66 ++++ src/reinforcement_learning/setup.py | 2 +- 3 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 src/reinforcement_learning/reinforcement_learning/updated_train.py create mode 100644 src/reinforcement_learning/reinforcement_learning/util.py diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py new file mode 100644 index 00000000..942a6347 --- /dev/null +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -0,0 +1,347 @@ +import random +import time + +import numpy as np +import rclpy +import torch + +from cares_reinforcement_learning.memory import MemoryBuffer +from cares_reinforcement_learning.util.Record import Record +from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory +from cares_reinforcement_learning.util import helpers as hlp + +from environments.CarBlockEnvironment import CarBlockEnvironment +from environments.CarGoalEnvironment import CarGoalEnvironment +from environments.CarTrackEnvironment import CarTrackEnvironment +from environments.CarWallEnvironment import CarWallEnvironment +from environments.CarBeatEnvironment import CarBeatEnvironment + +from .util import parse_args +from rclpy.parameter import Parameter +def main(): + rclpy.init() + + params: list(Parameter) = parse_args() + for param in params: + print(param) + print(param.value) + print(param.name) + + + global MAX_STEPS_TRAINING + global MAX_STEPS_EXPLORATION + global MAX_STEPS_PER_BATCH + global G + global BATCH_SIZE + global EVALUATE_EVERY_N_STEPS + global EVALUATE_FOR_M_EPISODES + global ALGORITHM + + ENVIRONMENT, \ + ALGORITHM, \ + TRACK, \ + MAX_STEPS_TRAINING, \ + MAX_STEPS_EXPLORATION, \ + GAMMA, \ + TAU, \ + G, \ + BATCH_SIZE, \ + BUFFER_SIZE, \ + SEED, \ + ACTOR_LR, \ + CRITIC_LR, \ + MAX_STEPS, \ + STEP_LENGTH, \ + REWARD_RANGE, \ + COLLISION_RANGE, \ + ACTOR_PATH, \ + CRITIC_PATH, \ + MAX_STEPS_PER_BATCH, \ + OBSERVATION_MODE, \ + EVALUATE_EVERY_N_STEPS, \ + EVALUATE_FOR_M_EPISODES = [param.value for param in params] + + if ACTOR_PATH != '' and CRITIC_PATH != '': + MAX_STEPS_EXPLORATION = 0 + + torch.manual_seed(SEED) + torch.cuda.manual_seed_all(SEED) + np.random.seed(SEED) + random.seed(SEED) + + print( + f'---------------------------------------------\n' + f'Environment: {ENVIRONMENT}\n' + f'Algorithm: {ALGORITHM}\n' + f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n' + f'Training Steps: {MAX_STEPS_TRAINING}\n' + f'Gamma: {GAMMA}\n' + f'Tau: {TAU}\n' + f'G: {G}\n' + f'Batch Size: {BATCH_SIZE}\n' + f'Buffer Size: {BUFFER_SIZE}\n' + f'Seed: {SEED}\n' + f'Actor LR: {ACTOR_LR}\n' + f'Critic LR: {CRITIC_LR}\n' + f'Steps per Episode: {MAX_STEPS}\n' + f'Step Length: {STEP_LENGTH}\n' + f'Reward Range: {REWARD_RANGE}\n' + f'Collision Range: {COLLISION_RANGE}\n' + f'Critic Path: {CRITIC_PATH}\n' + f'Actor Path: {ACTOR_PATH}\n' + f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n' + f'Observation Mode: {OBSERVATION_MODE}\n' + f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n' + f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n' + f'---------------------------------------------\n' + ) + + DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + time.sleep(3) + + match ENVIRONMENT: + case 'CarWall': + env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) + case 'CarBlock': + env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) + case 'CarTrack': + env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) + case 'CarBeat': + env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) + case _: + env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE) + + network_factory_args = { + 'observation_size': env.OBSERVATION_SIZE, + 'action_num': env.ACTION_NUM, + 'actor_lr': ACTOR_LR, + 'critic_lr': CRITIC_LR, + 'gamma': GAMMA, + 'tau': TAU, + 'device': DEVICE + } + + agent_factory = NetworkFactory() + agent = agent_factory.create_network(ALGORITHM, network_factory_args) + + if ACTOR_PATH != '' and CRITIC_PATH != '': + print('Reading saved models into actor and critic') + agent.actor_net.load_state_dict(torch.load(ACTOR_PATH)) + agent.critic_net.load_state_dict(torch.load(CRITIC_PATH)) + print('Successfully Loaded models') + + networks = {'actor': agent.actor_net, 'critic': agent.critic_net} + config = { + 'environment': ENVIRONMENT, + 'algorithm': ALGORITHM, + 'max_steps_training': MAX_STEPS_TRAINING, + 'max_steps_exploration': MAX_STEPS_EXPLORATION, + 'gamma': GAMMA, + 'tau': TAU, + 'g': G, + 'batch_size': BATCH_SIZE, + 'buffer_size': BUFFER_SIZE, + 'seed': SEED, + 'actor_lr': ACTOR_LR, + 'critic_lr': CRITIC_LR, + 'max_steps': MAX_STEPS, + 'step_length': STEP_LENGTH, + 'reward_range': REWARD_RANGE, + 'collision_range': COLLISION_RANGE, + 'max_steps_per_batch': MAX_STEPS_PER_BATCH, + 'observation_mode': OBSERVATION_MODE, + 'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS, + 'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES + } + + if (ENVIRONMENT == 'CarTrack'): + config['track'] = TRACK + + record = Record(networks=networks, checkpoint_freq=100, config=config) + + if ALGORITHM == 'PPO': + train_ppo(env, agent, record=record) + else: + train(env=env, agent=agent, record=record) + + +def train(env, agent, record: Record): + memory = MemoryBuffer() + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + evaluation_reward = None + + state, _ = env.reset() + + evaluate = False + + print(f'Initial State: {state}') + historical_reward = {"step": [], "episode_reward": []} + + for total_step_counter in range(int(MAX_STEPS_TRAINING)): + episode_timesteps += 1 + + if total_step_counter < MAX_STEPS_EXPLORATION: + print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}") + action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])]) # action range the env uses [e.g. -2 , 2 for pendulum] + action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) # algorithm range [-1, 1] + else: + action = agent.select_action_from_policy(state) # algorithm range [-1, 1] + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) # mapping to env range [e.g. -2 , 2 for pendulum] + + next_state, reward, done, truncated, info = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) + + state = next_state + episode_reward += reward + + if total_step_counter >= MAX_STEPS_EXPLORATION: + for _ in range(G): + experiences = memory.sample(BATCH_SIZE) + experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done']) + agent.train_policy(experiences) + + + + if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: + evaluate = True + + + record.log( + out=done or truncated, + Step=total_step_counter, + Episode=episode_num, + Step_Reward=reward, + Episode_Reward=episode_reward if (done or truncated) else None, + Evaluation_Reward=evaluation_reward + ) + + evaluation_reward = None + + if done or truncated: + historical_reward["step"].append(total_step_counter) + historical_reward["episode_reward"].append(episode_reward) + + if evaluate: + evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) + evaluate = False + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + +def train_ppo(env, agent, record): + max_steps_training = MAX_STEPS_TRAINING + max_steps_per_batch = MAX_STEPS_PER_BATCH + + min_action_value = env.MIN_ACTIONS + max_action_value = env.MAX_ACTIONS + + episode_timesteps = 0 + episode_num = 0 + episode_reward = 0 + time_step = 1 + evaluation_reward = None + evaluate = False + + memory = MemoryBuffer() + + state, _ = env.reset() + + for total_step_counter in range(int(max_steps_training)): + episode_timesteps += 1 + + action, log_prob = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, max_action_value, min_action_value) + + next_state, reward, done, truncated, _ = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) + + state = next_state + episode_reward += reward + + if time_step % max_steps_per_batch == 0: + for _ in range(G): + experience = memory.sample(BATCH_SIZE) + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'], + experience['log_prob'] + )) + + memory.clear() + + if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: + evaluate = True + + record.log( + Steps = total_step_counter, + Episode= episode_num, + Step_reward= reward, + Episode_reward= episode_reward if (done or truncated) else None, + Evaluation_reward=evaluation_reward, + out=done or truncated, + ) + evaluation_reward = None + time_step += 1 + + if done or truncated: + print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}') + + if evaluate: + evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) + evaluate = False + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + +def evaluate_policy(env, agent, num_episodes): + + episode_reward_history = [] + + print('Beginning Evaluation----------------------------') + + for ep in range(num_episodes): + state, _ = env.reset() + + episode_timesteps = 0 + episode_reward = 0 + + truncated = False + terminated = False + + while not truncated and not terminated: + + if ALGORITHM == 'PPO': + action = agent.select_action_from_policy(state) + else: + action = agent.select_action_from_policy(state, evaluation=True) + + action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + next_state, reward, terminated, truncated, _ = env.step(action) + + episode_reward += reward + state = next_state + + print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}') + episode_reward_history.append(episode_reward) + + avg_reward = sum(episode_reward_history) / len(episode_reward_history) + + print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------') + + return avg_reward + + +if __name__ == '__main__': + main() diff --git a/src/reinforcement_learning/reinforcement_learning/util.py b/src/reinforcement_learning/reinforcement_learning/util.py new file mode 100644 index 00000000..2b4c402b --- /dev/null +++ b/src/reinforcement_learning/reinforcement_learning/util.py @@ -0,0 +1,66 @@ +import rclpy + +def parse_args(): + params = _get_params() + return params + +def _get_params(): + ''' + This function fetches the hyperparameters passed in through the launch files + - The hyperparameters below are defaults, to change them, you should change the train.yaml config + ''' + param_node = rclpy.create_node('params') + param_node.declare_parameters( + '', + [ + ('environment', 'CarGoal'), + ('algorithm', 'TD3'), + ('track', 'track_1'), + ('gamma', 0.95), + ('tau', 0.005), + ('g', 10), + ('batch_size', 32), + ('buffer_size', 1_000_000), + ('seed', 123), # TODO: This doesn't do anything yet + ('actor_lr', 1e-4), + ('critic_lr', 1e-3), + ('max_steps_training', 1_000_000), + ('max_steps_exploration', 1_000), + ('max_steps', 100), + ('step_length', 0.25), + ('reward_range', 0.2), + ('collision_range', 0.2), + ('actor_path', ''), + ('critic_path', ''), + ('max_steps_per_batch', 5000), + ('observation_mode', 'no_position'), + ('evaluate_every_n_steps', 10000), + ('evaluate_for_m_episodes', 5) + ] + ) + + return param_node.get_parameters([ + 'environment', + 'algorithm', + 'track', + 'max_steps_training', + 'max_steps_exploration', + 'gamma', + 'tau', + 'g', + 'batch_size', + 'buffer_size', + 'seed', + 'actor_lr', + 'critic_lr', + 'max_steps', + 'step_length', + 'reward_range', + 'collision_range', + 'actor_path', + 'critic_path', + 'max_steps_per_batch', + 'observation_mode', + 'evaluate_every_n_steps', + 'evaluate_for_m_episodes' + ]) \ No newline at end of file diff --git a/src/reinforcement_learning/setup.py b/src/reinforcement_learning/setup.py index ccb4e262..93ed141e 100644 --- a/src/reinforcement_learning/setup.py +++ b/src/reinforcement_learning/setup.py @@ -25,7 +25,7 @@ entry_points={ 'console_scripts': [ 'sanity_check = reinforcement_learning.sanity_check:main', - 'train = reinforcement_learning.train:main', + 'train = reinforcement_learning.updated_train:main', 'test = reinforcement_learning.test:main', 'greatest_gap = reinforcement_learning.greatest_gap:main', ], From c8123d4f47a35f66bdb9c2cad27d9db82b0fc20a Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 14:08:02 +1300 Subject: [PATCH 06/38] feat: add args parser --- .../reinforcement_learning/parse_args.py | 108 ++++++ .../reinforcement_learning/updated_train.py | 337 +----------------- .../reinforcement_learning/util.py | 66 ---- 3 files changed, 125 insertions(+), 386 deletions(-) create mode 100644 src/reinforcement_learning/reinforcement_learning/parse_args.py delete mode 100644 src/reinforcement_learning/reinforcement_learning/util.py diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py new file mode 100644 index 00000000..3fa37fb1 --- /dev/null +++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py @@ -0,0 +1,108 @@ +import rclpy +from rclpy.node import Node +from rclpy.parameter import Parameter + +def parse_args(): + param_node = __declare_params() + + env_params = __get_env_params(param_node) + algorithm_params = __get_algorithm_params(param_node) + network_params = __get_network_params(param_node) + + return env_params, algorithm_params, network_params + +def __declare_params(): + param_node = rclpy.create_node('params') + param_node.declare_parameters( + '', + [ + # Environment Parameters --------------------------- + ('environment', 'CarGoal'), + ('track', 'track_1'), + ('max_steps', 100), + ('step_length', 0.25), + ('reward_range', 0.2), + ('collision_range', 0.2), + ('observation_mode', 'no_position'), + + # Algorithm Parameters ----------------------------- + ('g', 10), + ('batch_size', 32), + ('buffer_size', 1_000_000), + ('seed', 123), + ('max_steps_training', 1_000_000), + ('max_steps_exploration', 1_000), + ('max_steps_per_batch', 5000), + ('evaluate_every_n_steps', 10000), + ('evaluate_for_m_episodes', 5), + + # Network Parameters ------------------------------- + ('actor_path', ''), + ('critic_path', ''), + ('algorithm', 'TD3'), + ('gamma', 0.95), + ('tau', 0.005), + ('actor_lr', 1e-4), + ('critic_lr', 1e-3), + + ] + ) + + return param_node + +def __get_env_params(param_node: Node): + + params: list(Parameter) = param_node.get_parameters([ + 'environment', + 'track', + 'max_steps', + 'step_length', + 'reward_range', + 'collision_range', + 'observation_mode' + ]) + + # Convert to Dictionary + params_dict = {} + for param in params: + params_dict[param.name] = param.value + + return params_dict + +def __get_algorithm_params(param_node: Node): + params = param_node.get_parameters([ + 'g', + 'batch_size', + 'buffer_size', + 'seed', + 'max_steps_training', + 'max_steps_exploration', + 'max_steps_per_batch', + 'evaluate_every_n_steps', + 'evaluate_for_m_episodes' + ]) + + # Convert to Dictionary + params_dict = {} + for param in params: + params_dict[param.name] = param.value + + return params_dict + +def __get_network_params(param_node: Node): + params = param_node.get_parameters([ + 'actor_path', + 'critic_path', + 'algorithm', + 'gamma', + 'tau', + 'actor_lr', + 'critic_lr' + ]) + + # Convert to Dictionary + params_dict = {} + for param in params: + params_dict[param.name] = param.value + + return params_dict diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 942a6347..3fc5dd04 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -1,8 +1,10 @@ import random import time +import yaml import numpy as np import rclpy +from rclpy.parameter import Parameter import torch from cares_reinforcement_learning.memory import MemoryBuffer @@ -16,332 +18,27 @@ from environments.CarWallEnvironment import CarWallEnvironment from environments.CarBeatEnvironment import CarBeatEnvironment -from .util import parse_args -from rclpy.parameter import Parameter +from .parse_args import parse_args + def main(): rclpy.init() - params: list(Parameter) = parse_args() - for param in params: - print(param) - print(param.value) - print(param.name) - - - global MAX_STEPS_TRAINING - global MAX_STEPS_EXPLORATION - global MAX_STEPS_PER_BATCH - global G - global BATCH_SIZE - global EVALUATE_EVERY_N_STEPS - global EVALUATE_FOR_M_EPISODES - global ALGORITHM + env_config, algorithm_config, network_config = parse_args() - ENVIRONMENT, \ - ALGORITHM, \ - TRACK, \ - MAX_STEPS_TRAINING, \ - MAX_STEPS_EXPLORATION, \ - GAMMA, \ - TAU, \ - G, \ - BATCH_SIZE, \ - BUFFER_SIZE, \ - SEED, \ - ACTOR_LR, \ - CRITIC_LR, \ - MAX_STEPS, \ - STEP_LENGTH, \ - REWARD_RANGE, \ - COLLISION_RANGE, \ - ACTOR_PATH, \ - CRITIC_PATH, \ - MAX_STEPS_PER_BATCH, \ - OBSERVATION_MODE, \ - EVALUATE_EVERY_N_STEPS, \ - EVALUATE_FOR_M_EPISODES = [param.value for param in params] + # Set Seeds + torch.manual_seed(algorithm_config['seed']) + torch.cuda.manual_seed_all(algorithm_config['seed']) + np.random.seed(algorithm_config['seed']) + random.seed(algorithm_config['seed']) - if ACTOR_PATH != '' and CRITIC_PATH != '': - MAX_STEPS_EXPLORATION = 0 + DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - torch.manual_seed(SEED) - torch.cuda.manual_seed_all(SEED) - np.random.seed(SEED) - random.seed(SEED) - print( - f'---------------------------------------------\n' - f'Environment: {ENVIRONMENT}\n' - f'Algorithm: {ALGORITHM}\n' - f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n' - f'Training Steps: {MAX_STEPS_TRAINING}\n' - f'Gamma: {GAMMA}\n' - f'Tau: {TAU}\n' - f'G: {G}\n' - f'Batch Size: {BATCH_SIZE}\n' - f'Buffer Size: {BUFFER_SIZE}\n' - f'Seed: {SEED}\n' - f'Actor LR: {ACTOR_LR}\n' - f'Critic LR: {CRITIC_LR}\n' - f'Steps per Episode: {MAX_STEPS}\n' - f'Step Length: {STEP_LENGTH}\n' - f'Reward Range: {REWARD_RANGE}\n' - f'Collision Range: {COLLISION_RANGE}\n' - f'Critic Path: {CRITIC_PATH}\n' - f'Actor Path: {ACTOR_PATH}\n' - f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n' - f'Observation Mode: {OBSERVATION_MODE}\n' - f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n' - f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n' - f'---------------------------------------------\n' + f'Environment Config: ------------------------------------- \n' + f'{yaml.dump(env_config, default_flow_style=False)} \n' + f'Algorithm Config: ------------------------------------- \n' + f'{yaml.dump(algorithm_config, default_flow_style=False)} \n' + f'Network Config: ------------------------------------- \n' + f'{yaml.dump(network_config, default_flow_style=False)} \n' ) - DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - time.sleep(3) - - match ENVIRONMENT: - case 'CarWall': - env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarBlock': - env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarTrack': - env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) - case 'CarBeat': - env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) - case _: - env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE) - - network_factory_args = { - 'observation_size': env.OBSERVATION_SIZE, - 'action_num': env.ACTION_NUM, - 'actor_lr': ACTOR_LR, - 'critic_lr': CRITIC_LR, - 'gamma': GAMMA, - 'tau': TAU, - 'device': DEVICE - } - - agent_factory = NetworkFactory() - agent = agent_factory.create_network(ALGORITHM, network_factory_args) - - if ACTOR_PATH != '' and CRITIC_PATH != '': - print('Reading saved models into actor and critic') - agent.actor_net.load_state_dict(torch.load(ACTOR_PATH)) - agent.critic_net.load_state_dict(torch.load(CRITIC_PATH)) - print('Successfully Loaded models') - - networks = {'actor': agent.actor_net, 'critic': agent.critic_net} - config = { - 'environment': ENVIRONMENT, - 'algorithm': ALGORITHM, - 'max_steps_training': MAX_STEPS_TRAINING, - 'max_steps_exploration': MAX_STEPS_EXPLORATION, - 'gamma': GAMMA, - 'tau': TAU, - 'g': G, - 'batch_size': BATCH_SIZE, - 'buffer_size': BUFFER_SIZE, - 'seed': SEED, - 'actor_lr': ACTOR_LR, - 'critic_lr': CRITIC_LR, - 'max_steps': MAX_STEPS, - 'step_length': STEP_LENGTH, - 'reward_range': REWARD_RANGE, - 'collision_range': COLLISION_RANGE, - 'max_steps_per_batch': MAX_STEPS_PER_BATCH, - 'observation_mode': OBSERVATION_MODE, - 'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS, - 'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES - } - - if (ENVIRONMENT == 'CarTrack'): - config['track'] = TRACK - - record = Record(networks=networks, checkpoint_freq=100, config=config) - - if ALGORITHM == 'PPO': - train_ppo(env, agent, record=record) - else: - train(env=env, agent=agent, record=record) - - -def train(env, agent, record: Record): - memory = MemoryBuffer() - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - evaluation_reward = None - - state, _ = env.reset() - - evaluate = False - - print(f'Initial State: {state}') - historical_reward = {"step": [], "episode_reward": []} - - for total_step_counter in range(int(MAX_STEPS_TRAINING)): - episode_timesteps += 1 - - if total_step_counter < MAX_STEPS_EXPLORATION: - print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}") - action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])]) # action range the env uses [e.g. -2 , 2 for pendulum] - action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) # algorithm range [-1, 1] - else: - action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) # mapping to env range [e.g. -2 , 2 for pendulum] - - next_state, reward, done, truncated, info = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - - state = next_state - episode_reward += reward - - if total_step_counter >= MAX_STEPS_EXPLORATION: - for _ in range(G): - experiences = memory.sample(BATCH_SIZE) - experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done']) - agent.train_policy(experiences) - - - - if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: - evaluate = True - - - record.log( - out=done or truncated, - Step=total_step_counter, - Episode=episode_num, - Step_Reward=reward, - Episode_Reward=episode_reward if (done or truncated) else None, - Evaluation_Reward=evaluation_reward - ) - - evaluation_reward = None - - if done or truncated: - historical_reward["step"].append(total_step_counter) - historical_reward["episode_reward"].append(episode_reward) - - if evaluate: - evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) - evaluate = False - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - -def train_ppo(env, agent, record): - max_steps_training = MAX_STEPS_TRAINING - max_steps_per_batch = MAX_STEPS_PER_BATCH - - min_action_value = env.MIN_ACTIONS - max_action_value = env.MAX_ACTIONS - - episode_timesteps = 0 - episode_num = 0 - episode_reward = 0 - time_step = 1 - evaluation_reward = None - evaluate = False - - memory = MemoryBuffer() - - state, _ = env.reset() - - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) - - next_state, reward, done, truncated, _ = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) - - state = next_state - episode_reward += reward - - if time_step % max_steps_per_batch == 0: - for _ in range(G): - experience = memory.sample(BATCH_SIZE) - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'], - experience['log_prob'] - )) - - memory.clear() - - if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: - evaluate = True - - record.log( - Steps = total_step_counter, - Episode= episode_num, - Step_reward= reward, - Episode_reward= episode_reward if (done or truncated) else None, - Evaluation_reward=evaluation_reward, - out=done or truncated, - ) - evaluation_reward = None - time_step += 1 - - if done or truncated: - print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}') - - if evaluate: - evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) - evaluate = False - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - -def evaluate_policy(env, agent, num_episodes): - - episode_reward_history = [] - - print('Beginning Evaluation----------------------------') - - for ep in range(num_episodes): - state, _ = env.reset() - - episode_timesteps = 0 - episode_reward = 0 - - truncated = False - terminated = False - - while not truncated and not terminated: - - if ALGORITHM == 'PPO': - action = agent.select_action_from_policy(state) - else: - action = agent.select_action_from_policy(state, evaluation=True) - - action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) - next_state, reward, terminated, truncated, _ = env.step(action) - - episode_reward += reward - state = next_state - - print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}') - episode_reward_history.append(episode_reward) - - avg_reward = sum(episode_reward_history) / len(episode_reward_history) - - print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------') - - return avg_reward - - -if __name__ == '__main__': - main() diff --git a/src/reinforcement_learning/reinforcement_learning/util.py b/src/reinforcement_learning/reinforcement_learning/util.py deleted file mode 100644 index 2b4c402b..00000000 --- a/src/reinforcement_learning/reinforcement_learning/util.py +++ /dev/null @@ -1,66 +0,0 @@ -import rclpy - -def parse_args(): - params = _get_params() - return params - -def _get_params(): - ''' - This function fetches the hyperparameters passed in through the launch files - - The hyperparameters below are defaults, to change them, you should change the train.yaml config - ''' - param_node = rclpy.create_node('params') - param_node.declare_parameters( - '', - [ - ('environment', 'CarGoal'), - ('algorithm', 'TD3'), - ('track', 'track_1'), - ('gamma', 0.95), - ('tau', 0.005), - ('g', 10), - ('batch_size', 32), - ('buffer_size', 1_000_000), - ('seed', 123), # TODO: This doesn't do anything yet - ('actor_lr', 1e-4), - ('critic_lr', 1e-3), - ('max_steps_training', 1_000_000), - ('max_steps_exploration', 1_000), - ('max_steps', 100), - ('step_length', 0.25), - ('reward_range', 0.2), - ('collision_range', 0.2), - ('actor_path', ''), - ('critic_path', ''), - ('max_steps_per_batch', 5000), - ('observation_mode', 'no_position'), - ('evaluate_every_n_steps', 10000), - ('evaluate_for_m_episodes', 5) - ] - ) - - return param_node.get_parameters([ - 'environment', - 'algorithm', - 'track', - 'max_steps_training', - 'max_steps_exploration', - 'gamma', - 'tau', - 'g', - 'batch_size', - 'buffer_size', - 'seed', - 'actor_lr', - 'critic_lr', - 'max_steps', - 'step_length', - 'reward_range', - 'collision_range', - 'actor_path', - 'critic_path', - 'max_steps_per_batch', - 'observation_mode', - 'evaluate_every_n_steps', - 'evaluate_for_m_episodes' - ]) \ No newline at end of file From b2ded1ba610b700b176ca54f3621df942e714c8a Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 15:06:03 +1300 Subject: [PATCH 07/38] feat: add CarTrack and CarBeat to env factory --- .../environments/CarBeatEnvironment.py | 10 +-- src/environments/launch/carbeat.launch.py | 78 ++++++++++--------- src/reinforcement_learning/config/train.yaml | 25 +----- .../launch/train.launch.py | 2 + .../EnvironmentFactory.py | 44 +++++++++++ .../reinforcement_learning/parse_args.py | 12 ++- .../reinforcement_learning/updated_train.py | 10 +-- 7 files changed, 110 insertions(+), 71 deletions(-) create mode 100644 src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 26fbf177..5d86c316 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -20,13 +20,13 @@ class CarBeatEnvironment(Node): def __init__(self, - car_one_name, - car_two_name, + rl_car_name, + ftg_car_name, reward_range=1, max_steps=50, collision_range=0.2, step_length=0.5, - track='track_1', + track='multi_track', observation_mode='lidar_only', max_goals=500, num_lidar_points=50 @@ -34,8 +34,8 @@ def __init__(self, super().__init__('car_beat_environment') # Environment Details ---------------------------------------- - self.NAME = car_one_name - self.OTHER_CAR_NAME = car_two_name + self.NAME = rl_car_name + self.OTHER_CAR_NAME = ftg_car_name self.MAX_STEPS = max_steps self.STEP_LENGTH = step_length self.MAX_ACTIONS = np.asarray([5, 3.14]) diff --git a/src/environments/launch/carbeat.launch.py b/src/environments/launch/carbeat.launch.py index fde5b9aa..d6aa957e 100644 --- a/src/environments/launch/carbeat.launch.py +++ b/src/environments/launch/carbeat.launch.py @@ -9,9 +9,12 @@ def launch(context, *args, **kwargs): pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim') pkg_environments = get_package_share_directory('environments') + pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') track = LaunchConfiguration('track').perform(context) - + car_one = LaunchConfiguration('car_one').perform(context) + car_two = LaunchConfiguration('car_two').perform(context) + gz_sim = IncludeLaunchDescription( launch_description_source=PythonLaunchDescriptionSource( os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')), @@ -20,24 +23,56 @@ def launch(context, *args, **kwargs): }.items() ) + f1tenth_one = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_one, + 'world': 'empty', + 'x': '0', + 'y': '0', + 'z': '5', + }.items() + ) + + f1tenth_two = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_two, + 'world': 'empty', + 'x': '0', + 'y': '1', + 'z': '5', + }.items() + ) + controller = Node( package='controllers', executable='ftg_policy', output='screen', parameters=[ - {'car_name': 'f1tenth_two', 'track_name': track}, + {'car_name': car_two, 'track_name': track}, ], ) - return[gz_sim, controller] + return[gz_sim, controller, f1tenth_one, f1tenth_two] def generate_launch_description(): - pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') - track_arg = DeclareLaunchArgument( 'track', default_value='track_1' ) + car_one = DeclareLaunchArgument( + 'car_one', + default_value='f1tenth_one' + ) + + car_two = DeclareLaunchArgument( + 'car_two', + default_value='f1tenth_two' + ) + service_bridge = Node( package='ros_gz_bridge', executable='parameter_bridge', @@ -54,48 +89,19 @@ def generate_launch_description(): ], ) - f1tenth_one = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth_one', - 'world': 'empty', - 'x': '0', - 'y': '0', - 'z': '5', - }.items() - ) - - f1tenth_two = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth_two', - 'world': 'empty', - 'x': '0', - 'y': '1', - 'z': '5', - }.items() - ) - - #TODO: dynamically change car name - #TODO: This doesn't work yet - #TODO: Create CarWall Reset reset = Node( package='environments', executable='CarBeatReset', output='screen', ) - - ld = LaunchDescription([ track_arg, + car_one, + car_two, OpaqueFunction(function=launch), service_bridge, reset, - f1tenth_one, - f1tenth_two, ]) return ld \ No newline at end of file diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml index 9687b2d8..bfefbafc 100644 --- a/src/reinforcement_learning/config/train.yaml +++ b/src/reinforcement_learning/config/train.yaml @@ -1,25 +1,6 @@ train: ros__parameters: - environment: 'CarTrack' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat + environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack - algorithm: 'TD3' - max_steps_exploration: 1000 - max_steps_training: 1000000 - reward_range: 2.0 - collision_range: 0.2 - observation_mode: 'lidar_only' - evaluate_every_n_steps: 2000 - evaluate_for_m_episodes: 3 - # actor_path & critic_path must exist for retrain.sh, it can't be commented - # actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht' - # critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht' - gamma: 0.95 - # tau: 0.005 - g: 1 - # batch_size: 32 - # buffer_size: 1000000 - actor_lr: 0.0001 - critic_lr: 0.001 - # max_steps: 10 - step_length: 0.1 - # seed: 123 + car_name: 'New_Car' + ftg_car_name: 'New_Car_two' diff --git a/src/reinforcement_learning/launch/train.launch.py b/src/reinforcement_learning/launch/train.launch.py index 09d5d3d3..cb3c0345 100644 --- a/src/reinforcement_learning/launch/train.launch.py +++ b/src/reinforcement_learning/launch/train.launch.py @@ -32,6 +32,8 @@ def generate_launch_description(): os.path.join(pkg_environments, f'{env_launch[env]}.launch.py')), launch_arguments={ 'track': TextSubstitution(text=str(config['train']['ros__parameters']['track'])), + 'car_one': TextSubstitution(text=str(config['train']['ros__parameters']['car_name']) if 'car_name' in config['train']['ros__parameters'] else 'f1tenth'), + 'car_two': TextSubstitution(text=str(config['train']['ros__parameters']['ftg_car_name']) if 'ftg_car_name' in config['train']['ros__parameters'] else 'ftg_car'), }.items() #TODO: this doesn't do anything ) diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py new file mode 100644 index 00000000..f726efc3 --- /dev/null +++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py @@ -0,0 +1,44 @@ +from environments.CarBlockEnvironment import CarBlockEnvironment +from environments.CarGoalEnvironment import CarGoalEnvironment +from environments.CarTrackEnvironment import CarTrackEnvironment +from environments.CarWallEnvironment import CarWallEnvironment +from environments.CarBeatEnvironment import CarBeatEnvironment + +class EnvironmentFactory: + def __init__(self): + pass + + def create(self, name, config): + print(config) + if name == 'CarBlock': + return CarBlockEnvironment() + elif name == 'CarGoal': + return CarGoalEnvironment() + elif name == 'CarTrack': + return CarTrackEnvironment( + config['car_name'], + config['reward_range'], + config['max_steps'], + config['collision_range'], + config['step_length'], + config['track'], + config['observation_mode'], + config['max_goals'] + ) + elif name == 'CarWall': + return CarWallEnvironment() + elif name == 'CarBeat': + return CarBeatEnvironment( + config['car_name'], + config['ftg_car_name'], + config['reward_range'], + config['max_steps'], + config['collision_range'], + config['step_length'], + config['track'], + config['observation_mode'], + config['max_goals'], + config['num_lidar_points'] + ) + else: + raise Exception('EnvironmentFactory: Environment not found') diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py index 3fa37fb1..4e88be1e 100644 --- a/src/reinforcement_learning/reinforcement_learning/parse_args.py +++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py @@ -18,12 +18,16 @@ def __declare_params(): [ # Environment Parameters --------------------------- ('environment', 'CarGoal'), - ('track', 'track_1'), + ('car_name', 'f1tenth'), + ('ftg_car_name', 'ftg_car'), + ('track', 'multi_track'), ('max_steps', 100), ('step_length', 0.25), ('reward_range', 0.2), ('collision_range', 0.2), ('observation_mode', 'no_position'), + ('max_goals', 500), + ('num_lidar_points', 10), # Algorithm Parameters ----------------------------- ('g', 10), @@ -54,12 +58,16 @@ def __get_env_params(param_node: Node): params: list(Parameter) = param_node.get_parameters([ 'environment', + 'car_name', 'track', 'max_steps', 'step_length', 'reward_range', 'collision_range', - 'observation_mode' + 'observation_mode', + 'max_goals', + 'ftg_car_name', + 'num_lidar_points' ]) # Convert to Dictionary diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 3fc5dd04..3501f79d 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -12,13 +12,8 @@ from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory from cares_reinforcement_learning.util import helpers as hlp -from environments.CarBlockEnvironment import CarBlockEnvironment -from environments.CarGoalEnvironment import CarGoalEnvironment -from environments.CarTrackEnvironment import CarTrackEnvironment -from environments.CarWallEnvironment import CarWallEnvironment -from environments.CarBeatEnvironment import CarBeatEnvironment - from .parse_args import parse_args +from .EnvironmentFactory import EnvironmentFactory def main(): rclpy.init() @@ -42,3 +37,6 @@ def main(): f'{yaml.dump(network_config, default_flow_style=False)} \n' ) + env_factory = EnvironmentFactory() + env = env_factory.create(env_config['environment'], env_config) + From 0e9ac4fe768def262efd84f0e0a9e9d3c6da969e Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 15:22:15 +1300 Subject: [PATCH 08/38] feat: add CarWall to env factory --- .../reinforcement_learning/EnvironmentFactory.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py index f726efc3..4953511e 100644 --- a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py +++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py @@ -26,7 +26,13 @@ def create(self, name, config): config['max_goals'] ) elif name == 'CarWall': - return CarWallEnvironment() + return CarWallEnvironment( + config['car_name'], + config['reward_range'], + config['max_steps'], + config['collision_range'], + config['step_length'] + ) elif name == 'CarBeat': return CarBeatEnvironment( config['car_name'], From 86106378adfe469ca326658c935af6e13ee0c5fb Mon Sep 17 00:00:00 2001 From: retinfai Date: Sat, 21 Oct 2023 15:29:13 +1300 Subject: [PATCH 09/38] feat: add CarGoal and CarBlock to env factory --- .../reinforcement_learning/EnvironmentFactory.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py index 4953511e..30d1f94a 100644 --- a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py +++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py @@ -11,9 +11,20 @@ def __init__(self): def create(self, name, config): print(config) if name == 'CarBlock': - return CarBlockEnvironment() + return CarBlockEnvironment( + config['car_name'], + config['reward_range'], + config['max_steps'], + config['collision_range'], + config['step_length'] + ) elif name == 'CarGoal': - return CarGoalEnvironment() + return CarGoalEnvironment( + config['car_name'], + config['reward_range'], + config['max_steps'], + config['step_length'] + ) elif name == 'CarTrack': return CarTrackEnvironment( config['car_name'], From 1d7c56a8d17b4121d2ffcf968f158fb7ad4a8833 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 08:07:04 +1300 Subject: [PATCH 10/38] feat: integrate Network and Memory factory --- .../reinforcement_learning/updated_train.py | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 3501f79d..65bbd0e4 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -11,6 +11,7 @@ from cares_reinforcement_learning.util.Record import Record from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory from cares_reinforcement_learning.util import helpers as hlp +import cares_reinforcement_learning.util.configurations as cfg from .parse_args import parse_args from .EnvironmentFactory import EnvironmentFactory @@ -26,8 +27,6 @@ def main(): np.random.seed(algorithm_config['seed']) random.seed(algorithm_config['seed']) - DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - print( f'Environment Config: ------------------------------------- \n' f'{yaml.dump(env_config, default_flow_style=False)} \n' @@ -38,5 +37,29 @@ def main(): ) env_factory = EnvironmentFactory() + network_factory = NetworkFactory() + + match network_config['algorithm']: + case 'PPO': + config = cfg.PPOConfig(**network_config) + case 'DDPG': + config = cfg.DDPGConfig(**network_config) + case 'SAC': + config = cfg.SACConfig(**network_config) + case 'TD3': + config = cfg.TD3Config(**network_config) + case _: + raise Exception(f'Algorithm {network_config["algorithm"]} not implemented') + + env = env_factory.create(env_config['environment'], env_config) + agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) + memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM) + + # TODO: Load Actor and Critic if passed. Only load if both are passed + + + + + From 28b712c656292a6691c3d9a2bc03b2f7453b30d0 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 08:37:01 +1300 Subject: [PATCH 11/38] feat: add off policy training loop --- .../reinforcement_learning/updated_train.py | 89 +++++++++++++++++-- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 65bbd0e4..fa89691f 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -1,5 +1,5 @@ import random -import time +from datetime import datetime import yaml import numpy as np @@ -56,10 +56,87 @@ def main(): agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM) - # TODO: Load Actor and Critic if passed. Only load if both are passed - - - - + record = Record( + glob_log_dir='training_logs', + log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}", + algorithm=network_config['algorithm'], + task=env_config['environment'], + network=agent + ) + # TODO: Load Actor and Critic if passed. Only load if both are passed + off_policy_train(env, agent, memory, record, algorithm_config) + + +def off_policy_train(env, agent, memory, record, algorithm_config): + + max_steps_training = algorithm_config['max_steps_training'] + max_steps_exploration = algorithm_config['max_steps_exploration'] + num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] + evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] + + batch_size = algorithm_config['batch_size'] + G = algorithm_config['g'] + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + + evaluate = False + + state, _ = env.reset() + + for step_counter in range(max_steps_training): + episode_timesteps += 1 + + if step_counter < max_steps_exploration: + env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}') + action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM) + action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) + else: + action = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, info = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) + + state = next_state + episode_reward += reward + + if step_counter >= max_steps_exploration: + for i in range(G): + experience = memory.sample(batch_size) + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'] + )) + memory.update_priorities(experience['indices'], info) + + if step_counter % evaluate_every_n_steps == 0: + evaluate = True + + if done or truncated: + env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}') + + record.log_train( + total_steps = step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + if evaluate: + evaluate = False + # off_policy_evaluate(env, agent, record, algorithm_config, network_config) + print('Evaluated') + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 From 20e7ffef122f0d0fd84438c3ac806b62482a209d Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 08:48:54 +1300 Subject: [PATCH 12/38] feat: add off policy evaluation loop --- .../reinforcement_learning/updated_train.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index fa89691f..42fb6ec6 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -132,11 +132,51 @@ def off_policy_train(env, agent, memory, record, algorithm_config): if evaluate: evaluate = False - # off_policy_evaluate(env, agent, record, algorithm_config, network_config) - print('Evaluated') + + env.get_logger().info(f'*************--Begin Evaluation Loop--*************') + off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter) + env.get_logger().info(f'*************--End Evaluation Loop--*************') + # Reset environment state, _ = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + +def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): + + episode_reward = 0 + episode_timesteps = 0 + episode_num = 0 + + for episode_num in range(eval_episodes): + state, _ = env.reset() + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action = agent.select_action_from_policy(state, evaluation=True) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, _ = env.step(action_env) + + state = next_state + episode_reward += reward + + if done or truncated: + record.log_eval( + total_steps = steps_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + break From efee927faecf6bc7cba2b9fec444e876b2deae07 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 08:54:47 +1300 Subject: [PATCH 13/38] refactor: move training loops to module --- .../reinforcement_learning/training_loops.py | 115 ++++++++++++++++++ .../reinforcement_learning/updated_train.py | 113 +---------------- 2 files changed, 117 insertions(+), 111 deletions(-) create mode 100644 src/reinforcement_learning/reinforcement_learning/training_loops.py diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py new file mode 100644 index 00000000..9bd02986 --- /dev/null +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -0,0 +1,115 @@ +import numpy as np + +from cares_reinforcement_learning.util import helpers as hlp + +def off_policy_train(env, agent, memory, record, algorithm_config): + + max_steps_training = algorithm_config['max_steps_training'] + max_steps_exploration = algorithm_config['max_steps_exploration'] + num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] + evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] + + batch_size = algorithm_config['batch_size'] + G = algorithm_config['g'] + + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + + evaluate = False + + state, _ = env.reset() + + for step_counter in range(max_steps_training): + episode_timesteps += 1 + + if step_counter < max_steps_exploration: + env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}') + action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM) + action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) + else: + action = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, info = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) + + state = next_state + episode_reward += reward + + if step_counter >= max_steps_exploration: + for i in range(G): + experience = memory.sample(batch_size) + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'] + )) + memory.update_priorities(experience['indices'], info) + + if step_counter % evaluate_every_n_steps == 0: + evaluate = True + + if done or truncated: + env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}') + + record.log_train( + total_steps = step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + if evaluate: + evaluate = False + + env.get_logger().info(f'*************--Begin Evaluation Loop--*************') + off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter) + env.get_logger().info(f'*************--End Evaluation Loop--*************') + + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + +def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): + + episode_reward = 0 + episode_timesteps = 0 + episode_num = 0 + + for episode_num in range(eval_episodes): + state, _ = env.reset() + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action = agent.select_action_from_policy(state, evaluation=True) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, _ = env.step(action_env) + + state = next_state + episode_reward += reward + + if done or truncated: + record.log_eval( + total_steps = steps_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + break \ No newline at end of file diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 42fb6ec6..4c9c9d8c 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -15,6 +15,7 @@ from .parse_args import parse_args from .EnvironmentFactory import EnvironmentFactory +from .training_loops import off_policy_train def main(): rclpy.init() @@ -69,114 +70,4 @@ def main(): off_policy_train(env, agent, memory, record, algorithm_config) -def off_policy_train(env, agent, memory, record, algorithm_config): - - max_steps_training = algorithm_config['max_steps_training'] - max_steps_exploration = algorithm_config['max_steps_exploration'] - num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] - evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] - - batch_size = algorithm_config['batch_size'] - G = algorithm_config['g'] - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - evaluate = False - - state, _ = env.reset() - - for step_counter in range(max_steps_training): - episode_timesteps += 1 - - if step_counter < max_steps_exploration: - env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}') - action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM) - action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) - else: - action = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) - - next_state, reward, done, truncated, info = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - - state = next_state - episode_reward += reward - - if step_counter >= max_steps_exploration: - for i in range(G): - experience = memory.sample(batch_size) - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'] - )) - memory.update_priorities(experience['indices'], info) - - if step_counter % evaluate_every_n_steps == 0: - evaluate = True - - if done or truncated: - env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}') - - record.log_train( - total_steps = step_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - display = True - ) - - if evaluate: - evaluate = False - - env.get_logger().info(f'*************--Begin Evaluation Loop--*************') - off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter) - env.get_logger().info(f'*************--End Evaluation Loop--*************') - - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - -def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): - - episode_reward = 0 - episode_timesteps = 0 - episode_num = 0 - - for episode_num in range(eval_episodes): - state, _ = env.reset() - done = False - truncated = False - - while not done and not truncated: - episode_timesteps += 1 - action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) - - next_state, reward, done, truncated, _ = env.step(action_env) - - state = next_state - episode_reward += reward - - if done or truncated: - record.log_eval( - total_steps = steps_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - display = True - ) - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - break + From 8b302b41d0dea600cd57bf6ad38db0797de2145c Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 08:56:19 +1300 Subject: [PATCH 14/38] chore: add safe guards to algorithm training loop --- .../reinforcement_learning/updated_train.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 4c9c9d8c..55f42a3d 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -67,7 +67,14 @@ def main(): ) # TODO: Load Actor and Critic if passed. Only load if both are passed - off_policy_train(env, agent, memory, record, algorithm_config) + + match agent.type: + case 'policy': + off_policy_train(env, agent, memory, record, algorithm_config) + case 'ppo': + raise Exception('PPO Training not implemented') + case _: + raise Exception(f'Agent type {agent.type} not supported') From 33ffc134d65cf7e68033b2036ad08a4f65e2f30e Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 09:15:55 +1300 Subject: [PATCH 15/38] feat: add ppo train and evaluate --- .../reinforcement_learning/training_loops.py | 105 +++++++++++++++++- .../reinforcement_learning/updated_train.py | 11 +- 2 files changed, 110 insertions(+), 6 deletions(-) diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py index 9bd02986..35e5059d 100644 --- a/src/reinforcement_learning/reinforcement_learning/training_loops.py +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -49,7 +49,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config): )) memory.update_priorities(experience['indices'], info) - if step_counter % evaluate_every_n_steps == 0: + if (step_counter+1) % evaluate_every_n_steps == 0: evaluate = True if done or truncated: @@ -98,6 +98,109 @@ def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): state = next_state episode_reward += reward + if done or truncated: + record.log_eval( + total_steps = steps_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + break + + + +def ppo_train(env, agent, memory, record, algorithm_config): + + max_steps_training = algorithm_config['max_steps_training'] + max_steps_per_batch = algorithm_config['max_steps_per_batch'] + num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] + evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] + + episode_timesteps = 0 + episode_num = 0 + episode_reward = 0 + + evaluate = False + + state, _ = env.reset() + + for step_counter in range(max_steps_training): + episode_timesteps += 1 + + action, log_prob = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, info = env.step(action_env) + memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) + + state = next_state + episode_reward += reward + + if (step_counter+1) % max_steps_per_batch == 0: + experience = memory.flush() + info = agent.train_policy(( + experience['state'], + experience['action'], + experience['reward'], + experience['next_state'], + experience['done'], + experience['log_prob'] + )) + + if (step_counter+1) % evaluate_every_n_steps == 0: + evaluate = True + + if done or truncated: + record.log_train( + total_steps = step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) + + if evaluate: + evaluate = False + + env.get_logger().info(f'*************--Begin Evaluation Loop--*************') + ppo_evaluate(env, agent, num_eps_evaluation, record, step_counter) + env.get_logger().info(f'*************--End Evaluation Loop--*************') + + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 + +def ppo_evaluate(env, agent, eval_episodes, record, steps_counter): + + episode_reward = 0 + episode_timesteps = 0 + episode_num = 0 + + for episode_num in range(eval_episodes): + state, _ = env.reset() + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action, _ = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) + + next_state, reward, done, truncated, _ = env.step(action_env) + + state = next_state + episode_reward += reward + if done or truncated: record.log_eval( total_steps = steps_counter + 1, diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py index 55f42a3d..60a1b9c2 100644 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py @@ -10,12 +10,11 @@ from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.util.Record import Record from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory -from cares_reinforcement_learning.util import helpers as hlp import cares_reinforcement_learning.util.configurations as cfg from .parse_args import parse_args from .EnvironmentFactory import EnvironmentFactory -from .training_loops import off_policy_train +from .training_loops import off_policy_train, ppo_train def main(): rclpy.init() @@ -70,9 +69,11 @@ def main(): match agent.type: case 'policy': - off_policy_train(env, agent, memory, record, algorithm_config) - case 'ppo': - raise Exception('PPO Training not implemented') + + if network_config['algorithm'] == 'PPO': + ppo_train(env, agent, memory, record, algorithm_config) + else: + off_policy_train(env, agent, memory, record, algorithm_config) case _: raise Exception(f'Agent type {agent.type} not supported') From a88e564efe599c809268e36be067f69bd595205b Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 09:39:24 +1300 Subject: [PATCH 16/38] feat: make carbeat car names dynamic --- src/environment_interfaces/srv/CarBeatReset.srv | 2 ++ src/environments/environments/CarBeatEnvironment.py | 2 ++ src/environments/environments/CarBeatReset.py | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/environment_interfaces/srv/CarBeatReset.srv b/src/environment_interfaces/srv/CarBeatReset.srv index 6d8d5b61..202f6a44 100644 --- a/src/environment_interfaces/srv/CarBeatReset.srv +++ b/src/environment_interfaces/srv/CarBeatReset.srv @@ -1,8 +1,10 @@ float64 gx # goal x float64 gy # goal y +string car_one float64 cx_one # car_one x float64 cy_one # car_one y float64 cyaw_one # car_one yaw +string car_two float64 cx_two # car_two x float64 cy_two # car_two y float64 cyaw_two # car_two yaw diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 5d86c316..09f10b4c 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -270,10 +270,12 @@ def call_reset_service(self, request.gx = goal_x request.gy = goal_y + request.car_one = self.NAME request.cx_one = car_x request.cy_one = car_y request.cyaw_one = car_Y + request.car_two = self.OTHER_CAR_NAME request.cx_two = ftg_x request.cy_two = ftg_y request.cyaw_two = ftg_Y diff --git a/src/environments/environments/CarBeatReset.py b/src/environments/environments/CarBeatReset.py index fd6ae27a..b38ff404 100644 --- a/src/environments/environments/CarBeatReset.py +++ b/src/environments/environments/CarBeatReset.py @@ -37,8 +37,8 @@ def service_callback(self, request, response): # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.cx_one} y={request.cy_one}') goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1) - car_req_one = self.create_request('f1tenth_one', x=request.cx_one, y=request.cy_one, z=0.1, yaw=request.cyaw_one) - car_req_two = self.create_request('f1tenth_two', x=request.cx_two, y=request.cy_two, z=0.1, yaw=request.cyaw_two) + car_req_one = self.create_request(request.car_one, x=request.cx_one, y=request.cy_one, z=0.1, yaw=request.cyaw_one) + car_req_two = self.create_request(request.car_two, x=request.cx_two, y=request.cy_two, z=0.1, yaw=request.cyaw_two) while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') From 99d62fba5a506372ddaa0e487581003244a78d32 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:00:48 +1300 Subject: [PATCH 17/38] feat: make car name dynamic CarTrack --- src/environment_interfaces/srv/Reset.srv | 1 + .../environments/CarTrackEnvironment.py | 7 ++-- .../environments/CarTrackReset.py | 6 +--- src/environments/launch/cartrack.launch.py | 33 +++++++++++-------- .../launch/train.launch.py | 1 + 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/environment_interfaces/srv/Reset.srv b/src/environment_interfaces/srv/Reset.srv index f8eb3eeb..8d6e4804 100644 --- a/src/environment_interfaces/srv/Reset.srv +++ b/src/environment_interfaces/srv/Reset.srv @@ -3,6 +3,7 @@ float64 gy # goal y float64 cx # car x float64 cy # car y float64 cyaw # car yaw +string car_name string flag # flag --- bool success \ No newline at end of file diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py index cbc1703b..340ec1ac 100644 --- a/src/environments/environments/CarTrackEnvironment.py +++ b/src/environments/environments/CarTrackEnvironment.py @@ -26,7 +26,7 @@ def __init__(self, # Environment Details ---------------------------------------- self.MAX_STEPS_PER_GOAL = max_steps self.MAX_GOALS = max_goals - + match observation_mode: case 'no_position': self.OBSERVATION_SIZE = 6 + 10 @@ -81,7 +81,7 @@ def reset(self): self.sleep() goal_x, goal_y = self.goal_position - self.call_reset_service(car_x=car_x, car_y=car_y, car_Y=car_yaw, goal_x=goal_x, goal_y=goal_y) + self.call_reset_service(car_x=car_x, car_y=car_y, car_Y=car_yaw, goal_x=goal_x, goal_y=goal_y, car_name=self.NAME) observation, _ = self.get_observation() @@ -169,12 +169,13 @@ def compute_reward(self, state, next_state): # Utility Functions -------------------------------------------- - def call_reset_service(self, car_x, car_y, car_Y, goal_x, goal_y): + def call_reset_service(self, car_x, car_y, car_Y, goal_x, goal_y, car_name): """ Reset the car and goal position """ request = Reset.Request() + request.car_name = car_name request.gx = float(goal_x) request.gy = float(goal_y) request.cx = float(car_x) diff --git a/src/environments/environments/CarTrackReset.py b/src/environments/environments/CarTrackReset.py index bbd89e7d..0cd5f5dd 100644 --- a/src/environments/environments/CarTrackReset.py +++ b/src/environments/environments/CarTrackReset.py @@ -30,14 +30,11 @@ def __init__(self): while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') - def service_callback(self, request, response): - # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.x} y={request.y}') - goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1) - car_req = self.create_request('f1tenth', x=request.cx, y=request.cy, z=0, yaw=request.cyaw) + car_req = self.create_request(request.car_name, x=request.cx, y=request.cy, z=0, yaw=request.cyaw) while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') @@ -49,7 +46,6 @@ def service_callback(self, request, response): self.set_pose_client.call(goal_req) self.set_pose_client.call(car_req) - # self.get_logger().info('Successfully Reset') response.success = True return response diff --git a/src/environments/launch/cartrack.launch.py b/src/environments/launch/cartrack.launch.py index be709d9c..5c26a544 100644 --- a/src/environments/launch/cartrack.launch.py +++ b/src/environments/launch/cartrack.launch.py @@ -9,8 +9,10 @@ def launch(context, *args, **kwargs): pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim') pkg_environments = get_package_share_directory('environments') + pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') track = LaunchConfiguration('track').perform(context) + car_name = LaunchConfiguration('car_name').perform(context) gz_sim = IncludeLaunchDescription( launch_description_source=PythonLaunchDescriptionSource( @@ -20,16 +22,30 @@ def launch(context, *args, **kwargs): }.items() ) - return[ gz_sim] + f1tenth = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_name, + 'world': 'empty' + }.items() + ) + + + return[gz_sim, f1tenth] def generate_launch_description(): - pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') track_arg = DeclareLaunchArgument( 'track', default_value='track_1' ) + car_name = DeclareLaunchArgument( + 'car_name', + default_value='f1tenth' + ) + service_bridge = Node( package='ros_gz_bridge', executable='parameter_bridge', @@ -46,18 +62,7 @@ def generate_launch_description(): ], ) - f1tenth = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth', - 'world': 'empty' - }.items() - ) - #TODO: dynamically change car name - #TODO: This doesn't work yet - #TODO: Create CarWall Reset reset = Node( package='environments', executable='CarTrackReset', @@ -69,5 +74,5 @@ def generate_launch_description(): OpaqueFunction(function=launch), service_bridge, reset, - f1tenth, + car_name, ]) \ No newline at end of file diff --git a/src/reinforcement_learning/launch/train.launch.py b/src/reinforcement_learning/launch/train.launch.py index cb3c0345..1af7937c 100644 --- a/src/reinforcement_learning/launch/train.launch.py +++ b/src/reinforcement_learning/launch/train.launch.py @@ -32,6 +32,7 @@ def generate_launch_description(): os.path.join(pkg_environments, f'{env_launch[env]}.launch.py')), launch_arguments={ 'track': TextSubstitution(text=str(config['train']['ros__parameters']['track'])), + 'car_name': TextSubstitution(text=str(config['train']['ros__parameters']['car_name'])), 'car_one': TextSubstitution(text=str(config['train']['ros__parameters']['car_name']) if 'car_name' in config['train']['ros__parameters'] else 'f1tenth'), 'car_two': TextSubstitution(text=str(config['train']['ros__parameters']['ftg_car_name']) if 'ftg_car_name' in config['train']['ros__parameters'] else 'ftg_car'), }.items() #TODO: this doesn't do anything From 44e103b475a82e7b8504e22fe3c0f492e197242a Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:08:10 +1300 Subject: [PATCH 18/38] feat: make CarBlock car name dynamic --- .../environments/CarBlockEnvironment.py | 2 +- .../environments/CarBlockReset.py | 2 +- src/environments/launch/carblock.launch.py | 39 +++++++++++++------ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py index 47023106..100acbe8 100644 --- a/src/environments/environments/CarBlockEnvironment.py +++ b/src/environments/environments/CarBlockEnvironment.py @@ -7,7 +7,6 @@ from sensor_msgs.msg import LaserScan from environments.F1tenthEnvironment import F1tenthEnvironment -from environments.CarWallEnvironment import CarWallEnvironment from .util import reduce_lidar, process_odom, generate_position from .termination import reached_goal, has_collided, has_flipped_over @@ -82,6 +81,7 @@ def call_reset_service(self, goal_x, goal_y): req.gx = goal_x req.gy = goal_y + req.car_name = self.NAME future = self.reset_client.call_async(req) rclpy.spin_until_future_complete(future=future, node=self) diff --git a/src/environments/environments/CarBlockReset.py b/src/environments/environments/CarBlockReset.py index 39cfdd9a..f90a385d 100644 --- a/src/environments/environments/CarBlockReset.py +++ b/src/environments/environments/CarBlockReset.py @@ -43,7 +43,7 @@ def service_callback(self, request, response): # Move the goal to new position & car back to origin goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1) - car_req = self.create_request('f1tenth') + car_req = self.create_request(request.car_name) while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') diff --git a/src/environments/launch/carblock.launch.py b/src/environments/launch/carblock.launch.py index a7a34be2..a762cb68 100644 --- a/src/environments/launch/carblock.launch.py +++ b/src/environments/launch/carblock.launch.py @@ -3,13 +3,38 @@ from launch_ros.actions import Node from launch import LaunchDescription from launch.launch_description_sources import PythonLaunchDescriptionSource -from launch.actions import IncludeLaunchDescription +from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration + +def launch(context, *args, **kwargs): + pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') + + car_name = LaunchConfiguration('car_name').perform(context) + + f1tenth = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_name, + 'world': 'empty', + 'x': '0', + 'y': '0', + 'z': '5', + }.items() + ) + + return[f1tenth] def generate_launch_description(): pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim') pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') pkg_environments = get_package_share_directory('environments') + car_name = DeclareLaunchArgument( + 'car_name', + default_value='f1tenth' + ) + service_bridge = Node( package='ros_gz_bridge', executable='parameter_bridge', @@ -26,15 +51,6 @@ def generate_launch_description(): ], ) - f1tenth = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth', - 'world': 'empty' - }.items() - ) - gz_sim = IncludeLaunchDescription( launch_description_source=PythonLaunchDescriptionSource( os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')), @@ -51,7 +67,8 @@ def generate_launch_description(): return LaunchDescription([ gz_sim, - f1tenth, + car_name, + OpaqueFunction(function=launch), service_bridge, reset, ]) \ No newline at end of file From 832fe241a3a71f45381be4b6ad779a73913c9e98 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:12:49 +1300 Subject: [PATCH 19/38] feat: make CarWall car name dynamic --- .../environments/CarWallEnvironment.py | 3 +- src/environments/environments/CarWallReset.py | 2 +- src/environments/launch/carwall.launch.py | 42 ++++++++++++------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py index 61c92c7e..9d3dc028 100644 --- a/src/environments/environments/CarWallEnvironment.py +++ b/src/environments/environments/CarWallEnvironment.py @@ -81,7 +81,8 @@ def call_reset_service(self, goal_x, goal_y): req.gx = goal_x req.gy = goal_y - + req.car_name = self.NAME + future = self.reset_client.call_async(req) rclpy.spin_until_future_complete(future=future, node=self) diff --git a/src/environments/environments/CarWallReset.py b/src/environments/environments/CarWallReset.py index c4d80b81..d10a7f02 100644 --- a/src/environments/environments/CarWallReset.py +++ b/src/environments/environments/CarWallReset.py @@ -37,7 +37,7 @@ def service_callback(self, request, response): # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.x} y={request.y}') goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1) - car_req = self.create_request('f1tenth') + car_req = self.create_request(request.car_name, x=0, y=0, z=0) while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') diff --git a/src/environments/launch/carwall.launch.py b/src/environments/launch/carwall.launch.py index 17587eb0..a82d4383 100644 --- a/src/environments/launch/carwall.launch.py +++ b/src/environments/launch/carwall.launch.py @@ -3,13 +3,38 @@ from launch_ros.actions import Node from launch import LaunchDescription from launch.launch_description_sources import PythonLaunchDescriptionSource -from launch.actions import IncludeLaunchDescription +from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration + +def launch(context, *args, **kwargs): + pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') + + car_name = LaunchConfiguration('car_name').perform(context) + + f1tenth = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_name, + 'world': 'empty', + 'x': '0', + 'y': '0', + 'z': '5', + }.items() + ) + + return[f1tenth] def generate_launch_description(): pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim') pkg_environments = get_package_share_directory('environments') pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') + car_name = DeclareLaunchArgument( + 'car_name', + default_value='f1tenth' + ) + service_bridge = Node( package='ros_gz_bridge', executable='parameter_bridge', @@ -26,15 +51,6 @@ def generate_launch_description(): ], ) - f1tenth = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth', - 'world': 'empty' - }.items() - ) - gz_sim = IncludeLaunchDescription( launch_description_source=PythonLaunchDescriptionSource( os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')), @@ -43,9 +59,6 @@ def generate_launch_description(): }.items() ) - #TODO: dynamically change car name - #TODO: This doesn't work yet - #TODO: Create CarWall Reset reset = Node( package='environments', executable='CarWallReset', @@ -55,7 +68,8 @@ def generate_launch_description(): return LaunchDescription([ gz_sim, - f1tenth, + car_name, + OpaqueFunction(function=launch), service_bridge, reset, ]) \ No newline at end of file From c9081264a2461ac44b365b73fdb7e6805b238e17 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:16:30 +1300 Subject: [PATCH 20/38] feat: make CarGoal car name dynamic --- .../environments/CarGoalEnvironment.py | 1 + src/environments/environments/CarGoalReset.py | 2 +- src/environments/launch/cargoal.launch.py | 41 +++++++++++++------ 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py index 495bd899..015d255b 100644 --- a/src/environments/environments/CarGoalEnvironment.py +++ b/src/environments/environments/CarGoalEnvironment.py @@ -72,6 +72,7 @@ def call_reset_service(self, goal_x, goal_y): req.gx = goal_x req.gy = goal_y + req.car_name = self.NAME future = self.reset_client.call_async(req) rclpy.spin_until_future_complete(future=future, node=self) diff --git a/src/environments/environments/CarGoalReset.py b/src/environments/environments/CarGoalReset.py index c42802ff..85facc0d 100644 --- a/src/environments/environments/CarGoalReset.py +++ b/src/environments/environments/CarGoalReset.py @@ -35,7 +35,7 @@ def __init__(self): def service_callback(self, request, response): goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1) - car_req = self.create_request('f1tenth') + car_req = self.create_request(request.car_name) while not self.set_pose_client.wait_for_service(timeout_sec=1.0): self.get_logger().info('set_pose service not available, waiting again...') diff --git a/src/environments/launch/cargoal.launch.py b/src/environments/launch/cargoal.launch.py index 07692810..efcab148 100644 --- a/src/environments/launch/cargoal.launch.py +++ b/src/environments/launch/cargoal.launch.py @@ -3,13 +3,38 @@ from launch_ros.actions import Node from launch import LaunchDescription from launch.launch_description_sources import PythonLaunchDescriptionSource -from launch.actions import IncludeLaunchDescription +from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction +from launch.substitutions import LaunchConfiguration + +def launch(context, *args, **kwargs): + pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') + + car_name = LaunchConfiguration('car_name').perform(context) + + f1tenth = IncludeLaunchDescription( + launch_description_source=PythonLaunchDescriptionSource( + os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), + launch_arguments={ + 'name': car_name, + 'world': 'empty', + 'x': '0', + 'y': '0', + 'z': '5', + }.items() + ) + + return[f1tenth] def generate_launch_description(): pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim') pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup') pkg_environments = get_package_share_directory('environments') + car_name = DeclareLaunchArgument( + 'car_name', + default_value='f1tenth' + ) + service_bridge = Node( package='ros_gz_bridge', executable='parameter_bridge', @@ -26,15 +51,6 @@ def generate_launch_description(): ], ) - f1tenth = IncludeLaunchDescription( - launch_description_source=PythonLaunchDescriptionSource( - os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')), - launch_arguments={ - 'name': 'f1tenth', - 'world': 'empty' - }.items() - ) - gz_sim = IncludeLaunchDescription( launch_description_source=PythonLaunchDescriptionSource( os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')), @@ -43,8 +59,6 @@ def generate_launch_description(): }.items() ) - #TODO: dynamically change car name - #TODO: This doesn't work yet reset = Node( package='environments', executable='CarGoalReset', @@ -54,7 +68,8 @@ def generate_launch_description(): return LaunchDescription([ gz_sim, - f1tenth, + car_name, + OpaqueFunction(function=launch), service_bridge, reset, ]) \ No newline at end of file From a818bd37b707d7ce57c73db1dbb80a45c1ec7ab2 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:32:29 +1300 Subject: [PATCH 21/38] chore: remove bash scripts --- helperScripts/0kill.sh | 5 ---- helperScripts/1manual.sh | 28 ---------------------- helperScripts/2train.sh | 30 ------------------------ helperScripts/3test.sh | 28 ---------------------- helperScripts/4openGZ.sh | 12 ---------- helperScripts/retrain.sh | 50 ---------------------------------------- 6 files changed, 153 deletions(-) delete mode 100755 helperScripts/0kill.sh delete mode 100755 helperScripts/1manual.sh delete mode 100755 helperScripts/2train.sh delete mode 100755 helperScripts/3test.sh delete mode 100755 helperScripts/4openGZ.sh delete mode 100755 helperScripts/retrain.sh diff --git a/helperScripts/0kill.sh b/helperScripts/0kill.sh deleted file mode 100755 index 2b7804ca..00000000 --- a/helperScripts/0kill.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -pkill -f "ros" -pkill -f "gz" -pkill -f "ruby" diff --git a/helperScripts/1manual.sh b/helperScripts/1manual.sh deleted file mode 100755 index b6940d9a..00000000 --- a/helperScripts/1manual.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# Ctrl + C to exit the program -function cleanup() { - return -} - -# trap cleanup SIGINT SIGTERM -trap cleanup SIGINT SIGTERM - -cd ~/autonomous_f1tenth/ - -if [ -n "$1" ]; then - export GZ_PARTITION=$1 - export ROS_DOMAIN_ID=$1 -else - export GZ_PARTITION=50 - export ROS_DOMAIN_ID=50 -fi - -. install/setup.bash - -colcon build -ros2 launch reinforcement_learning sanity_check.launch.py & - -gz sim -g & - -wait diff --git a/helperScripts/2train.sh b/helperScripts/2train.sh deleted file mode 100755 index 382e530d..00000000 --- a/helperScripts/2train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Ctrl + C to exit the program -function cleanup() { - return -} - -# trap cleanup SIGINT SIGTERM -trap cleanup SIGINT SIGTERM - -cd ~/autonomous_f1tenth/ - -if [ -n "$1" ]; then - export GZ_PARTITION=$1 - export ROS_DOMAIN_ID=$1 -else - export GZ_PARTITION=100 - export ROS_DOMAIN_ID=100 -fi - -. install/setup.bash - -colcon build -ros2 launch reinforcement_learning train.launch.py & - -if [ "$1" == "y" ] || [ "$1" == "Y" ]; then - gz sim -g & -fi - -wait diff --git a/helperScripts/3test.sh b/helperScripts/3test.sh deleted file mode 100755 index d675e273..00000000 --- a/helperScripts/3test.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# Ctrl + C to exit the program -function cleanup() { - return -} - -# trap cleanup SIGINT SIGTERM -trap cleanup SIGINT SIGTERM - -cd ~/autonomous_f1tenth/ - -if [ -n "$1" ]; then - export GZ_PARTITION=$1 - export ROS_DOMAIN_ID=$1 -else - export GZ_PARTITION=150 - export ROS_DOMAIN_ID=150 -fi - -. install/setup.bash - -colcon build -ros2 launch reinforcement_learning test.launch.py & - -gz sim -g & - -wait diff --git a/helperScripts/4openGZ.sh b/helperScripts/4openGZ.sh deleted file mode 100755 index 932d40bf..00000000 --- a/helperScripts/4openGZ.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -cd ~/autonomous_f1tenth/ - -if [ -z "$1" ]; then - echo "Please enter the GZ_PARTITION number" - echo "Usage: ./4openGZ.sh " - return -fi - -export GZ_PARTITION=$1 -gz sim -g diff --git a/helperScripts/retrain.sh b/helperScripts/retrain.sh deleted file mode 100755 index 5b7caf1d..00000000 --- a/helperScripts/retrain.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# Ctrl + C to exit the program -function cleanup() { - return -} - -# trap cleanup SIGINT SIGTERM -trap cleanup SIGINT SIGTERM - -cd .. - -# Time limit for training -if [ -z "$1" ]; then - echo "Please enter the time limit for training" - echo "Usage: ./retrain.sh []" - echo "Eg: 10s, 6h" - echo "Usage: ./retrain.sh 10s" - return -fi - -# Set the partition number -if [ -n "$2" ]; then - export GZ_PARTITION="$2" -fi - -. install/setup.bash - -# Rerun every $1 time -while true; do - colcon build - timeout --signal=SIGKILL "$1" ros2 launch reinforcement_learning train.launch.py - - # Get the latest folder in rl_logs - latest_folder=$(ls -lt ./rl_logs | grep '^d' | head -n 1 | awk '{print $NF}') - - # Set the new paths for actor_path and critic_path - new_actor_path="rl_logs/$latest_folder/models/actor_checkpoint.pht" - new_critic_path="rl_logs/$latest_folder/models/critic_checkpoint.pht" - - # Check if the new paths exist - if [ ! -e "$new_actor_path" ] || [ ! -e "$new_critic_path" ]; then - echo "Error: $new_actor_path or $new_critic_path does not exist" - return - fi - - # Use sed to update the actor_path and critic_path in the YAML file - sed -i "s#actor_path: '.*'#actor_path: '$new_actor_path'#g" src/reinforcement_learning/config/train.yaml - sed -i "s#critic_path: '.*'#critic_path: '$new_critic_path'#g" src/reinforcement_learning/config/train.yaml -done From 542bb8551c6f4ffbcc82a12ecb48595db2f89ae9 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:35:40 +1300 Subject: [PATCH 22/38] chore: fix imports controller package --- src/controllers/controllers/controller.py | 9 +++++---- src/controllers/controllers/ftg_controller.py | 4 ++-- src/controllers/controllers/ftg_policy.py | 2 +- src/controllers/controllers/random.py | 2 +- src/controllers/controllers/rl_controller.py | 4 ++-- src/controllers/controllers/rl_policy.py | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/controllers/controllers/controller.py b/src/controllers/controllers/controller.py index 8e24a621..ee072474 100644 --- a/src/controllers/controllers/controller.py +++ b/src/controllers/controllers/controller.py @@ -1,13 +1,14 @@ -import rclpy import math -from geometry_msgs.msg import Twist -from message_filters import Subscriber, ApproximateTimeSynchronizer +import rclpy from rclpy import Future from rclpy.node import Node + +from geometry_msgs.msg import Twist +from message_filters import Subscriber, ApproximateTimeSynchronizer from sensor_msgs.msg import LaserScan from nav_msgs.msg import Odometry -from environments.util import process_lidar, process_odom, reduce_lidar +from environments.util import process_odom, reduce_lidar class Controller(Node): def __init__(self, node_name, car_name, step_length): diff --git a/src/controllers/controllers/ftg_controller.py b/src/controllers/controllers/ftg_controller.py index ae7bbc8a..1b33eb09 100644 --- a/src/controllers/controllers/ftg_controller.py +++ b/src/controllers/controllers/ftg_controller.py @@ -1,9 +1,9 @@ -from .controller import Controller +from rclpy import Future from std_msgs.msg import Empty from environments.track_reset import track_info from environments.termination import reached_goal from environments.util import reduce_lidar, process_odom -from rclpy import Future +from .controller import Controller class FTGController(Controller): diff --git a/src/controllers/controllers/ftg_policy.py b/src/controllers/controllers/ftg_policy.py index f7fdac6b..0ca5106d 100644 --- a/src/controllers/controllers/ftg_policy.py +++ b/src/controllers/controllers/ftg_policy.py @@ -1,6 +1,6 @@ -from .controller import Controller import rclpy import numpy as np +from .controller import Controller def main(): rclpy.init() diff --git a/src/controllers/controllers/random.py b/src/controllers/controllers/random.py index 0c72d78b..718bbe41 100644 --- a/src/controllers/controllers/random.py +++ b/src/controllers/controllers/random.py @@ -1,7 +1,7 @@ -from .controller import Controller import rclpy import numpy as np import random +from .controller import Controller def main(): rclpy.init() diff --git a/src/controllers/controllers/rl_controller.py b/src/controllers/controllers/rl_controller.py index c558c044..68eca094 100644 --- a/src/controllers/controllers/rl_controller.py +++ b/src/controllers/controllers/rl_controller.py @@ -1,7 +1,7 @@ -from .controller import Controller +from rclpy import Future from std_msgs.msg import Empty +from .controller import Controller from environments.util import reduce_lidar, process_odom -from rclpy import Future class RLController(Controller): diff --git a/src/controllers/controllers/rl_policy.py b/src/controllers/controllers/rl_policy.py index 82a32cc4..f6488282 100644 --- a/src/controllers/controllers/rl_policy.py +++ b/src/controllers/controllers/rl_policy.py @@ -1,7 +1,7 @@ -from .rl_controller import RLController import rclpy from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory from cares_reinforcement_learning.util.helpers import denormalize +from .rl_controller import RLController def main(): rclpy.init() From 4b67b18041a76f4ef6a09d00a8bc443ff88f4350 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:37:59 +1300 Subject: [PATCH 23/38] chore: remove unused imports --- src/environments/environments/CarBeatReset.py | 2 +- src/environments/environments/CarBlockEnvironment.py | 1 - src/environments/environments/CarBlockReset.py | 2 +- src/environments/environments/CarGoalReset.py | 2 +- src/environments/environments/CarTrackReset.py | 2 +- src/environments/environments/CarWallEnvironment.py | 2 -- src/environments/environments/CarWallReset.py | 3 +-- src/environments/environments/F1tenthEnvironment.py | 2 -- 8 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/environments/environments/CarBeatReset.py b/src/environments/environments/CarBeatReset.py index b38ff404..77cf5d30 100644 --- a/src/environments/environments/CarBeatReset.py +++ b/src/environments/environments/CarBeatReset.py @@ -1,7 +1,7 @@ import sys import rclpy from rclpy.node import Node -from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup from rclpy.executors import MultiThreadedExecutor from environment_interfaces.srv import CarBeatReset as CarBeatResetSrv diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py index 100acbe8..b2c3e650 100644 --- a/src/environments/environments/CarBlockEnvironment.py +++ b/src/environments/environments/CarBlockEnvironment.py @@ -4,7 +4,6 @@ import numpy as np import rclpy from rclpy import Future -from sensor_msgs.msg import LaserScan from environments.F1tenthEnvironment import F1tenthEnvironment diff --git a/src/environments/environments/CarBlockReset.py b/src/environments/environments/CarBlockReset.py index f90a385d..825122c9 100644 --- a/src/environments/environments/CarBlockReset.py +++ b/src/environments/environments/CarBlockReset.py @@ -1,7 +1,7 @@ import sys import rclpy from rclpy.node import Node -from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup from rclpy.executors import MultiThreadedExecutor import numpy as np import random diff --git a/src/environments/environments/CarGoalReset.py b/src/environments/environments/CarGoalReset.py index 85facc0d..07b3b426 100644 --- a/src/environments/environments/CarGoalReset.py +++ b/src/environments/environments/CarGoalReset.py @@ -1,7 +1,7 @@ import sys import rclpy from rclpy.node import Node -from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup from rclpy.executors import MultiThreadedExecutor from environment_interfaces.srv import Reset diff --git a/src/environments/environments/CarTrackReset.py b/src/environments/environments/CarTrackReset.py index 0cd5f5dd..486d129a 100644 --- a/src/environments/environments/CarTrackReset.py +++ b/src/environments/environments/CarTrackReset.py @@ -1,7 +1,7 @@ import sys import rclpy from rclpy.node import Node -from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup from rclpy.executors import MultiThreadedExecutor from environment_interfaces.srv import Reset diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py index 9d3dc028..cd285cee 100644 --- a/src/environments/environments/CarWallEnvironment.py +++ b/src/environments/environments/CarWallEnvironment.py @@ -1,10 +1,8 @@ import math import random -import numpy as np import rclpy from rclpy import Future -from sensor_msgs.msg import LaserScan from .util import process_odom, avg_reduce_lidar, generate_position from .termination import has_collided, has_flipped_over, reached_goal from environments.F1tenthEnvironment import F1tenthEnvironment diff --git a/src/environments/environments/CarWallReset.py b/src/environments/environments/CarWallReset.py index d10a7f02..48c47c90 100644 --- a/src/environments/environments/CarWallReset.py +++ b/src/environments/environments/CarWallReset.py @@ -1,7 +1,6 @@ -import sys import rclpy from rclpy.node import Node -from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup +from rclpy.callback_groups import MutuallyExclusiveCallbackGroup from rclpy.executors import MultiThreadedExecutor from environment_interfaces.srv import Reset diff --git a/src/environments/environments/F1tenthEnvironment.py b/src/environments/environments/F1tenthEnvironment.py index 8b3af17d..fde64773 100644 --- a/src/environments/environments/F1tenthEnvironment.py +++ b/src/environments/environments/F1tenthEnvironment.py @@ -1,5 +1,3 @@ -import math - import numpy as np import rclpy from geometry_msgs.msg import Twist From 4d9940242ac1865f23a477a7b364958db951f93f Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 10:40:17 +1300 Subject: [PATCH 24/38] chore: replace train --- .../reinforcement_learning/train.py | 426 +++--------------- .../reinforcement_learning/updated_train.py | 81 ---- src/reinforcement_learning/setup.py | 2 +- 3 files changed, 54 insertions(+), 455 deletions(-) delete mode 100644 src/reinforcement_learning/reinforcement_learning/updated_train.py diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py index a84b32ca..60a1b9c2 100644 --- a/src/reinforcement_learning/reinforcement_learning/train.py +++ b/src/reinforcement_learning/reinforcement_learning/train.py @@ -1,401 +1,81 @@ import random -import time +from datetime import datetime +import yaml import numpy as np import rclpy +from rclpy.parameter import Parameter import torch -from cares_reinforcement_learning.algorithm.policy import TD3, PPO + from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.networks.TD3 import Actor, Critic -from cares_reinforcement_learning.util import Record, NetworkFactory -from cares_reinforcement_learning.util import helpers as hlp +from cares_reinforcement_learning.util.Record import Record +from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory +import cares_reinforcement_learning.util.configurations as cfg -from environments.CarBlockEnvironment import CarBlockEnvironment -from environments.CarGoalEnvironment import CarGoalEnvironment -from environments.CarTrackEnvironment import CarTrackEnvironment -from environments.CarWallEnvironment import CarWallEnvironment -from environments.CarBeatEnvironment import CarBeatEnvironment +from .parse_args import parse_args +from .EnvironmentFactory import EnvironmentFactory +from .training_loops import off_policy_train, ppo_train def main(): rclpy.init() - params = get_params() - - global MAX_STEPS_TRAINING - global MAX_STEPS_EXPLORATION - global MAX_STEPS_PER_BATCH - global G - global BATCH_SIZE - global EVALUATE_EVERY_N_STEPS - global EVALUATE_FOR_M_EPISODES - global ALGORITHM - - ENVIRONMENT, \ - ALGORITHM, \ - TRACK, \ - MAX_STEPS_TRAINING, \ - MAX_STEPS_EXPLORATION, \ - GAMMA, \ - TAU, \ - G, \ - BATCH_SIZE, \ - BUFFER_SIZE, \ - SEED, \ - ACTOR_LR, \ - CRITIC_LR, \ - MAX_STEPS, \ - STEP_LENGTH, \ - REWARD_RANGE, \ - COLLISION_RANGE, \ - ACTOR_PATH, \ - CRITIC_PATH, \ - MAX_STEPS_PER_BATCH, \ - OBSERVATION_MODE, \ - EVALUATE_EVERY_N_STEPS, \ - EVALUATE_FOR_M_EPISODES = [param.value for param in params] + env_config, algorithm_config, network_config = parse_args() - if ACTOR_PATH != '' and CRITIC_PATH != '': - MAX_STEPS_EXPLORATION = 0 + # Set Seeds + torch.manual_seed(algorithm_config['seed']) + torch.cuda.manual_seed_all(algorithm_config['seed']) + np.random.seed(algorithm_config['seed']) + random.seed(algorithm_config['seed']) - torch.manual_seed(SEED) - torch.cuda.manual_seed_all(SEED) - np.random.seed(SEED) - random.seed(SEED) - print( - f'---------------------------------------------\n' - f'Environment: {ENVIRONMENT}\n' - f'Algorithm: {ALGORITHM}\n' - f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n' - f'Training Steps: {MAX_STEPS_TRAINING}\n' - f'Gamma: {GAMMA}\n' - f'Tau: {TAU}\n' - f'G: {G}\n' - f'Batch Size: {BATCH_SIZE}\n' - f'Buffer Size: {BUFFER_SIZE}\n' - f'Seed: {SEED}\n' - f'Actor LR: {ACTOR_LR}\n' - f'Critic LR: {CRITIC_LR}\n' - f'Steps per Episode: {MAX_STEPS}\n' - f'Step Length: {STEP_LENGTH}\n' - f'Reward Range: {REWARD_RANGE}\n' - f'Collision Range: {COLLISION_RANGE}\n' - f'Critic Path: {CRITIC_PATH}\n' - f'Actor Path: {ACTOR_PATH}\n' - f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n' - f'Observation Mode: {OBSERVATION_MODE}\n' - f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n' - f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n' - f'---------------------------------------------\n' + f'Environment Config: ------------------------------------- \n' + f'{yaml.dump(env_config, default_flow_style=False)} \n' + f'Algorithm Config: ------------------------------------- \n' + f'{yaml.dump(algorithm_config, default_flow_style=False)} \n' + f'Network Config: ------------------------------------- \n' + f'{yaml.dump(network_config, default_flow_style=False)} \n' ) - DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - time.sleep(3) - - match ENVIRONMENT: - case 'CarWall': - env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarBlock': - env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarTrack': - env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) - case 'CarBeat': - env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE) + env_factory = EnvironmentFactory() + network_factory = NetworkFactory() + + match network_config['algorithm']: + case 'PPO': + config = cfg.PPOConfig(**network_config) + case 'DDPG': + config = cfg.DDPGConfig(**network_config) + case 'SAC': + config = cfg.SACConfig(**network_config) + case 'TD3': + config = cfg.TD3Config(**network_config) case _: - env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE) - - network_factory_args = { - 'observation_size': env.OBSERVATION_SIZE, - 'action_num': env.ACTION_NUM, - 'actor_lr': ACTOR_LR, - 'critic_lr': CRITIC_LR, - 'gamma': GAMMA, - 'tau': TAU, - 'device': DEVICE - } - - agent_factory = NetworkFactory() - agent = agent_factory.create_network(ALGORITHM, network_factory_args) - - if ACTOR_PATH != '' and CRITIC_PATH != '': - print('Reading saved models into actor and critic') - agent.actor_net.load_state_dict(torch.load(ACTOR_PATH)) - agent.critic_net.load_state_dict(torch.load(CRITIC_PATH)) - print('Successfully Loaded models') - - networks = {'actor': agent.actor_net, 'critic': agent.critic_net} - config = { - 'environment': ENVIRONMENT, - 'algorithm': ALGORITHM, - 'max_steps_training': MAX_STEPS_TRAINING, - 'max_steps_exploration': MAX_STEPS_EXPLORATION, - 'gamma': GAMMA, - 'tau': TAU, - 'g': G, - 'batch_size': BATCH_SIZE, - 'buffer_size': BUFFER_SIZE, - 'seed': SEED, - 'actor_lr': ACTOR_LR, - 'critic_lr': CRITIC_LR, - 'max_steps': MAX_STEPS, - 'step_length': STEP_LENGTH, - 'reward_range': REWARD_RANGE, - 'collision_range': COLLISION_RANGE, - 'max_steps_per_batch': MAX_STEPS_PER_BATCH, - 'observation_mode': OBSERVATION_MODE, - 'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS, - 'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES - } - - if (ENVIRONMENT == 'CarTrack'): - config['track'] = TRACK - - record = Record(networks=networks, checkpoint_freq=100, config=config) - - if ALGORITHM == 'PPO': - train_ppo(env, agent, record=record) - else: - train(env=env, agent=agent, record=record) - - -def train(env, agent, record: Record): - memory = MemoryBuffer() - - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - evaluation_reward = None - - state, _ = env.reset() - - evaluate = False - - print(f'Initial State: {state}') - historical_reward = {"step": [], "episode_reward": []} - - for total_step_counter in range(int(MAX_STEPS_TRAINING)): - episode_timesteps += 1 - - if total_step_counter < MAX_STEPS_EXPLORATION: - print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}") - action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])]) # action range the env uses [e.g. -2 , 2 for pendulum] - action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS) # algorithm range [-1, 1] - else: - action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) # mapping to env range [e.g. -2 , 2 for pendulum] - - next_state, reward, done, truncated, info = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) - - state = next_state - episode_reward += reward - - if total_step_counter >= MAX_STEPS_EXPLORATION: - for _ in range(G): - experiences = memory.sample(BATCH_SIZE) - experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done']) - agent.train_policy(experiences) - - - - if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: - evaluate = True - - - record.log( - out=done or truncated, - Step=total_step_counter, - Episode=episode_num, - Step_Reward=reward, - Episode_Reward=episode_reward if (done or truncated) else None, - Evaluation_Reward=evaluation_reward - ) + raise Exception(f'Algorithm {network_config["algorithm"]} not implemented') - evaluation_reward = None - if done or truncated: - historical_reward["step"].append(total_step_counter) - historical_reward["episode_reward"].append(episode_reward) - - if evaluate: - evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) - evaluate = False - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - -def train_ppo(env, agent, record): - max_steps_training = MAX_STEPS_TRAINING - max_steps_per_batch = MAX_STEPS_PER_BATCH + env = env_factory.create(env_config['environment'], env_config) + agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) + memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM) - min_action_value = env.MIN_ACTIONS - max_action_value = env.MAX_ACTIONS - episode_timesteps = 0 - episode_num = 0 - episode_reward = 0 - time_step = 1 - evaluation_reward = None - evaluate = False - - memory = MemoryBuffer() - - state, _ = env.reset() - - for total_step_counter in range(int(max_steps_training)): - episode_timesteps += 1 - - action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) - - next_state, reward, done, truncated, _ = env.step(action_env) - memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) - - state = next_state - episode_reward += reward - - if time_step % max_steps_per_batch == 0: - for _ in range(G): - experience = memory.sample(BATCH_SIZE) - info = agent.train_policy(( - experience['state'], - experience['action'], - experience['reward'], - experience['next_state'], - experience['done'], - experience['log_prob'] - )) - - memory.clear() - - if total_step_counter % EVALUATE_EVERY_N_STEPS == 0: - evaluate = True - - record.log( - Steps = total_step_counter, - Episode= episode_num, - Step_reward= reward, - Episode_reward= episode_reward if (done or truncated) else None, - Evaluation_reward=evaluation_reward, - out=done or truncated, - ) - evaluation_reward = None - time_step += 1 - - if done or truncated: - print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}') - - if evaluate: - evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES) - evaluate = False - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - -def evaluate_policy(env, agent, num_episodes): - - episode_reward_history = [] - - print('Beginning Evaluation----------------------------') - - for ep in range(num_episodes): - state, _ = env.reset() - - episode_timesteps = 0 - episode_reward = 0 + record = Record( + glob_log_dir='training_logs', + log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}", + algorithm=network_config['algorithm'], + task=env_config['environment'], + network=agent + ) - truncated = False - terminated = False + # TODO: Load Actor and Critic if passed. Only load if both are passed - while not truncated and not terminated: + match agent.type: + case 'policy': - if ALGORITHM == 'PPO': - action = agent.select_action_from_policy(state) + if network_config['algorithm'] == 'PPO': + ppo_train(env, agent, memory, record, algorithm_config) else: - action = agent.select_action_from_policy(state, evaluation=True) - - action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) - next_state, reward, terminated, truncated, _ = env.step(action) - - episode_reward += reward - state = next_state - - print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}') - episode_reward_history.append(episode_reward) - - avg_reward = sum(episode_reward_history) / len(episode_reward_history) - - print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------') - - return avg_reward - -def get_params(): - ''' - This function fetches the hyperparameters passed in through the launch files - - The hyperparameters below are defaults, to change them, you should change the train.yaml config - ''' - param_node = rclpy.create_node('params') - param_node.declare_parameters( - '', - [ - ('environment', 'CarGoal'), - ('algorithm', 'TD3'), - ('track', 'track_1'), - ('gamma', 0.95), - ('tau', 0.005), - ('g', 10), - ('batch_size', 32), - ('buffer_size', 1_000_000), - ('seed', 123), # TODO: This doesn't do anything yet - ('actor_lr', 1e-4), - ('critic_lr', 1e-3), - ('max_steps_training', 1_000_000), - ('max_steps_exploration', 1_000), - ('max_steps', 100), - ('step_length', 0.25), - ('reward_range', 0.2), - ('collision_range', 0.2), - ('actor_path', ''), - ('critic_path', ''), - ('max_steps_per_batch', 5000), - ('observation_mode', 'no_position'), - ('evaluate_every_n_steps', 10000), - ('evaluate_for_m_episodes', 5) - ] - ) + off_policy_train(env, agent, memory, record, algorithm_config) + case _: + raise Exception(f'Agent type {agent.type} not supported') - return param_node.get_parameters([ - 'environment', - 'algorithm', - 'track', - 'max_steps_training', - 'max_steps_exploration', - 'gamma', - 'tau', - 'g', - 'batch_size', - 'buffer_size', - 'seed', - 'actor_lr', - 'critic_lr', - 'max_steps', - 'step_length', - 'reward_range', - 'collision_range', - 'actor_path', - 'critic_path', - 'max_steps_per_batch', - 'observation_mode', - 'evaluate_every_n_steps', - 'evaluate_for_m_episodes' - ]) -if __name__ == '__main__': - main() diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py deleted file mode 100644 index 60a1b9c2..00000000 --- a/src/reinforcement_learning/reinforcement_learning/updated_train.py +++ /dev/null @@ -1,81 +0,0 @@ -import random -from datetime import datetime -import yaml - -import numpy as np -import rclpy -from rclpy.parameter import Parameter -import torch - -from cares_reinforcement_learning.memory import MemoryBuffer -from cares_reinforcement_learning.util.Record import Record -from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory -import cares_reinforcement_learning.util.configurations as cfg - -from .parse_args import parse_args -from .EnvironmentFactory import EnvironmentFactory -from .training_loops import off_policy_train, ppo_train - -def main(): - rclpy.init() - - env_config, algorithm_config, network_config = parse_args() - - # Set Seeds - torch.manual_seed(algorithm_config['seed']) - torch.cuda.manual_seed_all(algorithm_config['seed']) - np.random.seed(algorithm_config['seed']) - random.seed(algorithm_config['seed']) - - print( - f'Environment Config: ------------------------------------- \n' - f'{yaml.dump(env_config, default_flow_style=False)} \n' - f'Algorithm Config: ------------------------------------- \n' - f'{yaml.dump(algorithm_config, default_flow_style=False)} \n' - f'Network Config: ------------------------------------- \n' - f'{yaml.dump(network_config, default_flow_style=False)} \n' - ) - - env_factory = EnvironmentFactory() - network_factory = NetworkFactory() - - match network_config['algorithm']: - case 'PPO': - config = cfg.PPOConfig(**network_config) - case 'DDPG': - config = cfg.DDPGConfig(**network_config) - case 'SAC': - config = cfg.SACConfig(**network_config) - case 'TD3': - config = cfg.TD3Config(**network_config) - case _: - raise Exception(f'Algorithm {network_config["algorithm"]} not implemented') - - - env = env_factory.create(env_config['environment'], env_config) - agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) - memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM) - - - record = Record( - glob_log_dir='training_logs', - log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}", - algorithm=network_config['algorithm'], - task=env_config['environment'], - network=agent - ) - - # TODO: Load Actor and Critic if passed. Only load if both are passed - - match agent.type: - case 'policy': - - if network_config['algorithm'] == 'PPO': - ppo_train(env, agent, memory, record, algorithm_config) - else: - off_policy_train(env, agent, memory, record, algorithm_config) - case _: - raise Exception(f'Agent type {agent.type} not supported') - - - diff --git a/src/reinforcement_learning/setup.py b/src/reinforcement_learning/setup.py index 93ed141e..ccb4e262 100644 --- a/src/reinforcement_learning/setup.py +++ b/src/reinforcement_learning/setup.py @@ -25,7 +25,7 @@ entry_points={ 'console_scripts': [ 'sanity_check = reinforcement_learning.sanity_check:main', - 'train = reinforcement_learning.updated_train:main', + 'train = reinforcement_learning.train:main', 'test = reinforcement_learning.test:main', 'greatest_gap = reinforcement_learning.greatest_gap:main', ], From 8071953568e7a7d46e247623e7e2d5fd17a98823 Mon Sep 17 00:00:00 2001 From: retinfai Date: Tue, 24 Oct 2023 11:26:07 +1300 Subject: [PATCH 25/38] refactor: update test loop to Record --- .../environments/CarBeatEnvironment.py | 2 +- .../reinforcement_learning/test.py | 207 ++++-------------- .../reinforcement_learning/train.py | 12 +- .../reinforcement_learning/training_loops.py | 36 +-- 4 files changed, 79 insertions(+), 178 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 53c64026..8a744fd3 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -29,7 +29,7 @@ def __init__(self, track='multi_track', observation_mode='lidar_only', max_goals=500, - num_lidar_points=50 + num_lidar_points=10 ): super().__init__('car_beat_environment') diff --git a/src/reinforcement_learning/reinforcement_learning/test.py b/src/reinforcement_learning/reinforcement_learning/test.py index 9adc7acf..03377fef 100644 --- a/src/reinforcement_learning/reinforcement_learning/test.py +++ b/src/reinforcement_learning/reinforcement_learning/test.py @@ -1,175 +1,64 @@ -import time +import random +from datetime import datetime +import yaml import rclpy import torch -from cares_reinforcement_learning.algorithm.policy import TD3 -from cares_reinforcement_learning.networks.TD3 import Actor, Critic -from cares_reinforcement_learning.util import Record -from cares_reinforcement_learning.util import helpers as hlp, NetworkFactory -from environments.CarBlockEnvironment import CarBlockEnvironment -from environments.CarGoalEnvironment import CarGoalEnvironment -from environments.CarTrackEnvironment import CarTrackEnvironment -from environments.CarWallEnvironment import CarWallEnvironment -from environments.CarBeatEnvironment import CarBeatEnvironment +from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory +import cares_reinforcement_learning.util.configurations as cfg + +from .parse_args import parse_args +from .EnvironmentFactory import EnvironmentFactory +from .training_loops import off_policy_evaluate, ppo_evaluate def main(): rclpy.init() - params = get_params() - - global MAX_STEPS_EVALUATION - global MAX_STEPS - - ENVIRONMENT, \ - TRACK, \ - MAX_STEPS_EVALUATION, \ - MAX_STEPS, \ - STEP_LENGTH, \ - REWARD_RANGE, \ - COLLISION_RANGE, \ - ACTOR_PATH, \ - CRITIC_PATH, \ - OBSERVAION_MODE, \ - ALGORITHM = [param.value for param in params] - - if ACTOR_PATH == '' or CRITIC_PATH == '': - raise Exception('Actor or Critic path not provided') - - DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - time.sleep(3) - - match ENVIRONMENT: - case 'CarWall': - env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarBlock': - env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE) - case 'CarTrack': - env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVAION_MODE) - case 'CarBeat': - env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVAION_MODE) - case _: - env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE) + env_config, algorithm_config, network_config = parse_args() print( - f'---------------------------------------------\n' - f'Environment: {ENVIRONMENT}\n' - f'Evaluation Steps: {MAX_STEPS_EVALUATION}\n' - f'Steps per Episode: {MAX_STEPS}\n' - f'Step Length: {STEP_LENGTH}\n' - f'Reward Range: {REWARD_RANGE}\n' - f'Collision Range: {COLLISION_RANGE}\n' - f'Critic Path: {CRITIC_PATH}\n' - f'Actor Path: {ACTOR_PATH}\n' - f'Observation Mode: {OBSERVAION_MODE}\n' - f'Observation size: {env.OBSERVATION_SIZE}' - f'Algorithm: {ALGORITHM}\n' - f'---------------------------------------------\n' - ) - - network_factory_args = { - 'observation_size': env.OBSERVATION_SIZE, - 'action_num': env.ACTION_NUM, - 'actor_lr': 0.11, - 'critic_lr': 0.11, - 'gamma': 0.23, - 'tau': 0.001, - 'device': DEVICE - } - - agent_factory = NetworkFactory() - agent = agent_factory.create_network(ALGORITHM, network_factory_args) - - print('Reading saved models into actor and critic') - agent.actor_net.load_state_dict(torch.load(ACTOR_PATH)) - agent.critic_net.load_state_dict(torch.load(CRITIC_PATH)) - - print('Successfully Loaded models') - - - - record = Record(checkpoint_freq=100, log_dir="multi_track_with_speed12_evaluation") - - test(env=env, agent=agent, record=record) - - -def test(env, agent, record: Record): - state, _ = env.reset() - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - print('Beginning Evaluation') - - for total_step_counter in range(int(MAX_STEPS_EVALUATION)): - episode_timesteps += 1 - - action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS) - - next_state, reward, done, truncated, info = env.step(action_env) - - state = next_state - episode_reward += reward - - if done or truncated: - print(f"Total T:{total_step_counter + 1} Episode {episode_num + 1} was completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}") - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - - record.log( - out=done or truncated, - Step=total_step_counter, - Episode=episode_num, - Step_Reward=reward, - Episode_Reward=episode_reward if (done or truncated) else None, - ) - - -def get_params(): - """ - This function fetches the hyperparameters passed in through the launch files - - The hyperparameters below are defaults, to change them, you should change the train.yaml config - """ - - param_node = rclpy.create_node('params') - param_node.declare_parameters( - '', - [ - ('environment', 'CarTrack'), - ('track', 'track_1'), - ('max_steps_evaluation', 1_000_000), - ('max_steps', 100), - ('step_length', 0.25), - ('reward_range', 0.2), - ('collision_range', 0.2), - ('actor_path', ''), - ('critic_path', ''), - ('observation_mode', 'full'), - ('algorithm', 'TD3') - ] + f'Environment Config: ------------------------------------- \n' + f'{yaml.dump(env_config, default_flow_style=False)} \n' + f'Algorithm Config: ------------------------------------- \n' + f'{yaml.dump(algorithm_config, default_flow_style=False)} \n' + f'Network Config: ------------------------------------- \n' + f'{yaml.dump(network_config, default_flow_style=False)} \n' ) - return param_node.get_parameters([ - 'environment', - 'track', - 'max_steps_evaluation', - 'max_steps', - 'step_length', - 'reward_range', - 'collision_range', - 'actor_path', - 'critic_path', - 'observation_mode', - 'algorithm' - ]) - + env_factory = EnvironmentFactory() + network_factory = NetworkFactory() + + match network_config['algorithm']: + case 'PPO': + config = cfg.PPOConfig(**network_config) + case 'DDPG': + config = cfg.DDPGConfig(**network_config) + case 'SAC': + config = cfg.SACConfig(**network_config) + case 'TD3': + config = cfg.TD3Config(**network_config) + case _: + raise Exception(f'Algorithm {network_config["algorithm"]} not implemented') + + env = env_factory.create(env_config['environment'], env_config) + agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) + + # Load models if both paths are provided + if network_config['actor_path'] and network_config['critic_path']: + print('Reading saved models into actor and critic') + agent.actor_net.load_state_dict(torch.load(network_config['actor_path'])) + agent.critic_net.load_state_dict(torch.load(network_config['critic_path'])) + print('Successfully Loaded models') + else: + raise Exception('Both actor and critic paths must be provided') + + match network_config['algorithm']: + case 'PPO': + ppo_evaluate(env, agent, algorithm_config) + case _: + off_policy_evaluate(env, agent, algorithm_config) if __name__ == '__main__': main() diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py index 60a1b9c2..ee1896a1 100644 --- a/src/reinforcement_learning/reinforcement_learning/train.py +++ b/src/reinforcement_learning/reinforcement_learning/train.py @@ -22,7 +22,7 @@ def main(): env_config, algorithm_config, network_config = parse_args() # Set Seeds - torch.manual_seed(algorithm_config['seed']) + torch.manual_seed(algorithm_config['seed']) torch.cuda.manual_seed_all(algorithm_config['seed']) np.random.seed(algorithm_config['seed']) random.seed(algorithm_config['seed']) @@ -62,9 +62,15 @@ def main(): log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}", algorithm=network_config['algorithm'], task=env_config['environment'], - network=agent + network=agent, + checkpoint_frequency=5000 ) + # TODO: update the save config once Record is updated. + # record.save_config(env_config, 'env_config') + # record.save_config(algorithm_config, 'algorithm_config') + # record.save_config(network_config, 'network_config') + # TODO: Load Actor and Critic if passed. Only load if both are passed match agent.type: @@ -76,6 +82,8 @@ def main(): off_policy_train(env, agent, memory, record, algorithm_config) case _: raise Exception(f'Agent type {agent.type} not supported') + + record.save() diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py index 35e5059d..1197e2ca 100644 --- a/src/reinforcement_learning/reinforcement_learning/training_loops.py +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -77,7 +77,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config): episode_timesteps = 0 episode_num += 1 -def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): +def off_policy_evaluate(env, agent, eval_episodes, record=None, steps_counter=0): episode_reward = 0 episode_timesteps = 0 @@ -99,13 +99,15 @@ def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter): episode_reward += reward if done or truncated: - record.log_eval( - total_steps = steps_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - display = True - ) + + if record: + record.log_eval( + total_steps = steps_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) # Reset environment state, _ = env.reset() @@ -180,7 +182,7 @@ def ppo_train(env, agent, memory, record, algorithm_config): episode_timesteps = 0 episode_num += 1 -def ppo_evaluate(env, agent, eval_episodes, record, steps_counter): +def ppo_evaluate(env, agent, eval_episodes, record=None, steps_counter=0): episode_reward = 0 episode_timesteps = 0 @@ -202,13 +204,15 @@ def ppo_evaluate(env, agent, eval_episodes, record, steps_counter): episode_reward += reward if done or truncated: - record.log_eval( - total_steps = steps_counter + 1, - episode = episode_num + 1, - episode_steps=episode_timesteps, - episode_reward = episode_reward, - display = True - ) + + if record: + record.log_eval( + total_steps = steps_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + display = True + ) # Reset environment state, _ = env.reset() From 9e2b3986208b7566dd3641f8c9a97a6ea294fbc9 Mon Sep 17 00:00:00 2001 From: Afereti Pama <79831813+retinfai@users.noreply.github.com> Date: Tue, 24 Oct 2023 11:34:30 +1300 Subject: [PATCH 26/38] Update README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 2bc821f1..8f9faf06 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,6 @@ train: reward_range: 1.0 collision_range: 0.2 observation_mode: 'no_position' - # actor_path & critic_path must exist, it can't be commented - actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht' - critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht' # gamma: 0.95 # tau: 0.005 # g: 5 From e3d5c23d31273b636bc9d2d05a6fb0f02291f926 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 09:41:47 +1300 Subject: [PATCH 27/38] feat: integrate pydantic models into training --- .../environments/CarBeatEnvironment.py | 2 +- .../environments/configurations.py | 46 +++++++++++++++++++ .../reinforcement_learning/parse_args.py | 46 ++++++++++++++++--- .../reinforcement_learning/train.py | 28 +++-------- .../reinforcement_learning/training_loops.py | 16 +++---- 5 files changed, 101 insertions(+), 37 deletions(-) create mode 100644 src/environments/environments/configurations.py diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 8a744fd3..64011d34 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -201,7 +201,7 @@ def step(self, action): next_state, full_next_state = self.get_observation() reward = self.compute_reward(full_state, full_next_state) terminated = self.is_terminated(full_next_state) - truncated = self.steps_since_last_goal >= 10 + truncated = self.steps_since_last_goal >= self.MAX_STEPS_PER_GOAL info = {} return next_state, reward, terminated, truncated, info diff --git a/src/environments/environments/configurations.py b/src/environments/environments/configurations.py new file mode 100644 index 00000000..4f02faca --- /dev/null +++ b/src/environments/environments/configurations.py @@ -0,0 +1,46 @@ +from pydantic import BaseModel +from typing import Optional + +class SubscriptableClass(BaseModel): + def __getitem__(self, item): + return getattr(self, item) + +class TrainingConfig(SubscriptableClass): + g: int + batch_size: int + buffer_size: int + seed: int + max_steps_training: int + max_steps_exploration: int + max_steps_per_batch: Optional[int] + number_steps_per_evaluation: int + number_eval_episodes: int + +class EnvironmentConfig(SubscriptableClass): + environment: str + car_name: str + reward_range: float + max_steps: int + step_length: float + collision_range: Optional[float] # Doesn't apply to CarGoal + +class CarGoalEnvironmentConfig(EnvironmentConfig): + pass + +class CarBlockEnvironmentConfig(EnvironmentConfig): + pass + +class CarWallEnvironmentConfig(EnvironmentConfig): + pass + +class CarTrackEnvironmentConfig(EnvironmentConfig): + track: str + observation_mode: str + max_goals: int + +class CarBeatEnvironmentConfig(EnvironmentConfig): + ftg_car_name: str + track: str + observation_mode: str + max_goals: int + num_lidar_points: int \ No newline at end of file diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py index 4e88be1e..165db0e1 100644 --- a/src/reinforcement_learning/reinforcement_learning/parse_args.py +++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py @@ -1,6 +1,8 @@ import rclpy from rclpy.node import Node from rclpy.parameter import Parameter +from environments import configurations as cfg +from cares_reinforcement_learning.util import configurations as cares_cfg def parse_args(): param_node = __declare_params() @@ -11,6 +13,7 @@ def parse_args(): return env_params, algorithm_params, network_params + def __declare_params(): param_node = rclpy.create_node('params') param_node.declare_parameters( @@ -37,8 +40,8 @@ def __declare_params(): ('max_steps_training', 1_000_000), ('max_steps_exploration', 1_000), ('max_steps_per_batch', 5000), - ('evaluate_every_n_steps', 10000), - ('evaluate_for_m_episodes', 5), + ('number_steps_per_evaluation', 10000), + ('number_eval_episodes', 5), # Network Parameters ------------------------------- ('actor_path', ''), @@ -75,7 +78,21 @@ def __get_env_params(param_node: Node): for param in params: params_dict[param.name] = param.value - return params_dict + match params_dict['environment']: + case 'CarGoal': + config = cfg.CarGoalEnvironmentConfig(**params_dict) + case 'CarBlock': + config = cfg.CarBlockEnvironmentConfig(**params_dict) + case 'CarWall': + config = cfg.CarWallEnvironmentConfig(**params_dict) + case 'CarTrack': + config = cfg.CarTrackEnvironmentConfig(**params_dict) + case 'CarBeat': + config = cfg.CarBeatEnvironmentConfig(**params_dict) + case _: + raise Exception(f'Environment {params_dict["environment"]} not implemented') + + return config def __get_algorithm_params(param_node: Node): params = param_node.get_parameters([ @@ -86,8 +103,8 @@ def __get_algorithm_params(param_node: Node): 'max_steps_training', 'max_steps_exploration', 'max_steps_per_batch', - 'evaluate_every_n_steps', - 'evaluate_for_m_episodes' + 'number_steps_per_evaluation', + 'number_eval_episodes' ]) # Convert to Dictionary @@ -95,7 +112,9 @@ def __get_algorithm_params(param_node: Node): for param in params: params_dict[param.name] = param.value - return params_dict + config = cfg.TrainingConfig(**params_dict) + + return config def __get_network_params(param_node: Node): params = param_node.get_parameters([ @@ -113,4 +132,17 @@ def __get_network_params(param_node: Node): for param in params: params_dict[param.name] = param.value - return params_dict + match params_dict['algorithm']: + case 'PPO': + config = cares_cfg.PPOConfig(**params_dict) + case 'DDPG': + config = cares_cfg.DDPGConfig(**params_dict) + case 'SAC': + config = cares_cfg.SACConfig(**params_dict) + case 'TD3': + config = cares_cfg.TD3Config(**params_dict) + case _: + raise Exception(f'Algorithm {params_dict["algorithm"]} not implemented') + + return config + diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py index ee1896a1..150febc5 100644 --- a/src/reinforcement_learning/reinforcement_learning/train.py +++ b/src/reinforcement_learning/reinforcement_learning/train.py @@ -29,31 +29,18 @@ def main(): print( f'Environment Config: ------------------------------------- \n' - f'{yaml.dump(env_config, default_flow_style=False)} \n' + f'{yaml.dump(dict(env_config), default_flow_style=False)} \n' f'Algorithm Config: ------------------------------------- \n' - f'{yaml.dump(algorithm_config, default_flow_style=False)} \n' + f'{yaml.dump(dict(algorithm_config), default_flow_style=False)} \n' f'Network Config: ------------------------------------- \n' - f'{yaml.dump(network_config, default_flow_style=False)} \n' + f'{yaml.dump(dict(network_config), default_flow_style=False)} \n' ) env_factory = EnvironmentFactory() network_factory = NetworkFactory() - match network_config['algorithm']: - case 'PPO': - config = cfg.PPOConfig(**network_config) - case 'DDPG': - config = cfg.DDPGConfig(**network_config) - case 'SAC': - config = cfg.SACConfig(**network_config) - case 'TD3': - config = cfg.TD3Config(**network_config) - case _: - raise Exception(f'Algorithm {network_config["algorithm"]} not implemented') - - env = env_factory.create(env_config['environment'], env_config) - agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config) + agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=network_config) memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM) @@ -66,10 +53,9 @@ def main(): checkpoint_frequency=5000 ) - # TODO: update the save config once Record is updated. - # record.save_config(env_config, 'env_config') - # record.save_config(algorithm_config, 'algorithm_config') - # record.save_config(network_config, 'network_config') + record.save_config(env_config, 'env_config') + record.save_config(algorithm_config, 'algorithm_config') + record.save_config(network_config, 'network_config') # TODO: Load Actor and Critic if passed. Only load if both are passed diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py index 1197e2ca..8a281e76 100644 --- a/src/reinforcement_learning/reinforcement_learning/training_loops.py +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -6,8 +6,8 @@ def off_policy_train(env, agent, memory, record, algorithm_config): max_steps_training = algorithm_config['max_steps_training'] max_steps_exploration = algorithm_config['max_steps_exploration'] - num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] - evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] + number_eval_episodes = algorithm_config['number_eval_episodes'] + number_steps_per_evaluation = algorithm_config['number_steps_per_evaluation'] batch_size = algorithm_config['batch_size'] G = algorithm_config['g'] @@ -49,7 +49,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config): )) memory.update_priorities(experience['indices'], info) - if (step_counter+1) % evaluate_every_n_steps == 0: + if (step_counter+1) % number_steps_per_evaluation == 0: evaluate = True if done or truncated: @@ -67,7 +67,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config): evaluate = False env.get_logger().info(f'*************--Begin Evaluation Loop--*************') - off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter) + off_policy_evaluate(env, agent, number_eval_episodes, record, step_counter) env.get_logger().info(f'*************--End Evaluation Loop--*************') @@ -122,8 +122,8 @@ def ppo_train(env, agent, memory, record, algorithm_config): max_steps_training = algorithm_config['max_steps_training'] max_steps_per_batch = algorithm_config['max_steps_per_batch'] - num_eps_evaluation = algorithm_config['evaluate_for_m_episodes'] - evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps'] + number_eval_episodes = algorithm_config['number_eval_episodes'] + number_steps_per_evaluation = algorithm_config['number_steps_per_evaluation'] episode_timesteps = 0 episode_num = 0 @@ -156,7 +156,7 @@ def ppo_train(env, agent, memory, record, algorithm_config): experience['log_prob'] )) - if (step_counter+1) % evaluate_every_n_steps == 0: + if (step_counter+1) % number_steps_per_evaluation == 0: evaluate = True if done or truncated: @@ -172,7 +172,7 @@ def ppo_train(env, agent, memory, record, algorithm_config): evaluate = False env.get_logger().info(f'*************--Begin Evaluation Loop--*************') - ppo_evaluate(env, agent, num_eps_evaluation, record, step_counter) + ppo_evaluate(env, agent, number_eval_episodes, record, step_counter) env.get_logger().info(f'*************--End Evaluation Loop--*************') From 776c992c0036b44593c9f8f508aa9d10470bf096 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:04:15 +1300 Subject: [PATCH 28/38] feat: add parse obs to CarBeat --- .../environments/CarBeatEnvironment.py | 26 +++++++++++++++++++ src/reinforcement_learning/config/train.yaml | 7 +++-- .../reinforcement_learning/training_loops.py | 5 ++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 64011d34..b552bfe8 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -365,4 +365,30 @@ def update_goal_service(self, x, y): rclpy.spin_until_future_complete(self, future) return future.result() + + # function that parses the state and returns a string that can be printed to the terminal + def parse_observation(self, observation): + string = f'CarBeat Observation: \n' + + if self.OBSERVATION_MODE == 'full': + string += f'Car Position: {observation[0:2]} \n' + string += f'Car Orientation: {observation[2:6]} \n' + string += f'Car Velocity: {observation[6]} \n' + string += f'Car Angular Velocity: {observation[7]} \n' + string += f'Car Lidar: {observation[8:]} \n' + elif self.OBSERVATION_MODE == 'no_position': + string += f'Car Orientation: {observation[:4]} \n' + string += f'Car Velocity: {observation[4]} \n' + string += f'Car Angular Velocity: {observation[5]} \n' + string += f'Car Lidar: {observation[6:]} \n' + elif self.OBSERVATION_MODE == 'lidar_only': + string += f'Car Velocity: {observation[0]} \n' + string += f'Car Angular Velocity: {observation[1]} \n' + string += f'Car Lidar: {observation[2:]} \n' + else: + raise ValueError(f'Invalid observation mode: {self.OBSERVATION_MODE}') + + return string + + \ No newline at end of file diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml index bfefbafc..67d3cc76 100644 --- a/src/reinforcement_learning/config/train.yaml +++ b/src/reinforcement_learning/config/train.yaml @@ -2,5 +2,8 @@ train: ros__parameters: environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack - car_name: 'New_Car' - ftg_car_name: 'New_Car_two' + car_name: 'f1tenth' + algorithm: 'TD3' + g: 1 + buffer_size: 50000 + observation_mode: 'lidar_only' diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py index 8a281e76..e7981767 100644 --- a/src/reinforcement_learning/reinforcement_learning/training_loops.py +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -19,6 +19,11 @@ def off_policy_train(env, agent, memory, record, algorithm_config): evaluate = False state, _ = env.reset() + + obs = env.parse_observation(state) + env.get_logger().info('-----------------------------------') + env.get_logger().info('\n' + obs) + env.get_logger().info('-----------------------------------') for step_counter in range(max_steps_training): episode_timesteps += 1 From bf67d370fe7192a833e9837a43b2c139cca45b86 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:07:24 +1300 Subject: [PATCH 29/38] feat: add parse obs to CarBlock --- src/environments/environments/CarBlockEnvironment.py | 11 +++++++++++ .../reinforcement_learning/parse_args.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py index b2c3e650..df12b79a 100644 --- a/src/environments/environments/CarBlockEnvironment.py +++ b/src/environments/environments/CarBlockEnvironment.py @@ -114,5 +114,16 @@ def compute_reward(self, state, next_state): reward -= 25 # TODO: find optimal value for this return reward + + def parse_observation(self, observation): + string = f'CarBlock Observation\n' + string += f'Position: {observation[:2]}\n' + string += f'Orientation: {observation[2:6]}\n' + string += f'Car Velocity: {observation[6]}\n' + string += f'Car Angular Velocity: {observation[7]}\n' + string += f'Lidar: {observation[8:-2]}\n' + string += f'Goal Position: {observation[-2:]}\n' + + return string diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py index 165db0e1..e16ca118 100644 --- a/src/reinforcement_learning/reinforcement_learning/parse_args.py +++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py @@ -28,7 +28,7 @@ def __declare_params(): ('step_length', 0.25), ('reward_range', 0.2), ('collision_range', 0.2), - ('observation_mode', 'no_position'), + ('observation_mode', 'lidar_only'), ('max_goals', 500), ('num_lidar_points', 10), From f4922d890046f204e5bc52dc6512cb8f23a73b35 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:10:52 +1300 Subject: [PATCH 30/38] feat: add parse obs to CarGoal --- src/environments/environments/CarGoalEnvironment.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py index 015d255b..9a6e78ff 100644 --- a/src/environments/environments/CarGoalEnvironment.py +++ b/src/environments/environments/CarGoalEnvironment.py @@ -101,5 +101,16 @@ def compute_reward(self, state, next_state): return reward + def parse_observation(self, observation): + + string = f'CarGoal Observation\n' + string += f'Position: {observation[:2]}\n' + string += f'Orientation: {observation[2:6]}\n' + string += f'Car Velocity: {observation[6]}\n' + string += f'Car Angular Velocity: {observation[7]}\n' + string += f'Goal Position: {observation[-2:]}\n' + + return string + \ No newline at end of file From 81540907c88239f880d4ab0743c479fcbc18278e Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:17:01 +1300 Subject: [PATCH 31/38] feat: add parse obs to CarTrack --- .../environments/CarTrackEnvironment.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py index 340ec1ac..42e1a1dc 100644 --- a/src/environments/environments/CarTrackEnvironment.py +++ b/src/environments/environments/CarTrackEnvironment.py @@ -208,3 +208,26 @@ def sleep(self): rclpy.spin_once(self) self.timer_future = Future() + + def parse_observation(self, observation): + + string = f'CarTrack Observation\n' + + match (self.observation_mode): + case 'no_position': + string += f'Orientation: {observation[:4]}\n' + string += f'Car Velocity: {observation[4]}\n' + string += f'Car Angular Velocity: {observation[5]}\n' + string += f'Lidar: {observation[6:]}\n' + case 'lidar_only': + string += f'Car Velocity: {observation[0]}\n' + string += f'Car Angular Velocity: {observation[1]}\n' + string += f'Lidar: {observation[2:]}\n' + case _: + string += f'Position: {observation[:2]}\n' + string += f'Orientation: {observation[2:6]}\n' + string += f'Car Velocity: {observation[6]}\n' + string += f'Car Angular Velocity: {observation[7]}\n' + string += f'Lidar: {observation[8:]}\n' + + return string From 712a2489eeeb25d3945b47e4a1ca5ae6b1ed114f Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:19:56 +1300 Subject: [PATCH 32/38] feat: add CarWall to parse obs --- src/environments/environments/CarWallEnvironment.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py index cd285cee..0fc0b8cc 100644 --- a/src/environments/environments/CarWallEnvironment.py +++ b/src/environments/environments/CarWallEnvironment.py @@ -111,4 +111,14 @@ def compute_reward(self, state, next_state): return reward - + def parse_observation(self, observation): + + string = f'CarWall Observation\n' + string += f'Position: {observation[:2]}\n' + string += f'Orientation: {observation[2:6]}\n' + string += f'Car Velocity: {observation[6]}\n' + string += f'Car Angular Velocity: {observation[7]}\n' + string += f'Lidar Points: {observation[8:-2]}\n' + string += f'Goal Position: {observation[-2:]}\n' + + return string From 575e27b3f1a10cad72327f539de2c1ed9d603ffc Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:23:47 +1300 Subject: [PATCH 33/38] feat: add obs log to ppo train --- .../reinforcement_learning/training_loops.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py index e7981767..9f91bc93 100644 --- a/src/reinforcement_learning/reinforcement_learning/training_loops.py +++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py @@ -138,6 +138,11 @@ def ppo_train(env, agent, memory, record, algorithm_config): state, _ = env.reset() + obs = env.parse_observation(state) + env.get_logger().info('-----------------------------------') + env.get_logger().info('\n' + obs) + env.get_logger().info('-----------------------------------') + for step_counter in range(max_steps_training): episode_timesteps += 1 From 3ab45e6363f6676bd03794cb09c0a2807326bf29 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:37:01 +1300 Subject: [PATCH 34/38] docs: add comments to CarBlock and CarBeat --- .../environments/CarBeatEnvironment.py | 26 +++++++++++++++++++ .../environments/CarBlockEnvironment.py | 23 +++++++++------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index b552bfe8..4b2a468a 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -19,6 +19,32 @@ class CarBeatEnvironment(Node): + """ + CarBeat Reinforcement Learning Environment: + + Task: + Agent learns to drive a track and overtake a car that is driving at a constant speed. + The second car is using the Follow The Gap algorithm. + + Observation: + Car Velocity + Car Angular Velocity + Car Lidar + + Action: + It's linear and angular velocity (Twist) + + Reward: + +2 if it comes within REWARD_RANGE units of a goal + +200 if it overtakes the Follow The Gap car + -25 if it collides with a wall + + Termination Conditions: + When the agent collides with a wall or the Follow The Gap car + + Truncation Condition: + When the number of steps surpasses MAX_GOALS + """ def __init__(self, rl_car_name, ftg_car_name, diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py index df12b79a..db890308 100644 --- a/src/environments/environments/CarBlockEnvironment.py +++ b/src/environments/environments/CarBlockEnvironment.py @@ -14,28 +14,31 @@ class CarBlockEnvironment(F1tenthEnvironment): """ - CarWall Reinforcement Learning Environment: + CarBlock Reinforcement Learning Environment: Task: - Here the agent learns to drive the f1tenth car to a goal position + Agent learns to navigate to a goal position while avoiding obstacles that are dynamically placed at the start of each episode Observation: - It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y) + Car Position (x, y) + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data Action: - It's linear and angular velocity + Its linear and angular velocity (Twist) Reward: - It's progress toward the goal plus, - 100+ if it reaches the goal plus, - -50 if it collides with the wall + Its progress towards the goal * 10 + +100 if it reaches the goal + -25 if it collides with an obstacle or flips over Termination Conditions: - When the agent is within REWARD_RANGE units or, - When the agent is within COLLISION_RANGE units + When the agent collides with a wall or reaches the goal Truncation Condition: - When the number of steps surpasses MAX_STEPS + When the number of steps surpasses MAX_steps """ def __init__(self, car_name, reward_range=0.2, max_steps=50, collision_range=0.2, step_length=0.5): From e2dbcec758fb8cc714c969827297871ca585cb6a Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:37:26 +1300 Subject: [PATCH 35/38] docs: update CarBlock comments --- src/environments/environments/CarBlockEnvironment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py index db890308..f09b6081 100644 --- a/src/environments/environments/CarBlockEnvironment.py +++ b/src/environments/environments/CarBlockEnvironment.py @@ -38,7 +38,7 @@ class CarBlockEnvironment(F1tenthEnvironment): When the agent collides with a wall or reaches the goal Truncation Condition: - When the number of steps surpasses MAX_steps + When the number of steps surpasses MAX_STEPS """ def __init__(self, car_name, reward_range=0.2, max_steps=50, collision_range=0.2, step_length=0.5): From c42de198ad49f9a41b3fa9a12b2493b61a9a379e Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:38:55 +1300 Subject: [PATCH 36/38] docs: add env description to CarGoal --- .../environments/CarGoalEnvironment.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py index 9a6e78ff..2967afb4 100644 --- a/src/environments/environments/CarGoalEnvironment.py +++ b/src/environments/environments/CarGoalEnvironment.py @@ -8,26 +8,28 @@ class CarGoalEnvironment(F1tenthEnvironment): """ - CarWall Reinforcement Learning Environment: + CarGoal Reinforcement Learning Environment: Task: Here the agent learns to drive the f1tenth car to a goal position. - This happens all within a 10x10 box + This happens in an open area. Observation: - It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y) + Car Position (x, y) + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Goal Position (x, y) Action: - It's linear and angular velocity + Its linear and angular velocity Reward: - It's progress toward the goal plus, - 100+ if it reaches the goal plus, - -50 if it collides with the wall + Its progress toward the goal plus, + +100 if it reaches the goal plus Termination Conditions: - When the agent is within REWARD_RANGE units of the goal or, - When the agent is within COLLISION_RANGE units of a wall + When the agent is within REWARD_RANGE units of the goal Truncation Condition: When the number of steps surpasses MAX_STEPS From b4e53dd6e6213c2102f4b2199a116a0b4d0a3375 Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:41:56 +1300 Subject: [PATCH 37/38] docs: add env description to CarTrack --- .../environments/CarBeatEnvironment.py | 20 ++++++++-- .../environments/CarTrackEnvironment.py | 37 +++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py index 4b2a468a..66e270b2 100644 --- a/src/environments/environments/CarBeatEnvironment.py +++ b/src/environments/environments/CarBeatEnvironment.py @@ -27,9 +27,23 @@ class CarBeatEnvironment(Node): The second car is using the Follow The Gap algorithm. Observation: - Car Velocity - Car Angular Velocity - Car Lidar + full: + Car Position (x, y) + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data + no_position: + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data + lidar_only: + Car Velocity + Car Angular Velocity + Lidar Data + + No. of lidar points is configurable Action: It's linear and angular velocity (Twist) diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py index 42e1a1dc..af539bb9 100644 --- a/src/environments/environments/CarTrackEnvironment.py +++ b/src/environments/environments/CarTrackEnvironment.py @@ -11,6 +11,43 @@ class CarTrackEnvironment(F1tenthEnvironment): + """ + CarTrack Reinforcement Learning Environment: + + Task: + Agent learns to drive a track + + Observation: + full: + Car Position (x, y) + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data + no_position: + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data + lidar_only: + Car Velocity + Car Angular Velocity + Lidar Data + + Action: + It's linear and angular velocity (Twist) + + Reward: + +2 if it comes within REWARD_RANGE units of a goal + -25 if it collides with a wall + + Termination Conditions: + When the agent collides with a wall or the Follow The Gap car + + Truncation Condition: + When the number of steps surpasses MAX_GOALS + """ + def __init__(self, car_name, reward_range=0.5, From f0c3879709a8ff1ab25a6f7250e25b4221918aac Mon Sep 17 00:00:00 2001 From: retinfai Date: Wed, 25 Oct 2023 10:42:45 +1300 Subject: [PATCH 38/38] docs: update CarWall environment --- src/environments/environments/CarWallEnvironment.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py index 0fc0b8cc..32fe1020 100644 --- a/src/environments/environments/CarWallEnvironment.py +++ b/src/environments/environments/CarWallEnvironment.py @@ -18,7 +18,12 @@ class CarWallEnvironment(F1tenthEnvironment): This happens all within a 10x10 box Observation: - It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y) + Car Position (x, y) + Car Orientation (x, y, z, w) + Car Velocity + Car Angular Velocity + Lidar Data + Goal Position (x, y) Action: It's linear and angular velocity