From b7d982c0445ced3839d46f2195cc3dad8abe41e0 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Fri, 6 Oct 2023 09:45:49 +1300
Subject: [PATCH 01/38] feat: configify lidar

---
 .../environments/CarBeatEnvironment.py          | 17 ++++++++++-------
 src/environments/environments/util.py           | 10 ++++++++++
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 8f2f6ab1..26fbf177 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -13,7 +13,7 @@
 import numpy as np
 from environment_interfaces.srv import CarBeatReset
 from .termination import has_collided, has_flipped_over
-from .util import process_odom, reduce_lidar, get_all_goals_and_waypoints_in_multi_tracks
+from .util import process_odom, reduce_lidar_n, get_all_goals_and_waypoints_in_multi_tracks
 from .goal_positions import goal_positions
 from .waypoints import waypoints
 
@@ -28,7 +28,9 @@ def __init__(self,
                  step_length=0.5,
                  track='track_1',
                  observation_mode='lidar_only',
-                 max_goals=500):
+                 max_goals=500,
+                 num_lidar_points=50
+                 ):
         super().__init__('car_beat_environment')
 
         # Environment Details ----------------------------------------
@@ -40,6 +42,7 @@ def __init__(self,
         self.MIN_ACTIONS = np.asarray([0, -3.14])
         self.MAX_STEPS_PER_GOAL = max_steps
         self.OBSERVATION_MODE = observation_mode
+        self.LIDAR_NUM = num_lidar_points
         self.num_spawns = 0
         
         self.MAX_GOALS = max_goals
@@ -49,7 +52,7 @@ def __init__(self,
             case 'no_position':
                 self.OBSERVATION_SIZE = 6 + 10
             case 'lidar_only':
-                self.OBSERVATION_SIZE = 2 + 10
+                self.OBSERVATION_SIZE = 2 + num_lidar_points
             case _:
                 raise ValueError(f'Invalid observation mode: {observation_mode}')
 
@@ -309,8 +312,8 @@ def get_observation(self):
         odom_one = process_odom(odom_one)
         odom_two = process_odom(odom_two)
 
-        lidar_one = reduce_lidar(lidar_one)
-        lidar_two = reduce_lidar(lidar_two)
+        lidar_one = reduce_lidar_n(lidar_one, self.LIDAR_NUM)
+        lidar_two = reduce_lidar_n(lidar_two, self.LIDAR_NUM)
 
         match self.OBSERVATION_MODE:
             case 'full':
@@ -349,7 +352,7 @@ def compute_reward(self, state, next_state):
 
             self.steps_since_last_goal = 0
 
-        ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[18:20])
+        ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2])
 
         # Keeping track of FTG car goal number
         if ftg_current_distance < self.REWARD_RANGE:
@@ -362,7 +365,7 @@ def compute_reward(self, state, next_state):
             self.ftg_goals_reached += 500
             
 
-        if has_collided(next_state[8:19], self.COLLISION_RANGE) or has_flipped_over(next_state[2:6]):
+        if has_collided(next_state[8:8 + self.LIDAR_NUM], self.COLLISION_RANGE) or has_flipped_over(next_state[2:6]):
             reward -= 25  # TODO: find optimal value for this
 
         return reward
diff --git a/src/environments/environments/util.py b/src/environments/environments/util.py
index 63ed00ea..5bda6704 100644
--- a/src/environments/environments/util.py
+++ b/src/environments/environments/util.py
@@ -81,6 +81,16 @@ def reduce_lidar(lidar: LaserScan):
 
         return reduced_range
 
+def reduce_lidar_n(lidar: LaserScan, num_points: int):
+        ranges = lidar.ranges
+        ranges = np.nan_to_num(ranges, posinf=float(10), neginf=float(0))
+        ranges = list(ranges)
+
+        index = np.round(np.linspace(0, len(ranges) - 1, num_points)).astype(int)
+        reduced_range = np.array(ranges)[index]      
+
+        return list(reduced_range)
+
 
 def get_all_goals_and_waypoints_in_multi_tracks(track_name):
     all_car_goals = {}

From b47ac3ca718d15f70f71792d922fcde0eca779ef Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 09:20:50 +1300
Subject: [PATCH 02/38] chore: update train yaml

---
 src/reinforcement_learning/config/train.yaml  |   8 +-
 .../reinforcement_learning/greatest_gap.py    | 226 ------------------
 2 files changed, 4 insertions(+), 230 deletions(-)
 delete mode 100644 src/reinforcement_learning/reinforcement_learning/greatest_gap.py

diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml
index 9687b2d8..a7c70ddf 100644
--- a/src/reinforcement_learning/config/train.yaml
+++ b/src/reinforcement_learning/config/train.yaml
@@ -1,15 +1,15 @@
 train:
   ros__parameters:
-    environment: 'CarTrack' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat
+    environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat
     track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack
-    algorithm: 'TD3'
+    algorithm: 'SAC'
     max_steps_exploration: 1000
     max_steps_training: 1000000
     reward_range: 2.0
     collision_range: 0.2
     observation_mode: 'lidar_only'
-    evaluate_every_n_steps: 2000
-    evaluate_for_m_episodes: 3
+    evaluate_every_n_steps: 10000
+    evaluate_for_m_episodes: 5
     # actor_path & critic_path must exist for retrain.sh, it can't be commented
     # actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht'
     # critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht'
diff --git a/src/reinforcement_learning/reinforcement_learning/greatest_gap.py b/src/reinforcement_learning/reinforcement_learning/greatest_gap.py
deleted file mode 100644
index 2c393864..00000000
--- a/src/reinforcement_learning/reinforcement_learning/greatest_gap.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import rclpy
-import time
-import torch
-import random
-import numpy as np
-
-from cares_reinforcement_learning.algorithm.policy import TD3
-from cares_reinforcement_learning.util import helpers as hlp
-from cares_reinforcement_learning.memory import MemoryBuffer
-from cares_reinforcement_learning.util import Record
-
-from environments.CarGoalEnvironment import CarGoalEnvironment
-from environments.CarWallEnvironment import CarWallEnvironment
-from environments.CarBlockEnvironment import CarBlockEnvironment
-from environments.CarTrackEnvironment import CarTrackEnvironment
-from environments.follow_the_gap import FollowTheGapNode
-
-def main():
-    rclpy.init()
-
-    params = get_params()
-    
-    global MAX_STEPS_TRAINING
-    global MAX_STEPS_EXPLORATION
-    global G
-    global BATCH_SIZE
-
-    ENVIRONMENT,\
-    MAX_STEPS_TRAINING,\
-    MAX_STEPS_EXPLORATION,\
-    GAMMA,\
-    TAU,\
-    G,\
-    BATCH_SIZE,\
-    BUFFER_SIZE,\
-    SEED,\
-    ACTOR_LR,\
-    CRITIC_LR,\
-    MAX_STEPS,\
-    STEP_LENGTH,\
-    REWARD_RANGE,\
-    COLLISION_RANGE = [param.value for param in params]
-
-    print(
-        f'---------------------------------------------\n'
-        f'Environment: {ENVIRONMENT}\n'
-        f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n'
-        f'Training Steps: {MAX_STEPS_TRAINING}\n'
-        f'Gamma: {GAMMA}\n'
-        f'Tau: {TAU}\n'
-        f'G: {G}\n'
-        f'Batch Size: {BATCH_SIZE}\n'
-        f'Buffer Size: {BUFFER_SIZE}\n'
-        f'Seed: {SEED}\n'
-        f'Actor LR: {ACTOR_LR}\n'
-        f'Critic LR: {CRITIC_LR}\n'
-        f'Steps per Episode: {MAX_STEPS}\n'
-        f'Step Length: {STEP_LENGTH}\n'
-        f'Reward Range: {REWARD_RANGE}\n'
-        f'Collision Range: {COLLISION_RANGE}\n'
-        f'---------------------------------------------\n'
-    )
-
-    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    time.sleep(3)
-
-    match(ENVIRONMENT):
-        case 'CarWall':
-            env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarBlock':
-            env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarTrack':
-            env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case _:
-            env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE)
-   
-    config = {
-        'environment': ENVIRONMENT,
-        'max_steps_training':MAX_STEPS_TRAINING,
-        'max_steps_exploration': MAX_STEPS_EXPLORATION, 
-        'gamma': GAMMA, 
-        'tau': TAU, 
-        'g': G, 
-        'batch_size': BATCH_SIZE, 
-        'buffer_size': BUFFER_SIZE, 
-        'seed': SEED, 
-        'actor_lr': ACTOR_LR, 
-        'critic_lr': CRITIC_LR,
-        'max_steps': MAX_STEPS,
-        'step_length': STEP_LENGTH,
-        'reward_range': REWARD_RANGE,
-        'collision_range': COLLISION_RANGE
-    }
-    record = Record(checkpoint_freq=100, config=config)
-
-    greatestGap(env)
-
-def greatestGap(env):
-    state, _ = env.reset()
-
-    episode_timesteps = 0
-    episode_reward    = 0
-    episode_num       = 0
-    calcMethod = FollowTheGapNode()
-
-    for total_step_counter in range(int(MAX_STEPS_TRAINING)):
-        episode_timesteps += 1
-        # print(f"LIDAR ranges: {state[8:18]}")
-        action_env = calcMethod.select_action(state, env.goal_position) # Select action
-        #print(f"Current action:{action_env[0]}, {action_env[1]}")
-        action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)
-        next_state, reward, done, truncated, info = env.step(action_env)
-        state = next_state
-
-        if done or truncated:
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward    = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-
-
-def train(env, agent: TD3, record: Record):
-    
-    memory = MemoryBuffer()
-
-    episode_timesteps = 0
-    episode_reward    = 0
-    episode_num       = 0
-
-    state, _ = env.reset()
-
-    historical_reward = {"step": [], "episode_reward": []}    
-
-
-    for total_step_counter in range(int(MAX_STEPS_TRAINING)):
-        episode_timesteps += 1
-
-        if total_step_counter < MAX_STEPS_EXPLORATION:
-            print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}")
-            action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])]) # action range the env uses [e.g. -2 , 2 for pendulum]
-            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)  # algorithm range [-1, 1]
-        else:
-            action = agent.select_action_from_policy(state) # algorithm range [-1, 1]
-            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)  # mapping to env range [e.g. -2 , 2 for pendulum]
-
-        next_state, reward, done, truncated, info = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-
-        state = next_state
-        episode_reward += reward
-
-        if total_step_counter >= MAX_STEPS_EXPLORATION:
-                for _ in range(G):
-                    experiences = memory.sample(BATCH_SIZE)
-                    experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done'])
-                    agent.train_policy(experiences)
-
-        record.log(
-            out=done or truncated,
-            Step=total_step_counter,
-            Episode=episode_num,
-            Step_Reward=reward,
-            Episode_Reward=episode_reward if (done or truncated) else None,
-        )
-
-        if done or truncated:
-
-            historical_reward["step"].append(total_step_counter)
-            historical_reward["episode_reward"].append(episode_reward)
-
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward    = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-
-def get_params():
-    '''
-    This function fetches the hyperparameters passed in through the launch files
-    - The hyperparameters below are defaults, to change them, you should change the train.yaml config
-    '''
-    param_node = rclpy.create_node('params')
-    param_node.declare_parameters(
-        '',
-        [
-            ('environment', 'CarGoal'),
-            ('gamma', 0.95),
-            ('tau', 0.005),
-            ('g', 10),
-            ('batch_size', 32),
-            ('buffer_size', 1_000_000),
-            ('seed', 123), #TODO: This doesn't do anything yet
-            ('actor_lr', 1e-4),
-            ('critic_lr', 1e-3),
-            ('max_steps_training', 1_000_000),
-            ('max_steps_exploration', 1_000),
-            ('max_steps', 10000),
-            ('step_length', 0.25),
-            ('reward_range', 0.2),
-            ('collision_range', 0.2)
-        ]
-    )
-
-    return param_node.get_parameters([
-        'environment',
-        'max_steps_training',
-        'max_steps_exploration', 
-        'gamma', 
-        'tau', 
-        'g', 
-        'batch_size', 
-        'buffer_size', 
-        'seed', 
-        'actor_lr', 
-        'critic_lr',
-        'max_steps',
-        'step_length',
-        'reward_range',
-        'collision_range'
-        ])
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From d46ffd8db231d4d56b03401ac94c3ab4113008a1 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 10:31:54 +1300
Subject: [PATCH 03/38] feat: add progress to carbeat reward

---
 src/environments/environments/CarBeatEnvironment.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 26fbf177..3044f048 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -336,7 +336,10 @@ def compute_reward(self, state, next_state):
         goal_position = self.goal_position
 
         current_distance = math.dist(goal_position, next_state[:2])
-        
+        prev_distance = math.dist(goal_position, state[:2])
+
+        reward += prev_distance - current_distance
+
         self.steps_since_last_goal += 1
 
         if current_distance < self.REWARD_RANGE:
@@ -351,7 +354,7 @@ def compute_reward(self, state, next_state):
             self.update_goal_service(new_x, new_y)
 
             self.steps_since_last_goal = 0
-
+        
         ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2])
 
         # Keeping track of FTG car goal number

From 8c6d56e961bc82ad99bbb3f3127828c86b4d6a42 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 11:00:40 +1300
Subject: [PATCH 04/38] feat: clean carbeat

---
 .../environments/CarBeatEnvironment.py        | 123 ++++++++----------
 1 file changed, 57 insertions(+), 66 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 3044f048..5acfa4ce 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -238,71 +238,7 @@ def is_terminated(self, state):
         return has_collided(state[8:19], self.COLLISION_RANGE) \
             or has_flipped_over(state[2:6]) \
             or self.goals_reached >= self.MAX_GOALS
-
-    '''
-    init ai car goal count 0
-    init opponent goal count diffrenct between ai goal abs to oppent goal abs
-
-    for each goal passed
-    ai car goal count ++
-    oppoent goal count ++
-
-    if ai car goal count > oppent goal count:
-        passed
-    '''
-
-    def call_reset_service(self, 
-                           car_x, 
-                           car_y, 
-                           car_Y, 
-                           goal_x, 
-                           goal_y, 
-                           ftg_x, 
-                           ftg_y, 
-                           ftg_Y
-                           ):
-        """
-        Reset the car and goal position
-        """
-
-        request = CarBeatReset.Request()
-        
-        request.gx = goal_x
-        request.gy = goal_y
-
-        request.cx_one = car_x
-        request.cy_one = car_y
-        request.cyaw_one = car_Y
-
-        request.cx_two = ftg_x
-        request.cy_two = ftg_y
-        request.cyaw_two = ftg_Y
-
-        request.flag = "car_and_goal"
-
-        # Publish to reset Topic to reset other nodes
-        empty_msg = Empty()
-        self.reset_pub.publish(empty_msg)
-
-        future = self.reset_client.call_async(request)
-        rclpy.spin_until_future_complete(self, future)
-
-        return future.result()
-
-    def update_goal_service(self, x, y):
-        """
-        Reset the goal position
-        """
-
-        request = CarBeatReset.Request()
-        request.gx = x
-        request.gy = y
-        request.flag = "goal_only"
-
-        future = self.reset_client.call_async(request)
-        rclpy.spin_until_future_complete(self, future)
-
-        return future.result()
+    
 
     def get_observation(self):
 
@@ -354,7 +290,7 @@ def compute_reward(self, state, next_state):
             self.update_goal_service(new_x, new_y)
 
             self.steps_since_last_goal = 0
-        
+
         ftg_current_distance = math.dist(self.all_goals[(self.ftg_start_goal_index + self.ftg_goals_reached) % len(self.all_goals)], next_state[8 + self.LIDAR_NUM:8 + self.LIDAR_NUM + 2])
 
         # Keeping track of FTG car goal number
@@ -365,6 +301,8 @@ def compute_reward(self, state, next_state):
         if self.goals_reached >= (self.ftg_goals_reached + self.ftg_offset + 3):
             print(f'RL Car has overtaken FTG Car')
             reward  += 200
+
+            # Ensure overtaking won't happen again
             self.ftg_goals_reached += 500
             
 
@@ -372,4 +310,57 @@ def compute_reward(self, state, next_state):
             reward -= 25  # TODO: find optimal value for this
 
         return reward
+    
+    def call_reset_service(self, 
+                           car_x, 
+                           car_y, 
+                           car_Y, 
+                           goal_x, 
+                           goal_y, 
+                           ftg_x, 
+                           ftg_y, 
+                           ftg_Y
+                           ):
+        """
+        Reset the car and goal position
+        """
+
+        request = CarBeatReset.Request()
+        
+        request.gx = goal_x
+        request.gy = goal_y
+
+        request.cx_one = car_x
+        request.cy_one = car_y
+        request.cyaw_one = car_Y
+
+        request.cx_two = ftg_x
+        request.cy_two = ftg_y
+        request.cyaw_two = ftg_Y
+
+        request.flag = "car_and_goal"
+
+        # Publish to reset Topic to reset other nodes
+        empty_msg = Empty()
+        self.reset_pub.publish(empty_msg)
+
+        future = self.reset_client.call_async(request)
+        rclpy.spin_until_future_complete(self, future)
+
+        return future.result()
+
+    def update_goal_service(self, x, y):
+        """
+        Reset the goal position
+        """
+
+        request = CarBeatReset.Request()
+        request.gx = x
+        request.gy = y
+        request.flag = "goal_only"
+
+        future = self.reset_client.call_async(request)
+        rclpy.spin_until_future_complete(self, future)
+
+        return future.result()
     
\ No newline at end of file

From d7b34ef667035ad47ad2296d730d05f5d556e101 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 13:21:29 +1300
Subject: [PATCH 05/38] chore: set up updated train

---
 .../reinforcement_learning/updated_train.py   | 347 ++++++++++++++++++
 .../reinforcement_learning/util.py            |  66 ++++
 src/reinforcement_learning/setup.py           |   2 +-
 3 files changed, 414 insertions(+), 1 deletion(-)
 create mode 100644 src/reinforcement_learning/reinforcement_learning/updated_train.py
 create mode 100644 src/reinforcement_learning/reinforcement_learning/util.py

diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
new file mode 100644
index 00000000..942a6347
--- /dev/null
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -0,0 +1,347 @@
+import random
+import time
+
+import numpy as np
+import rclpy
+import torch
+
+from cares_reinforcement_learning.memory import MemoryBuffer
+from cares_reinforcement_learning.util.Record import Record
+from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
+from cares_reinforcement_learning.util import helpers as hlp
+
+from environments.CarBlockEnvironment import CarBlockEnvironment
+from environments.CarGoalEnvironment import CarGoalEnvironment
+from environments.CarTrackEnvironment import CarTrackEnvironment
+from environments.CarWallEnvironment import CarWallEnvironment
+from environments.CarBeatEnvironment import CarBeatEnvironment
+
+from .util import parse_args
+from rclpy.parameter import Parameter
+def main():
+    rclpy.init()
+
+    params: list(Parameter) = parse_args()
+    for param in params:
+        print(param)
+        print(param.value)
+        print(param.name)
+        
+
+    global MAX_STEPS_TRAINING
+    global MAX_STEPS_EXPLORATION
+    global MAX_STEPS_PER_BATCH
+    global G
+    global BATCH_SIZE
+    global EVALUATE_EVERY_N_STEPS
+    global EVALUATE_FOR_M_EPISODES
+    global ALGORITHM
+
+    ENVIRONMENT, \
+    ALGORITHM, \
+    TRACK, \
+    MAX_STEPS_TRAINING, \
+    MAX_STEPS_EXPLORATION, \
+    GAMMA, \
+    TAU, \
+    G, \
+    BATCH_SIZE, \
+    BUFFER_SIZE, \
+    SEED, \
+    ACTOR_LR, \
+    CRITIC_LR, \
+    MAX_STEPS, \
+    STEP_LENGTH, \
+    REWARD_RANGE, \
+    COLLISION_RANGE, \
+    ACTOR_PATH, \
+    CRITIC_PATH, \
+    MAX_STEPS_PER_BATCH, \
+    OBSERVATION_MODE, \
+    EVALUATE_EVERY_N_STEPS, \
+    EVALUATE_FOR_M_EPISODES = [param.value for param in params]
+
+    if ACTOR_PATH != '' and CRITIC_PATH != '':
+        MAX_STEPS_EXPLORATION = 0
+
+    torch.manual_seed(SEED)
+    torch.cuda.manual_seed_all(SEED)
+    np.random.seed(SEED)
+    random.seed(SEED)
+    
+    print(
+        f'---------------------------------------------\n'
+        f'Environment: {ENVIRONMENT}\n'
+        f'Algorithm: {ALGORITHM}\n'
+        f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n'
+        f'Training Steps: {MAX_STEPS_TRAINING}\n'
+        f'Gamma: {GAMMA}\n'
+        f'Tau: {TAU}\n'
+        f'G: {G}\n'
+        f'Batch Size: {BATCH_SIZE}\n'
+        f'Buffer Size: {BUFFER_SIZE}\n'
+        f'Seed: {SEED}\n'
+        f'Actor LR: {ACTOR_LR}\n'
+        f'Critic LR: {CRITIC_LR}\n'
+        f'Steps per Episode: {MAX_STEPS}\n'
+        f'Step Length: {STEP_LENGTH}\n'
+        f'Reward Range: {REWARD_RANGE}\n'
+        f'Collision Range: {COLLISION_RANGE}\n'
+        f'Critic Path: {CRITIC_PATH}\n'
+        f'Actor Path: {ACTOR_PATH}\n'
+        f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n'
+        f'Observation Mode: {OBSERVATION_MODE}\n'
+        f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n'
+        f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n'
+        f'---------------------------------------------\n'
+    )
+
+    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    time.sleep(3)
+
+    match ENVIRONMENT:
+        case 'CarWall':
+            env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
+        case 'CarBlock':
+            env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
+        case 'CarTrack':
+            env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
+        case 'CarBeat':
+            env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
+        case _:
+            env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE)
+
+    network_factory_args = {
+        'observation_size': env.OBSERVATION_SIZE,
+        'action_num': env.ACTION_NUM,
+        'actor_lr': ACTOR_LR,
+        'critic_lr': CRITIC_LR,
+        'gamma': GAMMA,
+        'tau': TAU,
+        'device': DEVICE
+    }
+
+    agent_factory = NetworkFactory()
+    agent = agent_factory.create_network(ALGORITHM, network_factory_args)
+    
+    if ACTOR_PATH != '' and CRITIC_PATH != '':
+        print('Reading saved models into actor and critic')
+        agent.actor_net.load_state_dict(torch.load(ACTOR_PATH))
+        agent.critic_net.load_state_dict(torch.load(CRITIC_PATH))
+        print('Successfully Loaded models')
+
+    networks = {'actor': agent.actor_net, 'critic': agent.critic_net}
+    config = {
+        'environment': ENVIRONMENT,
+        'algorithm': ALGORITHM,
+        'max_steps_training': MAX_STEPS_TRAINING,
+        'max_steps_exploration': MAX_STEPS_EXPLORATION,
+        'gamma': GAMMA,
+        'tau': TAU,
+        'g': G,
+        'batch_size': BATCH_SIZE,
+        'buffer_size': BUFFER_SIZE,
+        'seed': SEED,
+        'actor_lr': ACTOR_LR,
+        'critic_lr': CRITIC_LR,
+        'max_steps': MAX_STEPS,
+        'step_length': STEP_LENGTH,
+        'reward_range': REWARD_RANGE,
+        'collision_range': COLLISION_RANGE,
+        'max_steps_per_batch': MAX_STEPS_PER_BATCH,
+        'observation_mode': OBSERVATION_MODE,
+        'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS,
+        'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES
+    }
+
+    if (ENVIRONMENT == 'CarTrack'):
+        config['track'] = TRACK
+
+    record = Record(networks=networks, checkpoint_freq=100, config=config)
+
+    if ALGORITHM == 'PPO':
+        train_ppo(env, agent, record=record)
+    else:
+        train(env=env, agent=agent, record=record)
+
+
+def train(env, agent, record: Record):
+    memory = MemoryBuffer()
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+    evaluation_reward = None
+    
+    state, _ = env.reset()
+
+    evaluate = False
+    
+    print(f'Initial State: {state}')
+    historical_reward = {"step": [], "episode_reward": []}
+
+    for total_step_counter in range(int(MAX_STEPS_TRAINING)):
+        episode_timesteps += 1
+
+        if total_step_counter < MAX_STEPS_EXPLORATION:
+            print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}")
+            action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])])  # action range the env uses [e.g. -2 , 2 for pendulum]
+            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)  # algorithm range [-1, 1]
+        else:
+            action = agent.select_action_from_policy(state)  # algorithm range [-1, 1]
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)  # mapping to env range [e.g. -2 , 2 for pendulum]
+
+        next_state, reward, done, truncated, info = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
+
+        state = next_state
+        episode_reward += reward
+
+        if total_step_counter >= MAX_STEPS_EXPLORATION:
+            for _ in range(G):
+                experiences = memory.sample(BATCH_SIZE)
+                experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done'])
+                agent.train_policy(experiences)
+
+        
+
+        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
+            evaluate = True
+
+        
+        record.log(
+            out=done or truncated,
+            Step=total_step_counter,
+            Episode=episode_num,
+            Step_Reward=reward,
+            Episode_Reward=episode_reward if (done or truncated) else None,
+            Evaluation_Reward=evaluation_reward
+        )
+
+        evaluation_reward = None
+
+        if done or truncated:
+            historical_reward["step"].append(total_step_counter)
+            historical_reward["episode_reward"].append(episode_reward)
+            
+            if evaluate:
+                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
+                evaluate = False
+            
+            # Reset environment
+            state, _ = env.reset()
+            episode_reward = 0
+            episode_timesteps = 0
+            episode_num += 1
+            
+def train_ppo(env, agent, record):
+    max_steps_training = MAX_STEPS_TRAINING
+    max_steps_per_batch = MAX_STEPS_PER_BATCH
+
+    min_action_value = env.MIN_ACTIONS 
+    max_action_value = env.MAX_ACTIONS
+
+    episode_timesteps = 0
+    episode_num = 0
+    episode_reward = 0
+    time_step = 1
+    evaluation_reward = None
+    evaluate = False
+    
+    memory = MemoryBuffer()
+
+    state, _ = env.reset()
+
+    for total_step_counter in range(int(max_steps_training)):
+        episode_timesteps += 1
+
+        action, log_prob = agent.select_action_from_policy(state)
+        action_env = hlp.denormalize(action, max_action_value, min_action_value)
+
+        next_state, reward, done, truncated, _ = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
+
+        state = next_state
+        episode_reward += reward
+
+        if time_step % max_steps_per_batch == 0:
+            for _ in range(G):   
+                experience = memory.sample(BATCH_SIZE) 
+                info = agent.train_policy((
+                    experience['state'],
+                    experience['action'],
+                    experience['reward'],
+                    experience['next_state'],
+                    experience['done'],
+                    experience['log_prob']
+                ))
+            
+            memory.clear()
+
+        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
+            evaluate = True
+        
+        record.log(
+            Steps = total_step_counter,
+            Episode= episode_num,
+            Step_reward= reward,
+            Episode_reward= episode_reward if (done or truncated) else None,
+            Evaluation_reward=evaluation_reward,
+            out=done or truncated,
+        )
+        evaluation_reward = None
+        time_step += 1
+
+        if done or truncated:
+            print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}')
+            
+            if evaluate:
+                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
+                evaluate = False
+
+            # Reset environment
+            state, _ = env.reset()
+            episode_reward = 0
+            episode_timesteps = 0
+            episode_num += 1
+
+def evaluate_policy(env, agent, num_episodes):
+
+    episode_reward_history = []
+
+    print('Beginning Evaluation----------------------------')
+    
+    for ep in range(num_episodes):
+        state, _ = env.reset()
+
+        episode_timesteps = 0
+        episode_reward = 0
+
+        truncated = False
+        terminated = False
+
+        while not truncated and not terminated:
+
+            if ALGORITHM == 'PPO':
+                action = agent.select_action_from_policy(state)
+            else:
+                action = agent.select_action_from_policy(state, evaluation=True)
+                
+            action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+            next_state, reward, terminated, truncated, _ = env.step(action)
+
+            episode_reward += reward
+            state = next_state
+
+        print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}')
+        episode_reward_history.append(episode_reward)
+
+    avg_reward = sum(episode_reward_history) / len(episode_reward_history)
+
+    print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------')
+
+    return avg_reward
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/reinforcement_learning/reinforcement_learning/util.py b/src/reinforcement_learning/reinforcement_learning/util.py
new file mode 100644
index 00000000..2b4c402b
--- /dev/null
+++ b/src/reinforcement_learning/reinforcement_learning/util.py
@@ -0,0 +1,66 @@
+import rclpy
+
+def parse_args():
+    params = _get_params()
+    return params
+
+def _get_params():
+    '''
+    This function fetches the hyperparameters passed in through the launch files
+    - The hyperparameters below are defaults, to change them, you should change the train.yaml config
+    '''
+    param_node = rclpy.create_node('params')
+    param_node.declare_parameters(
+        '',
+        [
+            ('environment', 'CarGoal'),
+            ('algorithm', 'TD3'),
+            ('track', 'track_1'),
+            ('gamma', 0.95),
+            ('tau', 0.005),
+            ('g', 10),
+            ('batch_size', 32),
+            ('buffer_size', 1_000_000),
+            ('seed', 123),  # TODO: This doesn't do anything yet
+            ('actor_lr', 1e-4),
+            ('critic_lr', 1e-3),
+            ('max_steps_training', 1_000_000),
+            ('max_steps_exploration', 1_000),
+            ('max_steps', 100),
+            ('step_length', 0.25),
+            ('reward_range', 0.2),
+            ('collision_range', 0.2),
+            ('actor_path', ''),
+            ('critic_path', ''),
+            ('max_steps_per_batch', 5000),
+            ('observation_mode', 'no_position'),
+            ('evaluate_every_n_steps', 10000),
+            ('evaluate_for_m_episodes', 5)
+        ]
+    )
+
+    return param_node.get_parameters([
+        'environment',
+        'algorithm',
+        'track',
+        'max_steps_training',
+        'max_steps_exploration',
+        'gamma',
+        'tau',
+        'g',
+        'batch_size',
+        'buffer_size',
+        'seed',
+        'actor_lr',
+        'critic_lr',
+        'max_steps',
+        'step_length',
+        'reward_range',
+        'collision_range',
+        'actor_path',
+        'critic_path',
+        'max_steps_per_batch',
+        'observation_mode',
+        'evaluate_every_n_steps',
+        'evaluate_for_m_episodes'
+    ])
\ No newline at end of file
diff --git a/src/reinforcement_learning/setup.py b/src/reinforcement_learning/setup.py
index ccb4e262..93ed141e 100644
--- a/src/reinforcement_learning/setup.py
+++ b/src/reinforcement_learning/setup.py
@@ -25,7 +25,7 @@
     entry_points={
         'console_scripts': [
             'sanity_check = reinforcement_learning.sanity_check:main',
-            'train = reinforcement_learning.train:main',
+            'train = reinforcement_learning.updated_train:main',
             'test = reinforcement_learning.test:main',
             'greatest_gap = reinforcement_learning.greatest_gap:main',
         ],

From c8123d4f47a35f66bdb9c2cad27d9db82b0fc20a Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 14:08:02 +1300
Subject: [PATCH 06/38] feat: add args parser

---
 .../reinforcement_learning/parse_args.py      | 108 ++++++
 .../reinforcement_learning/updated_train.py   | 337 +-----------------
 .../reinforcement_learning/util.py            |  66 ----
 3 files changed, 125 insertions(+), 386 deletions(-)
 create mode 100644 src/reinforcement_learning/reinforcement_learning/parse_args.py
 delete mode 100644 src/reinforcement_learning/reinforcement_learning/util.py

diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py
new file mode 100644
index 00000000..3fa37fb1
--- /dev/null
+++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py
@@ -0,0 +1,108 @@
+import rclpy
+from rclpy.node import Node
+from rclpy.parameter import Parameter
+
+def parse_args():
+    param_node = __declare_params()
+
+    env_params = __get_env_params(param_node)
+    algorithm_params = __get_algorithm_params(param_node)
+    network_params = __get_network_params(param_node)
+
+    return env_params, algorithm_params, network_params
+
+def __declare_params():
+    param_node = rclpy.create_node('params')
+    param_node.declare_parameters(
+        '',
+        [
+            # Environment Parameters ---------------------------
+            ('environment', 'CarGoal'),
+            ('track', 'track_1'),
+            ('max_steps', 100),
+            ('step_length', 0.25),
+            ('reward_range', 0.2),
+            ('collision_range', 0.2),
+            ('observation_mode', 'no_position'),
+
+            # Algorithm Parameters -----------------------------
+            ('g', 10),
+            ('batch_size', 32),
+            ('buffer_size', 1_000_000),
+            ('seed', 123),
+            ('max_steps_training', 1_000_000),
+            ('max_steps_exploration', 1_000),
+            ('max_steps_per_batch', 5000),
+            ('evaluate_every_n_steps', 10000),
+            ('evaluate_for_m_episodes', 5),
+
+            # Network Parameters -------------------------------
+            ('actor_path', ''),
+            ('critic_path', ''),
+            ('algorithm', 'TD3'),
+            ('gamma', 0.95),
+            ('tau', 0.005),
+            ('actor_lr', 1e-4),
+            ('critic_lr', 1e-3),
+
+        ]
+    )
+
+    return param_node
+
+def __get_env_params(param_node: Node):
+    
+    params: list(Parameter) = param_node.get_parameters([
+        'environment',
+        'track',
+        'max_steps',
+        'step_length',
+        'reward_range',
+        'collision_range',
+        'observation_mode'
+    ])
+
+    # Convert to Dictionary
+    params_dict = {}
+    for param in params:
+        params_dict[param.name] = param.value
+    
+    return params_dict
+
+def __get_algorithm_params(param_node: Node):
+    params = param_node.get_parameters([
+        'g',
+        'batch_size',
+        'buffer_size',
+        'seed',
+        'max_steps_training',
+        'max_steps_exploration',
+        'max_steps_per_batch',
+        'evaluate_every_n_steps',
+        'evaluate_for_m_episodes'
+    ])
+
+    # Convert to Dictionary
+    params_dict = {}
+    for param in params:
+        params_dict[param.name] = param.value
+    
+    return params_dict
+
+def __get_network_params(param_node: Node):
+    params = param_node.get_parameters([
+        'actor_path',
+        'critic_path',
+        'algorithm',
+        'gamma',
+        'tau',
+        'actor_lr',
+        'critic_lr'
+    ])
+
+    # Convert to Dictionary
+    params_dict = {}
+    for param in params:
+        params_dict[param.name] = param.value
+    
+    return params_dict
diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 942a6347..3fc5dd04 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -1,8 +1,10 @@
 import random
 import time
+import yaml
 
 import numpy as np
 import rclpy
+from rclpy.parameter import Parameter
 import torch
 
 from cares_reinforcement_learning.memory import MemoryBuffer
@@ -16,332 +18,27 @@
 from environments.CarWallEnvironment import CarWallEnvironment
 from environments.CarBeatEnvironment import CarBeatEnvironment
 
-from .util import parse_args
-from rclpy.parameter import Parameter
+from .parse_args import parse_args
+
 def main():
     rclpy.init()
 
-    params: list(Parameter) = parse_args()
-    for param in params:
-        print(param)
-        print(param.value)
-        print(param.name)
-        
-
-    global MAX_STEPS_TRAINING
-    global MAX_STEPS_EXPLORATION
-    global MAX_STEPS_PER_BATCH
-    global G
-    global BATCH_SIZE
-    global EVALUATE_EVERY_N_STEPS
-    global EVALUATE_FOR_M_EPISODES
-    global ALGORITHM
+    env_config, algorithm_config, network_config = parse_args()
 
-    ENVIRONMENT, \
-    ALGORITHM, \
-    TRACK, \
-    MAX_STEPS_TRAINING, \
-    MAX_STEPS_EXPLORATION, \
-    GAMMA, \
-    TAU, \
-    G, \
-    BATCH_SIZE, \
-    BUFFER_SIZE, \
-    SEED, \
-    ACTOR_LR, \
-    CRITIC_LR, \
-    MAX_STEPS, \
-    STEP_LENGTH, \
-    REWARD_RANGE, \
-    COLLISION_RANGE, \
-    ACTOR_PATH, \
-    CRITIC_PATH, \
-    MAX_STEPS_PER_BATCH, \
-    OBSERVATION_MODE, \
-    EVALUATE_EVERY_N_STEPS, \
-    EVALUATE_FOR_M_EPISODES = [param.value for param in params]
+    # Set Seeds
+    torch.manual_seed(algorithm_config['seed'])    
+    torch.cuda.manual_seed_all(algorithm_config['seed'])
+    np.random.seed(algorithm_config['seed'])
+    random.seed(algorithm_config['seed'])
 
-    if ACTOR_PATH != '' and CRITIC_PATH != '':
-        MAX_STEPS_EXPLORATION = 0
+    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
-    torch.manual_seed(SEED)
-    torch.cuda.manual_seed_all(SEED)
-    np.random.seed(SEED)
-    random.seed(SEED)
-    
     print(
-        f'---------------------------------------------\n'
-        f'Environment: {ENVIRONMENT}\n'
-        f'Algorithm: {ALGORITHM}\n'
-        f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n'
-        f'Training Steps: {MAX_STEPS_TRAINING}\n'
-        f'Gamma: {GAMMA}\n'
-        f'Tau: {TAU}\n'
-        f'G: {G}\n'
-        f'Batch Size: {BATCH_SIZE}\n'
-        f'Buffer Size: {BUFFER_SIZE}\n'
-        f'Seed: {SEED}\n'
-        f'Actor LR: {ACTOR_LR}\n'
-        f'Critic LR: {CRITIC_LR}\n'
-        f'Steps per Episode: {MAX_STEPS}\n'
-        f'Step Length: {STEP_LENGTH}\n'
-        f'Reward Range: {REWARD_RANGE}\n'
-        f'Collision Range: {COLLISION_RANGE}\n'
-        f'Critic Path: {CRITIC_PATH}\n'
-        f'Actor Path: {ACTOR_PATH}\n'
-        f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n'
-        f'Observation Mode: {OBSERVATION_MODE}\n'
-        f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n'
-        f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n'
-        f'---------------------------------------------\n'
+        f'Environment Config: ------------------------------------- \n'
+        f'{yaml.dump(env_config, default_flow_style=False)} \n'
+        f'Algorithm Config: ------------------------------------- \n'
+        f'{yaml.dump(algorithm_config, default_flow_style=False)} \n'
+        f'Network Config: ------------------------------------- \n'
+        f'{yaml.dump(network_config, default_flow_style=False)} \n'
     )
 
-    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    time.sleep(3)
-
-    match ENVIRONMENT:
-        case 'CarWall':
-            env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarBlock':
-            env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarTrack':
-            env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
-        case 'CarBeat':
-            env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
-        case _:
-            env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE)
-
-    network_factory_args = {
-        'observation_size': env.OBSERVATION_SIZE,
-        'action_num': env.ACTION_NUM,
-        'actor_lr': ACTOR_LR,
-        'critic_lr': CRITIC_LR,
-        'gamma': GAMMA,
-        'tau': TAU,
-        'device': DEVICE
-    }
-
-    agent_factory = NetworkFactory()
-    agent = agent_factory.create_network(ALGORITHM, network_factory_args)
-    
-    if ACTOR_PATH != '' and CRITIC_PATH != '':
-        print('Reading saved models into actor and critic')
-        agent.actor_net.load_state_dict(torch.load(ACTOR_PATH))
-        agent.critic_net.load_state_dict(torch.load(CRITIC_PATH))
-        print('Successfully Loaded models')
-
-    networks = {'actor': agent.actor_net, 'critic': agent.critic_net}
-    config = {
-        'environment': ENVIRONMENT,
-        'algorithm': ALGORITHM,
-        'max_steps_training': MAX_STEPS_TRAINING,
-        'max_steps_exploration': MAX_STEPS_EXPLORATION,
-        'gamma': GAMMA,
-        'tau': TAU,
-        'g': G,
-        'batch_size': BATCH_SIZE,
-        'buffer_size': BUFFER_SIZE,
-        'seed': SEED,
-        'actor_lr': ACTOR_LR,
-        'critic_lr': CRITIC_LR,
-        'max_steps': MAX_STEPS,
-        'step_length': STEP_LENGTH,
-        'reward_range': REWARD_RANGE,
-        'collision_range': COLLISION_RANGE,
-        'max_steps_per_batch': MAX_STEPS_PER_BATCH,
-        'observation_mode': OBSERVATION_MODE,
-        'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS,
-        'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES
-    }
-
-    if (ENVIRONMENT == 'CarTrack'):
-        config['track'] = TRACK
-
-    record = Record(networks=networks, checkpoint_freq=100, config=config)
-
-    if ALGORITHM == 'PPO':
-        train_ppo(env, agent, record=record)
-    else:
-        train(env=env, agent=agent, record=record)
-
-
-def train(env, agent, record: Record):
-    memory = MemoryBuffer()
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-    evaluation_reward = None
-    
-    state, _ = env.reset()
-
-    evaluate = False
-    
-    print(f'Initial State: {state}')
-    historical_reward = {"step": [], "episode_reward": []}
-
-    for total_step_counter in range(int(MAX_STEPS_TRAINING)):
-        episode_timesteps += 1
-
-        if total_step_counter < MAX_STEPS_EXPLORATION:
-            print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}")
-            action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])])  # action range the env uses [e.g. -2 , 2 for pendulum]
-            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)  # algorithm range [-1, 1]
-        else:
-            action = agent.select_action_from_policy(state)  # algorithm range [-1, 1]
-            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)  # mapping to env range [e.g. -2 , 2 for pendulum]
-
-        next_state, reward, done, truncated, info = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-
-        state = next_state
-        episode_reward += reward
-
-        if total_step_counter >= MAX_STEPS_EXPLORATION:
-            for _ in range(G):
-                experiences = memory.sample(BATCH_SIZE)
-                experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done'])
-                agent.train_policy(experiences)
-
-        
-
-        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
-            evaluate = True
-
-        
-        record.log(
-            out=done or truncated,
-            Step=total_step_counter,
-            Episode=episode_num,
-            Step_Reward=reward,
-            Episode_Reward=episode_reward if (done or truncated) else None,
-            Evaluation_Reward=evaluation_reward
-        )
-
-        evaluation_reward = None
-
-        if done or truncated:
-            historical_reward["step"].append(total_step_counter)
-            historical_reward["episode_reward"].append(episode_reward)
-            
-            if evaluate:
-                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
-                evaluate = False
-            
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-            
-def train_ppo(env, agent, record):
-    max_steps_training = MAX_STEPS_TRAINING
-    max_steps_per_batch = MAX_STEPS_PER_BATCH
-
-    min_action_value = env.MIN_ACTIONS 
-    max_action_value = env.MAX_ACTIONS
-
-    episode_timesteps = 0
-    episode_num = 0
-    episode_reward = 0
-    time_step = 1
-    evaluation_reward = None
-    evaluate = False
-    
-    memory = MemoryBuffer()
-
-    state, _ = env.reset()
-
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        action, log_prob = agent.select_action_from_policy(state)
-        action_env = hlp.denormalize(action, max_action_value, min_action_value)
-
-        next_state, reward, done, truncated, _ = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
-
-        state = next_state
-        episode_reward += reward
-
-        if time_step % max_steps_per_batch == 0:
-            for _ in range(G):   
-                experience = memory.sample(BATCH_SIZE) 
-                info = agent.train_policy((
-                    experience['state'],
-                    experience['action'],
-                    experience['reward'],
-                    experience['next_state'],
-                    experience['done'],
-                    experience['log_prob']
-                ))
-            
-            memory.clear()
-
-        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
-            evaluate = True
-        
-        record.log(
-            Steps = total_step_counter,
-            Episode= episode_num,
-            Step_reward= reward,
-            Episode_reward= episode_reward if (done or truncated) else None,
-            Evaluation_reward=evaluation_reward,
-            out=done or truncated,
-        )
-        evaluation_reward = None
-        time_step += 1
-
-        if done or truncated:
-            print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}')
-            
-            if evaluate:
-                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
-                evaluate = False
-
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-def evaluate_policy(env, agent, num_episodes):
-
-    episode_reward_history = []
-
-    print('Beginning Evaluation----------------------------')
-    
-    for ep in range(num_episodes):
-        state, _ = env.reset()
-
-        episode_timesteps = 0
-        episode_reward = 0
-
-        truncated = False
-        terminated = False
-
-        while not truncated and not terminated:
-
-            if ALGORITHM == 'PPO':
-                action = agent.select_action_from_policy(state)
-            else:
-                action = agent.select_action_from_policy(state, evaluation=True)
-                
-            action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
-            next_state, reward, terminated, truncated, _ = env.step(action)
-
-            episode_reward += reward
-            state = next_state
-
-        print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}')
-        episode_reward_history.append(episode_reward)
-
-    avg_reward = sum(episode_reward_history) / len(episode_reward_history)
-
-    print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------')
-
-    return avg_reward
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/reinforcement_learning/reinforcement_learning/util.py b/src/reinforcement_learning/reinforcement_learning/util.py
deleted file mode 100644
index 2b4c402b..00000000
--- a/src/reinforcement_learning/reinforcement_learning/util.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import rclpy
-
-def parse_args():
-    params = _get_params()
-    return params
-
-def _get_params():
-    '''
-    This function fetches the hyperparameters passed in through the launch files
-    - The hyperparameters below are defaults, to change them, you should change the train.yaml config
-    '''
-    param_node = rclpy.create_node('params')
-    param_node.declare_parameters(
-        '',
-        [
-            ('environment', 'CarGoal'),
-            ('algorithm', 'TD3'),
-            ('track', 'track_1'),
-            ('gamma', 0.95),
-            ('tau', 0.005),
-            ('g', 10),
-            ('batch_size', 32),
-            ('buffer_size', 1_000_000),
-            ('seed', 123),  # TODO: This doesn't do anything yet
-            ('actor_lr', 1e-4),
-            ('critic_lr', 1e-3),
-            ('max_steps_training', 1_000_000),
-            ('max_steps_exploration', 1_000),
-            ('max_steps', 100),
-            ('step_length', 0.25),
-            ('reward_range', 0.2),
-            ('collision_range', 0.2),
-            ('actor_path', ''),
-            ('critic_path', ''),
-            ('max_steps_per_batch', 5000),
-            ('observation_mode', 'no_position'),
-            ('evaluate_every_n_steps', 10000),
-            ('evaluate_for_m_episodes', 5)
-        ]
-    )
-
-    return param_node.get_parameters([
-        'environment',
-        'algorithm',
-        'track',
-        'max_steps_training',
-        'max_steps_exploration',
-        'gamma',
-        'tau',
-        'g',
-        'batch_size',
-        'buffer_size',
-        'seed',
-        'actor_lr',
-        'critic_lr',
-        'max_steps',
-        'step_length',
-        'reward_range',
-        'collision_range',
-        'actor_path',
-        'critic_path',
-        'max_steps_per_batch',
-        'observation_mode',
-        'evaluate_every_n_steps',
-        'evaluate_for_m_episodes'
-    ])
\ No newline at end of file

From b2ded1ba610b700b176ca54f3621df942e714c8a Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 15:06:03 +1300
Subject: [PATCH 07/38] feat: add CarTrack and CarBeat to env factory

---
 .../environments/CarBeatEnvironment.py        | 10 +--
 src/environments/launch/carbeat.launch.py     | 78 ++++++++++---------
 src/reinforcement_learning/config/train.yaml  | 25 +-----
 .../launch/train.launch.py                    |  2 +
 .../EnvironmentFactory.py                     | 44 +++++++++++
 .../reinforcement_learning/parse_args.py      | 12 ++-
 .../reinforcement_learning/updated_train.py   | 10 +--
 7 files changed, 110 insertions(+), 71 deletions(-)
 create mode 100644 src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 26fbf177..5d86c316 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -20,13 +20,13 @@
 class CarBeatEnvironment(Node):
 
     def __init__(self,
-                 car_one_name,
-                 car_two_name,
+                 rl_car_name,
+                 ftg_car_name,
                  reward_range=1,
                  max_steps=50,
                  collision_range=0.2,
                  step_length=0.5,
-                 track='track_1',
+                 track='multi_track',
                  observation_mode='lidar_only',
                  max_goals=500,
                  num_lidar_points=50
@@ -34,8 +34,8 @@ def __init__(self,
         super().__init__('car_beat_environment')
 
         # Environment Details ----------------------------------------
-        self.NAME = car_one_name
-        self.OTHER_CAR_NAME = car_two_name
+        self.NAME = rl_car_name
+        self.OTHER_CAR_NAME = ftg_car_name
         self.MAX_STEPS = max_steps
         self.STEP_LENGTH = step_length
         self.MAX_ACTIONS = np.asarray([5, 3.14])
diff --git a/src/environments/launch/carbeat.launch.py b/src/environments/launch/carbeat.launch.py
index fde5b9aa..d6aa957e 100644
--- a/src/environments/launch/carbeat.launch.py
+++ b/src/environments/launch/carbeat.launch.py
@@ -9,9 +9,12 @@
 def launch(context, *args, **kwargs):
     pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim')
     pkg_environments = get_package_share_directory('environments')
+    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
 
     track = LaunchConfiguration('track').perform(context)
-    
+    car_one = LaunchConfiguration('car_one').perform(context)
+    car_two = LaunchConfiguration('car_two').perform(context)
+
     gz_sim = IncludeLaunchDescription(
         launch_description_source=PythonLaunchDescriptionSource(
             os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')),
@@ -20,24 +23,56 @@ def launch(context, *args, **kwargs):
         }.items()
     )
 
+    f1tenth_one = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_one,
+            'world': 'empty',
+            'x': '0',
+            'y': '0',
+            'z': '5',
+        }.items()
+    )
+
+    f1tenth_two = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_two,
+            'world': 'empty',
+            'x': '0',
+            'y': '1',
+            'z': '5',
+        }.items()
+    )
+
     controller = Node(
         package='controllers',
         executable='ftg_policy',
         output='screen',
         parameters=[
-            {'car_name': 'f1tenth_two', 'track_name': track},
+            {'car_name': car_two, 'track_name': track},
         ],
     )
-    return[gz_sim, controller]
+    return[gz_sim, controller, f1tenth_one, f1tenth_two]
 
 def generate_launch_description():
-    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
-
     track_arg = DeclareLaunchArgument(
         'track',
         default_value='track_1'
     )
 
+    car_one = DeclareLaunchArgument(
+        'car_one',
+        default_value='f1tenth_one'
+    )
+
+    car_two = DeclareLaunchArgument(
+        'car_two',
+        default_value='f1tenth_two'
+    )
+
     service_bridge = Node(
         package='ros_gz_bridge',
         executable='parameter_bridge',
@@ -54,48 +89,19 @@ def generate_launch_description():
         ],
     )
 
-    f1tenth_one = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth_one',
-            'world': 'empty',
-            'x': '0',
-            'y': '0',
-            'z': '5',
-        }.items()
-    )
-
-    f1tenth_two = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth_two',
-            'world': 'empty',
-            'x': '0',
-            'y': '1',
-            'z': '5',
-        }.items()
-    )
-
-    #TODO: dynamically change car name
-    #TODO: This doesn't work yet
-    #TODO: Create CarWall Reset
     reset = Node(
             package='environments',
             executable='CarBeatReset',
             output='screen',
     )
 
-    
-
     ld = LaunchDescription([
         track_arg,
+        car_one,
+        car_two,
         OpaqueFunction(function=launch),
         service_bridge,
         reset,
-        f1tenth_one,
-        f1tenth_two,
     ])
     
     return ld 
\ No newline at end of file
diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml
index 9687b2d8..bfefbafc 100644
--- a/src/reinforcement_learning/config/train.yaml
+++ b/src/reinforcement_learning/config/train.yaml
@@ -1,25 +1,6 @@
 train:
   ros__parameters:
-    environment: 'CarTrack' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat
+    environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat
     track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack
-    algorithm: 'TD3'
-    max_steps_exploration: 1000
-    max_steps_training: 1000000
-    reward_range: 2.0
-    collision_range: 0.2
-    observation_mode: 'lidar_only'
-    evaluate_every_n_steps: 2000
-    evaluate_for_m_episodes: 3
-    # actor_path & critic_path must exist for retrain.sh, it can't be commented
-    # actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht'
-    # critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht'
-    gamma: 0.95
-    # tau: 0.005
-    g: 1
-    # batch_size: 32
-    # buffer_size: 1000000
-    actor_lr: 0.0001
-    critic_lr: 0.001
-    # max_steps: 10
-    step_length: 0.1
-    # seed: 123
+    car_name: 'New_Car'
+    ftg_car_name: 'New_Car_two'
diff --git a/src/reinforcement_learning/launch/train.launch.py b/src/reinforcement_learning/launch/train.launch.py
index 09d5d3d3..cb3c0345 100644
--- a/src/reinforcement_learning/launch/train.launch.py
+++ b/src/reinforcement_learning/launch/train.launch.py
@@ -32,6 +32,8 @@ def generate_launch_description():
             os.path.join(pkg_environments, f'{env_launch[env]}.launch.py')),
         launch_arguments={
             'track': TextSubstitution(text=str(config['train']['ros__parameters']['track'])),
+            'car_one': TextSubstitution(text=str(config['train']['ros__parameters']['car_name']) if 'car_name' in config['train']['ros__parameters'] else 'f1tenth'),
+            'car_two': TextSubstitution(text=str(config['train']['ros__parameters']['ftg_car_name']) if 'ftg_car_name' in config['train']['ros__parameters'] else 'ftg_car'),
         }.items() #TODO: this doesn't do anything
     )
 
diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
new file mode 100644
index 00000000..f726efc3
--- /dev/null
+++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
@@ -0,0 +1,44 @@
+from environments.CarBlockEnvironment import CarBlockEnvironment
+from environments.CarGoalEnvironment import CarGoalEnvironment
+from environments.CarTrackEnvironment import CarTrackEnvironment
+from environments.CarWallEnvironment import CarWallEnvironment
+from environments.CarBeatEnvironment import CarBeatEnvironment
+
+class EnvironmentFactory:
+    def __init__(self):
+        pass
+
+    def create(self, name, config):
+        print(config)
+        if name == 'CarBlock':
+            return CarBlockEnvironment()
+        elif name == 'CarGoal':
+            return CarGoalEnvironment()
+        elif name == 'CarTrack':
+            return CarTrackEnvironment(
+                config['car_name'], 
+                config['reward_range'], 
+                config['max_steps'], 
+                config['collision_range'], 
+                config['step_length'], 
+                config['track'], 
+                config['observation_mode'], 
+                config['max_goals']
+                )
+        elif name == 'CarWall':
+            return CarWallEnvironment()
+        elif name == 'CarBeat':
+            return CarBeatEnvironment(
+                config['car_name'],
+                config['ftg_car_name'], 
+                config['reward_range'], 
+                config['max_steps'], 
+                config['collision_range'], 
+                config['step_length'], 
+                config['track'], 
+                config['observation_mode'], 
+                config['max_goals'],
+                config['num_lidar_points']
+            )
+        else:
+            raise Exception('EnvironmentFactory: Environment not found')
diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py
index 3fa37fb1..4e88be1e 100644
--- a/src/reinforcement_learning/reinforcement_learning/parse_args.py
+++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py
@@ -18,12 +18,16 @@ def __declare_params():
         [
             # Environment Parameters ---------------------------
             ('environment', 'CarGoal'),
-            ('track', 'track_1'),
+            ('car_name', 'f1tenth'),
+            ('ftg_car_name', 'ftg_car'),
+            ('track', 'multi_track'),
             ('max_steps', 100),
             ('step_length', 0.25),
             ('reward_range', 0.2),
             ('collision_range', 0.2),
             ('observation_mode', 'no_position'),
+            ('max_goals', 500),
+            ('num_lidar_points', 10),
 
             # Algorithm Parameters -----------------------------
             ('g', 10),
@@ -54,12 +58,16 @@ def __get_env_params(param_node: Node):
     
     params: list(Parameter) = param_node.get_parameters([
         'environment',
+        'car_name',
         'track',
         'max_steps',
         'step_length',
         'reward_range',
         'collision_range',
-        'observation_mode'
+        'observation_mode',
+        'max_goals',
+        'ftg_car_name',
+        'num_lidar_points'
     ])
 
     # Convert to Dictionary
diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 3fc5dd04..3501f79d 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -12,13 +12,8 @@
 from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
 from cares_reinforcement_learning.util import helpers as hlp
 
-from environments.CarBlockEnvironment import CarBlockEnvironment
-from environments.CarGoalEnvironment import CarGoalEnvironment
-from environments.CarTrackEnvironment import CarTrackEnvironment
-from environments.CarWallEnvironment import CarWallEnvironment
-from environments.CarBeatEnvironment import CarBeatEnvironment
-
 from .parse_args import parse_args
+from .EnvironmentFactory import EnvironmentFactory
 
 def main():
     rclpy.init()
@@ -42,3 +37,6 @@ def main():
         f'{yaml.dump(network_config, default_flow_style=False)} \n'
     )
 
+    env_factory = EnvironmentFactory()
+    env = env_factory.create(env_config['environment'], env_config)
+

From 0e9ac4fe768def262efd84f0e0a9e9d3c6da969e Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 15:22:15 +1300
Subject: [PATCH 08/38] feat: add CarWall to env factory

---
 .../reinforcement_learning/EnvironmentFactory.py          | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
index f726efc3..4953511e 100644
--- a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
+++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
@@ -26,7 +26,13 @@ def create(self, name, config):
                 config['max_goals']
                 )
         elif name == 'CarWall':
-            return CarWallEnvironment()
+            return CarWallEnvironment(
+                config['car_name'],
+                config['reward_range'],
+                config['max_steps'],
+                config['collision_range'],
+                config['step_length']
+            )
         elif name == 'CarBeat':
             return CarBeatEnvironment(
                 config['car_name'],

From 86106378adfe469ca326658c935af6e13ee0c5fb Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Sat, 21 Oct 2023 15:29:13 +1300
Subject: [PATCH 09/38] feat: add CarGoal and CarBlock to env factory

---
 .../reinforcement_learning/EnvironmentFactory.py  | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
index 4953511e..30d1f94a 100644
--- a/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
+++ b/src/reinforcement_learning/reinforcement_learning/EnvironmentFactory.py
@@ -11,9 +11,20 @@ def __init__(self):
     def create(self, name, config):
         print(config)
         if name == 'CarBlock':
-            return CarBlockEnvironment()
+            return CarBlockEnvironment(
+                config['car_name'],
+                config['reward_range'],
+                config['max_steps'],
+                config['collision_range'],
+                config['step_length']
+            )
         elif name == 'CarGoal':
-            return CarGoalEnvironment()
+            return CarGoalEnvironment(
+                config['car_name'],
+                config['reward_range'],
+                config['max_steps'],
+                config['step_length']
+            )
         elif name == 'CarTrack':
             return CarTrackEnvironment(
                 config['car_name'], 

From 1d7c56a8d17b4121d2ffcf968f158fb7ad4a8833 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 08:07:04 +1300
Subject: [PATCH 10/38] feat: integrate Network and Memory factory

---
 .../reinforcement_learning/updated_train.py   | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 3501f79d..65bbd0e4 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -11,6 +11,7 @@
 from cares_reinforcement_learning.util.Record import Record
 from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
 from cares_reinforcement_learning.util import helpers as hlp
+import cares_reinforcement_learning.util.configurations as cfg
 
 from .parse_args import parse_args
 from .EnvironmentFactory import EnvironmentFactory
@@ -26,8 +27,6 @@ def main():
     np.random.seed(algorithm_config['seed'])
     random.seed(algorithm_config['seed'])
 
-    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
     print(
         f'Environment Config: ------------------------------------- \n'
         f'{yaml.dump(env_config, default_flow_style=False)} \n'
@@ -38,5 +37,29 @@ def main():
     )
 
     env_factory = EnvironmentFactory()
+    network_factory = NetworkFactory()
+
+    match network_config['algorithm']:
+        case 'PPO':
+            config = cfg.PPOConfig(**network_config)
+        case 'DDPG':
+            config = cfg.DDPGConfig(**network_config)
+        case 'SAC':
+            config = cfg.SACConfig(**network_config)
+        case 'TD3':
+            config = cfg.TD3Config(**network_config)
+        case _:
+            raise Exception(f'Algorithm {network_config["algorithm"]} not implemented')
+
+
     env = env_factory.create(env_config['environment'], env_config)
+    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
+    memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM)
+
+    # TODO: Load Actor and Critic if passed. Only load if both are passed
+
+
+
+
+
 

From 28b712c656292a6691c3d9a2bc03b2f7453b30d0 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 08:37:01 +1300
Subject: [PATCH 11/38] feat: add off policy training loop

---
 .../reinforcement_learning/updated_train.py   | 89 +++++++++++++++++--
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 65bbd0e4..fa89691f 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -1,5 +1,5 @@
 import random
-import time
+from datetime import datetime
 import yaml
 
 import numpy as np
@@ -56,10 +56,87 @@ def main():
     agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
     memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM)
 
-    # TODO: Load Actor and Critic if passed. Only load if both are passed
-
-
-
-
 
+    record = Record(
+        glob_log_dir='training_logs',
+        log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}",
+        algorithm=network_config['algorithm'],
+        task=env_config['environment'],
+        network=agent
+    )
 
+    # TODO: Load Actor and Critic if passed. Only load if both are passed
+    off_policy_train(env, agent, memory, record, algorithm_config)
+
+
+def off_policy_train(env, agent, memory, record, algorithm_config):
+    
+    max_steps_training = algorithm_config['max_steps_training']
+    max_steps_exploration = algorithm_config['max_steps_exploration']
+    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
+    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
+
+    batch_size = algorithm_config['batch_size']
+    G = algorithm_config['g']
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+
+    evaluate = False
+
+    state, _ = env.reset()
+
+    for step_counter in range(max_steps_training):
+        episode_timesteps += 1
+
+        if step_counter < max_steps_exploration:
+            env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}')
+            action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM)
+            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)
+        else:
+            action = agent.select_action_from_policy(state)
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+        
+        next_state, reward, done, truncated, info = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
+
+        state = next_state
+        episode_reward += reward
+
+        if step_counter >= max_steps_exploration:
+            for i in range(G):
+                experience = memory.sample(batch_size)
+                info = agent.train_policy((
+                    experience['state'],
+                    experience['action'],
+                    experience['reward'],
+                    experience['next_state'],
+                    experience['done']
+                ))
+                memory.update_priorities(experience['indices'], info)
+        
+        if step_counter % evaluate_every_n_steps == 0:
+            evaluate = True
+        
+        if done or truncated:
+            env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}')
+
+            record.log_train(
+                total_steps = step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                display = True
+            )
+
+            if evaluate:
+                evaluate = False
+                # off_policy_evaluate(env, agent, record, algorithm_config, network_config)
+                print('Evaluated')
+
+            # Reset environment
+            state, _ = env.reset()
+            episode_reward = 0
+            episode_timesteps = 0
+            episode_num += 1

From 20e7ffef122f0d0fd84438c3ac806b62482a209d Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 08:48:54 +1300
Subject: [PATCH 12/38] feat: add off policy evaluation loop

---
 .../reinforcement_learning/updated_train.py   | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index fa89691f..42fb6ec6 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -132,11 +132,51 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
 
             if evaluate:
                 evaluate = False
-                # off_policy_evaluate(env, agent, record, algorithm_config, network_config)
-                print('Evaluated')
+
+                env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
+                off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter)
+                env.get_logger().info(f'*************--End Evaluation Loop--*************')
+
 
             # Reset environment
             state, _ = env.reset()
             episode_reward = 0
             episode_timesteps = 0
             episode_num += 1
+
+def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
+
+    episode_reward = 0
+    episode_timesteps = 0
+    episode_num = 0
+
+    for episode_num in range(eval_episodes):
+        state, _ = env.reset()
+        done = False
+        truncated = False
+
+        while not done and not truncated:
+            episode_timesteps += 1
+            action = agent.select_action_from_policy(state, evaluation=True)
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+
+            next_state, reward, done, truncated, _ = env.step(action_env)
+
+            state = next_state
+            episode_reward += reward
+
+            if done or truncated:
+                record.log_eval(
+                    total_steps = steps_counter + 1,
+                    episode = episode_num + 1,
+                    episode_steps=episode_timesteps,
+                    episode_reward = episode_reward,
+                    display = True
+                )
+
+                # Reset environment
+                state, _ = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+                break

From efee927faecf6bc7cba2b9fec444e876b2deae07 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 08:54:47 +1300
Subject: [PATCH 13/38] refactor: move training loops to module

---
 .../reinforcement_learning/training_loops.py  | 115 ++++++++++++++++++
 .../reinforcement_learning/updated_train.py   | 113 +----------------
 2 files changed, 117 insertions(+), 111 deletions(-)
 create mode 100644 src/reinforcement_learning/reinforcement_learning/training_loops.py

diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
new file mode 100644
index 00000000..9bd02986
--- /dev/null
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -0,0 +1,115 @@
+import numpy as np
+
+from cares_reinforcement_learning.util import helpers as hlp
+
+def off_policy_train(env, agent, memory, record, algorithm_config):
+    
+    max_steps_training = algorithm_config['max_steps_training']
+    max_steps_exploration = algorithm_config['max_steps_exploration']
+    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
+    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
+
+    batch_size = algorithm_config['batch_size']
+    G = algorithm_config['g']
+
+    episode_timesteps = 0
+    episode_reward = 0
+    episode_num = 0
+
+    evaluate = False
+
+    state, _ = env.reset()
+
+    for step_counter in range(max_steps_training):
+        episode_timesteps += 1
+
+        if step_counter < max_steps_exploration:
+            env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}')
+            action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM)
+            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)
+        else:
+            action = agent.select_action_from_policy(state)
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+        
+        next_state, reward, done, truncated, info = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
+
+        state = next_state
+        episode_reward += reward
+
+        if step_counter >= max_steps_exploration:
+            for i in range(G):
+                experience = memory.sample(batch_size)
+                info = agent.train_policy((
+                    experience['state'],
+                    experience['action'],
+                    experience['reward'],
+                    experience['next_state'],
+                    experience['done']
+                ))
+                memory.update_priorities(experience['indices'], info)
+        
+        if step_counter % evaluate_every_n_steps == 0:
+            evaluate = True
+        
+        if done or truncated:
+            env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}')
+
+            record.log_train(
+                total_steps = step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                display = True
+            )
+
+            if evaluate:
+                evaluate = False
+
+                env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
+                off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter)
+                env.get_logger().info(f'*************--End Evaluation Loop--*************')
+
+
+            # Reset environment
+            state, _ = env.reset()
+            episode_reward = 0
+            episode_timesteps = 0
+            episode_num += 1
+
+def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
+
+    episode_reward = 0
+    episode_timesteps = 0
+    episode_num = 0
+
+    for episode_num in range(eval_episodes):
+        state, _ = env.reset()
+        done = False
+        truncated = False
+
+        while not done and not truncated:
+            episode_timesteps += 1
+            action = agent.select_action_from_policy(state, evaluation=True)
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+
+            next_state, reward, done, truncated, _ = env.step(action_env)
+
+            state = next_state
+            episode_reward += reward
+
+            if done or truncated:
+                record.log_eval(
+                    total_steps = steps_counter + 1,
+                    episode = episode_num + 1,
+                    episode_steps=episode_timesteps,
+                    episode_reward = episode_reward,
+                    display = True
+                )
+
+                # Reset environment
+                state, _ = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+                break
\ No newline at end of file
diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 42fb6ec6..4c9c9d8c 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -15,6 +15,7 @@
 
 from .parse_args import parse_args
 from .EnvironmentFactory import EnvironmentFactory
+from .training_loops import off_policy_train
 
 def main():
     rclpy.init()
@@ -69,114 +70,4 @@ def main():
     off_policy_train(env, agent, memory, record, algorithm_config)
 
 
-def off_policy_train(env, agent, memory, record, algorithm_config):
-    
-    max_steps_training = algorithm_config['max_steps_training']
-    max_steps_exploration = algorithm_config['max_steps_exploration']
-    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
-    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
-
-    batch_size = algorithm_config['batch_size']
-    G = algorithm_config['g']
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-
-    evaluate = False
-
-    state, _ = env.reset()
-
-    for step_counter in range(max_steps_training):
-        episode_timesteps += 1
-
-        if step_counter < max_steps_exploration:
-            env.get_logger().info(f'Exploration Step #{step_counter} out of {max_steps_exploration}')
-            action_env = np.random.uniform(env.MIN_ACTIONS, env.MAX_ACTIONS, env.ACTION_NUM)
-            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)
-        else:
-            action = agent.select_action_from_policy(state)
-            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
-        
-        next_state, reward, done, truncated, info = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-
-        state = next_state
-        episode_reward += reward
-
-        if step_counter >= max_steps_exploration:
-            for i in range(G):
-                experience = memory.sample(batch_size)
-                info = agent.train_policy((
-                    experience['state'],
-                    experience['action'],
-                    experience['reward'],
-                    experience['next_state'],
-                    experience['done']
-                ))
-                memory.update_priorities(experience['indices'], info)
-        
-        if step_counter % evaluate_every_n_steps == 0:
-            evaluate = True
-        
-        if done or truncated:
-            env.get_logger().info(f'Episode #{episode_num} completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}')
-
-            record.log_train(
-                total_steps = step_counter + 1,
-                episode = episode_num + 1,
-                episode_steps=episode_timesteps,
-                episode_reward = episode_reward,
-                display = True
-            )
-
-            if evaluate:
-                evaluate = False
-
-                env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
-                off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter)
-                env.get_logger().info(f'*************--End Evaluation Loop--*************')
-
-
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
-
-    episode_reward = 0
-    episode_timesteps = 0
-    episode_num = 0
-
-    for episode_num in range(eval_episodes):
-        state, _ = env.reset()
-        done = False
-        truncated = False
-
-        while not done and not truncated:
-            episode_timesteps += 1
-            action = agent.select_action_from_policy(state, evaluation=True)
-            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
-
-            next_state, reward, done, truncated, _ = env.step(action_env)
-
-            state = next_state
-            episode_reward += reward
-
-            if done or truncated:
-                record.log_eval(
-                    total_steps = steps_counter + 1,
-                    episode = episode_num + 1,
-                    episode_steps=episode_timesteps,
-                    episode_reward = episode_reward,
-                    display = True
-                )
-
-                # Reset environment
-                state, _ = env.reset()
-                episode_reward = 0
-                episode_timesteps = 0
-                episode_num += 1
-                break
+

From 8b302b41d0dea600cd57bf6ad38db0797de2145c Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 08:56:19 +1300
Subject: [PATCH 14/38] chore: add safe guards to algorithm training loop

---
 .../reinforcement_learning/updated_train.py              | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 4c9c9d8c..55f42a3d 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -67,7 +67,14 @@ def main():
     )
 
     # TODO: Load Actor and Critic if passed. Only load if both are passed
-    off_policy_train(env, agent, memory, record, algorithm_config)
+
+    match agent.type:
+        case 'policy':
+            off_policy_train(env, agent, memory, record, algorithm_config)
+        case 'ppo':
+            raise Exception('PPO Training not implemented')
+        case _:
+            raise Exception(f'Agent type {agent.type} not supported')
 
 
 

From 33ffc134d65cf7e68033b2036ad08a4f65e2f30e Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 09:15:55 +1300
Subject: [PATCH 15/38] feat: add ppo train and evaluate

---
 .../reinforcement_learning/training_loops.py  | 105 +++++++++++++++++-
 .../reinforcement_learning/updated_train.py   |  11 +-
 2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
index 9bd02986..35e5059d 100644
--- a/src/reinforcement_learning/reinforcement_learning/training_loops.py
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -49,7 +49,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
                 ))
                 memory.update_priorities(experience['indices'], info)
         
-        if step_counter % evaluate_every_n_steps == 0:
+        if (step_counter+1) % evaluate_every_n_steps == 0:
             evaluate = True
         
         if done or truncated:
@@ -98,6 +98,109 @@ def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
             state = next_state
             episode_reward += reward
 
+            if done or truncated:
+                record.log_eval(
+                    total_steps = steps_counter + 1,
+                    episode = episode_num + 1,
+                    episode_steps=episode_timesteps,
+                    episode_reward = episode_reward,
+                    display = True
+                )
+
+                # Reset environment
+                state, _ = env.reset()
+                episode_reward = 0
+                episode_timesteps = 0
+                episode_num += 1
+                break
+
+
+
+def ppo_train(env, agent, memory, record, algorithm_config):
+
+    max_steps_training = algorithm_config['max_steps_training']
+    max_steps_per_batch = algorithm_config['max_steps_per_batch']
+    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
+    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
+
+    episode_timesteps = 0
+    episode_num = 0
+    episode_reward = 0
+
+    evaluate = False
+
+    state, _ = env.reset()
+
+    for step_counter in range(max_steps_training):
+        episode_timesteps += 1
+
+        action, log_prob = agent.select_action_from_policy(state)
+        action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+
+        next_state, reward, done, truncated, info = env.step(action_env)
+        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
+
+        state = next_state
+        episode_reward += reward
+
+        if (step_counter+1) % max_steps_per_batch == 0:
+            experience = memory.flush()
+            info = agent.train_policy((
+                experience['state'],
+                experience['action'],
+                experience['reward'],
+                experience['next_state'],
+                experience['done'],
+                experience['log_prob']
+            ))
+        
+        if (step_counter+1) % evaluate_every_n_steps == 0:
+            evaluate = True
+        
+        if done or truncated:
+            record.log_train(
+                total_steps = step_counter + 1,
+                episode = episode_num + 1,
+                episode_steps=episode_timesteps,
+                episode_reward = episode_reward,
+                display = True
+            )
+
+            if evaluate:
+                evaluate = False
+
+                env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
+                ppo_evaluate(env, agent, num_eps_evaluation, record, step_counter)
+                env.get_logger().info(f'*************--End Evaluation Loop--*************')
+
+
+            # Reset environment
+            state, _ = env.reset()
+            episode_reward = 0
+            episode_timesteps = 0
+            episode_num += 1
+
+def ppo_evaluate(env, agent, eval_episodes, record, steps_counter):
+
+    episode_reward = 0
+    episode_timesteps = 0
+    episode_num = 0
+
+    for episode_num in range(eval_episodes):
+        state, _ = env.reset()
+        done = False
+        truncated = False
+
+        while not done and not truncated:
+            episode_timesteps += 1
+            action, _ = agent.select_action_from_policy(state)
+            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
+
+            next_state, reward, done, truncated, _ = env.step(action_env)
+
+            state = next_state
+            episode_reward += reward
+
             if done or truncated:
                 record.log_eval(
                     total_steps = steps_counter + 1,
diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
index 55f42a3d..60a1b9c2 100644
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ b/src/reinforcement_learning/reinforcement_learning/updated_train.py
@@ -10,12 +10,11 @@
 from cares_reinforcement_learning.memory import MemoryBuffer
 from cares_reinforcement_learning.util.Record import Record
 from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
-from cares_reinforcement_learning.util import helpers as hlp
 import cares_reinforcement_learning.util.configurations as cfg
 
 from .parse_args import parse_args
 from .EnvironmentFactory import EnvironmentFactory
-from .training_loops import off_policy_train
+from .training_loops import off_policy_train, ppo_train
 
 def main():
     rclpy.init()
@@ -70,9 +69,11 @@ def main():
 
     match agent.type:
         case 'policy':
-            off_policy_train(env, agent, memory, record, algorithm_config)
-        case 'ppo':
-            raise Exception('PPO Training not implemented')
+
+            if network_config['algorithm'] == 'PPO':
+                ppo_train(env, agent, memory, record, algorithm_config)
+            else:
+                off_policy_train(env, agent, memory, record, algorithm_config)
         case _:
             raise Exception(f'Agent type {agent.type} not supported')
 

From a88e564efe599c809268e36be067f69bd595205b Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 09:39:24 +1300
Subject: [PATCH 16/38] feat: make carbeat car names dynamic

---
 src/environment_interfaces/srv/CarBeatReset.srv     | 2 ++
 src/environments/environments/CarBeatEnvironment.py | 2 ++
 src/environments/environments/CarBeatReset.py       | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/environment_interfaces/srv/CarBeatReset.srv b/src/environment_interfaces/srv/CarBeatReset.srv
index 6d8d5b61..202f6a44 100644
--- a/src/environment_interfaces/srv/CarBeatReset.srv
+++ b/src/environment_interfaces/srv/CarBeatReset.srv
@@ -1,8 +1,10 @@
 float64 gx      # goal x
 float64 gy      # goal y
+string car_one
 float64 cx_one      # car_one x
 float64 cy_one      # car_one y
 float64 cyaw_one    # car_one yaw
+string car_two
 float64 cx_two      # car_two x
 float64 cy_two      # car_two y
 float64 cyaw_two    # car_two yaw
diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 5d86c316..09f10b4c 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -270,10 +270,12 @@ def call_reset_service(self,
         request.gx = goal_x
         request.gy = goal_y
 
+        request.car_one = self.NAME
         request.cx_one = car_x
         request.cy_one = car_y
         request.cyaw_one = car_Y
 
+        request.car_two = self.OTHER_CAR_NAME
         request.cx_two = ftg_x
         request.cy_two = ftg_y
         request.cyaw_two = ftg_Y
diff --git a/src/environments/environments/CarBeatReset.py b/src/environments/environments/CarBeatReset.py
index fd6ae27a..b38ff404 100644
--- a/src/environments/environments/CarBeatReset.py
+++ b/src/environments/environments/CarBeatReset.py
@@ -37,8 +37,8 @@ def service_callback(self, request, response):
         # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.cx_one} y={request.cy_one}')
 
         goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1)
-        car_req_one = self.create_request('f1tenth_one', x=request.cx_one, y=request.cy_one, z=0.1, yaw=request.cyaw_one)
-        car_req_two = self.create_request('f1tenth_two', x=request.cx_two, y=request.cy_two, z=0.1, yaw=request.cyaw_two)
+        car_req_one = self.create_request(request.car_one, x=request.cx_one, y=request.cy_one, z=0.1, yaw=request.cyaw_one)
+        car_req_two = self.create_request(request.car_two, x=request.cx_two, y=request.cy_two, z=0.1, yaw=request.cyaw_two)
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')

From 99d62fba5a506372ddaa0e487581003244a78d32 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:00:48 +1300
Subject: [PATCH 17/38] feat: make car name dynamic CarTrack

---
 src/environment_interfaces/srv/Reset.srv      |  1 +
 .../environments/CarTrackEnvironment.py       |  7 ++--
 .../environments/CarTrackReset.py             |  6 +---
 src/environments/launch/cartrack.launch.py    | 33 +++++++++++--------
 .../launch/train.launch.py                    |  1 +
 5 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/environment_interfaces/srv/Reset.srv b/src/environment_interfaces/srv/Reset.srv
index f8eb3eeb..8d6e4804 100644
--- a/src/environment_interfaces/srv/Reset.srv
+++ b/src/environment_interfaces/srv/Reset.srv
@@ -3,6 +3,7 @@ float64 gy      # goal y
 float64 cx      # car x
 float64 cy      # car y
 float64 cyaw    # car yaw
+string car_name
 string flag     # flag
 ---
 bool success
\ No newline at end of file
diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py
index cbc1703b..340ec1ac 100644
--- a/src/environments/environments/CarTrackEnvironment.py
+++ b/src/environments/environments/CarTrackEnvironment.py
@@ -26,7 +26,7 @@ def __init__(self,
         # Environment Details ----------------------------------------
         self.MAX_STEPS_PER_GOAL = max_steps
         self.MAX_GOALS = max_goals
-        
+
         match observation_mode:
             case 'no_position':
                 self.OBSERVATION_SIZE = 6 + 10
@@ -81,7 +81,7 @@ def reset(self):
         self.sleep()
 
         goal_x, goal_y = self.goal_position
-        self.call_reset_service(car_x=car_x, car_y=car_y, car_Y=car_yaw, goal_x=goal_x, goal_y=goal_y)
+        self.call_reset_service(car_x=car_x, car_y=car_y, car_Y=car_yaw, goal_x=goal_x, goal_y=goal_y, car_name=self.NAME)
 
         observation, _ = self.get_observation()
 
@@ -169,12 +169,13 @@ def compute_reward(self, state, next_state):
 
     # Utility Functions --------------------------------------------
 
-    def call_reset_service(self, car_x, car_y, car_Y, goal_x, goal_y):
+    def call_reset_service(self, car_x, car_y, car_Y, goal_x, goal_y, car_name):
         """
         Reset the car and goal position
         """
 
         request = Reset.Request()
+        request.car_name = car_name
         request.gx = float(goal_x)
         request.gy = float(goal_y)
         request.cx = float(car_x)
diff --git a/src/environments/environments/CarTrackReset.py b/src/environments/environments/CarTrackReset.py
index bbd89e7d..0cd5f5dd 100644
--- a/src/environments/environments/CarTrackReset.py
+++ b/src/environments/environments/CarTrackReset.py
@@ -30,14 +30,11 @@ def __init__(self):
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')
- 
 
     def service_callback(self, request, response):
 
-        # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.x} y={request.y}')
-
         goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1)
-        car_req = self.create_request('f1tenth', x=request.cx, y=request.cy, z=0, yaw=request.cyaw)
+        car_req = self.create_request(request.car_name, x=request.cx, y=request.cy, z=0, yaw=request.cyaw)
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')
@@ -49,7 +46,6 @@ def service_callback(self, request, response):
             self.set_pose_client.call(goal_req)
             self.set_pose_client.call(car_req)
 
-        # self.get_logger().info('Successfully Reset')
         response.success = True
 
         return response
diff --git a/src/environments/launch/cartrack.launch.py b/src/environments/launch/cartrack.launch.py
index be709d9c..5c26a544 100644
--- a/src/environments/launch/cartrack.launch.py
+++ b/src/environments/launch/cartrack.launch.py
@@ -9,8 +9,10 @@
 def launch(context, *args, **kwargs):
     pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim')
     pkg_environments = get_package_share_directory('environments')
+    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
 
     track = LaunchConfiguration('track').perform(context)
+    car_name = LaunchConfiguration('car_name').perform(context)
     
     gz_sim = IncludeLaunchDescription(
         launch_description_source=PythonLaunchDescriptionSource(
@@ -20,16 +22,30 @@ def launch(context, *args, **kwargs):
         }.items()
     )
 
-    return[ gz_sim]
+    f1tenth = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_name,
+            'world': 'empty'
+        }.items()
+    )
+
+
+    return[gz_sim, f1tenth]
 
 def generate_launch_description():
-    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
 
     track_arg = DeclareLaunchArgument(
         'track',
         default_value='track_1'
     )
 
+    car_name = DeclareLaunchArgument(
+        'car_name',
+        default_value='f1tenth'
+    )
+
     service_bridge = Node(
         package='ros_gz_bridge',
         executable='parameter_bridge',
@@ -46,18 +62,7 @@ def generate_launch_description():
         ],
     )
 
-    f1tenth = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth',
-            'world': 'empty'
-        }.items()
-    )
 
-    #TODO: dynamically change car name
-    #TODO: This doesn't work yet
-    #TODO: Create CarWall Reset
     reset = Node(
             package='environments',
             executable='CarTrackReset',
@@ -69,5 +74,5 @@ def generate_launch_description():
         OpaqueFunction(function=launch),
         service_bridge,
         reset,
-        f1tenth,
+        car_name,
 ])
\ No newline at end of file
diff --git a/src/reinforcement_learning/launch/train.launch.py b/src/reinforcement_learning/launch/train.launch.py
index cb3c0345..1af7937c 100644
--- a/src/reinforcement_learning/launch/train.launch.py
+++ b/src/reinforcement_learning/launch/train.launch.py
@@ -32,6 +32,7 @@ def generate_launch_description():
             os.path.join(pkg_environments, f'{env_launch[env]}.launch.py')),
         launch_arguments={
             'track': TextSubstitution(text=str(config['train']['ros__parameters']['track'])),
+            'car_name': TextSubstitution(text=str(config['train']['ros__parameters']['car_name'])),
             'car_one': TextSubstitution(text=str(config['train']['ros__parameters']['car_name']) if 'car_name' in config['train']['ros__parameters'] else 'f1tenth'),
             'car_two': TextSubstitution(text=str(config['train']['ros__parameters']['ftg_car_name']) if 'ftg_car_name' in config['train']['ros__parameters'] else 'ftg_car'),
         }.items() #TODO: this doesn't do anything

From 44e103b475a82e7b8504e22fe3c0f492e197242a Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:08:10 +1300
Subject: [PATCH 18/38] feat: make CarBlock car name dynamic

---
 .../environments/CarBlockEnvironment.py       |  2 +-
 .../environments/CarBlockReset.py             |  2 +-
 src/environments/launch/carblock.launch.py    | 39 +++++++++++++------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py
index 47023106..100acbe8 100644
--- a/src/environments/environments/CarBlockEnvironment.py
+++ b/src/environments/environments/CarBlockEnvironment.py
@@ -7,7 +7,6 @@
 from sensor_msgs.msg import LaserScan
 
 from environments.F1tenthEnvironment import F1tenthEnvironment
-from environments.CarWallEnvironment import CarWallEnvironment
  
 from .util import reduce_lidar, process_odom, generate_position
 from .termination import reached_goal, has_collided, has_flipped_over
@@ -82,6 +81,7 @@ def call_reset_service(self, goal_x, goal_y):
         
         req.gx = goal_x
         req.gy = goal_y
+        req.car_name = self.NAME
         
         future = self.reset_client.call_async(req)
         rclpy.spin_until_future_complete(future=future, node=self)
diff --git a/src/environments/environments/CarBlockReset.py b/src/environments/environments/CarBlockReset.py
index 39cfdd9a..f90a385d 100644
--- a/src/environments/environments/CarBlockReset.py
+++ b/src/environments/environments/CarBlockReset.py
@@ -43,7 +43,7 @@ def service_callback(self, request, response):
 
         # Move the goal to new position & car back to origin
         goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1)
-        car_req = self.create_request('f1tenth')
+        car_req = self.create_request(request.car_name)
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')
diff --git a/src/environments/launch/carblock.launch.py b/src/environments/launch/carblock.launch.py
index a7a34be2..a762cb68 100644
--- a/src/environments/launch/carblock.launch.py
+++ b/src/environments/launch/carblock.launch.py
@@ -3,13 +3,38 @@
 from launch_ros.actions import Node 
 from launch import LaunchDescription
 from launch.launch_description_sources import PythonLaunchDescriptionSource
-from launch.actions import IncludeLaunchDescription
+from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+
+def launch(context, *args, **kwargs):
+    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
+
+    car_name = LaunchConfiguration('car_name').perform(context)
+
+    f1tenth = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_name,
+            'world': 'empty',
+            'x': '0',
+            'y': '0',
+            'z': '5',
+        }.items()
+    )
+
+    return[f1tenth]
 
 def generate_launch_description():
     pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim')
     pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
     pkg_environments = get_package_share_directory('environments')
     
+    car_name = DeclareLaunchArgument(
+        'car_name',
+        default_value='f1tenth'
+    )
+
     service_bridge = Node(
         package='ros_gz_bridge',
         executable='parameter_bridge',
@@ -26,15 +51,6 @@ def generate_launch_description():
         ],
     )
 
-    f1tenth = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth',
-            'world': 'empty'
-        }.items()
-    )
-
     gz_sim = IncludeLaunchDescription(
         launch_description_source=PythonLaunchDescriptionSource(
             os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')),
@@ -51,7 +67,8 @@ def generate_launch_description():
 
     return LaunchDescription([
         gz_sim,
-        f1tenth,
+        car_name,
+        OpaqueFunction(function=launch),
         service_bridge,
         reset,
 ])
\ No newline at end of file

From 832fe241a3a71f45381be4b6ad779a73913c9e98 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:12:49 +1300
Subject: [PATCH 19/38] feat: make CarWall car name dynamic

---
 .../environments/CarWallEnvironment.py        |  3 +-
 src/environments/environments/CarWallReset.py |  2 +-
 src/environments/launch/carwall.launch.py     | 42 ++++++++++++-------
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py
index 61c92c7e..9d3dc028 100644
--- a/src/environments/environments/CarWallEnvironment.py
+++ b/src/environments/environments/CarWallEnvironment.py
@@ -81,7 +81,8 @@ def call_reset_service(self, goal_x, goal_y):
         
         req.gx = goal_x
         req.gy = goal_y
-        
+        req.car_name = self.NAME
+
         future = self.reset_client.call_async(req)
         rclpy.spin_until_future_complete(future=future, node=self)
         
diff --git a/src/environments/environments/CarWallReset.py b/src/environments/environments/CarWallReset.py
index c4d80b81..d10a7f02 100644
--- a/src/environments/environments/CarWallReset.py
+++ b/src/environments/environments/CarWallReset.py
@@ -37,7 +37,7 @@ def service_callback(self, request, response):
         # self.get_logger().info(f'Reset Service Request Received: relocating goal to x={request.x} y={request.y}')
 
         goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1)
-        car_req = self.create_request('f1tenth')
+        car_req = self.create_request(request.car_name, x=0, y=0, z=0)
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')
diff --git a/src/environments/launch/carwall.launch.py b/src/environments/launch/carwall.launch.py
index 17587eb0..a82d4383 100644
--- a/src/environments/launch/carwall.launch.py
+++ b/src/environments/launch/carwall.launch.py
@@ -3,13 +3,38 @@
 from launch_ros.actions import Node 
 from launch import LaunchDescription
 from launch.launch_description_sources import PythonLaunchDescriptionSource
-from launch.actions import IncludeLaunchDescription
+from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+
+def launch(context, *args, **kwargs):
+    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
+
+    car_name = LaunchConfiguration('car_name').perform(context)
+
+    f1tenth = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_name,
+            'world': 'empty',
+            'x': '0',
+            'y': '0',
+            'z': '5',
+        }.items()
+    )
+
+    return[f1tenth]
 
 def generate_launch_description():
     pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim')
     pkg_environments = get_package_share_directory('environments')
     pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
 
+    car_name = DeclareLaunchArgument(
+        'car_name',
+        default_value='f1tenth'
+    )
+
     service_bridge = Node(
         package='ros_gz_bridge',
         executable='parameter_bridge',
@@ -26,15 +51,6 @@ def generate_launch_description():
         ],
     )
 
-    f1tenth = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth',
-            'world': 'empty'
-        }.items()
-    )
-
     gz_sim = IncludeLaunchDescription(
         launch_description_source=PythonLaunchDescriptionSource(
             os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')),
@@ -43,9 +59,6 @@ def generate_launch_description():
         }.items()
     )
 
-    #TODO: dynamically change car name
-    #TODO: This doesn't work yet
-    #TODO: Create CarWall Reset
     reset = Node(
             package='environments',
             executable='CarWallReset',
@@ -55,7 +68,8 @@ def generate_launch_description():
 
     return LaunchDescription([
         gz_sim,
-        f1tenth,
+        car_name,
+        OpaqueFunction(function=launch),
         service_bridge,
         reset,
 ])
\ No newline at end of file

From c9081264a2461ac44b365b73fdb7e6805b238e17 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:16:30 +1300
Subject: [PATCH 20/38] feat: make CarGoal car name dynamic

---
 .../environments/CarGoalEnvironment.py        |  1 +
 src/environments/environments/CarGoalReset.py |  2 +-
 src/environments/launch/cargoal.launch.py     | 41 +++++++++++++------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py
index 495bd899..015d255b 100644
--- a/src/environments/environments/CarGoalEnvironment.py
+++ b/src/environments/environments/CarGoalEnvironment.py
@@ -72,6 +72,7 @@ def call_reset_service(self, goal_x, goal_y):
         
         req.gx = goal_x
         req.gy = goal_y
+        req.car_name = self.NAME
         
         future = self.reset_client.call_async(req)
         rclpy.spin_until_future_complete(future=future, node=self)
diff --git a/src/environments/environments/CarGoalReset.py b/src/environments/environments/CarGoalReset.py
index c42802ff..85facc0d 100644
--- a/src/environments/environments/CarGoalReset.py
+++ b/src/environments/environments/CarGoalReset.py
@@ -35,7 +35,7 @@ def __init__(self):
     def service_callback(self, request, response):
 
         goal_req = self.create_request('goal', x=request.gx, y=request.gy, z=1)
-        car_req = self.create_request('f1tenth')
+        car_req = self.create_request(request.car_name)
 
         while not self.set_pose_client.wait_for_service(timeout_sec=1.0):
             self.get_logger().info('set_pose service not available, waiting again...')
diff --git a/src/environments/launch/cargoal.launch.py b/src/environments/launch/cargoal.launch.py
index 07692810..efcab148 100644
--- a/src/environments/launch/cargoal.launch.py
+++ b/src/environments/launch/cargoal.launch.py
@@ -3,13 +3,38 @@
 from launch_ros.actions import Node 
 from launch import LaunchDescription
 from launch.launch_description_sources import PythonLaunchDescriptionSource
-from launch.actions import IncludeLaunchDescription
+from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+
+def launch(context, *args, **kwargs):
+    pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
+
+    car_name = LaunchConfiguration('car_name').perform(context)
+
+    f1tenth = IncludeLaunchDescription(
+        launch_description_source=PythonLaunchDescriptionSource(
+            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
+        launch_arguments={
+            'name': car_name,
+            'world': 'empty',
+            'x': '0',
+            'y': '0',
+            'z': '5',
+        }.items()
+    )
+
+    return[f1tenth]
 
 def generate_launch_description():
     pkg_ros_gz_sim = get_package_share_directory('ros_gz_sim')
     pkg_f1tenth_bringup = get_package_share_directory('f1tenth_bringup')
     pkg_environments = get_package_share_directory('environments')
 
+    car_name = DeclareLaunchArgument(
+        'car_name',
+        default_value='f1tenth'
+    )
+    
     service_bridge = Node(
         package='ros_gz_bridge',
         executable='parameter_bridge',
@@ -26,15 +51,6 @@ def generate_launch_description():
         ],
     )
 
-    f1tenth = IncludeLaunchDescription(
-        launch_description_source=PythonLaunchDescriptionSource(
-            os.path.join(pkg_f1tenth_bringup, 'simulation_bringup.launch.py')),
-        launch_arguments={
-            'name': 'f1tenth',
-            'world': 'empty'
-        }.items()
-    )
-
     gz_sim = IncludeLaunchDescription(
         launch_description_source=PythonLaunchDescriptionSource(
             os.path.join(pkg_ros_gz_sim, 'launch', 'gz_sim.launch.py')),
@@ -43,8 +59,6 @@ def generate_launch_description():
         }.items()
     )
 
-    #TODO: dynamically change car name
-    #TODO: This doesn't work yet
     reset = Node(
             package='environments',
             executable='CarGoalReset',
@@ -54,7 +68,8 @@ def generate_launch_description():
 
     return LaunchDescription([
         gz_sim,
-        f1tenth,
+        car_name,
+        OpaqueFunction(function=launch),
         service_bridge,
         reset,
 ])
\ No newline at end of file

From a818bd37b707d7ce57c73db1dbb80a45c1ec7ab2 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:32:29 +1300
Subject: [PATCH 21/38] chore: remove bash scripts

---
 helperScripts/0kill.sh   |  5 ----
 helperScripts/1manual.sh | 28 ----------------------
 helperScripts/2train.sh  | 30 ------------------------
 helperScripts/3test.sh   | 28 ----------------------
 helperScripts/4openGZ.sh | 12 ----------
 helperScripts/retrain.sh | 50 ----------------------------------------
 6 files changed, 153 deletions(-)
 delete mode 100755 helperScripts/0kill.sh
 delete mode 100755 helperScripts/1manual.sh
 delete mode 100755 helperScripts/2train.sh
 delete mode 100755 helperScripts/3test.sh
 delete mode 100755 helperScripts/4openGZ.sh
 delete mode 100755 helperScripts/retrain.sh

diff --git a/helperScripts/0kill.sh b/helperScripts/0kill.sh
deleted file mode 100755
index 2b7804ca..00000000
--- a/helperScripts/0kill.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-pkill -f "ros"
-pkill -f "gz"
-pkill -f "ruby"
diff --git a/helperScripts/1manual.sh b/helperScripts/1manual.sh
deleted file mode 100755
index b6940d9a..00000000
--- a/helperScripts/1manual.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Ctrl + C to exit the program
-function cleanup() {
-  return
-}
-
-# trap cleanup SIGINT SIGTERM
-trap cleanup SIGINT SIGTERM
-
-cd ~/autonomous_f1tenth/
-
-if [ -n "$1" ]; then
-  export GZ_PARTITION=$1
-  export ROS_DOMAIN_ID=$1
-else
-  export GZ_PARTITION=50
-  export ROS_DOMAIN_ID=50
-fi
-
-. install/setup.bash
-
-colcon build
-ros2 launch reinforcement_learning sanity_check.launch.py &
-
-gz sim -g &
-
-wait
diff --git a/helperScripts/2train.sh b/helperScripts/2train.sh
deleted file mode 100755
index 382e530d..00000000
--- a/helperScripts/2train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Ctrl + C to exit the program
-function cleanup() {
-  return
-}
-
-# trap cleanup SIGINT SIGTERM
-trap cleanup SIGINT SIGTERM
-
-cd ~/autonomous_f1tenth/
-
-if [ -n "$1" ]; then
-  export GZ_PARTITION=$1
-  export ROS_DOMAIN_ID=$1
-else
-  export GZ_PARTITION=100
-  export ROS_DOMAIN_ID=100
-fi
-
-. install/setup.bash
-
-colcon build
-ros2 launch reinforcement_learning train.launch.py &
-
-if [ "$1" == "y" ] || [ "$1" == "Y" ]; then
-  gz sim -g &
-fi
-
-wait
diff --git a/helperScripts/3test.sh b/helperScripts/3test.sh
deleted file mode 100755
index d675e273..00000000
--- a/helperScripts/3test.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Ctrl + C to exit the program
-function cleanup() {
-  return
-}
-
-# trap cleanup SIGINT SIGTERM
-trap cleanup SIGINT SIGTERM
-
-cd ~/autonomous_f1tenth/
-
-if [ -n "$1" ]; then
-  export GZ_PARTITION=$1
-  export ROS_DOMAIN_ID=$1
-else
-  export GZ_PARTITION=150
-  export ROS_DOMAIN_ID=150
-fi
-
-. install/setup.bash
-
-colcon build
-ros2 launch reinforcement_learning test.launch.py &
-
-gz sim -g &
-
-wait
diff --git a/helperScripts/4openGZ.sh b/helperScripts/4openGZ.sh
deleted file mode 100755
index 932d40bf..00000000
--- a/helperScripts/4openGZ.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-cd ~/autonomous_f1tenth/
-
-if [ -z "$1" ]; then
-  echo "Please enter the GZ_PARTITION number"
-  echo "Usage: ./4openGZ.sh <GZ_PARTITION>"
-  return
-fi
-
-export GZ_PARTITION=$1
-gz sim -g
diff --git a/helperScripts/retrain.sh b/helperScripts/retrain.sh
deleted file mode 100755
index 5b7caf1d..00000000
--- a/helperScripts/retrain.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-# Ctrl + C to exit the program
-function cleanup() {
-  return
-}
-
-# trap cleanup SIGINT SIGTERM
-trap cleanup SIGINT SIGTERM
-
-cd ..
-
-# Time limit for training
-if [ -z "$1" ]; then
-  echo "Please enter the time limit for training"
-  echo "Usage: ./retrain.sh <time_limit> [<partition_number>]"
-  echo "Eg: 10s, 6h"
-  echo "Usage: ./retrain.sh 10s"
-  return
-fi
-
-# Set the partition number
-if [ -n "$2" ]; then
-	export GZ_PARTITION="$2"
-fi
-
-. install/setup.bash
-
-# Rerun every $1 time
-while true; do
-	colcon build
-	timeout --signal=SIGKILL "$1" ros2 launch reinforcement_learning train.launch.py
-
-  # Get the latest folder in rl_logs
-	latest_folder=$(ls -lt ./rl_logs | grep '^d' | head -n 1 | awk '{print $NF}')
-
-	# Set the new paths for actor_path and critic_path
-	new_actor_path="rl_logs/$latest_folder/models/actor_checkpoint.pht"
-	new_critic_path="rl_logs/$latest_folder/models/critic_checkpoint.pht"
-
-  # Check if the new paths exist
-	if [ ! -e "$new_actor_path" ] || [ ! -e "$new_critic_path" ]; then
-    echo "Error: $new_actor_path or $new_critic_path does not exist"
-    return
-  fi
-
-	# Use sed to update the actor_path and critic_path in the YAML file
-	sed -i "s#actor_path: '.*'#actor_path: '$new_actor_path'#g" src/reinforcement_learning/config/train.yaml
-	sed -i "s#critic_path: '.*'#critic_path: '$new_critic_path'#g" src/reinforcement_learning/config/train.yaml
-done

From 542bb8551c6f4ffbcc82a12ecb48595db2f89ae9 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:35:40 +1300
Subject: [PATCH 22/38] chore: fix imports controller package

---
 src/controllers/controllers/controller.py     | 9 +++++----
 src/controllers/controllers/ftg_controller.py | 4 ++--
 src/controllers/controllers/ftg_policy.py     | 2 +-
 src/controllers/controllers/random.py         | 2 +-
 src/controllers/controllers/rl_controller.py  | 4 ++--
 src/controllers/controllers/rl_policy.py      | 2 +-
 6 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/controllers/controllers/controller.py b/src/controllers/controllers/controller.py
index 8e24a621..ee072474 100644
--- a/src/controllers/controllers/controller.py
+++ b/src/controllers/controllers/controller.py
@@ -1,13 +1,14 @@
-import rclpy
 import math
-from geometry_msgs.msg import Twist
-from message_filters import Subscriber, ApproximateTimeSynchronizer
+import rclpy
 from rclpy import Future
 from rclpy.node import Node
+
+from geometry_msgs.msg import Twist
+from message_filters import Subscriber, ApproximateTimeSynchronizer
 from sensor_msgs.msg import LaserScan
 from nav_msgs.msg import Odometry
 
-from environments.util import process_lidar, process_odom, reduce_lidar
+from environments.util import process_odom, reduce_lidar
 
 class Controller(Node):
     def __init__(self, node_name, car_name, step_length):
diff --git a/src/controllers/controllers/ftg_controller.py b/src/controllers/controllers/ftg_controller.py
index ae7bbc8a..1b33eb09 100644
--- a/src/controllers/controllers/ftg_controller.py
+++ b/src/controllers/controllers/ftg_controller.py
@@ -1,9 +1,9 @@
-from .controller import Controller
+from rclpy import Future
 from std_msgs.msg import Empty
 from environments.track_reset import track_info
 from environments.termination import reached_goal
 from environments.util import reduce_lidar, process_odom
-from rclpy import Future
+from .controller import Controller
 
 class FTGController(Controller):
 
diff --git a/src/controllers/controllers/ftg_policy.py b/src/controllers/controllers/ftg_policy.py
index f7fdac6b..0ca5106d 100644
--- a/src/controllers/controllers/ftg_policy.py
+++ b/src/controllers/controllers/ftg_policy.py
@@ -1,6 +1,6 @@
-from .controller import Controller
 import rclpy
 import numpy as np
+from .controller import Controller
 
 def main():
     rclpy.init()
diff --git a/src/controllers/controllers/random.py b/src/controllers/controllers/random.py
index 0c72d78b..718bbe41 100644
--- a/src/controllers/controllers/random.py
+++ b/src/controllers/controllers/random.py
@@ -1,7 +1,7 @@
-from .controller import Controller
 import rclpy
 import numpy as np
 import random
+from .controller import Controller
 
 def main():
     rclpy.init()
diff --git a/src/controllers/controllers/rl_controller.py b/src/controllers/controllers/rl_controller.py
index c558c044..68eca094 100644
--- a/src/controllers/controllers/rl_controller.py
+++ b/src/controllers/controllers/rl_controller.py
@@ -1,7 +1,7 @@
-from .controller import Controller
+from rclpy import Future
 from std_msgs.msg import Empty
+from .controller import Controller
 from environments.util import reduce_lidar, process_odom
-from rclpy import Future
 
 class RLController(Controller):
 
diff --git a/src/controllers/controllers/rl_policy.py b/src/controllers/controllers/rl_policy.py
index 82a32cc4..f6488282 100644
--- a/src/controllers/controllers/rl_policy.py
+++ b/src/controllers/controllers/rl_policy.py
@@ -1,7 +1,7 @@
-from .rl_controller import RLController
 import rclpy
 from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
 from cares_reinforcement_learning.util.helpers import denormalize
+from .rl_controller import RLController
 
 def main():
     rclpy.init()

From 4b67b18041a76f4ef6a09d00a8bc443ff88f4350 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:37:59 +1300
Subject: [PATCH 23/38] chore: remove unused imports

---
 src/environments/environments/CarBeatReset.py        | 2 +-
 src/environments/environments/CarBlockEnvironment.py | 1 -
 src/environments/environments/CarBlockReset.py       | 2 +-
 src/environments/environments/CarGoalReset.py        | 2 +-
 src/environments/environments/CarTrackReset.py       | 2 +-
 src/environments/environments/CarWallEnvironment.py  | 2 --
 src/environments/environments/CarWallReset.py        | 3 +--
 src/environments/environments/F1tenthEnvironment.py  | 2 --
 8 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/environments/environments/CarBeatReset.py b/src/environments/environments/CarBeatReset.py
index b38ff404..77cf5d30 100644
--- a/src/environments/environments/CarBeatReset.py
+++ b/src/environments/environments/CarBeatReset.py
@@ -1,7 +1,7 @@
 import sys
 import rclpy
 from rclpy.node import Node
-from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 
 from environment_interfaces.srv import CarBeatReset as CarBeatResetSrv
diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py
index 100acbe8..b2c3e650 100644
--- a/src/environments/environments/CarBlockEnvironment.py
+++ b/src/environments/environments/CarBlockEnvironment.py
@@ -4,7 +4,6 @@
 import numpy as np
 import rclpy
 from rclpy import Future
-from sensor_msgs.msg import LaserScan
 
 from environments.F1tenthEnvironment import F1tenthEnvironment
  
diff --git a/src/environments/environments/CarBlockReset.py b/src/environments/environments/CarBlockReset.py
index f90a385d..825122c9 100644
--- a/src/environments/environments/CarBlockReset.py
+++ b/src/environments/environments/CarBlockReset.py
@@ -1,7 +1,7 @@
 import sys
 import rclpy
 from rclpy.node import Node
-from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 import numpy as np
 import random
diff --git a/src/environments/environments/CarGoalReset.py b/src/environments/environments/CarGoalReset.py
index 85facc0d..07b3b426 100644
--- a/src/environments/environments/CarGoalReset.py
+++ b/src/environments/environments/CarGoalReset.py
@@ -1,7 +1,7 @@
 import sys
 import rclpy
 from rclpy.node import Node
-from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 
 from environment_interfaces.srv import Reset
diff --git a/src/environments/environments/CarTrackReset.py b/src/environments/environments/CarTrackReset.py
index 0cd5f5dd..486d129a 100644
--- a/src/environments/environments/CarTrackReset.py
+++ b/src/environments/environments/CarTrackReset.py
@@ -1,7 +1,7 @@
 import sys
 import rclpy
 from rclpy.node import Node
-from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 
 from environment_interfaces.srv import Reset
diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py
index 9d3dc028..cd285cee 100644
--- a/src/environments/environments/CarWallEnvironment.py
+++ b/src/environments/environments/CarWallEnvironment.py
@@ -1,10 +1,8 @@
 import math
 import random
 
-import numpy as np
 import rclpy
 from rclpy import Future
-from sensor_msgs.msg import LaserScan
 from .util import process_odom, avg_reduce_lidar, generate_position
 from .termination import has_collided, has_flipped_over, reached_goal
 from environments.F1tenthEnvironment import F1tenthEnvironment
diff --git a/src/environments/environments/CarWallReset.py b/src/environments/environments/CarWallReset.py
index d10a7f02..48c47c90 100644
--- a/src/environments/environments/CarWallReset.py
+++ b/src/environments/environments/CarWallReset.py
@@ -1,7 +1,6 @@
-import sys
 import rclpy
 from rclpy.node import Node
-from rclpy.callback_groups import MutuallyExclusiveCallbackGroup, ReentrantCallbackGroup
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 from rclpy.executors import MultiThreadedExecutor
 
 from environment_interfaces.srv import Reset
diff --git a/src/environments/environments/F1tenthEnvironment.py b/src/environments/environments/F1tenthEnvironment.py
index 8b3af17d..fde64773 100644
--- a/src/environments/environments/F1tenthEnvironment.py
+++ b/src/environments/environments/F1tenthEnvironment.py
@@ -1,5 +1,3 @@
-import math
-
 import numpy as np
 import rclpy
 from geometry_msgs.msg import Twist

From 4d9940242ac1865f23a477a7b364958db951f93f Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 10:40:17 +1300
Subject: [PATCH 24/38] chore: replace train

---
 .../reinforcement_learning/train.py           | 426 +++---------------
 .../reinforcement_learning/updated_train.py   |  81 ----
 src/reinforcement_learning/setup.py           |   2 +-
 3 files changed, 54 insertions(+), 455 deletions(-)
 delete mode 100644 src/reinforcement_learning/reinforcement_learning/updated_train.py

diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py
index a84b32ca..60a1b9c2 100644
--- a/src/reinforcement_learning/reinforcement_learning/train.py
+++ b/src/reinforcement_learning/reinforcement_learning/train.py
@@ -1,401 +1,81 @@
 import random
-import time
+from datetime import datetime
+import yaml
 
 import numpy as np
 import rclpy
+from rclpy.parameter import Parameter
 import torch
-from cares_reinforcement_learning.algorithm.policy import TD3, PPO
+
 from cares_reinforcement_learning.memory import MemoryBuffer
-from cares_reinforcement_learning.networks.TD3 import Actor, Critic
-from cares_reinforcement_learning.util import Record, NetworkFactory
-from cares_reinforcement_learning.util import helpers as hlp
+from cares_reinforcement_learning.util.Record import Record
+from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
+import cares_reinforcement_learning.util.configurations as cfg
 
-from environments.CarBlockEnvironment import CarBlockEnvironment
-from environments.CarGoalEnvironment import CarGoalEnvironment
-from environments.CarTrackEnvironment import CarTrackEnvironment
-from environments.CarWallEnvironment import CarWallEnvironment
-from environments.CarBeatEnvironment import CarBeatEnvironment
+from .parse_args import parse_args
+from .EnvironmentFactory import EnvironmentFactory
+from .training_loops import off_policy_train, ppo_train
 
 def main():
     rclpy.init()
 
-    params = get_params()
-
-    global MAX_STEPS_TRAINING
-    global MAX_STEPS_EXPLORATION
-    global MAX_STEPS_PER_BATCH
-    global G
-    global BATCH_SIZE
-    global EVALUATE_EVERY_N_STEPS
-    global EVALUATE_FOR_M_EPISODES
-    global ALGORITHM
-
-    ENVIRONMENT, \
-    ALGORITHM, \
-    TRACK, \
-    MAX_STEPS_TRAINING, \
-    MAX_STEPS_EXPLORATION, \
-    GAMMA, \
-    TAU, \
-    G, \
-    BATCH_SIZE, \
-    BUFFER_SIZE, \
-    SEED, \
-    ACTOR_LR, \
-    CRITIC_LR, \
-    MAX_STEPS, \
-    STEP_LENGTH, \
-    REWARD_RANGE, \
-    COLLISION_RANGE, \
-    ACTOR_PATH, \
-    CRITIC_PATH, \
-    MAX_STEPS_PER_BATCH, \
-    OBSERVATION_MODE, \
-    EVALUATE_EVERY_N_STEPS, \
-    EVALUATE_FOR_M_EPISODES = [param.value for param in params]
+    env_config, algorithm_config, network_config = parse_args()
 
-    if ACTOR_PATH != '' and CRITIC_PATH != '':
-        MAX_STEPS_EXPLORATION = 0
+    # Set Seeds
+    torch.manual_seed(algorithm_config['seed'])    
+    torch.cuda.manual_seed_all(algorithm_config['seed'])
+    np.random.seed(algorithm_config['seed'])
+    random.seed(algorithm_config['seed'])
 
-    torch.manual_seed(SEED)
-    torch.cuda.manual_seed_all(SEED)
-    np.random.seed(SEED)
-    random.seed(SEED)
-    
     print(
-        f'---------------------------------------------\n'
-        f'Environment: {ENVIRONMENT}\n'
-        f'Algorithm: {ALGORITHM}\n'
-        f'Exploration Steps: {MAX_STEPS_EXPLORATION}\n'
-        f'Training Steps: {MAX_STEPS_TRAINING}\n'
-        f'Gamma: {GAMMA}\n'
-        f'Tau: {TAU}\n'
-        f'G: {G}\n'
-        f'Batch Size: {BATCH_SIZE}\n'
-        f'Buffer Size: {BUFFER_SIZE}\n'
-        f'Seed: {SEED}\n'
-        f'Actor LR: {ACTOR_LR}\n'
-        f'Critic LR: {CRITIC_LR}\n'
-        f'Steps per Episode: {MAX_STEPS}\n'
-        f'Step Length: {STEP_LENGTH}\n'
-        f'Reward Range: {REWARD_RANGE}\n'
-        f'Collision Range: {COLLISION_RANGE}\n'
-        f'Critic Path: {CRITIC_PATH}\n'
-        f'Actor Path: {ACTOR_PATH}\n'
-        f'Max Steps per Batch: {MAX_STEPS_PER_BATCH}\n'
-        f'Observation Mode: {OBSERVATION_MODE}\n'
-        f'Evaluate Every N Steps: {EVALUATE_EVERY_N_STEPS}\n'
-        f'Evaluate For M Episodes: {EVALUATE_FOR_M_EPISODES}\n'
-        f'---------------------------------------------\n'
+        f'Environment Config: ------------------------------------- \n'
+        f'{yaml.dump(env_config, default_flow_style=False)} \n'
+        f'Algorithm Config: ------------------------------------- \n'
+        f'{yaml.dump(algorithm_config, default_flow_style=False)} \n'
+        f'Network Config: ------------------------------------- \n'
+        f'{yaml.dump(network_config, default_flow_style=False)} \n'
     )
 
-    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    time.sleep(3)
-
-    match ENVIRONMENT:
-        case 'CarWall':
-            env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarBlock':
-            env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarTrack':
-            env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
-        case 'CarBeat':
-            env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVATION_MODE)
+    env_factory = EnvironmentFactory()
+    network_factory = NetworkFactory()
+
+    match network_config['algorithm']:
+        case 'PPO':
+            config = cfg.PPOConfig(**network_config)
+        case 'DDPG':
+            config = cfg.DDPGConfig(**network_config)
+        case 'SAC':
+            config = cfg.SACConfig(**network_config)
+        case 'TD3':
+            config = cfg.TD3Config(**network_config)
         case _:
-            env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE)
-
-    network_factory_args = {
-        'observation_size': env.OBSERVATION_SIZE,
-        'action_num': env.ACTION_NUM,
-        'actor_lr': ACTOR_LR,
-        'critic_lr': CRITIC_LR,
-        'gamma': GAMMA,
-        'tau': TAU,
-        'device': DEVICE
-    }
-
-    agent_factory = NetworkFactory()
-    agent = agent_factory.create_network(ALGORITHM, network_factory_args)
-    
-    if ACTOR_PATH != '' and CRITIC_PATH != '':
-        print('Reading saved models into actor and critic')
-        agent.actor_net.load_state_dict(torch.load(ACTOR_PATH))
-        agent.critic_net.load_state_dict(torch.load(CRITIC_PATH))
-        print('Successfully Loaded models')
-
-    networks = {'actor': agent.actor_net, 'critic': agent.critic_net}
-    config = {
-        'environment': ENVIRONMENT,
-        'algorithm': ALGORITHM,
-        'max_steps_training': MAX_STEPS_TRAINING,
-        'max_steps_exploration': MAX_STEPS_EXPLORATION,
-        'gamma': GAMMA,
-        'tau': TAU,
-        'g': G,
-        'batch_size': BATCH_SIZE,
-        'buffer_size': BUFFER_SIZE,
-        'seed': SEED,
-        'actor_lr': ACTOR_LR,
-        'critic_lr': CRITIC_LR,
-        'max_steps': MAX_STEPS,
-        'step_length': STEP_LENGTH,
-        'reward_range': REWARD_RANGE,
-        'collision_range': COLLISION_RANGE,
-        'max_steps_per_batch': MAX_STEPS_PER_BATCH,
-        'observation_mode': OBSERVATION_MODE,
-        'evaluate_every_n_steps': EVALUATE_EVERY_N_STEPS,
-        'evaluate_for_m_episodes': EVALUATE_FOR_M_EPISODES
-    }
-
-    if (ENVIRONMENT == 'CarTrack'):
-        config['track'] = TRACK
-
-    record = Record(networks=networks, checkpoint_freq=100, config=config)
-
-    if ALGORITHM == 'PPO':
-        train_ppo(env, agent, record=record)
-    else:
-        train(env=env, agent=agent, record=record)
-
-
-def train(env, agent, record: Record):
-    memory = MemoryBuffer()
-
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-    evaluation_reward = None
-    
-    state, _ = env.reset()
-
-    evaluate = False
-    
-    print(f'Initial State: {state}')
-    historical_reward = {"step": [], "episode_reward": []}
-
-    for total_step_counter in range(int(MAX_STEPS_TRAINING)):
-        episode_timesteps += 1
-
-        if total_step_counter < MAX_STEPS_EXPLORATION:
-            print(f"Running Exploration Steps {total_step_counter}/{MAX_STEPS_EXPLORATION}")
-            action_env = np.asarray([random.uniform(env.MIN_ACTIONS[0], env.MAX_ACTIONS[0]), random.uniform(env.MIN_ACTIONS[1], env.MAX_ACTIONS[1])])  # action range the env uses [e.g. -2 , 2 for pendulum]
-            action = hlp.normalize(action_env, env.MAX_ACTIONS, env.MIN_ACTIONS)  # algorithm range [-1, 1]
-        else:
-            action = agent.select_action_from_policy(state)  # algorithm range [-1, 1]
-            action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)  # mapping to env range [e.g. -2 , 2 for pendulum]
-
-        next_state, reward, done, truncated, info = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done)
-
-        state = next_state
-        episode_reward += reward
-
-        if total_step_counter >= MAX_STEPS_EXPLORATION:
-            for _ in range(G):
-                experiences = memory.sample(BATCH_SIZE)
-                experiences = (experiences['state'], experiences['action'], experiences['reward'], experiences['next_state'], experiences['done'])
-                agent.train_policy(experiences)
-
-        
-
-        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
-            evaluate = True
-
-        
-        record.log(
-            out=done or truncated,
-            Step=total_step_counter,
-            Episode=episode_num,
-            Step_Reward=reward,
-            Episode_Reward=episode_reward if (done or truncated) else None,
-            Evaluation_Reward=evaluation_reward
-        )
+            raise Exception(f'Algorithm {network_config["algorithm"]} not implemented')
 
-        evaluation_reward = None
 
-        if done or truncated:
-            historical_reward["step"].append(total_step_counter)
-            historical_reward["episode_reward"].append(episode_reward)
-            
-            if evaluate:
-                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
-                evaluate = False
-            
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-            
-def train_ppo(env, agent, record):
-    max_steps_training = MAX_STEPS_TRAINING
-    max_steps_per_batch = MAX_STEPS_PER_BATCH
+    env = env_factory.create(env_config['environment'], env_config)
+    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
+    memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM)
 
-    min_action_value = env.MIN_ACTIONS 
-    max_action_value = env.MAX_ACTIONS
 
-    episode_timesteps = 0
-    episode_num = 0
-    episode_reward = 0
-    time_step = 1
-    evaluation_reward = None
-    evaluate = False
-    
-    memory = MemoryBuffer()
-
-    state, _ = env.reset()
-
-    for total_step_counter in range(int(max_steps_training)):
-        episode_timesteps += 1
-
-        action, log_prob = agent.select_action_from_policy(state)
-        action_env = hlp.denormalize(action, max_action_value, min_action_value)
-
-        next_state, reward, done, truncated, _ = env.step(action_env)
-        memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob)
-
-        state = next_state
-        episode_reward += reward
-
-        if time_step % max_steps_per_batch == 0:
-            for _ in range(G):   
-                experience = memory.sample(BATCH_SIZE) 
-                info = agent.train_policy((
-                    experience['state'],
-                    experience['action'],
-                    experience['reward'],
-                    experience['next_state'],
-                    experience['done'],
-                    experience['log_prob']
-                ))
-            
-            memory.clear()
-
-        if total_step_counter % EVALUATE_EVERY_N_STEPS == 0:
-            evaluate = True
-        
-        record.log(
-            Steps = total_step_counter,
-            Episode= episode_num,
-            Step_reward= reward,
-            Episode_reward= episode_reward if (done or truncated) else None,
-            Evaluation_reward=evaluation_reward,
-            out=done or truncated,
-        )
-        evaluation_reward = None
-        time_step += 1
-
-        if done or truncated:
-            print(f'Episode: {episode_num} | Reward: {episode_reward} | Steps: {time_step}')
-            
-            if evaluate:
-                evaluation_reward = evaluate_policy(env, agent, EVALUATE_FOR_M_EPISODES)
-                evaluate = False
-
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-def evaluate_policy(env, agent, num_episodes):
-
-    episode_reward_history = []
-
-    print('Beginning Evaluation----------------------------')
-    
-    for ep in range(num_episodes):
-        state, _ = env.reset()
-
-        episode_timesteps = 0
-        episode_reward = 0
+    record = Record(
+        glob_log_dir='training_logs',
+        log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}",
+        algorithm=network_config['algorithm'],
+        task=env_config['environment'],
+        network=agent
+    )
 
-        truncated = False
-        terminated = False
+    # TODO: Load Actor and Critic if passed. Only load if both are passed
 
-        while not truncated and not terminated:
+    match agent.type:
+        case 'policy':
 
-            if ALGORITHM == 'PPO':
-                action = agent.select_action_from_policy(state)
+            if network_config['algorithm'] == 'PPO':
+                ppo_train(env, agent, memory, record, algorithm_config)
             else:
-                action = agent.select_action_from_policy(state, evaluation=True)
-                
-            action = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
-            next_state, reward, terminated, truncated, _ = env.step(action)
-
-            episode_reward += reward
-            state = next_state
-
-        print(f'Evaluation Episode {ep + 1} Completed with a Reward of {episode_reward}')
-        episode_reward_history.append(episode_reward)
-
-    avg_reward = sum(episode_reward_history) / len(episode_reward_history)
-
-    print(f'Evaluation Completed: Avg Reward over {num_episodes} Episodes is {avg_reward} ----------------------------')
-
-    return avg_reward
-
-def get_params():
-    '''
-    This function fetches the hyperparameters passed in through the launch files
-    - The hyperparameters below are defaults, to change them, you should change the train.yaml config
-    '''
-    param_node = rclpy.create_node('params')
-    param_node.declare_parameters(
-        '',
-        [
-            ('environment', 'CarGoal'),
-            ('algorithm', 'TD3'),
-            ('track', 'track_1'),
-            ('gamma', 0.95),
-            ('tau', 0.005),
-            ('g', 10),
-            ('batch_size', 32),
-            ('buffer_size', 1_000_000),
-            ('seed', 123),  # TODO: This doesn't do anything yet
-            ('actor_lr', 1e-4),
-            ('critic_lr', 1e-3),
-            ('max_steps_training', 1_000_000),
-            ('max_steps_exploration', 1_000),
-            ('max_steps', 100),
-            ('step_length', 0.25),
-            ('reward_range', 0.2),
-            ('collision_range', 0.2),
-            ('actor_path', ''),
-            ('critic_path', ''),
-            ('max_steps_per_batch', 5000),
-            ('observation_mode', 'no_position'),
-            ('evaluate_every_n_steps', 10000),
-            ('evaluate_for_m_episodes', 5)
-        ]
-    )
+                off_policy_train(env, agent, memory, record, algorithm_config)
+        case _:
+            raise Exception(f'Agent type {agent.type} not supported')
 
-    return param_node.get_parameters([
-        'environment',
-        'algorithm',
-        'track',
-        'max_steps_training',
-        'max_steps_exploration',
-        'gamma',
-        'tau',
-        'g',
-        'batch_size',
-        'buffer_size',
-        'seed',
-        'actor_lr',
-        'critic_lr',
-        'max_steps',
-        'step_length',
-        'reward_range',
-        'collision_range',
-        'actor_path',
-        'critic_path',
-        'max_steps_per_batch',
-        'observation_mode',
-        'evaluate_every_n_steps',
-        'evaluate_for_m_episodes'
-    ])
 
 
-if __name__ == '__main__':
-    main()
diff --git a/src/reinforcement_learning/reinforcement_learning/updated_train.py b/src/reinforcement_learning/reinforcement_learning/updated_train.py
deleted file mode 100644
index 60a1b9c2..00000000
--- a/src/reinforcement_learning/reinforcement_learning/updated_train.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import random
-from datetime import datetime
-import yaml
-
-import numpy as np
-import rclpy
-from rclpy.parameter import Parameter
-import torch
-
-from cares_reinforcement_learning.memory import MemoryBuffer
-from cares_reinforcement_learning.util.Record import Record
-from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
-import cares_reinforcement_learning.util.configurations as cfg
-
-from .parse_args import parse_args
-from .EnvironmentFactory import EnvironmentFactory
-from .training_loops import off_policy_train, ppo_train
-
-def main():
-    rclpy.init()
-
-    env_config, algorithm_config, network_config = parse_args()
-
-    # Set Seeds
-    torch.manual_seed(algorithm_config['seed'])    
-    torch.cuda.manual_seed_all(algorithm_config['seed'])
-    np.random.seed(algorithm_config['seed'])
-    random.seed(algorithm_config['seed'])
-
-    print(
-        f'Environment Config: ------------------------------------- \n'
-        f'{yaml.dump(env_config, default_flow_style=False)} \n'
-        f'Algorithm Config: ------------------------------------- \n'
-        f'{yaml.dump(algorithm_config, default_flow_style=False)} \n'
-        f'Network Config: ------------------------------------- \n'
-        f'{yaml.dump(network_config, default_flow_style=False)} \n'
-    )
-
-    env_factory = EnvironmentFactory()
-    network_factory = NetworkFactory()
-
-    match network_config['algorithm']:
-        case 'PPO':
-            config = cfg.PPOConfig(**network_config)
-        case 'DDPG':
-            config = cfg.DDPGConfig(**network_config)
-        case 'SAC':
-            config = cfg.SACConfig(**network_config)
-        case 'TD3':
-            config = cfg.TD3Config(**network_config)
-        case _:
-            raise Exception(f'Algorithm {network_config["algorithm"]} not implemented')
-
-
-    env = env_factory.create(env_config['environment'], env_config)
-    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
-    memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM)
-
-
-    record = Record(
-        glob_log_dir='training_logs',
-        log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}",
-        algorithm=network_config['algorithm'],
-        task=env_config['environment'],
-        network=agent
-    )
-
-    # TODO: Load Actor and Critic if passed. Only load if both are passed
-
-    match agent.type:
-        case 'policy':
-
-            if network_config['algorithm'] == 'PPO':
-                ppo_train(env, agent, memory, record, algorithm_config)
-            else:
-                off_policy_train(env, agent, memory, record, algorithm_config)
-        case _:
-            raise Exception(f'Agent type {agent.type} not supported')
-
-
-
diff --git a/src/reinforcement_learning/setup.py b/src/reinforcement_learning/setup.py
index 93ed141e..ccb4e262 100644
--- a/src/reinforcement_learning/setup.py
+++ b/src/reinforcement_learning/setup.py
@@ -25,7 +25,7 @@
     entry_points={
         'console_scripts': [
             'sanity_check = reinforcement_learning.sanity_check:main',
-            'train = reinforcement_learning.updated_train:main',
+            'train = reinforcement_learning.train:main',
             'test = reinforcement_learning.test:main',
             'greatest_gap = reinforcement_learning.greatest_gap:main',
         ],

From 8071953568e7a7d46e247623e7e2d5fd17a98823 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Tue, 24 Oct 2023 11:26:07 +1300
Subject: [PATCH 25/38] refactor: update test loop to Record

---
 .../environments/CarBeatEnvironment.py        |   2 +-
 .../reinforcement_learning/test.py            | 207 ++++--------------
 .../reinforcement_learning/train.py           |  12 +-
 .../reinforcement_learning/training_loops.py  |  36 +--
 4 files changed, 79 insertions(+), 178 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 53c64026..8a744fd3 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -29,7 +29,7 @@ def __init__(self,
                  track='multi_track',
                  observation_mode='lidar_only',
                  max_goals=500,
-                 num_lidar_points=50
+                 num_lidar_points=10
                  ):
         super().__init__('car_beat_environment')
 
diff --git a/src/reinforcement_learning/reinforcement_learning/test.py b/src/reinforcement_learning/reinforcement_learning/test.py
index 9adc7acf..03377fef 100644
--- a/src/reinforcement_learning/reinforcement_learning/test.py
+++ b/src/reinforcement_learning/reinforcement_learning/test.py
@@ -1,175 +1,64 @@
-import time
+import random
+from datetime import datetime
+import yaml
 
 import rclpy
 import torch
-from cares_reinforcement_learning.algorithm.policy import TD3
-from cares_reinforcement_learning.networks.TD3 import Actor, Critic
-from cares_reinforcement_learning.util import Record
-from cares_reinforcement_learning.util import helpers as hlp, NetworkFactory
 
-from environments.CarBlockEnvironment import CarBlockEnvironment
-from environments.CarGoalEnvironment import CarGoalEnvironment
-from environments.CarTrackEnvironment import CarTrackEnvironment
-from environments.CarWallEnvironment import CarWallEnvironment
-from environments.CarBeatEnvironment import CarBeatEnvironment
+from cares_reinforcement_learning.util.NetworkFactory import NetworkFactory
+import cares_reinforcement_learning.util.configurations as cfg
+
+from .parse_args import parse_args
+from .EnvironmentFactory import EnvironmentFactory
+from .training_loops import off_policy_evaluate, ppo_evaluate
 
 
 def main():
     rclpy.init()
 
-    params = get_params()
-
-    global MAX_STEPS_EVALUATION
-    global MAX_STEPS
-
-    ENVIRONMENT, \
-    TRACK, \
-    MAX_STEPS_EVALUATION, \
-    MAX_STEPS, \
-    STEP_LENGTH, \
-    REWARD_RANGE, \
-    COLLISION_RANGE, \
-    ACTOR_PATH, \
-    CRITIC_PATH, \
-    OBSERVAION_MODE, \
-    ALGORITHM = [param.value for param in params]
-
-    if ACTOR_PATH == '' or CRITIC_PATH == '':
-        raise Exception('Actor or Critic path not provided')
-
-    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    time.sleep(3)
-
-    match ENVIRONMENT:
-        case 'CarWall':
-            env = CarWallEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarBlock':
-            env = CarBlockEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE)
-        case 'CarTrack':
-            env = CarTrackEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVAION_MODE)
-        case 'CarBeat':
-            env = CarBeatEnvironment('f1tenth_one', 'f1tenth_two', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE, collision_range=COLLISION_RANGE, track=TRACK, observation_mode=OBSERVAION_MODE)
-        case _:
-            env = CarGoalEnvironment('f1tenth', step_length=STEP_LENGTH, max_steps=MAX_STEPS, reward_range=REWARD_RANGE)
+    env_config, algorithm_config, network_config = parse_args()
 
     print(
-        f'---------------------------------------------\n'
-        f'Environment: {ENVIRONMENT}\n'
-        f'Evaluation Steps: {MAX_STEPS_EVALUATION}\n'
-        f'Steps per Episode: {MAX_STEPS}\n'
-        f'Step Length: {STEP_LENGTH}\n'
-        f'Reward Range: {REWARD_RANGE}\n'
-        f'Collision Range: {COLLISION_RANGE}\n'
-        f'Critic Path: {CRITIC_PATH}\n'
-        f'Actor Path: {ACTOR_PATH}\n'
-        f'Observation Mode: {OBSERVAION_MODE}\n'
-        f'Observation size: {env.OBSERVATION_SIZE}'
-        f'Algorithm: {ALGORITHM}\n'
-        f'---------------------------------------------\n'
-    )
-
-    network_factory_args = {
-        'observation_size': env.OBSERVATION_SIZE,
-        'action_num': env.ACTION_NUM,
-        'actor_lr': 0.11,
-        'critic_lr': 0.11,
-        'gamma': 0.23,
-        'tau': 0.001,
-        'device': DEVICE
-    }
-
-    agent_factory = NetworkFactory()
-    agent = agent_factory.create_network(ALGORITHM, network_factory_args)
-
-    print('Reading saved models into actor and critic')
-    agent.actor_net.load_state_dict(torch.load(ACTOR_PATH))
-    agent.critic_net.load_state_dict(torch.load(CRITIC_PATH))
-
-    print('Successfully Loaded models')
-
-    
-
-    record = Record(checkpoint_freq=100, log_dir="multi_track_with_speed12_evaluation")
-
-    test(env=env, agent=agent, record=record)
-
-
-def test(env, agent, record: Record):
-    state, _ = env.reset()
-    episode_timesteps = 0
-    episode_reward = 0
-    episode_num = 0
-
-    print('Beginning Evaluation')
-
-    for total_step_counter in range(int(MAX_STEPS_EVALUATION)):
-        episode_timesteps += 1
-
-        action = agent.select_action_from_policy(state, evaluation=True)
-        action_env = hlp.denormalize(action, env.MAX_ACTIONS, env.MIN_ACTIONS)
-
-        next_state, reward, done, truncated, info = env.step(action_env)
-
-        state = next_state
-        episode_reward += reward
-
-        if done or truncated:
-            print(f"Total T:{total_step_counter + 1} Episode {episode_num + 1} was completed with {episode_timesteps} steps taken and a Reward= {episode_reward:.3f}")
-
-            # Reset environment
-            state, _ = env.reset()
-            episode_reward = 0
-            episode_timesteps = 0
-            episode_num += 1
-
-        record.log(
-            out=done or truncated,
-            Step=total_step_counter,
-            Episode=episode_num,
-            Step_Reward=reward,
-            Episode_Reward=episode_reward if (done or truncated) else None,
-        )
-
-
-def get_params():
-    """
-    This function fetches the hyperparameters passed in through the launch files
-    - The hyperparameters below are defaults, to change them, you should change the train.yaml config
-    """
-
-    param_node = rclpy.create_node('params')
-    param_node.declare_parameters(
-        '',
-        [
-            ('environment', 'CarTrack'),
-            ('track', 'track_1'),
-            ('max_steps_evaluation', 1_000_000),
-            ('max_steps', 100),
-            ('step_length', 0.25),
-            ('reward_range', 0.2),
-            ('collision_range', 0.2),
-            ('actor_path', ''),
-            ('critic_path', ''),
-            ('observation_mode', 'full'),
-            ('algorithm', 'TD3')
-        ]
+        f'Environment Config: ------------------------------------- \n'
+        f'{yaml.dump(env_config, default_flow_style=False)} \n'
+        f'Algorithm Config: ------------------------------------- \n'
+        f'{yaml.dump(algorithm_config, default_flow_style=False)} \n'
+        f'Network Config: ------------------------------------- \n'
+        f'{yaml.dump(network_config, default_flow_style=False)} \n'
     )
 
-    return param_node.get_parameters([
-        'environment',
-        'track',
-        'max_steps_evaluation',
-        'max_steps',
-        'step_length',
-        'reward_range',
-        'collision_range',
-        'actor_path',
-        'critic_path',
-        'observation_mode',
-        'algorithm'
-    ])
-
+    env_factory = EnvironmentFactory()
+    network_factory = NetworkFactory()
+
+    match network_config['algorithm']:
+        case 'PPO':
+            config = cfg.PPOConfig(**network_config)
+        case 'DDPG':
+            config = cfg.DDPGConfig(**network_config)
+        case 'SAC':
+            config = cfg.SACConfig(**network_config)
+        case 'TD3':
+            config = cfg.TD3Config(**network_config)
+        case _:
+            raise Exception(f'Algorithm {network_config["algorithm"]} not implemented')
+
+    env = env_factory.create(env_config['environment'], env_config)
+    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
+
+    # Load models if both paths are provided
+    if network_config['actor_path'] and network_config['critic_path']:
+        print('Reading saved models into actor and critic')
+        agent.actor_net.load_state_dict(torch.load(network_config['actor_path']))
+        agent.critic_net.load_state_dict(torch.load(network_config['critic_path']))
+        print('Successfully Loaded models')
+    else:
+        raise Exception('Both actor and critic paths must be provided')
+
+    match network_config['algorithm']:
+        case 'PPO':
+            ppo_evaluate(env, agent, algorithm_config)
+        case _:
+            off_policy_evaluate(env, agent, algorithm_config)
 
 if __name__ == '__main__':
     main()
diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py
index 60a1b9c2..ee1896a1 100644
--- a/src/reinforcement_learning/reinforcement_learning/train.py
+++ b/src/reinforcement_learning/reinforcement_learning/train.py
@@ -22,7 +22,7 @@ def main():
     env_config, algorithm_config, network_config = parse_args()
 
     # Set Seeds
-    torch.manual_seed(algorithm_config['seed'])    
+    torch.manual_seed(algorithm_config['seed'])
     torch.cuda.manual_seed_all(algorithm_config['seed'])
     np.random.seed(algorithm_config['seed'])
     random.seed(algorithm_config['seed'])
@@ -62,9 +62,15 @@ def main():
         log_dir= f"{network_config['algorithm']}-{env_config['environment']}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}",
         algorithm=network_config['algorithm'],
         task=env_config['environment'],
-        network=agent
+        network=agent,
+        checkpoint_frequency=5000
     )
 
+    # TODO: update the save config once Record is updated.
+    # record.save_config(env_config, 'env_config')
+    # record.save_config(algorithm_config, 'algorithm_config')
+    # record.save_config(network_config, 'network_config')
+
     # TODO: Load Actor and Critic if passed. Only load if both are passed
 
     match agent.type:
@@ -76,6 +82,8 @@ def main():
                 off_policy_train(env, agent, memory, record, algorithm_config)
         case _:
             raise Exception(f'Agent type {agent.type} not supported')
+    
+    record.save()
 
 
 
diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
index 35e5059d..1197e2ca 100644
--- a/src/reinforcement_learning/reinforcement_learning/training_loops.py
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -77,7 +77,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
             episode_timesteps = 0
             episode_num += 1
 
-def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
+def off_policy_evaluate(env, agent, eval_episodes, record=None, steps_counter=0):
 
     episode_reward = 0
     episode_timesteps = 0
@@ -99,13 +99,15 @@ def off_policy_evaluate(env, agent, eval_episodes, record, steps_counter):
             episode_reward += reward
 
             if done or truncated:
-                record.log_eval(
-                    total_steps = steps_counter + 1,
-                    episode = episode_num + 1,
-                    episode_steps=episode_timesteps,
-                    episode_reward = episode_reward,
-                    display = True
-                )
+
+                if record:
+                    record.log_eval(
+                        total_steps = steps_counter + 1,
+                        episode = episode_num + 1,
+                        episode_steps=episode_timesteps,
+                        episode_reward = episode_reward,
+                        display = True
+                    )
 
                 # Reset environment
                 state, _ = env.reset()
@@ -180,7 +182,7 @@ def ppo_train(env, agent, memory, record, algorithm_config):
             episode_timesteps = 0
             episode_num += 1
 
-def ppo_evaluate(env, agent, eval_episodes, record, steps_counter):
+def ppo_evaluate(env, agent, eval_episodes, record=None, steps_counter=0):
 
     episode_reward = 0
     episode_timesteps = 0
@@ -202,13 +204,15 @@ def ppo_evaluate(env, agent, eval_episodes, record, steps_counter):
             episode_reward += reward
 
             if done or truncated:
-                record.log_eval(
-                    total_steps = steps_counter + 1,
-                    episode = episode_num + 1,
-                    episode_steps=episode_timesteps,
-                    episode_reward = episode_reward,
-                    display = True
-                )
+
+                if record:
+                    record.log_eval(
+                        total_steps = steps_counter + 1,
+                        episode = episode_num + 1,
+                        episode_steps=episode_timesteps,
+                        episode_reward = episode_reward,
+                        display = True
+                    )
 
                 # Reset environment
                 state, _ = env.reset()

From 9e2b3986208b7566dd3641f8c9a97a6ea294fbc9 Mon Sep 17 00:00:00 2001
From: Afereti Pama <79831813+retinfai@users.noreply.github.com>
Date: Tue, 24 Oct 2023 11:34:30 +1300
Subject: [PATCH 26/38] Update README.md

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 2bc821f1..8f9faf06 100644
--- a/README.md
+++ b/README.md
@@ -63,9 +63,6 @@ train:
     reward_range: 1.0
     collision_range: 0.2
     observation_mode: 'no_position'
-    # actor_path & critic_path must exist, it can't be commented
-    actor_path: 'rl_logs/23_08_02_17:59:13/models/actor_checkpoint.pht'
-    critic_path: 'rl_logs/23_08_02_17:59:13/models/critic_checkpoint.pht'
     # gamma: 0.95
     # tau: 0.005
     # g: 5

From e3d5c23d31273b636bc9d2d05a6fb0f02291f926 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 09:41:47 +1300
Subject: [PATCH 27/38] feat: integrate pydantic models into training

---
 .../environments/CarBeatEnvironment.py        |  2 +-
 .../environments/configurations.py            | 46 +++++++++++++++++++
 .../reinforcement_learning/parse_args.py      | 46 ++++++++++++++++---
 .../reinforcement_learning/train.py           | 28 +++--------
 .../reinforcement_learning/training_loops.py  | 16 +++----
 5 files changed, 101 insertions(+), 37 deletions(-)
 create mode 100644 src/environments/environments/configurations.py

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 8a744fd3..64011d34 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -201,7 +201,7 @@ def step(self, action):
         next_state, full_next_state  = self.get_observation()
         reward = self.compute_reward(full_state, full_next_state)
         terminated = self.is_terminated(full_next_state)
-        truncated = self.steps_since_last_goal >= 10
+        truncated = self.steps_since_last_goal >= self.MAX_STEPS_PER_GOAL
         info = {}
 
         return next_state, reward, terminated, truncated, info
diff --git a/src/environments/environments/configurations.py b/src/environments/environments/configurations.py
new file mode 100644
index 00000000..4f02faca
--- /dev/null
+++ b/src/environments/environments/configurations.py
@@ -0,0 +1,46 @@
+from pydantic import BaseModel
+from typing import Optional
+
+class SubscriptableClass(BaseModel):
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+class TrainingConfig(SubscriptableClass):
+    g: int
+    batch_size: int
+    buffer_size: int
+    seed: int
+    max_steps_training: int
+    max_steps_exploration: int
+    max_steps_per_batch: Optional[int]
+    number_steps_per_evaluation: int
+    number_eval_episodes: int
+
+class EnvironmentConfig(SubscriptableClass):
+    environment: str
+    car_name: str
+    reward_range: float
+    max_steps: int
+    step_length: float
+    collision_range: Optional[float] # Doesn't apply to CarGoal
+
+class CarGoalEnvironmentConfig(EnvironmentConfig):
+    pass
+
+class CarBlockEnvironmentConfig(EnvironmentConfig):
+    pass
+
+class CarWallEnvironmentConfig(EnvironmentConfig):
+    pass
+
+class CarTrackEnvironmentConfig(EnvironmentConfig):
+    track: str
+    observation_mode: str
+    max_goals: int
+
+class CarBeatEnvironmentConfig(EnvironmentConfig):
+    ftg_car_name: str
+    track: str
+    observation_mode: str
+    max_goals: int
+    num_lidar_points: int
\ No newline at end of file
diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py
index 4e88be1e..165db0e1 100644
--- a/src/reinforcement_learning/reinforcement_learning/parse_args.py
+++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py
@@ -1,6 +1,8 @@
 import rclpy
 from rclpy.node import Node
 from rclpy.parameter import Parameter
+from environments import configurations as cfg
+from cares_reinforcement_learning.util import configurations as cares_cfg
 
 def parse_args():
     param_node = __declare_params()
@@ -11,6 +13,7 @@ def parse_args():
 
     return env_params, algorithm_params, network_params
 
+
 def __declare_params():
     param_node = rclpy.create_node('params')
     param_node.declare_parameters(
@@ -37,8 +40,8 @@ def __declare_params():
             ('max_steps_training', 1_000_000),
             ('max_steps_exploration', 1_000),
             ('max_steps_per_batch', 5000),
-            ('evaluate_every_n_steps', 10000),
-            ('evaluate_for_m_episodes', 5),
+            ('number_steps_per_evaluation', 10000),
+            ('number_eval_episodes', 5),
 
             # Network Parameters -------------------------------
             ('actor_path', ''),
@@ -75,7 +78,21 @@ def __get_env_params(param_node: Node):
     for param in params:
         params_dict[param.name] = param.value
     
-    return params_dict
+    match params_dict['environment']:
+        case 'CarGoal':
+            config = cfg.CarGoalEnvironmentConfig(**params_dict)
+        case 'CarBlock':
+            config = cfg.CarBlockEnvironmentConfig(**params_dict)
+        case 'CarWall':
+            config = cfg.CarWallEnvironmentConfig(**params_dict)
+        case 'CarTrack':
+            config = cfg.CarTrackEnvironmentConfig(**params_dict)
+        case 'CarBeat':
+            config = cfg.CarBeatEnvironmentConfig(**params_dict)
+        case _:
+            raise Exception(f'Environment {params_dict["environment"]} not implemented')
+    
+    return config
 
 def __get_algorithm_params(param_node: Node):
     params = param_node.get_parameters([
@@ -86,8 +103,8 @@ def __get_algorithm_params(param_node: Node):
         'max_steps_training',
         'max_steps_exploration',
         'max_steps_per_batch',
-        'evaluate_every_n_steps',
-        'evaluate_for_m_episodes'
+        'number_steps_per_evaluation',
+        'number_eval_episodes'
     ])
 
     # Convert to Dictionary
@@ -95,7 +112,9 @@ def __get_algorithm_params(param_node: Node):
     for param in params:
         params_dict[param.name] = param.value
     
-    return params_dict
+    config = cfg.TrainingConfig(**params_dict)
+
+    return config
 
 def __get_network_params(param_node: Node):
     params = param_node.get_parameters([
@@ -113,4 +132,17 @@ def __get_network_params(param_node: Node):
     for param in params:
         params_dict[param.name] = param.value
     
-    return params_dict
+    match params_dict['algorithm']:
+        case 'PPO':
+            config = cares_cfg.PPOConfig(**params_dict)
+        case 'DDPG':
+            config = cares_cfg.DDPGConfig(**params_dict)
+        case 'SAC':
+            config = cares_cfg.SACConfig(**params_dict)
+        case 'TD3':
+            config = cares_cfg.TD3Config(**params_dict)
+        case _:
+            raise Exception(f'Algorithm {params_dict["algorithm"]} not implemented')
+    
+    return config
+
diff --git a/src/reinforcement_learning/reinforcement_learning/train.py b/src/reinforcement_learning/reinforcement_learning/train.py
index ee1896a1..150febc5 100644
--- a/src/reinforcement_learning/reinforcement_learning/train.py
+++ b/src/reinforcement_learning/reinforcement_learning/train.py
@@ -29,31 +29,18 @@ def main():
 
     print(
         f'Environment Config: ------------------------------------- \n'
-        f'{yaml.dump(env_config, default_flow_style=False)} \n'
+        f'{yaml.dump(dict(env_config), default_flow_style=False)} \n'
         f'Algorithm Config: ------------------------------------- \n'
-        f'{yaml.dump(algorithm_config, default_flow_style=False)} \n'
+        f'{yaml.dump(dict(algorithm_config), default_flow_style=False)} \n'
         f'Network Config: ------------------------------------- \n'
-        f'{yaml.dump(network_config, default_flow_style=False)} \n'
+        f'{yaml.dump(dict(network_config), default_flow_style=False)} \n'
     )
 
     env_factory = EnvironmentFactory()
     network_factory = NetworkFactory()
 
-    match network_config['algorithm']:
-        case 'PPO':
-            config = cfg.PPOConfig(**network_config)
-        case 'DDPG':
-            config = cfg.DDPGConfig(**network_config)
-        case 'SAC':
-            config = cfg.SACConfig(**network_config)
-        case 'TD3':
-            config = cfg.TD3Config(**network_config)
-        case _:
-            raise Exception(f'Algorithm {network_config["algorithm"]} not implemented')
-
-
     env = env_factory.create(env_config['environment'], env_config)
-    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=config)
+    agent = network_factory.create_network(env.OBSERVATION_SIZE, env.ACTION_NUM, config=network_config)
     memory = MemoryBuffer(algorithm_config['buffer_size'], env.OBSERVATION_SIZE, env.ACTION_NUM)
 
 
@@ -66,10 +53,9 @@ def main():
         checkpoint_frequency=5000
     )
 
-    # TODO: update the save config once Record is updated.
-    # record.save_config(env_config, 'env_config')
-    # record.save_config(algorithm_config, 'algorithm_config')
-    # record.save_config(network_config, 'network_config')
+    record.save_config(env_config, 'env_config')
+    record.save_config(algorithm_config, 'algorithm_config')
+    record.save_config(network_config, 'network_config')
 
     # TODO: Load Actor and Critic if passed. Only load if both are passed
 
diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
index 1197e2ca..8a281e76 100644
--- a/src/reinforcement_learning/reinforcement_learning/training_loops.py
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -6,8 +6,8 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
     
     max_steps_training = algorithm_config['max_steps_training']
     max_steps_exploration = algorithm_config['max_steps_exploration']
-    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
-    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
+    number_eval_episodes = algorithm_config['number_eval_episodes']
+    number_steps_per_evaluation = algorithm_config['number_steps_per_evaluation']
 
     batch_size = algorithm_config['batch_size']
     G = algorithm_config['g']
@@ -49,7 +49,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
                 ))
                 memory.update_priorities(experience['indices'], info)
         
-        if (step_counter+1) % evaluate_every_n_steps == 0:
+        if (step_counter+1) % number_steps_per_evaluation == 0:
             evaluate = True
         
         if done or truncated:
@@ -67,7 +67,7 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
                 evaluate = False
 
                 env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
-                off_policy_evaluate(env, agent, num_eps_evaluation, record, step_counter)
+                off_policy_evaluate(env, agent, number_eval_episodes, record, step_counter)
                 env.get_logger().info(f'*************--End Evaluation Loop--*************')
 
 
@@ -122,8 +122,8 @@ def ppo_train(env, agent, memory, record, algorithm_config):
 
     max_steps_training = algorithm_config['max_steps_training']
     max_steps_per_batch = algorithm_config['max_steps_per_batch']
-    num_eps_evaluation = algorithm_config['evaluate_for_m_episodes']
-    evaluate_every_n_steps = algorithm_config['evaluate_every_n_steps']
+    number_eval_episodes = algorithm_config['number_eval_episodes']
+    number_steps_per_evaluation = algorithm_config['number_steps_per_evaluation']
 
     episode_timesteps = 0
     episode_num = 0
@@ -156,7 +156,7 @@ def ppo_train(env, agent, memory, record, algorithm_config):
                 experience['log_prob']
             ))
         
-        if (step_counter+1) % evaluate_every_n_steps == 0:
+        if (step_counter+1) % number_steps_per_evaluation == 0:
             evaluate = True
         
         if done or truncated:
@@ -172,7 +172,7 @@ def ppo_train(env, agent, memory, record, algorithm_config):
                 evaluate = False
 
                 env.get_logger().info(f'*************--Begin Evaluation Loop--*************')
-                ppo_evaluate(env, agent, num_eps_evaluation, record, step_counter)
+                ppo_evaluate(env, agent, number_eval_episodes, record, step_counter)
                 env.get_logger().info(f'*************--End Evaluation Loop--*************')
 
 

From 776c992c0036b44593c9f8f508aa9d10470bf096 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:04:15 +1300
Subject: [PATCH 28/38] feat: add parse obs to CarBeat

---
 .../environments/CarBeatEnvironment.py        | 26 +++++++++++++++++++
 src/reinforcement_learning/config/train.yaml  |  7 +++--
 .../reinforcement_learning/training_loops.py  |  5 ++++
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 64011d34..b552bfe8 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -365,4 +365,30 @@ def update_goal_service(self, x, y):
         rclpy.spin_until_future_complete(self, future)
 
         return future.result()
+
+    # function that parses the state and returns a string that can be printed to the terminal
+    def parse_observation(self, observation):
+        string = f'CarBeat Observation: \n'
+
+        if self.OBSERVATION_MODE == 'full':
+            string += f'Car Position: {observation[0:2]} \n'
+            string += f'Car Orientation: {observation[2:6]} \n' 
+            string += f'Car Velocity: {observation[6]} \n'
+            string += f'Car Angular Velocity: {observation[7]} \n'
+            string += f'Car Lidar: {observation[8:]} \n'
+        elif self.OBSERVATION_MODE == 'no_position':
+            string += f'Car Orientation: {observation[:4]} \n' 
+            string += f'Car Velocity: {observation[4]} \n'
+            string += f'Car Angular Velocity: {observation[5]} \n'
+            string += f'Car Lidar: {observation[6:]} \n'
+        elif self.OBSERVATION_MODE == 'lidar_only':
+            string += f'Car Velocity: {observation[0]} \n'
+            string += f'Car Angular Velocity: {observation[1]} \n'
+            string += f'Car Lidar: {observation[2:]} \n'
+        else:
+            raise ValueError(f'Invalid observation mode: {self.OBSERVATION_MODE}')
+    
+        return string
+    
+
     
\ No newline at end of file
diff --git a/src/reinforcement_learning/config/train.yaml b/src/reinforcement_learning/config/train.yaml
index bfefbafc..67d3cc76 100644
--- a/src/reinforcement_learning/config/train.yaml
+++ b/src/reinforcement_learning/config/train.yaml
@@ -2,5 +2,8 @@ train:
   ros__parameters:
     environment: 'CarBeat' # CarGoal, CarWall, CarBlock, CarTrack, CarBeat
     track: 'multi_track' # track_1, track_2, track_3, multi_track, multi_track_testing -> only applies for CarTrack
-    car_name: 'New_Car'
-    ftg_car_name: 'New_Car_two'
+    car_name: 'f1tenth'
+    algorithm: 'TD3'
+    g: 1
+    buffer_size: 50000
+    observation_mode: 'lidar_only'
diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
index 8a281e76..e7981767 100644
--- a/src/reinforcement_learning/reinforcement_learning/training_loops.py
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -19,6 +19,11 @@ def off_policy_train(env, agent, memory, record, algorithm_config):
     evaluate = False
 
     state, _ = env.reset()
+    
+    obs = env.parse_observation(state)
+    env.get_logger().info('-----------------------------------')
+    env.get_logger().info('\n' + obs)
+    env.get_logger().info('-----------------------------------')
 
     for step_counter in range(max_steps_training):
         episode_timesteps += 1

From bf67d370fe7192a833e9837a43b2c139cca45b86 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:07:24 +1300
Subject: [PATCH 29/38] feat: add parse obs to CarBlock

---
 src/environments/environments/CarBlockEnvironment.py  | 11 +++++++++++
 .../reinforcement_learning/parse_args.py              |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py
index b2c3e650..df12b79a 100644
--- a/src/environments/environments/CarBlockEnvironment.py
+++ b/src/environments/environments/CarBlockEnvironment.py
@@ -114,5 +114,16 @@ def compute_reward(self, state, next_state):
             reward -= 25  # TODO: find optimal value for this
 
         return reward
+    
+    def parse_observation(self, observation):
+        string = f'CarBlock Observation\n'
+        string += f'Position: {observation[:2]}\n'
+        string += f'Orientation: {observation[2:6]}\n'
+        string += f'Car Velocity: {observation[6]}\n'
+        string += f'Car Angular Velocity: {observation[7]}\n'
+        string += f'Lidar: {observation[8:-2]}\n'
+        string += f'Goal Position: {observation[-2:]}\n'
+
+        return string
 
     
diff --git a/src/reinforcement_learning/reinforcement_learning/parse_args.py b/src/reinforcement_learning/reinforcement_learning/parse_args.py
index 165db0e1..e16ca118 100644
--- a/src/reinforcement_learning/reinforcement_learning/parse_args.py
+++ b/src/reinforcement_learning/reinforcement_learning/parse_args.py
@@ -28,7 +28,7 @@ def __declare_params():
             ('step_length', 0.25),
             ('reward_range', 0.2),
             ('collision_range', 0.2),
-            ('observation_mode', 'no_position'),
+            ('observation_mode', 'lidar_only'),
             ('max_goals', 500),
             ('num_lidar_points', 10),
 

From f4922d890046f204e5bc52dc6512cb8f23a73b35 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:10:52 +1300
Subject: [PATCH 30/38] feat: add parse obs to CarGoal

---
 src/environments/environments/CarGoalEnvironment.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py
index 015d255b..9a6e78ff 100644
--- a/src/environments/environments/CarGoalEnvironment.py
+++ b/src/environments/environments/CarGoalEnvironment.py
@@ -101,5 +101,16 @@ def compute_reward(self, state, next_state):
 
         return reward
     
+    def parse_observation(self, observation):
+        
+        string = f'CarGoal Observation\n'
+        string += f'Position: {observation[:2]}\n'
+        string += f'Orientation: {observation[2:6]}\n'
+        string += f'Car Velocity: {observation[6]}\n'
+        string += f'Car Angular Velocity: {observation[7]}\n'
+        string += f'Goal Position: {observation[-2:]}\n'
+
+        return string
+    
     
         
\ No newline at end of file

From 81540907c88239f880d4ab0743c479fcbc18278e Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:17:01 +1300
Subject: [PATCH 31/38] feat: add parse obs to CarTrack

---
 .../environments/CarTrackEnvironment.py       | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py
index 340ec1ac..42e1a1dc 100644
--- a/src/environments/environments/CarTrackEnvironment.py
+++ b/src/environments/environments/CarTrackEnvironment.py
@@ -208,3 +208,26 @@ def sleep(self):
             rclpy.spin_once(self)
 
         self.timer_future = Future()
+    
+    def parse_observation(self, observation):
+        
+        string = f'CarTrack Observation\n'
+
+        match (self.observation_mode):
+            case 'no_position':
+                string += f'Orientation: {observation[:4]}\n'
+                string += f'Car Velocity: {observation[4]}\n'
+                string += f'Car Angular Velocity: {observation[5]}\n'
+                string += f'Lidar: {observation[6:]}\n'
+            case 'lidar_only':
+                string += f'Car Velocity: {observation[0]}\n'
+                string += f'Car Angular Velocity: {observation[1]}\n'
+                string += f'Lidar: {observation[2:]}\n'
+            case _:
+                string += f'Position: {observation[:2]}\n'
+                string += f'Orientation: {observation[2:6]}\n'
+                string += f'Car Velocity: {observation[6]}\n'
+                string += f'Car Angular Velocity: {observation[7]}\n'
+                string += f'Lidar: {observation[8:]}\n'
+
+        return string

From 712a2489eeeb25d3945b47e4a1ca5ae6b1ed114f Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:19:56 +1300
Subject: [PATCH 32/38] feat: add CarWall to parse obs

---
 src/environments/environments/CarWallEnvironment.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py
index cd285cee..0fc0b8cc 100644
--- a/src/environments/environments/CarWallEnvironment.py
+++ b/src/environments/environments/CarWallEnvironment.py
@@ -111,4 +111,14 @@ def compute_reward(self, state, next_state):
 
         return reward
 
-    
+    def parse_observation(self, observation):
+        
+        string = f'CarWall Observation\n'
+        string += f'Position: {observation[:2]}\n'
+        string += f'Orientation: {observation[2:6]}\n'
+        string += f'Car Velocity: {observation[6]}\n'
+        string += f'Car Angular Velocity: {observation[7]}\n'
+        string += f'Lidar Points: {observation[8:-2]}\n'
+        string += f'Goal Position: {observation[-2:]}\n'
+
+        return string

From 575e27b3f1a10cad72327f539de2c1ed9d603ffc Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:23:47 +1300
Subject: [PATCH 33/38] feat: add obs log to ppo train

---
 .../reinforcement_learning/training_loops.py                 | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/reinforcement_learning/reinforcement_learning/training_loops.py b/src/reinforcement_learning/reinforcement_learning/training_loops.py
index e7981767..9f91bc93 100644
--- a/src/reinforcement_learning/reinforcement_learning/training_loops.py
+++ b/src/reinforcement_learning/reinforcement_learning/training_loops.py
@@ -138,6 +138,11 @@ def ppo_train(env, agent, memory, record, algorithm_config):
 
     state, _ = env.reset()
 
+    obs = env.parse_observation(state)
+    env.get_logger().info('-----------------------------------')
+    env.get_logger().info('\n' + obs)
+    env.get_logger().info('-----------------------------------')
+
     for step_counter in range(max_steps_training):
         episode_timesteps += 1
 

From 3ab45e6363f6676bd03794cb09c0a2807326bf29 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:37:01 +1300
Subject: [PATCH 34/38] docs: add comments to CarBlock and CarBeat

---
 .../environments/CarBeatEnvironment.py        | 26 +++++++++++++++++++
 .../environments/CarBlockEnvironment.py       | 23 +++++++++-------
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index b552bfe8..4b2a468a 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -19,6 +19,32 @@
 
 class CarBeatEnvironment(Node):
 
+    """
+    CarBeat Reinforcement Learning Environment:
+
+        Task:
+            Agent learns to drive a track and overtake a car that is driving at a constant speed.
+            The second car is using the Follow The Gap algorithm.
+
+        Observation:
+            Car Velocity
+            Car Angular Velocity
+            Car Lidar
+
+        Action:
+            It's linear and angular velocity (Twist)
+        
+        Reward:
+            +2 if it comes within REWARD_RANGE units of a goal
+            +200 if it overtakes the Follow The Gap car
+            -25 if it collides with a wall
+
+        Termination Conditions:
+            When the agent collides with a wall or the Follow The Gap car
+        
+        Truncation Condition:
+            When the number of steps surpasses MAX_GOALS
+    """
     def __init__(self,
                  rl_car_name,
                  ftg_car_name,
diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py
index df12b79a..db890308 100644
--- a/src/environments/environments/CarBlockEnvironment.py
+++ b/src/environments/environments/CarBlockEnvironment.py
@@ -14,28 +14,31 @@
 
 class CarBlockEnvironment(F1tenthEnvironment):
     """
-    CarWall Reinforcement Learning Environment:
+    CarBlock Reinforcement Learning Environment:
 
         Task:
-            Here the agent learns to drive the f1tenth car to a goal position
+            Agent learns to navigate to a goal position while avoiding obstacles that are dynamically placed at the start of each episode
 
         Observation:
-            It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y)
+            Car Position (x, y)
+            Car Orientation (x, y, z, w)
+            Car Velocity
+            Car Angular Velocity
+            Lidar Data
 
         Action:
-            It's linear and angular velocity
+            Its linear and angular velocity (Twist)
         
         Reward:
-            It's progress toward the goal plus,
-            100+ if it reaches the goal plus,
-            -50 if it collides with the wall
+            Its progress towards the goal * 10
+            +100 if it reaches the goal
+            -25 if it collides with an obstacle or flips over
 
         Termination Conditions:
-            When the agent is within REWARD_RANGE units or,
-            When the agent is within COLLISION_RANGE units
+            When the agent collides with a wall or reaches the goal
         
         Truncation Condition:
-            When the number of steps surpasses MAX_STEPS
+            When the number of steps surpasses MAX_steps
     """
 
     def __init__(self, car_name, reward_range=0.2, max_steps=50, collision_range=0.2, step_length=0.5):

From e2dbcec758fb8cc714c969827297871ca585cb6a Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:37:26 +1300
Subject: [PATCH 35/38] docs: update CarBlock comments

---
 src/environments/environments/CarBlockEnvironment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/environments/environments/CarBlockEnvironment.py b/src/environments/environments/CarBlockEnvironment.py
index db890308..f09b6081 100644
--- a/src/environments/environments/CarBlockEnvironment.py
+++ b/src/environments/environments/CarBlockEnvironment.py
@@ -38,7 +38,7 @@ class CarBlockEnvironment(F1tenthEnvironment):
             When the agent collides with a wall or reaches the goal
         
         Truncation Condition:
-            When the number of steps surpasses MAX_steps
+            When the number of steps surpasses MAX_STEPS
     """
 
     def __init__(self, car_name, reward_range=0.2, max_steps=50, collision_range=0.2, step_length=0.5):

From c42de198ad49f9a41b3fa9a12b2493b61a9a379e Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:38:55 +1300
Subject: [PATCH 36/38] docs: add env description to CarGoal

---
 .../environments/CarGoalEnvironment.py        | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/environments/environments/CarGoalEnvironment.py b/src/environments/environments/CarGoalEnvironment.py
index 9a6e78ff..2967afb4 100644
--- a/src/environments/environments/CarGoalEnvironment.py
+++ b/src/environments/environments/CarGoalEnvironment.py
@@ -8,26 +8,28 @@
 
 class CarGoalEnvironment(F1tenthEnvironment):
     """
-    CarWall Reinforcement Learning Environment:
+    CarGoal Reinforcement Learning Environment:
 
         Task:
             Here the agent learns to drive the f1tenth car to a goal position.
-            This happens all within a 10x10 box
+            This happens in an open area.
 
         Observation:
-            It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y)
+            Car Position (x, y)
+            Car Orientation (x, y, z, w)
+            Car Velocity
+            Car Angular Velocity
+            Goal Position (x, y)
 
         Action:
-            It's linear and angular velocity
+            Its linear and angular velocity
         
         Reward:
-            It's progress toward the goal plus,
-            100+ if it reaches the goal plus,
-            -50 if it collides with the wall
+            Its progress toward the goal plus,
+            +100 if it reaches the goal plus
 
         Termination Conditions:
-            When the agent is within REWARD_RANGE units of the goal or,
-            When the agent is within COLLISION_RANGE units of a wall
+            When the agent is within REWARD_RANGE units of the goal
         
         Truncation Condition:
             When the number of steps surpasses MAX_STEPS

From b4e53dd6e6213c2102f4b2199a116a0b4d0a3375 Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:41:56 +1300
Subject: [PATCH 37/38] docs: add env description to CarTrack

---
 .../environments/CarBeatEnvironment.py        | 20 ++++++++--
 .../environments/CarTrackEnvironment.py       | 37 +++++++++++++++++++
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/src/environments/environments/CarBeatEnvironment.py b/src/environments/environments/CarBeatEnvironment.py
index 4b2a468a..66e270b2 100644
--- a/src/environments/environments/CarBeatEnvironment.py
+++ b/src/environments/environments/CarBeatEnvironment.py
@@ -27,9 +27,23 @@ class CarBeatEnvironment(Node):
             The second car is using the Follow The Gap algorithm.
 
         Observation:
-            Car Velocity
-            Car Angular Velocity
-            Car Lidar
+            full:
+                Car Position (x, y)
+                Car Orientation (x, y, z, w)
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+            no_position:
+                Car Orientation (x, y, z, w)
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+            lidar_only:
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+
+            No. of lidar points is configurable
 
         Action:
             It's linear and angular velocity (Twist)
diff --git a/src/environments/environments/CarTrackEnvironment.py b/src/environments/environments/CarTrackEnvironment.py
index 42e1a1dc..af539bb9 100644
--- a/src/environments/environments/CarTrackEnvironment.py
+++ b/src/environments/environments/CarTrackEnvironment.py
@@ -11,6 +11,43 @@
 
 class CarTrackEnvironment(F1tenthEnvironment):
 
+    """
+    CarTrack Reinforcement Learning Environment:
+
+        Task:
+            Agent learns to drive a track
+
+        Observation:
+            full:
+                Car Position (x, y)
+                Car Orientation (x, y, z, w)
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+            no_position:
+                Car Orientation (x, y, z, w)
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+            lidar_only:
+                Car Velocity
+                Car Angular Velocity
+                Lidar Data
+
+        Action:
+            It's linear and angular velocity (Twist)
+        
+        Reward:
+            +2 if it comes within REWARD_RANGE units of a goal
+            -25 if it collides with a wall
+
+        Termination Conditions:
+            When the agent collides with a wall or the Follow The Gap car
+        
+        Truncation Condition:
+            When the number of steps surpasses MAX_GOALS
+    """
+
     def __init__(self, 
                  car_name, 
                  reward_range=0.5, 

From f0c3879709a8ff1ab25a6f7250e25b4221918aac Mon Sep 17 00:00:00 2001
From: retinfai <aferetipama@gmail.com>
Date: Wed, 25 Oct 2023 10:42:45 +1300
Subject: [PATCH 38/38] docs: update CarWall environment

---
 src/environments/environments/CarWallEnvironment.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/environments/environments/CarWallEnvironment.py b/src/environments/environments/CarWallEnvironment.py
index 0fc0b8cc..32fe1020 100644
--- a/src/environments/environments/CarWallEnvironment.py
+++ b/src/environments/environments/CarWallEnvironment.py
@@ -18,7 +18,12 @@ class CarWallEnvironment(F1tenthEnvironment):
             This happens all within a 10x10 box
 
         Observation:
-            It's position (x, y), orientation (w, x, y, z), lidar points (approx. ~600 rays) and the goal's position (x, y)
+            Car Position (x, y)
+            Car Orientation (x, y, z, w)
+            Car Velocity
+            Car Angular Velocity
+            Lidar Data
+            Goal Position (x, y)
 
         Action:
             It's linear and angular velocity