Skip to content

Commit

Permalink
bugfix
Browse files Browse the repository at this point in the history
  • Loading branch information
L-ED committed Apr 15, 2024
1 parent d127170 commit 2e50245
Show file tree
Hide file tree
Showing 9 changed files with 334 additions and 21 deletions.
47 changes: 34 additions & 13 deletions gym_pybullet_drones/envs/single_agent_rl/hover/HoverFullState.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from gym_pybullet_drones.utils.state import State
from torch import sigmoid
from copy import deepcopy
import torch

class HoverFullState(BaseRL):

Expand All @@ -20,9 +21,18 @@ def __init__(
scene_objects=[],
visualize=False,
record=False,
realtime=False
realtime=False,
max_step = 2000,
seed = 42,

):

self.set_seed(seed)

self.elem_num = 15#18

self.max_step = max_step

self.max_g = 2*9.8
self.max_ang_vel = 2 #10
self.max_radius = 1
Expand All @@ -39,7 +49,7 @@ def __init__(
if drone is None:

sensors= [
FullState(1e3),
FullState(500),
mpu6000()
]

Expand All @@ -57,18 +67,24 @@ def __init__(
super().__init__(client, drone, control_system, logger, scene_objects, visualize, record, realtime)


def set_seed(self, seed):
np.random.seed(seed)
torch.random.seed(seed)


def normalize_observation_space(self):

self.observation_space = spaces.Box(
low=-1*np.ones((1, 12)),
high=np.ones((1, 12)),
low=-1*np.ones((1, self.elem_num)),
high=np.ones((1, self.elem_num)),
dtype=np.float32
)



def preprocess_action(self, action):
return self.drone.max_rpm/(1+np.exp(-action*3))
self.last_action = action.copy()
return self.drone.max_rpm/(1+np.exp(-action))


def preprocess_observation(self, observation):
Expand All @@ -81,7 +97,8 @@ def preprocess_observation(self, observation):
targ_disp = self.target_pos - pos

stats = [
pos,
pos,
ang,
a_vel,
vel,
# imu[:3],
Expand All @@ -98,7 +115,8 @@ def preprocess_observation(self, observation):
# value = value/value_norm
# stats[i] = value

return np.concatenate(stats).reshape((1, 12))
return np.concatenate(stats).reshape((1, self.elem_num))
# return np.concatenate(stats).reshape((1, 12))


def check_termination(self):
Expand All @@ -112,6 +130,13 @@ def check_termination(self):
return term


def check_truncation(self):
trunc = False
if self.step_idx > self.max_step:
trunc = True
return trunc


def create_initial_state(self):
state = super().create_initial_state()
if self.randomize:
Expand Down Expand Up @@ -156,13 +181,9 @@ def reward(self):
closenes_reward +=1
dir_reward=1

# dir_reward = np.log10(0.1+np.abs(dir_reward-1))
# if dir_reward < 0:
# dir_reward *= np.linalg.norm(vel)

angles_reward = 0#-np.linalg.norm(state.world.ang_vel)
angles_reward = np.exp(-np.linalg.norm(state.world.ang_vel))

reward = dir_reward + angles_reward + closenes_reward
reward = dir_reward*0.1 + angles_reward*0.1 + closenes_reward
# reward = (1-displ_normalized)#dir_reward + angles_reward + closenes_reward

return reward
4 changes: 2 additions & 2 deletions gym_pybullet_drones/examples/hover/hover_learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def main(test=True):
verbose=1,
tensorboard_log=savedir,
# policy_kwargs=policy_kwargs
# n_steps=6000
# n_steps=10000
)

eval_callback = EvalCallback(env, best_model_save_path=savedir,
Expand All @@ -44,7 +44,7 @@ def main(test=True):
test_only=False
# test_only=True
if not test_only:
agent.learn(1000000, callback=eval_callback)
agent.learn(2000000, callback=eval_callback)
agent.save(savepath)
env = env_class(visualize=True)
# env.randomize = False
Expand Down
83 changes: 83 additions & 0 deletions gym_pybullet_drones/examples/hover/hover_learn_multienv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from stable_baselines3 import PPO, SAC, TD3
from gym_pybullet_drones.envs.single_agent_rl import HoverIMU, HoverGPS, HoverFullState
import time
import torch
import os
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import SubprocVecEnv, subproc_vec_env

def make_env(env_class, rank):

def init_():
env = env_class(seed=rank)
env.rank = rank
env.id = rank
return env

return init_


def main(test=True):

proc_num = 4

savedir = '/home/led/robotics/engines/Bullet_sym/gym-pybullet-drones/gym_pybullet_drones/results/hover/multienv'
savepath= os.path.join(
savedir,
'best_model'
# "best_model_ppo_longlearn"
# 'best_model_ppo_nonorm_imu_BEST'
# 'best_model_ppo_nonorm'
# 'best_model_random_noize'
)

trainer = PPO

env_class = HoverFullState
vec_env = SubprocVecEnv([make_env(env_class, i) for i in range(proc_num)])
eval_env = HoverFullState()

# env.randomize = False
agent = trainer(
'MlpPolicy',
env=vec_env,
verbose=0,
tensorboard_log=savedir,
# policy_kwargs=policy_kwargs
# n_steps=10000
)

eval_callback = EvalCallback(eval_env, best_model_save_path=savedir,
log_path=savedir, eval_freq=10000,
deterministic=True, render=False)


agent.learn(2000000, callback=eval_callback)
agent.save(savepath)

env = env_class(visualize=True)
# env.randomize = False
agent = trainer.load(savepath, env=env)

state, _=env.reset()
rew = 0
while test:
action, _ = agent.predict(
state.reshape(1,-1),
deterministic=True
)
state, reward, terminated, truncated, info = env.step(action)
# print(state, reward)
msg = f"POS {state[0, :3]} VEL{state[0, 6:9]}, ACC {state[0, 12:15]}"
print(msg)
rew+=reward

time.sleep(env.timestep)
if terminated or truncated:
print(rew)
rew=0
state, _=env.reset()


if __name__ == '__main__':
main()
71 changes: 71 additions & 0 deletions gym_pybullet_drones/examples/hover/hover_learn_multiple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from stable_baselines3 import PPO, SAC, TD3
from gym_pybullet_drones.envs.single_agent_rl import HoverIMU, HoverGPS, HoverFullState
import time
import torch
import os
from stable_baselines3.common.callbacks import EvalCallback


def main(test=True):

ep_size = 2000
for buffer_size in [2000, 10000]:
for batch_size in [64, 512, 1024]:
# for ep_size in [2000]:

savedir = '/home/led/robotics/engines/Bullet_sym/gym-pybullet-drones/gym_pybullet_drones/results/hover/multiple'
savepath= os.path.join(
savedir,
'model_' + str(buffer_size) + '_' + str(ep_size)
)

trainer = PPO
env_class = HoverFullState
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))

env = env_class(max_step=ep_size)
# env.randomize = False
agent = trainer(
'MlpPolicy',
env=env,
verbose=1,
tensorboard_log=savedir,
# policy_kwargs=policy_kwargs
n_steps=buffer_size,
batch_size=batch_size
)

eval_callback = EvalCallback(env, best_model_save_path=savedir,
log_path=savedir, eval_freq=10000,
deterministic=True, render=False)

test_only=False
# test_only=True
agent.learn(1000000, callback=eval_callback)
agent.save(savepath)
# env = env_class(visualize=True)
# # env.randomize = False
# agent = trainer.load(savepath, env=env)

# state, _=env.reset()
# rew = 0
# while test:
# action, _ = agent.predict(
# state.reshape(1,-1),
# deterministic=True
# )
# state, reward, terminated, truncated, info = env.step(action)
# # print(state, reward)
# msg = f"POS {state[0, :3]} VEL{state[0, 6:9]}, ACC {state[0, 12:15]}"
# print(msg)
# rew+=reward

# time.sleep(env.timestep)
# if terminated or truncated:
# print(rew)
# rew=0
# state, _=env.reset()


if __name__=='__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from stable_baselines3 import PPO, SAC, TD3
from gym_pybullet_drones.envs.single_agent_rl import HoverIMU, HoverGPS, HoverFullState
import time
import torch
import os
import multiprocessing as mp
from stable_baselines3.common.callbacks import EvalCallback


def run_exp(buffer_size):

batch_size = 128
# for model_scale in [1, 2]:

savedir = os.path.join(
'/home/led/robotics/engines/Bullet_sym/gym-pybullet-drones/gym_pybullet_drones/results/hover/multiple',
str(buffer_size)#, str(batch_size),
)
for i in range(10):
savepath= os.path.join(
savedir,
'model_'+str(i)
)

trainer = PPO
env_class = HoverFullState
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))

env = env_class()
# env.randomize = False
agent = trainer(
'MlpPolicy',
env=env,
verbose=1,
tensorboard_log=savedir,
# policy_kwargs=policy_kwargs
n_steps=buffer_size,
# batch_size=batch_size
)

eval_callback = EvalCallback(env, best_model_save_path=savedir,
log_path=savedir, eval_freq=20000,
deterministic=True, render=False)

test_only=False
# test_only=True
agent.learn(3000000, callback=eval_callback)
agent.save(savepath)


def main():
args = [4000, 8000, 16000]# buffer size

with mp.Pool(3) as p:
p.map(run_exp, args)


if __name__=='__main__':
main()
12 changes: 7 additions & 5 deletions gym_pybullet_drones/examples/hover/hover_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def main(test=True):
savedir = '/home/led/robotics/engines/Bullet_sym/gym-pybullet-drones/gym_pybullet_drones/results/hover'
savepath= os.path.join(
savedir,
'best_model'
'/home/led/robotics/engines/Bullet_sym/gym-pybullet-drones/gym_pybullet_drones/results/hover/multiple/model_2000_2000.zip'
# 'best'
# 'best_model'
# "best_model_ppo_longlearn"
# 'best_model_ppo_nonorm_imu_BEST'
# 'best_model_ppo_nonorm'
Expand All @@ -27,7 +29,7 @@ def main(test=True):
policy_kwargs = dict(net_arch=dict(pi=[64, 64], qf=[64, 64]))

env = env_class(visualize=True)
env.randomize = False
# env.randomize = False
agent = trainer.load(savepath, env=env)

state, _=env.reset()
Expand All @@ -38,10 +40,10 @@ def main(test=True):
deterministic=True
)
state, reward, terminated, truncated, info = env.step(action)
# print(state, reward)
print(reward, action)

msg = f"POS {state[0, :3]} VEL{state[0, 6:9]}, ACC {state[0, 12:15]}"
print(msg)
# msg = f"POS {state[0, :3]} VEL{state[0, 6:9]}, ACC {state[0, 12:15]}"
# print(msg)
rew+=reward

time.sleep(env.timestep)
Expand Down
Loading

0 comments on commit 2e50245

Please sign in to comment.