Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
maik97 committed Sep 17, 2021
1 parent 10ad212 commit 2aaa4f3
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 21 deletions.
2 changes: 1 addition & 1 deletion _example_agents/ppo_seperate_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def learn(self):
#self.memory.pop_array('adv')
return np.mean(losses)

def compare_with_old_policy(self, test_reward):
def test_compare_with_old_policy(self, test_reward):
if self.old_test_reward is None:
self.old_test_reward = test_reward
self.old_weights = self.actor.get_weights()
Expand Down
70 changes: 56 additions & 14 deletions _example_agents/ppo_single_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,54 @@

from wacky_rl.transform import RunningMeanStd

from wacky_rl.logger import StatusPrinter


class PPO(AgentCore):


def __init__(self, env, approximate_contin=False):
def __init__(
self,
env,
epochs=10,
batch_size=64,
learning_rate=3e-45,
clipnorm=0.5,
entropy_factor = 0.0,
hidden_units = 256,
hidden_activation = 'relu',
kernel_initializer: str = 'glorot_uniform',
logger=None,
approximate_contin=False,
):
super(PPO, self).__init__()

self.epochs = epochs
self.batch_size = batch_size

self.logger = logger

self.approximate_contin = approximate_contin
self.memory = BufferMemory()
self.advantage_and_returns = GAE()

self.reward_rmstd = RunningMeanStd()

input_layer = Input(env.observation_space.shape)
hidden_layer = Dense(256, activation='relu')(input_layer)
hidden_layer = Dense(256, activation='relu')(hidden_layer)
hidden_layer = Dense(hidden_units, activation=hidden_activation, kernel_initializer=kernel_initializer)(input_layer)
hidden_layer = Dense(hidden_units, activation=hidden_activation, kernel_initializer=kernel_initializer)(hidden_layer)

action_layer = self.make_action_layer(env, approx_contin=approximate_contin)(hidden_layer)
critic_layer = Dense(1)(hidden_layer)
action_layer = self.make_action_layer(env, approx_contin=approximate_contin, kernel_initializer=kernel_initializer)(hidden_layer)
critic_layer = Dense(1, kernel_initializer=kernel_initializer)(hidden_layer)

self.model = WackyModel(inputs=input_layer, outputs=[action_layer, critic_layer])

self.actor_loss = PPOActorLoss(entropy_factor=0.0)
self.actor_loss = PPOActorLoss(entropy_factor=entropy_factor)
self.critic_loss = MeanSquaredErrorLoss()
self.optimizer = tf.keras.optimizers.Adam(3e-5, clipnorm=0.5)
if clipnorm is None:
self.optimizer = tf.keras.optimizers.Adam(learning_rate)
else:
self.optimizer = tf.keras.optimizers.Adam(learning_rate, clipnorm=clipnorm)

def act(self, inputs, act_argmax=False, save_memories=True):

Expand All @@ -48,9 +71,16 @@ def act(self, inputs, act_argmax=False, save_memories=True):
else:
actions = dist.sample_actions()

probs = dist.calc_probs(actions)

if act_argmax:
self.logger.log_mean('argmax probs', probs)
else:
self.logger.log_mean('probs', probs)

if save_memories:
self.memory(actions, key='actions')
self.memory(dist.calc_probs(actions), key='probs')
self.memory(probs, key='probs')
self.memory(val, key='val')

return self.transform_actions(dist, actions)
Expand All @@ -65,15 +95,20 @@ def learn(self):
_ , next_value = self.model(tf.expand_dims(new_states[-1], 0))
adv, ret = self.advantage_and_returns(rewards, dones, values, next_value)

self.logger.log_mean('values', np.mean(np.append(values, next_value)))
self.logger.log_mean('adv', np.mean(adv.numpy()))
self.logger.log_mean('ret', np.mean(ret.numpy()))


for i in range(len(adv)):
self.memory(adv[i], key='adv')
self.memory(ret[i], key='ret')

a_loss_list = []
c_loss_list = []
sum_loss_list = []

for e in range(4):
for e in range(10):

for mini_batch in self.memory.mini_batches(batch_size=64, num_batches=None, shuffle_batches=False):

Expand All @@ -91,13 +126,20 @@ def learn(self):

sum_loss = a_loss + 0.5 * c_loss

self.optimizer.minimize(sum_loss, self.model.trainable_variables, tape=tape)
grad = tape.gradient(sum_loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(grad, self.model.trainable_variables))

#self.optimizer.minimize(sum_loss, self.model.trainable_variables, tape=tape)

a_loss_list.append(a_loss.numpy())
c_loss_list.append(c_loss.numpy())
sum_loss_list.append(sum_loss.numpy())

a_loss_list.append(tf.reduce_mean(a_loss).numpy())
c_loss_list.append(tf.reduce_mean(c_loss).numpy())
self.logger.log_mean('actor_loss', np.mean(a_loss_list))
self.logger.log_mean('critic_loss', np.mean(c_loss_list))
self.logger.log_mean('sum_loss', np.mean(sum_loss_list))

self.memory.clear()
return np.mean(a_loss_list), np.mean(c_loss_list)


def train_ppo():
Expand All @@ -106,7 +148,7 @@ def train_ppo():
#env = gym.make('CartPole-v0')
env = gym.make("LunarLanderContinuous-v2")

agent = PPO(env)
agent = PPO(env, logger=StatusPrinter('test'))

trainer = Trainer(env, agent)
trainer.n_step_train(5_000_000, train_on_test=False)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
setup(
name = 'wacky-rl',
packages=find_packages(),
version = '0.0.5',
version = '0.0.6',
license='MIT',
description = 'Create custom reinforcement learning agents with wacky-rl.',
author = 'Maik Schürmann',
author_email = 'maik.schuermann97@gmail.com',
url = 'https://github.com/maik97/wacky-rl',
download_url = 'https://github.com/maik97/wacky-rl/archive/refs/tags/v0.0.5.tar.gz',
download_url = 'https://github.com/maik97/wacky-rl/archive/refs/tags/v0.0.6.tar.gz',
keywords = ['rl', 'actor_critic', 'reinforcement-learning', 'ppo', 'a2c', 'sac', 'dqn', 'keras', 'tensorflow', 'python'],
install_requires=[
'tensorflow',
Expand Down
2 changes: 2 additions & 0 deletions wacky_rl/agents/_agent_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def make_action_layer(self, env, num_bins=21, num_actions=None, approx_contin=Fa
self.out_format = float
return layers.ContinActionLayer(num_actions=num_actions, *args, **kwargs)

def compare_with_old_policy(self, test_reward):
pass

def transform_actions(self, dist, actions, lows=None, highs=None, scale=1.0):

Expand Down
7 changes: 5 additions & 2 deletions wacky_rl/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,13 @@ def log_mean(self, tag, value):
else:
self.mean_per_episode_dict[tag] = [value]

def print_status(self, episode):
def print_status(self, steps=None, episode=None):

print('\nname:', self.name)
print('Episode:', str(episode))
if not episode is None:
print('episode:', str(episode))
if not steps is None:
print('steps:', str(steps))
for key in self.mean_per_episode_dict.keys():
if len(self.mean_per_episode_dict[key]) > 1:
self.mean_per_episode_dict[key] = np.mean(self.mean_per_episode_dict[key])
Expand Down
6 changes: 4 additions & 2 deletions wacky_rl/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ def n_step_train(
if s >= train_after:
train_after += n_steps
self.agent.learn()
self.logger.log_mean('sum reward', np.round(np.mean(episode_reward_list)))
if not self.logger is None:
self.logger.log_mean('sum reward', np.round(np.mean(episode_reward_list)))
self.agent.compare_with_old_policy(np.mean(episode_reward_list))
episode_reward_list = []

Expand All @@ -135,7 +136,8 @@ def n_step_train(
reward_list.append(r)

if done:
self.logger.log_mean('test reward', np.round(sum(reward_list), 1))
if not self.logger is None:
self.logger.log_mean('test reward', np.round(sum(reward_list), 1))
obs = self.env.reset()
reward_list = []

Expand Down

0 comments on commit 2aaa4f3

Please sign in to comment.