From b6a52f35567bc72d896aef0bdff0d64e58b2d4bd Mon Sep 17 00:00:00 2001 From: maik97 <35041716+maik97@users.noreply.github.com> Date: Fri, 17 Sep 2021 14:06:32 +0200 Subject: [PATCH] update --- _example_agents/ppo_single_model.py | 37 ++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/_example_agents/ppo_single_model.py b/_example_agents/ppo_single_model.py index 8456024..64b220e 100644 --- a/_example_agents/ppo_single_model.py +++ b/_example_agents/ppo_single_model.py @@ -24,12 +24,12 @@ def __init__( env, epochs=10, batch_size=64, - learning_rate=3e-45, + learning_rate=3e-4, clipnorm=0.5, - entropy_factor = 0.0, - hidden_units = 256, - hidden_activation = 'relu', - kernel_initializer: str = 'glorot_uniform', + entropy_factor=0.0, + hidden_units=64, + hidden_activation='relu', + kernel_initializer='glorot_uniform', logger=None, approximate_contin=False, ): @@ -46,6 +46,8 @@ def __init__( self.reward_rmstd = RunningMeanStd() + kernel_initializer = tf.keras.initializers.Orthogonal() + input_layer = Input(env.observation_space.shape) hidden_layer = Dense(hidden_units, activation=hidden_activation, kernel_initializer=kernel_initializer)(input_layer) hidden_layer = Dense(hidden_units, activation=hidden_activation, kernel_initializer=kernel_initializer)(hidden_layer) @@ -126,8 +128,9 @@ def learn(self): sum_loss = a_loss + 0.5 * c_loss - grad = tape.gradient(sum_loss, self.model.trainable_variables) - self.optimizer.apply_gradients(zip(grad, self.model.trainable_variables)) + if not tf.math.is_nan(sum_loss): + grad = tape.gradient(sum_loss, self.model.trainable_variables) + self.optimizer.apply_gradients(zip(grad, self.model.trainable_variables)) #self.optimizer.minimize(sum_loss, self.model.trainable_variables, tape=tape) @@ -141,6 +144,13 @@ def learn(self): self.memory.clear() + def save_model(self, path='test'): + self.model.save_weights(path) + #self.model.save(path) + + def load_model(self, path='test'): + self.model.load_weights(path) + def train_ppo(): @@ -151,7 +161,18 @@ def train_ppo(): agent = PPO(env, logger=StatusPrinter('test')) trainer = Trainer(env, agent) - trainer.n_step_train(5_000_000, train_on_test=False) + trainer.n_step_train(5_000, train_on_test=False) + trainer.agent.save_model() + env.close() + del trainer + del agent + del env + + env = gym.make("LunarLanderContinuous-v2") + agent = PPO(env, logger=StatusPrinter('test')) + trainer = Trainer(env, agent) + trainer.agent.load_model() + trainer.n_step_train(5_000, train_on_test=False) trainer.test(100) if __name__ == "__main__":