paramtrized reward cliping

gsurma · gsurma · commit ee42b7437155 · 2018-09-22T09:16:47.000-07:00
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/atari.py b/atari.py
@@ -1,5 +1,6 @@
 import gym
 import argparse
+import numpy as np
 import atari_py
 from game_models.ddqn_game_model import DDQNTrainer, DDQNSolver
 from game_models.ge_game_model import GETrainer, GESolver
@@ -13,12 +14,12 @@
 class Atari:
 
     def __init__(self):
-        game_name, game_mode, render, total_step_limit, total_run_limit = self._args()
+        game_name, game_mode, render, total_step_limit, total_run_limit, clip = self._args()
         env_name = game_name + "Deterministic-v4"  # Handles frame skipping (4) at every iteration
         env = MainGymWrapper.wrap(gym.make(env_name))
-        self._main_loop(self._game_model(game_mode, game_name, env.action_space.n), env, render, total_step_limit, total_run_limit)
+        self._main_loop(self._game_model(game_mode, game_name, env.action_space.n), env, render, total_step_limit, total_run_limit, clip)
 
-    def _main_loop(self, game_model, env, render, total_step_limit, total_run_limit):
+    def _main_loop(self, game_model, env, render, total_step_limit, total_run_limit, clip):
         run = 0
         total_step = 0
         while True:
@@ -42,6 +43,8 @@ def _main_loop(self, game_model, env, render, total_step_limit, total_run_limit)
 
                 action = game_model.move(current_state)
                 next_state, reward, terminal, info = env.step(action)
+                if clip:
+                    np.sign(reward)
                 score += reward
                 game_model.remember(current_state, action, reward, next_state, terminal)
                 current_state = next_state
@@ -60,18 +63,21 @@ def _args(self):
         parser.add_argument("-r", "--render", help="Choose if the game should be rendered. Default is 'False'.", default=False)
         parser.add_argument("-tsl", "--total_step_limit", help="Choose how many total steps (frames visible by agent) should be performed. Default is '10000000'.", default=10000000)
         parser.add_argument("-trl", "--total_run_limit", help="Choose after how many runs should we stop. Default is None (no limit).", default=None)
+        parser.add_argument("-c", "--clip", help="Choose whether we should clip rewards to (0, 1) range. Default is 'True'", default=True)
         args = parser.parse_args()
         game_mode = args.mode
         game_name = args.game
         render = args.render
         total_step_limit = args.total_step_limit
-        total_run_limit = args.run_limit
+        total_run_limit = args.total_run_limit
+        clip = args.clip
         print "Selected game: " + str(game_name)
         print "Selected mode: " + str(game_mode)
         print "Should render: " + str(render)
+        print "Should clip: " + str(clip)
         print "Total step limit: " + str(total_step_limit)
         print "Total run limit: " + str(total_run_limit)
-        return game_name, game_mode, render, total_step_limit, total_run_limit
+        return game_name, game_mode, render, total_step_limit, total_run_limit, clip
 
     def _game_model(self, game_mode,game_name, action_space):
         if game_mode == "ddqn_training":
diff --git a/game_models/ddqn_game_model.py b/game_models/ddqn_game_model.py
@@ -16,7 +16,7 @@
 
 EXPLORATION_MAX = 1.0
 EXPLORATION_MIN = 0.1
-EXPLORATION_TEST = 0.02
+EXPLORATION_TEST = 0.01
 EXPLORATION_STEPS = 850000
 EXPLORATION_DECAY = (EXPLORATION_MAX-EXPLORATION_MIN)/EXPLORATION_STEPS
 
diff --git a/gym_wrappers.py b/gym_wrappers.py
@@ -191,5 +191,5 @@ def wrap(env):
         env = ProcessFrame84(env)
         env = ChannelsFirstImageShape(env)
         env = FrameStack(env, 4)
-        env = ClippedRewardsWrapper(env)
+        # env = ClippedRewardsWrapper(env)
         return env
diff --git a/logger.py b/logger.py
@@ -95,7 +95,7 @@ def _save_png(self, input_path, output_path, small_batch_length, big_batch_lengt
                 batch_averages_y.append(mean(temp_values_in_batch))
                 batch_averages_x.append(len(batch_averages_y)*big_batch_length)
                 temp_values_in_batch = []
-        if batch_averages_x and batch_averages_y:
+        if len(batch_averages_x) > 1:
             plt.plot(batch_averages_x, batch_averages_y, linestyle="--", label="last " + str(big_batch_length) + " average")
 
         if len(x) > 1: