diff --git a/deep_q_rl/launcher.py b/deep_q_rl/launcher.py index c136f01..e3df92a 100755 --- a/deep_q_rl/launcher.py +++ b/deep_q_rl/launcher.py @@ -138,6 +138,10 @@ def process_args(args, defaults, description): type=bool, default=defaults.CUDNN_DETERMINISTIC, help=('Whether to use deterministic backprop. ' + '(default: %(default)s)')) + parser.add_argument('--use_double', dest="use_double", + type=bool, default=defaults.USE_DOUBLE, + help=('Whether to use Double DQN. ' + + '(default: %(default)s)')) parameters = parser.parse_args(args) if parameters.experiment_prefix is None: @@ -216,6 +220,7 @@ def launch(args, defaults, description): parameters.momentum, parameters.clip_delta, parameters.freeze_interval, + parameters.use_double, parameters.batch_size, parameters.network_type, parameters.update_rule, diff --git a/deep_q_rl/q_network.py b/deep_q_rl/q_network.py index 0fa360b..95fef35 100644 --- a/deep_q_rl/q_network.py +++ b/deep_q_rl/q_network.py @@ -28,7 +28,7 @@ class DeepQLearner: def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, - batch_size, network_type, update_rule, + use_double, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width @@ -43,8 +43,13 @@ def __init__(self, input_width, input_height, num_actions, self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval + self.use_double = use_double self.rng = rng + # Using Double DQN is pointless without periodic freezing + if self.use_double: + assert self.freeze_interval > 0 + lasagne.random.set_rng(self.rng) self.update_counter = 0 @@ -93,9 +98,16 @@ def __init__(self, input_width, input_height, num_actions, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) - target = (rewards + - (T.ones_like(terminals) - terminals) * - self.discount * T.max(next_q_vals, axis=1, keepdims=True)) + if self.use_double: + maxaction = T.argmax(q_vals, axis=1, keepdims=False) + temptargets = next_q_vals[T.arange(batch_size),maxaction].reshape((-1, 1)) + target = (rewards + + (T.ones_like(terminals) - terminals) * + self.discount * temptargets) + else: + target = (rewards + + (T.ones_like(terminals) - terminals) * + self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) @@ -476,7 +488,7 @@ def build_linear_network(self, input_width, input_height, output_dim, return l_out def main(): - net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000, + net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000, False, 32, 'nature_cuda') diff --git a/deep_q_rl/run_double.py b/deep_q_rl/run_double.py new file mode 100755 index 0000000..2bce656 --- /dev/null +++ b/deep_q_rl/run_double.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python +""" +Execute a training run of deep-Q-Leaning with parameters that +are consistent with: + +Deep Reinforcement Learning with Double Q-learning. +arXiv preprint arXiv:1509.06461. + +""" + +import launcher +import sys + +class Defaults: + # ---------------------- + # Experiment Parameters + # ---------------------- + STEPS_PER_EPOCH = 250000 + EPOCHS = 200 + STEPS_PER_TEST = 125000 + + # ---------------------- + # ALE Parameters + # ---------------------- + BASE_ROM_PATH = "../roms/" + ROM = 'breakout.bin' + FRAME_SKIP = 4 + REPEAT_ACTION_PROBABILITY = 0 + + # ---------------------- + # Agent/Network parameters: + # ---------------------- + UPDATE_RULE = 'deepmind_rmsprop' + BATCH_ACCUMULATOR = 'sum' + LEARNING_RATE = .00025 + DISCOUNT = .99 + RMS_DECAY = .95 # (Rho) + RMS_EPSILON = .01 + MOMENTUM = 0 # Note that the "momentum" value mentioned in the Nature + # paper is not used in the same way as a traditional momentum + # term. It is used to track gradient for the purpose of + # estimating the standard deviation. This package uses + # rho/RMS_DECAY to track both the history of the gradient + # and the squared gradient. + CLIP_DELTA = 1.0 + EPSILON_START = 1.0 + EPSILON_MIN = .1 + EPSILON_DECAY = 1000000 + PHI_LENGTH = 4 + UPDATE_FREQUENCY = 4 + REPLAY_MEMORY_SIZE = 1000000 + BATCH_SIZE = 32 + NETWORK_TYPE = "nature_dnn" + FREEZE_INTERVAL = 10000 + REPLAY_START_SIZE = 50000 + RESIZE_METHOD = 'scale' + RESIZED_WIDTH = 84 + RESIZED_HEIGHT = 84 + DEATH_ENDS_EPISODE = 'true' + MAX_START_NULLOPS = 30 + DETERMINISTIC = True + CUDNN_DETERMINISTIC = False + USE_DOUBLE = True + +if __name__ == "__main__": + launcher.launch(sys.argv[1:], Defaults, __doc__) diff --git a/deep_q_rl/run_nature.py b/deep_q_rl/run_nature.py index 2da46bc..8199546 100755 --- a/deep_q_rl/run_nature.py +++ b/deep_q_rl/run_nature.py @@ -60,6 +60,7 @@ class Defaults: MAX_START_NULLOPS = 30 DETERMINISTIC = True CUDNN_DETERMINISTIC = False + USE_DOUBLE = False if __name__ == "__main__": launcher.launch(sys.argv[1:], Defaults, __doc__) diff --git a/deep_q_rl/run_nips.py b/deep_q_rl/run_nips.py index 8a6ddfc..1585f2c 100755 --- a/deep_q_rl/run_nips.py +++ b/deep_q_rl/run_nips.py @@ -55,6 +55,7 @@ class Defaults: MAX_START_NULLOPS = 0 DETERMINISTIC = True CUDNN_DETERMINISTIC = False + USE_DOUBLE = False if __name__ == "__main__": launcher.launch(sys.argv[1:], Defaults, __doc__) diff --git a/deep_q_rl/test/test_q_network.py b/deep_q_rl/test/test_q_network.py index 82cd142..87bc10d 100644 --- a/deep_q_rl/test/test_q_network.py +++ b/deep_q_rl/test/test_q_network.py @@ -114,7 +114,7 @@ def test_updates_sgd_no_freeze(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) mdp = self.mdp @@ -157,7 +157,7 @@ def test_convergence_sgd_no_freeze(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) @@ -178,7 +178,7 @@ def test_convergence_random_initialization(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) # Randomize initial q-values: @@ -203,7 +203,7 @@ def test_convergence_sgd_permanent_freeze(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) self.train(net, 1000) @@ -218,7 +218,7 @@ def test_convergence_sgd_frequent_freeze(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) self.train(net, 1000) @@ -233,7 +233,7 @@ def test_convergence_sgd_one_freeze(self): self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, - freeze_interval, 1, 'linear', + freeze_interval, False, 1, 'linear', 'sgd', 'sum', 1.0) self.train(net, freeze_interval * 2)