-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtd3_agent.py
120 lines (106 loc) · 4.32 KB
/
td3_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import numpy as np
import parl, os
from parl import layers
from paddle import fluid
class RLBenchAgent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim=8):
assert isinstance(obs_dim, int)
assert isinstance(act_dim, int)
self.obs_dim = obs_dim
self.act_dim = act_dim
super(RLBenchAgent, self).__init__(algorithm)
# Attention: In the beginning, sync target model totally.
self.alg.sync_target(decay=0)
self.learn_it = 0
self.policy_freq = self.alg.policy_freq
def build_program(self):
self.pred_program = fluid.Program()
self.actor_learn_program = fluid.Program()
self.critic_learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.actor_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.actor_cost = self.alg.actor_learn(obs)
with fluid.program_guard(self.critic_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(
name='act', shape=[self.act_dim], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.critic_cost = self.alg.critic_learn(obs, act, reward,
next_obs, terminal)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0]
return act
def learn(self, obs, act, reward, next_obs, terminal):
self.learn_it += 1
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal
}
critic_cost = self.fluid_executor.run(
self.critic_learn_program,
feed=feed,
fetch_list=[self.critic_cost])[0]
actor_cost = None
if self.learn_it % self.policy_freq == 0:
actor_cost = self.fluid_executor.run(
self.actor_learn_program,
feed={'obs': obs},
fetch_list=[self.actor_cost])[0]
self.alg.sync_target()
return actor_cost, critic_cost
def save_actor(self, save_path):
program = self.actor_learn_program
dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split(os.sep)[-1]
fluid.io.save_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=program,
filename=filename)
def save_critic(self, save_path):
program = self.critic_learn_program
dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split(os.sep)[-1]
fluid.io.save_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=program,
filename=filename)
def restore_actor(self, save_path):
program = self.actor_learn_program
if type(program) is fluid.compiler.CompiledProgram:
program = program._init_program
dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split(os.sep)[-1]
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=program,
filename=filename)
def restore_critic(self, save_path):
program = self.critic_learn_program
if type(program) is fluid.compiler.CompiledProgram:
program = program._init_program
dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split(os.sep)[-1]
fluid.io.load_params(
executor=self.fluid_executor,
dirname=dirname,
main_program=program,
filename=filename)