|
import tensorflow as tf |
|
import numpy as np |
|
import gym |
|
import os |
|
import shutil |
|
import sys |
|
import time |
|
|
|
|
|
np.random.seed(1) |
|
tf.set_random_seed(1) |
|
|
|
|
|
fold = int(sys.argv[1]) |
|
|
|
st = time.process_time() |
|
|
|
MAX_EPISODES = 1500 |
|
LR_A = 0.0005 |
|
LR_C = 0.0005 |
|
GAMMA = 0.999 |
|
REPLACE_ITER_A = 1700 |
|
REPLACE_ITER_C = 1500 |
|
MEMORY_CAPACITY = 200000 |
|
BATCH_SIZE = 32 |
|
DISPLAY_THRESHOLD = 100 |
|
DATA_PATH = './data' |
|
LOAD_MODEL = False |
|
SAVE_MODEL_ITER = 100000 |
|
RENDER = False |
|
OUTPUT_GRAPH = False |
|
ENV_NAME = 'BipedalWalker-v2' |
|
|
|
GLOBAL_STEP = tf.Variable(0, trainable=False) |
|
INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1)) |
|
LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True) |
|
LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True) |
|
END_POINT = (200 - 10) * (14/30) |
|
|
|
env = gym.make(ENV_NAME) |
|
env.seed(1) |
|
|
|
STATE_DIM = env.observation_space.shape[0] |
|
ACTION_DIM = env.action_space.shape[0] |
|
ACTION_BOUND = env.action_space.high |
|
|
|
|
|
with tf.name_scope('S'): |
|
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') |
|
with tf.name_scope('R'): |
|
R = tf.placeholder(tf.float32, [None, 1], name='r') |
|
with tf.name_scope('S_'): |
|
S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') |
|
|
|
|
|
|
|
class Actor(object): |
|
def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): |
|
self.sess = sess |
|
self.a_dim = action_dim |
|
self.action_bound = action_bound |
|
self.lr = learning_rate |
|
self.t_replace_iter = t_replace_iter |
|
self.t_replace_counter = 0 |
|
|
|
with tf.variable_scope('Actor'): |
|
|
|
self.a = self._build_net(S, scope='eval_net', trainable=True) |
|
|
|
|
|
self.a_ = self._build_net(S_, scope='target_net', trainable=False) |
|
|
|
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') |
|
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') |
|
|
|
def _build_net(self, s, scope, trainable): |
|
with tf.variable_scope(scope): |
|
init_w = tf.random_normal_initializer(0., 0.01) |
|
init_b = tf.constant_initializer(0.01) |
|
|
|
two = tf.constant(2.0, dtype=tf.float32) |
|
def UAF(x): |
|
UAF_A = tf.Variable(tf.random_normal([1], mean=1.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32) |
|
UAF_B = tf.Variable(tf.random_normal([1], mean=-0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32) |
|
UAF_C = tf.Variable(tf.random_normal([1], mean=-0.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32) |
|
UAF_D = tf.Variable(tf.random_normal([1], mean=-0.9, stddev=0.0, dtype=tf.float32), dtype=tf.float32) |
|
UAF_E = tf.Variable(tf.random_normal([1], mean=0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32) |
|
|
|
P1 = tf.multiply(UAF_A,(x+UAF_B)) +tf.multiply(UAF_C,tf.pow(x,two)) |
|
P2 = tf.multiply(UAF_D,(x-UAF_B)) |
|
|
|
P3 = tf.nn.relu(P1) + tf.math.log1p(tf.exp(-tf.abs(P1))) |
|
P4 = tf.nn.relu(P2) + tf.math.log1p(tf.exp(-tf.abs(P2))) |
|
return P3 - P4 + UAF_E |
|
|
|
|
|
net = tf.layers.dense(s, 500, activation=UAF, |
|
kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable) |
|
net = tf.layers.dense(net, 200, activation=UAF, |
|
kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable) |
|
|
|
with tf.variable_scope('a'): |
|
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, |
|
bias_initializer=init_b, name='a', trainable=trainable) |
|
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') |
|
return scaled_a |
|
|
|
def learn(self, s): |
|
self.sess.run(self.train_op, feed_dict={S: s}) |
|
if self.t_replace_counter % self.t_replace_iter == 0: |
|
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) |
|
self.t_replace_counter += 1 |
|
|
|
def choose_action(self, s): |
|
s = s[np.newaxis, :] |
|
return self.sess.run(self.a, feed_dict={S: s})[0] |
|
|
|
def add_grad_to_graph(self, a_grads): |
|
with tf.variable_scope('policy_grads'): |
|
|
|
|
|
|
|
|
|
self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) |
|
|
|
with tf.variable_scope('A_train'): |
|
opt = tf.train.RMSPropOptimizer(-self.lr) |
|
self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP) |
|
|
|
|
|
|
|
|
|
class Critic(object): |
|
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_): |
|
self.sess = sess |
|
self.s_dim = state_dim |
|
self.a_dim = action_dim |
|
self.lr = learning_rate |
|
self.gamma = gamma |
|
self.t_replace_iter = t_replace_iter |
|
self.t_replace_counter = 0 |
|
|
|
with tf.variable_scope('Critic'): |
|
|
|
self.a = a |
|
self.q = self._build_net(S, self.a, 'eval_net', trainable=True) |
|
|
|
|
|
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) |
|
|
|
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') |
|
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') |
|
|
|
with tf.variable_scope('target_q'): |
|
self.target_q = R + self.gamma * self.q_ |
|
|
|
with tf.variable_scope('abs_TD'): |
|
self.abs_td = tf.abs(self.target_q - self.q) |
|
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') |
|
with tf.variable_scope('TD_error'): |
|
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q)) |
|
|
|
with tf.variable_scope('C_train'): |
|
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP) |
|
|
|
with tf.variable_scope('a_grad'): |
|
self.a_grads = tf.gradients(self.q, a)[0] |
|
|
|
def _build_net(self, s, a, scope, trainable): |
|
with tf.variable_scope(scope): |
|
init_w = tf.random_normal_initializer(0., 0.01) |
|
init_b = tf.constant_initializer(0.01) |
|
|
|
two = tf.constant(2.0, dtype=tf.float32) |
|
def UAF2(x): |
|
return tf.nn.relu(x) |
|
|
|
|
|
with tf.variable_scope('l1'): |
|
n_l1 = 700 |
|
|
|
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) |
|
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) |
|
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) |
|
net = UAF2(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) |
|
with tf.variable_scope('l2'): |
|
net = tf.layers.dense(net, 20, activation=UAF2, kernel_initializer=init_w, |
|
bias_initializer=init_b, name='l2', trainable=trainable) |
|
with tf.variable_scope('q'): |
|
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) |
|
return q |
|
|
|
def learn(self, s, a, r, s_, ISW): |
|
_, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, self.a: a, R: r, S_: s_, self.ISWeights: ISW}) |
|
if self.t_replace_counter % self.t_replace_iter == 0: |
|
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) |
|
self.t_replace_counter += 1 |
|
return abs_td |
|
|
|
|
|
class SumTree(object): |
|
""" |
|
This SumTree code is modified version and the original code is from: |
|
https://github.com/jaara/AI-blog/blob/master/SumTree.py |
|
|
|
Story the data with it priority in tree and data frameworks. |
|
""" |
|
data_pointer = 0 |
|
|
|
def __init__(self, capacity): |
|
self.capacity = capacity |
|
self.tree = np.zeros(2 * capacity - 1)+1e-5 |
|
|
|
|
|
self.data = np.zeros(capacity, dtype=object) |
|
|
|
|
|
|
|
def add_new_priority(self, p, data): |
|
leaf_idx = self.data_pointer + self.capacity - 1 |
|
|
|
self.data[self.data_pointer] = data |
|
self.update(leaf_idx, p) |
|
self.data_pointer += 1 |
|
if self.data_pointer >= self.capacity: |
|
self.data_pointer = 0 |
|
|
|
def update(self, tree_idx, p): |
|
change = p - self.tree[tree_idx] |
|
|
|
self.tree[tree_idx] = p |
|
self._propagate_change(tree_idx, change) |
|
|
|
def _propagate_change(self, tree_idx, change): |
|
"""change the sum of priority value in all parent nodes""" |
|
parent_idx = (tree_idx - 1) // 2 |
|
self.tree[parent_idx] += change |
|
if parent_idx != 0: |
|
self._propagate_change(parent_idx, change) |
|
|
|
def get_leaf(self, lower_bound): |
|
leaf_idx = self._retrieve(lower_bound) |
|
data_idx = leaf_idx - self.capacity + 1 |
|
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] |
|
|
|
def _retrieve(self, lower_bound, parent_idx=0): |
|
""" |
|
Tree structure and array storage: |
|
|
|
Tree index: |
|
0 -> storing priority sum |
|
/ \ |
|
1 2 |
|
/ \ / \ |
|
3 4 5 6 -> storing priority for transitions |
|
|
|
Array type for storing: |
|
[0,1,2,3,4,5,6] |
|
""" |
|
left_child_idx = 2 * parent_idx + 1 |
|
right_child_idx = left_child_idx + 1 |
|
|
|
if left_child_idx >= len(self.tree): |
|
return parent_idx |
|
|
|
if self.tree[left_child_idx] == self.tree[right_child_idx]: |
|
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) |
|
if lower_bound <= self.tree[left_child_idx]: |
|
return self._retrieve(lower_bound, left_child_idx) |
|
else: |
|
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) |
|
|
|
@property |
|
def root_priority(self): |
|
return self.tree[0] |
|
|
|
|
|
class Memory(object): |
|
""" |
|
This SumTree code is modified version and the original code is from: |
|
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py |
|
""" |
|
epsilon = 0.001 |
|
alpha = 0.6 |
|
beta = 0.4 |
|
beta_increment_per_sampling = 1e-5 |
|
abs_err_upper = 1 |
|
|
|
def __init__(self, capacity): |
|
self.tree = SumTree(capacity) |
|
|
|
def store(self, error, transition): |
|
p = self._get_priority(error) |
|
self.tree.add_new_priority(p, transition) |
|
|
|
def prio_sample(self, n): |
|
batch_idx, batch_memory, ISWeights = [], [], [] |
|
segment = self.tree.root_priority / n |
|
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) |
|
|
|
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority |
|
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) |
|
for i in range(n): |
|
a = segment * i |
|
b = segment * (i + 1) |
|
lower_bound = np.random.uniform(a, b) |
|
while True: |
|
idx, p, data = self.tree.get_leaf(lower_bound) |
|
if type(data) is int: |
|
i -= 1 |
|
lower_bound = np.random.uniform(segment * i, segment * (i+1)) |
|
else: |
|
break |
|
prob = p / self.tree.root_priority |
|
ISWeights.append(self.tree.capacity * prob) |
|
batch_idx.append(idx) |
|
batch_memory.append(data) |
|
|
|
ISWeights = np.vstack(ISWeights) |
|
ISWeights = np.power(ISWeights, -self.beta) / maxiwi |
|
return batch_idx, np.vstack(batch_memory), ISWeights |
|
|
|
def random_sample(self, n): |
|
idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int) |
|
return np.vstack(self.tree.data[idx]) |
|
|
|
def update(self, idx, error): |
|
p = self._get_priority(error) |
|
self.tree.update(idx, p) |
|
|
|
def _get_priority(self, error): |
|
error += self.epsilon |
|
clipped_error = np.clip(error, 0, self.abs_err_upper) |
|
return np.power(clipped_error, self.alpha) |
|
|
|
|
|
sess = tf.Session() |
|
|
|
|
|
actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A) |
|
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_) |
|
actor.add_grad_to_graph(critic.a_grads) |
|
|
|
M = Memory(MEMORY_CAPACITY) |
|
|
|
saver = tf.train.Saver(max_to_keep=100) |
|
|
|
if LOAD_MODEL: |
|
all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths |
|
saver.restore(sess, all_ckpt[-1]) |
|
else: |
|
if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) |
|
os.mkdir(DATA_PATH) |
|
sess.run(tf.global_variables_initializer()) |
|
|
|
if OUTPUT_GRAPH: |
|
tf.summary.FileWriter('logs', graph=sess.graph) |
|
|
|
var = 3 |
|
var_min = 0.01 |
|
|
|
|
|
dataarr = [] |
|
|
|
|
|
for i_episode in range(MAX_EPISODES): |
|
|
|
s = env.reset() |
|
ep_r = 0 |
|
while True: |
|
if RENDER: |
|
env.render() |
|
a = actor.choose_action(s) |
|
a = np.clip(np.random.normal(a, var), -1, 1) |
|
s_, r, done, _ = env.step(a) |
|
|
|
if r == -100: r = -2 |
|
ep_r += r |
|
|
|
transition = np.hstack((s, a, [r], s_)) |
|
max_p = np.max(M.tree.tree[-M.tree.capacity:]) |
|
M.store(max_p, transition) |
|
|
|
if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20: |
|
var = max([var*0.9999, var_min]) |
|
tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) |
|
b_s = b_M[:, :STATE_DIM] |
|
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM] |
|
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM] |
|
b_s_ = b_M[:, -STATE_DIM:] |
|
|
|
abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights) |
|
actor.learn(b_s) |
|
for i in range(len(tree_idx)): |
|
idx = tree_idx[i] |
|
M.update(idx, abs_td[i]) |
|
if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0: |
|
ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt') |
|
save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False) |
|
print("\nSave Model %s\n" % save_path) |
|
|
|
if done: |
|
if "running_r" not in globals(): |
|
running_r = ep_r |
|
else: |
|
running_r = 0.95*running_r + 0.05*ep_r |
|
|
|
|
|
|
|
stats = np.array([running_r, env.unwrapped.hull.position[0] ]) |
|
dataarr.append(stats) |
|
|
|
done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----' |
|
print('Episode:', i_episode, |
|
done, |
|
'| Running_r: %i' % int(running_r), |
|
'| Epi_r: %.2f' % ep_r, |
|
'| Exploration: %.3f' % var, |
|
'| Pos: %.i' % int(env.unwrapped.hull.position[0]), |
|
'| LR_A: %.6f' % sess.run(LR_A), |
|
'| LR_C: %.6f' % sess.run(LR_C), |
|
) |
|
break |
|
|
|
s = s_ |
|
sess.run(INCREASE_GS) |
|
|
|
|
|
elapsed_time = time.process_time() - st |
|
|
|
data_dir = "time_" + str(fold) + ".npy" |
|
np.save(data_dir, np.array([elapsed_time])) |
|
|
|
|
|
dataarr = np.asarray(dataarr) |
|
data_dir = "stats_" + str(fold) + ".npy" |
|
np.save(data_dir, dataarr) |
|
|