[Dy2stat] Add Reinforcement learning unittest (#25445)
* add reinforcement learning model test=develop * align backward test=develop * add gym in paddle_build.sh test=develop * rm pip install in script test=develop * refine paddle_build.sh test=develop * fix sed error in macOS test=develop * polish code test=developfix_copy_if_different
parent
5a2d15a1d0
commit
1a5d3defb1
@ -0,0 +1,218 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gym
|
||||
import math
|
||||
import itertools
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import paddle.fluid.dygraph.nn as nn
|
||||
from paddle.fluid.dygraph import to_variable, Layer
|
||||
from paddle.fluid.dygraph import declarative, ProgramTranslator
|
||||
|
||||
import unittest
|
||||
|
||||
SEED = 2020
|
||||
program_translator = ProgramTranslator()
|
||||
|
||||
|
||||
class Policy(Layer):
|
||||
def __init__(self):
|
||||
super(Policy, self).__init__()
|
||||
|
||||
self.affine1 = nn.Linear(4, 128)
|
||||
self.affine2 = nn.Linear(128, 2)
|
||||
self.dropout_ratio = 0.6
|
||||
|
||||
self.saved_log_probs = []
|
||||
self.rewards = []
|
||||
|
||||
@declarative
|
||||
def forward(self, x):
|
||||
x = fluid.layers.reshape(x, shape=[1, 4])
|
||||
x = self.affine1(x)
|
||||
x = fluid.layers.dropout(x, self.dropout_ratio)
|
||||
x = fluid.layers.relu(x)
|
||||
action_scores = self.affine2(x)
|
||||
|
||||
log_prob = fluid.layers.softmax(action_scores, axis=1)
|
||||
|
||||
return log_prob
|
||||
|
||||
|
||||
class Args(object):
|
||||
gamma = 0.99
|
||||
log_interval = 1
|
||||
train_step = 10
|
||||
|
||||
|
||||
def train(args, place, to_static):
|
||||
program_translator.enable(to_static)
|
||||
|
||||
env = gym.make('CartPole-v0')
|
||||
env.seed(SEED)
|
||||
|
||||
with fluid.dygraph.guard(place):
|
||||
fluid.default_main_program().random_seed = SEED
|
||||
fluid.default_startup_program().random_seed = SEED
|
||||
local_random = np.random.RandomState(SEED)
|
||||
|
||||
policy = Policy()
|
||||
|
||||
eps = np.finfo(np.float32).eps.item()
|
||||
optimizer = fluid.optimizer.AdamaxOptimizer(
|
||||
learning_rate=1e-2, parameter_list=policy.parameters())
|
||||
|
||||
def get_mean_and_std(values=[]):
|
||||
n = 0.
|
||||
s = 0.
|
||||
for val in values:
|
||||
s += val
|
||||
n += 1
|
||||
mean = s / n
|
||||
|
||||
std = 0.
|
||||
for val in values:
|
||||
std += (val - mean) * (val - mean)
|
||||
std /= n
|
||||
std = math.sqrt(std)
|
||||
|
||||
return mean, std
|
||||
|
||||
def sample_action(probs):
|
||||
sample = local_random.random_sample()
|
||||
idx = 0
|
||||
|
||||
while idx < len(probs) and sample > probs[idx]:
|
||||
sample -= probs[idx]
|
||||
idx += 1
|
||||
mask = [0.] * len(probs)
|
||||
mask[idx] = 1.
|
||||
|
||||
return idx, np.array([mask]).astype("float32")
|
||||
|
||||
def choose_best_action(probs):
|
||||
idx = 0 if probs[0] > probs[1] else 1
|
||||
mask = [1., 0.] if idx == 0 else [0., 1.]
|
||||
|
||||
return idx, np.array([mask]).astype("float32")
|
||||
|
||||
def select_action(state):
|
||||
state = to_variable(state)
|
||||
state.stop_gradient = True
|
||||
loss_probs = policy(state)
|
||||
# print(loss_probs.name)
|
||||
probs = loss_probs.numpy()
|
||||
|
||||
action, _mask = sample_action(probs[0])
|
||||
mask = to_variable(_mask)
|
||||
mask.stop_gradient = True
|
||||
|
||||
loss_probs = fluid.layers.log(loss_probs)
|
||||
loss_probs = fluid.layers.elementwise_mul(loss_probs, mask)
|
||||
loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
|
||||
|
||||
policy.saved_log_probs.append(loss_probs)
|
||||
return action, loss_probs
|
||||
|
||||
def finish_episode():
|
||||
R = 0
|
||||
policy_loss = []
|
||||
returns = []
|
||||
for r in policy.rewards[::-1]:
|
||||
R = r + args.gamma * R
|
||||
returns.insert(0, R)
|
||||
|
||||
mean, std = get_mean_and_std(returns)
|
||||
|
||||
returns = np.array(returns).astype("float32")
|
||||
returns = (returns - mean) / (std + eps)
|
||||
|
||||
# calculate policy loss of each step.
|
||||
for log_prob, R in zip(policy.saved_log_probs, returns):
|
||||
log_prob_numpy = log_prob.numpy()
|
||||
|
||||
R_numpy = np.ones_like(log_prob_numpy).astype("float32")
|
||||
_R = -1 * R * R_numpy
|
||||
_R = to_variable(_R)
|
||||
_R.stop_gradient = True
|
||||
cur_loss = fluid.layers.elementwise_mul(_R, log_prob)
|
||||
policy_loss.append(cur_loss)
|
||||
|
||||
policy_loss = fluid.layers.concat(policy_loss)
|
||||
policy_loss = fluid.layers.reduce_sum(policy_loss)
|
||||
|
||||
policy_loss.backward()
|
||||
optimizer.minimize(policy_loss)
|
||||
policy.clear_gradients()
|
||||
|
||||
del policy.rewards[:]
|
||||
del policy.saved_log_probs[:]
|
||||
|
||||
return returns
|
||||
|
||||
loss_data = []
|
||||
running_reward = 10
|
||||
for i_episode in itertools.count(1):
|
||||
state, ep_reward = env.reset(), 0
|
||||
# TODO(Aurelius84): In RL, we continuously select actions with multiple steps,
|
||||
# then accumulate loss to apply optimization. But currently all vars shared with
|
||||
# the same inner scope, which has problem in backward. I will fix it in next PR.
|
||||
for t in range(1, 2): # default 1000
|
||||
state = np.array(state).astype("float32")
|
||||
action, loss = select_action(state)
|
||||
state, reward, done, _ = env.step(action)
|
||||
|
||||
# log loss_probs
|
||||
loss_data.append(loss.numpy()[0])
|
||||
|
||||
policy.rewards.append(reward)
|
||||
ep_reward += reward
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
# sum loss and apply optimization
|
||||
returns = finish_episode()
|
||||
|
||||
running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
|
||||
if i_episode % args.log_interval == 0:
|
||||
print(
|
||||
'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
|
||||
format(i_episode, ep_reward, running_reward,
|
||||
loss.numpy()[0]))
|
||||
|
||||
if i_episode > args.train_step:
|
||||
break
|
||||
|
||||
return np.array(loss_data)
|
||||
|
||||
|
||||
class TestDeclarative(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
|
||||
else fluid.CPUPlace()
|
||||
|
||||
self.args = Args()
|
||||
|
||||
def test_train(self):
|
||||
st_out = train(self.args, self.place, to_static=True)
|
||||
dy_out = train(self.args, self.place, to_static=False)
|
||||
self.assertTrue(
|
||||
np.allclose(st_out, dy_out),
|
||||
msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue