[Dy2stat] Add Reinforcement learning unittest (#25445)

* add reinforcement learning model test=develop * align backward test=develop * add gym in paddle_build.sh test=develop * rm pip install in script test=develop * refine paddle_build.sh test=develop * fix sed error in macOS test=develop * polish code test=develop
5 years ago · 1a5d3defb1
parent 5a2d15a1d0
commit 1a5d3defb1
2 changed files with 225 additions and 0 deletions
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@ -64,6 +64,9 @@ function cmake_base() {
    # Delete previous built whl packages
    rm -rf python/dist 2>/dev/null || true

+    # `gym` is only used in unittest, it's not suitable to add in requirements.txt.
+    # Add it dynamically.
+    echo "gym" >> ${PADDLE_ROOT}/python/requirements.txt
    # Support build for all python versions, currently
    # including cp27-cp27m and cp27-cp27mu.
    PYTHON_FLAGS=""
@ -119,6 +122,8 @@ function cmake_base() {
                exit 1
            fi
        fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
    else
        if [ "$1" != "" ]; then
            echo "using python abi: $1"
@ -175,6 +180,8 @@ function cmake_base() {
        else
            pip install -r ${PADDLE_ROOT}/python/requirements.txt
        fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
    fi

    if [ "$SYSTEM" == "Darwin" ]; then
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gym
+import math
+import itertools
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from paddle.fluid.dygraph import to_variable, Layer
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+
+import unittest
+
+SEED = 2020
+program_translator = ProgramTranslator()
+
+
+class Policy(Layer):
+    def __init__(self):
+        super(Policy, self).__init__()
+
+        self.affine1 = nn.Linear(4, 128)
+        self.affine2 = nn.Linear(128, 2)
+        self.dropout_ratio = 0.6
+
+        self.saved_log_probs = []
+        self.rewards = []
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[1, 4])
+        x = self.affine1(x)
+        x = fluid.layers.dropout(x, self.dropout_ratio)
+        x = fluid.layers.relu(x)
+        action_scores = self.affine2(x)
+
+        log_prob = fluid.layers.softmax(action_scores, axis=1)
+
+        return log_prob
+
+
+class Args(object):
+    gamma = 0.99
+    log_interval = 1
+    train_step = 10
+
+
+def train(args, place, to_static):
+    program_translator.enable(to_static)
+
+    env = gym.make('CartPole-v0')
+    env.seed(SEED)
+
+    with fluid.dygraph.guard(place):
+        fluid.default_main_program().random_seed = SEED
+        fluid.default_startup_program().random_seed = SEED
+        local_random = np.random.RandomState(SEED)
+
+        policy = Policy()
+
+        eps = np.finfo(np.float32).eps.item()
+        optimizer = fluid.optimizer.AdamaxOptimizer(
+            learning_rate=1e-2, parameter_list=policy.parameters())
+
+        def get_mean_and_std(values=[]):
+            n = 0.
+            s = 0.
+            for val in values:
+                s += val
+                n += 1
+            mean = s / n
+
+            std = 0.
+            for val in values:
+                std += (val - mean) * (val - mean)
+            std /= n
+            std = math.sqrt(std)
+
+            return mean, std
+
+        def sample_action(probs):
+            sample = local_random.random_sample()
+            idx = 0
+
+            while idx < len(probs) and sample > probs[idx]:
+                sample -= probs[idx]
+                idx += 1
+            mask = [0.] * len(probs)
+            mask[idx] = 1.
+
+            return idx, np.array([mask]).astype("float32")
+
+        def choose_best_action(probs):
+            idx = 0 if probs[0] > probs[1] else 1
+            mask = [1., 0.] if idx == 0 else [0., 1.]
+
+            return idx, np.array([mask]).astype("float32")
+
+        def select_action(state):
+            state = to_variable(state)
+            state.stop_gradient = True
+            loss_probs = policy(state)
+            # print(loss_probs.name)
+            probs = loss_probs.numpy()
+
+            action, _mask = sample_action(probs[0])
+            mask = to_variable(_mask)
+            mask.stop_gradient = True
+
+            loss_probs = fluid.layers.log(loss_probs)
+            loss_probs = fluid.layers.elementwise_mul(loss_probs, mask)
+            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
+
+            policy.saved_log_probs.append(loss_probs)
+            return action, loss_probs
+
+        def finish_episode():
+            R = 0
+            policy_loss = []
+            returns = []
+            for r in policy.rewards[::-1]:
+                R = r + args.gamma * R
+                returns.insert(0, R)
+
+            mean, std = get_mean_and_std(returns)
+
+            returns = np.array(returns).astype("float32")
+            returns = (returns - mean) / (std + eps)
+
+            # calculate policy loss of each step.
+            for log_prob, R in zip(policy.saved_log_probs, returns):
+                log_prob_numpy = log_prob.numpy()
+
+                R_numpy = np.ones_like(log_prob_numpy).astype("float32")
+                _R = -1 * R * R_numpy
+                _R = to_variable(_R)
+                _R.stop_gradient = True
+                cur_loss = fluid.layers.elementwise_mul(_R, log_prob)
+                policy_loss.append(cur_loss)
+
+            policy_loss = fluid.layers.concat(policy_loss)
+            policy_loss = fluid.layers.reduce_sum(policy_loss)
+
+            policy_loss.backward()
+            optimizer.minimize(policy_loss)
+            policy.clear_gradients()
+
+            del policy.rewards[:]
+            del policy.saved_log_probs[:]
+
+            return returns
+
+        loss_data = []
+        running_reward = 10
+        for i_episode in itertools.count(1):
+            state, ep_reward = env.reset(), 0
+            # TODO(Aurelius84): In RL, we continuously select actions with multiple steps, 
+            # then accumulate loss to apply optimization. But currently all vars shared with 
+            # the same inner scope, which has problem in backward. I will fix it in next PR.
+            for t in range(1, 2):  # default 1000
+                state = np.array(state).astype("float32")
+                action, loss = select_action(state)
+                state, reward, done, _ = env.step(action)
+
+                # log loss_probs
+                loss_data.append(loss.numpy()[0])
+
+                policy.rewards.append(reward)
+                ep_reward += reward
+
+                if done:
+                    break
+
+            # sum loss and apply optimization
+            returns = finish_episode()
+
+            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
+            if i_episode % args.log_interval == 0:
+                print(
+                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
+                    format(i_episode, ep_reward, running_reward,
+                           loss.numpy()[0]))
+
+            if i_episode > args.train_step:
+                break
+
+        return np.array(loss_data)
+
+
+class TestDeclarative(unittest.TestCase):
+    def setUp(self):
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
+            else fluid.CPUPlace()
+
+        self.args = Args()
+
+    def test_train(self):
+        st_out = train(self.args, self.place, to_static=True)
+        dy_out = train(self.args, self.place, to_static=False)
+        self.assertTrue(
+            np.allclose(st_out, dy_out),
+            msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
+
+
+if __name__ == '__main__':
+    unittest.main()