Merge branch 'develop' into merge_bn

7 years ago · 16e31343d8
parent 17833d30e3 49d431d44c
commit 16e31343d8
73 changed files with 3146 additions and 482 deletions
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import tensorflow as tf
+import paddle.v2 as paddle
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+
+
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+
+import paddle.v2 as paddle
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+
+        return fetch_g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+
+    if args.infer_only:
+        pass
+    else:
+        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")

-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h

 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.dot
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.png
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.png
--- a/doc/fluid/design/concepts/index_cn.rst
+++ b/doc/fluid/design/concepts/index_cn.rst
@ -16,3 +16,4 @@
  block.md
  scope.md
  executor.md
+  parallel_executor.md
--- a/doc/fluid/design/concepts/index_en.rst
+++ b/doc/fluid/design/concepts/index_en.rst
@ -16,3 +16,4 @@ Core Concepts
  block.md
  scope.md
  executor.md
+  parallel_executor.md
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
@ -1,4 +1,6 @@
-# Problem
+# Kernel Hint Design
+
+## Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.

 In the current design, we use KernelType to describe one kernel.
--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
@ -1,4 +1,6 @@
-# Background
+# Kernel Selection
+
+## Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.

 The `OpKernelType ` is as follows:
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@ -1,32 +1,56 @@
-Install and Build
-=================
+install and Compile
+==========

 .. _install_steps:

-Install Steps
-++++++++
+PaddlePaddle provides various methods of installation for many different users

-You can choose either pip or Docker to complete your install:
+Focus on Deep Learning Model Development
+-----------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:

 .. toctree::
 	:maxdepth: 1

 	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+----------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
 	docker_install_en.rst

-Build from Source
-----------------
+We recommend running PaddlePaddle in Docker. This method has the following advantages：

-..  warning::
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 

-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:

 .. toctree::
    :maxdepth: 1

-    build_from_source_en.md
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+

 FAQ
-++++++++++
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：

-`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@ -1,3 +1,4 @@
+.timestamp
 *.o
 *.a
 .svn
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name,
 TEST(Operator, CPUtoGPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  InitDevices();
+  InitDevices(true);

  paddle::framework::Scope scope;
  paddle::platform::CPUPlace cpu_place;
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -59,7 +59,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
  std::unordered_set<std::string> og_has_been_broadcast;
-  result.vars_.resize(places_.size());
+
+  // We cannot invoke resize. It is a bug of GCC 4.8
+  result.vars_ = std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
+      places_.size());

  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
@ -147,15 +151,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
            if (vars.empty()) {  // This device has no data. continue.
              continue;
            }
-            auto *prev_grad = &vars[vars.size() - 1];
-            op_handle->AddInput(prev_grad);
+            auto &prev_grad = vars[vars.size() - 1];
+            op_handle->AddInput(prev_grad.get());

-            auto &var = vars[vars.size()];
-            var.place_ = p;
-            var.name_ = og;
-            var.version_ = vars.size() - 1;
+            vars.emplace_back(new VarHandle);
+            auto &var = vars.back();
+            var->place_ = p;
+            var->name_ = og;
+            var->version_ = vars.size() - 1;

-            op_handle->AddOutput(&var);
+            op_handle->AddOutput(var.get());
          }
 #else
          PADDLE_ENFORCE("Not implemented");
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ b/paddle/fluid/framework/details/ssa_graph.h
@ -16,6 +16,8 @@

 #include <map>
 #include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"

@ -24,7 +26,9 @@ namespace framework {
 namespace details {

 struct SSAGraph {
-  std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_;
+  std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+      vars_;
  // aux variables to represent dependency. Useful to resolve data hazard.
  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
  std::vector<std::unique_ptr<OpHandleBase>> ops_;
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
      auto it_old = name_pair.second.rbegin();
      ++it_old;
      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = it_new->second.generated_op_;
-        auto &read_ops = it_old->second.pending_ops_;
+        auto *write_op = (*it_new)->generated_op_;
+        auto &read_ops = (*it_old)->pending_ops_;

        for (auto *read_op : read_ops) {
          // Manually add a dependency var from read_op to write_op;
@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
  auto &var_holder = var_holders[each_var_name];
  VarHandle *var = nullptr;
  if (var_holder.empty()) {
+    var_holder.emplace_back(new VarHandle);
    auto &init_var = var_holder[0];
-    init_var.place_ = place;
-    init_var.name_ = each_var_name;
-    init_var.generated_op_ = nullptr;
-    init_var.version_ = 0;
-    var = &init_var;
+    init_var->place_ = place;
+    init_var->name_ = each_var_name;
+    init_var->generated_op_ = nullptr;
+    init_var->version_ = 0;
+    var = init_var.get();
  } else {
-    var = &var_holder.rbegin()->second;
+    var = var_holder.rbegin()->get();
  }
  return var;
 }
@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
                                     size_t place_offset) {
  auto &vars = graph->vars_[place_offset][each_var_name];
  size_t version = vars.size();
-  auto &var = vars[version];
-  var.version_ = version;
-  var.name_ = each_var_name;
-  var.place_ = place;
-  op_handle->AddOutput(&var);
+  vars.emplace_back(new VarHandle());
+  auto &var = vars.back();
+  var->version_ = version;
+  var->name_ = each_var_name;
+  var->place_ = place;
+  op_handle->AddOutput(var.get());
 }

 template <typename Callback>
@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) {
  for (auto &each : graph.vars_) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
-        callback(pair2.second);
+        callback(*pair2);
      }
    }
  }
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  for (auto &var_map : graph_->vars_) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(version_pair.second);
+        InsertPendingVar(*version_pair);
      }
    }
  }
@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    for (auto &var_map : graph_->vars_) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second);
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
      }
    }
  }
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@ -64,7 +64,7 @@ void InitP2P(int count) {
 #endif
 }

-void InitDevices() {
+void InitDevices(bool init_p2p) {
  /*Init all avaiable devices by default */

  std::vector<platform::Place> places;
@ -85,7 +85,9 @@ void InitDevices() {
  for (int i = 0; i < count; ++i) {
    places.emplace_back(platform::CUDAPlace(i));
  }
+  if (init_p2p) {
    InitP2P(count);
+  }
  platform::DeviceContextPool::Init(places);
 }

--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);

 void InitGLOG(const std::string &prog_name);

-void InitDevices();
+void InitDevices(bool init_p2p);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
@ -21,7 +21,7 @@ TEST(InitDevices, CPU) {
  using paddle::platform::DeviceContextPool;

 #ifndef PADDLE_WITH_CUDA
-  InitDevices();
+  InitDevices(true);
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U);
 #endif
@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) {

 #ifdef PADDLE_WITH_CUDA
  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices();
+  InitDevices(true);
  DeviceContextPool& pool = DeviceContextPool::Instance();
  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) {
 }

 TEST(LoD, data) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);

  paddle::framework::LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@ -46,7 +46,7 @@ TEST(LoD, data) {
 }

 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);

  paddle::framework::LoDTensor lod_tensor;
  paddle::platform::CUDAPlace place(0);
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@ -72,7 +72,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                             paddle::framework::OpWithoutKernelCheckerMaker);

 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@ -198,7 +198,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,

 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@ -228,7 +228,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;

-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  proto::OpDesc op_desc;

  op_desc.set_type("op_multi_inputs_with_kernel");
@ -269,7 +269,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };

 TEST(Operator, Clone) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                  paddle::framework::VariableNameMap{},
                  paddle::framework::AttributeMap{});
--- a/Show More
+++ b/Show More