Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_add_doc

7 years ago · 8a136d142f
parent 980499faf1 9169b3b802
commit 8a136d142f
42 changed files with 1946 additions and 139 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 # CMAKE_BUILD_TYPE
@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
-endif(WITH_GPU)
+    include(external/anakin)
 else()
  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
 endif()
 if(WITH_AMD_GPU)
    find_package(HIP)
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -0,0 +1,42 @@
 if (NOT WITH_ANAKIN)
  return()
 endif()
 set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
  "Anakin install path." FORCE)
 set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
 set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
 function(fetch_include_recursively root_dir)
    if (IS_DIRECTORY ${root_dir})
        include_directories(${root_dir})
    endif()
    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
    foreach(sub ${ALL_SUB})
        if (IS_DIRECTORY ${root_dir}/${sub})
            fetch_include_recursively(${root_dir}/${sub})
        endif()
    endforeach()
 endfunction()
 # download library
 message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
 execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
 execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
 execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
 execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
 execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
 if (WITH_ANAKIN)
    message(STATUS "Anakin for inference is enabled")
    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
    fetch_include_recursively(${ANAKIN_INCLUDE})
    link_directories(${ANAKIN_LIBRARY})
 endif()
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    SET(OPENBLAS_COMMIT "v0.2.20")
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@ -1,5 +1,5 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -1041,3 +1041,42 @@ box_coder
 ..  autofunction:: paddle.fluid.layers.box_coder
    :noindex:
 learning_rate_scheduler
 =======================
 exponential_decay
 -----------------
 ..  autofunction:: paddle.fluid.layers.exponential_decay
    :noindex:
 natural_exp_decay
 -----------------
 ..  autofunction:: paddle.fluid.layers.natural_exp_decay
    :noindex:
 inverse_time_decay
 ------------------
 ..  autofunction:: paddle.fluid.layers.inverse_time_decay
    :noindex:
 polynomial_decay
 ----------------
 ..  autofunction:: paddle.fluid.layers.polynomial_decay
    :noindex:
 piecewise_decay
 ---------------
 ..  autofunction:: paddle.fluid.layers.piecewise_decay
    :noindex:
 noam_decay
 ----------
 ..  autofunction:: paddle.fluid.layers.noam_decay
    :noindex:
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr
 ## What can fluid learn from them?
-TBD
+Please refer to `paddle/contrib/dynamic/`.
 # Appendix
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@ -14,3 +14,4 @@
 #
 add_subdirectory(inference)
 add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@ -17,48 +17,9 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
 set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
 # if anakin is set enable anakin api implementation
 if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
    set(ANAKIN_FOUND ON)
 else()
    set(ANAKIN_FOUND OFF)
 endif()
 function(fetch_include_recursively root_dir) 
    if (IS_DIRECTORY ${root_dir}) 
        include_directories(${root_dir})
    endif()
    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
    foreach(sub ${ALL_SUB})
        if (IS_DIRECTORY ${root_dir}/${sub})
            fetch_include_recursively(${root_dir}/${sub})
        endif()
    endforeach()
 endfunction()
 if (ANAKIN_FOUND)
    # Anakin's code style doesn't follow google c style.
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
    message(STATUS "Anakin for inference is enabled")
    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
    fetch_include_recursively(${ANAKIN_INCLUDE})
    link_directories(${ANAKIN_LIBRARY})
    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    list(APPEND inference_deps inference_anakin_api)
 endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 cc_test(test_paddle_inference_api
@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
-if (ANAKIN_FOUND)
+if (WITH_ANAKIN)
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
    # compile the libinference_anakin_api.a and compile with anakin.so.
    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps})
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                  DEPS inference_anakin_api)
    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
 endif()
 if(WITH_TESTING)
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <cuda.h>
 #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
 #include <cuda.h>
 namespace paddle {
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@ -19,10 +19,9 @@ limitations under the License. */
 #pragma once
 // NOTE This header file do not have namespace.
 //#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 // from anakin
 #include "framework/core/net/net.h"
 #include "saber/saber_types.h"
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api.h"
 DEFINE_string(model, "", "Directory of the inference model.");
 namespace paddle {
 AnakinConfig GetConfig() {
  AnakinConfig config;
-  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.model_file = FLAGS_model;
  config.device = 0;
  config.max_batch_size = 1;
  return config;
--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
@ -0,0 +1,25 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
 cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
 cc_test(test_tape
        SRCS test_tape.cc
        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
@ -0,0 +1,131 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include "paddle/contrib/tape/tape.h"
 #include "paddle/contrib/tape/variable.h"
 #include "paddle/fluid/framework/type_defs.h"
 namespace paddle {
 namespace tape {
 class Function {};
 class Fill {
 public:
  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
      : initializer_(initializer), attrs_(attrs) {}
  void operator()(VariableHandle var) {
    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
  }
 private:
  const std::string initializer_;
  const framework::AttributeMap attrs_;
 };
 class Mean {
 public:
  VariableHandle operator()(VariableHandle var) {
    VariableHandle out(new Variable("mean"));
    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
    return out;
  }
 };
 class Linear {
 public:
  Linear(int in_dim, int out_dim, const std::string &act)
      : w_(new Variable("LinearWeight")),
        b_(new Variable("LinearBias")),
        act_(act) {
    Tape init_tape;
    std::string initializer = "fill_constant";
    framework::AttributeMap attrs;
    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
    attrs["shape"] = std::vector<int>{in_dim, out_dim};
    attrs["value"] = 1.0f;
    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
    attrs["shape"] = std::vector<int>{out_dim};
    attrs["value"] = 1.0f;
    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
    init_tape.Forward();
  }
  VariableHandle operator()(VariableHandle input) {
    VariableHandle pre_bias(new Variable("linear"));
    get_global_tape().AddOp("mul",
                            {{"X", {input}}, {"Y", {w_}}},
                            {{"Out", {pre_bias}}},
                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
    VariableHandle pre_act(new Variable("linear"));
    get_global_tape().AddOp("elementwise_add",
                            {{"X", {pre_bias}}, {"Y", {b_}}},
                            {{"Out", {pre_act}}},
                            {{"axis", 1}});
    VariableHandle post_act(new Variable("linear"));
    get_global_tape().AddOp(
        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
    return post_act;
  }
  std::vector<VariableHandle> Params() { return {w_, b_}; }
 private:
  VariableHandle w_;
  VariableHandle b_;
  std::string act_;
 };
 class SGD {
 public:
  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
    Tape init_tape;
    std::string initializer = "fill_constant";
    framework::AttributeMap attrs;
    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
    attrs["shape"] = std::vector<int>{1};
    attrs["value"] = learning_rate;
    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
    init_tape.Forward();
  }
  void operator()(VariableHandle input) {
    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
                   "optimization must happen after the backward");
    Tape temp_tape;
    temp_tape.AddOp("sgd",
                    {{"Param", {input}},
                     {"LearningRate", {learning_rate_}},
                     {"Grad", {input->Grad()}}},
                    {{"ParamOut", {input}}},
                    {});
    temp_tape.Forward();
  }
 private:
  VariableHandle learning_rate_;
 };
 }
 }
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
@ -0,0 +1,64 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/contrib/tape/variable.h"
 namespace paddle {
 namespace tape {
 using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
 struct OpHandle {
  OpHandle(const std::string &type,
           const VariableHandleMap &in_vars,
           const VariableHandleMap &out_vars,
           const framework::AttributeMap &attrs)
      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
  std::string type_;
  VariableHandleMap inputs_;
  VariableHandleMap outputs_;
  framework::AttributeMap attrs_;
 };
 class Tape {
 public:
  void AddOp(const std::string &type,
             const VariableHandleMap &in_vars,
             VariableHandleMap out_vars,
             const framework::AttributeMap &attrs);
  void Forward();
  void Backward(VariableHandle target);
  bool HasBeenBackwarded() { return has_been_backwarded_; }
 private:
  bool has_been_backwarded_ = false;
  size_t current_position_ = 0;
  std::vector<OpHandle> tape_;
  std::shared_ptr<Tape> backward_tape_;
 };
 Tape &get_global_tape();
 void reset_global_tape();
 }
 }
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
@ -0,0 +1,61 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/contrib/tape/function.h"
 using namespace paddle::tape;
 TEST(Tape, TestMLP) {
  LOG(INFO) << "TestMLP";
  Linear linear1(3, 3, "relu");
  Linear linear2(3, 3, "relu");
  Mean mean;
  SGD sgd(0.001);
  std::string initializer = "fill_constant";
  paddle::framework::AttributeMap attrs;
  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
  attrs["shape"] = std::vector<int>{3, 3};
  attrs["value"] = 1.0f;
  Fill filler(initializer, attrs);
  for (int i = 0; i < 2; ++i) {
    reset_global_tape();
    VariableHandle input(new Variable("input"));
    filler(input);
    auto loss = mean(linear2(linear1(input)));
    get_global_tape().Backward(loss);
    for (auto w : linear1.Params()) {
      sgd(w);
    }
    for (auto w : linear2.Params()) {
      sgd(w);
    }
  }
 }
 int main(int argc, char** argv) {
  std::vector<paddle::platform::Place> places;
  places.emplace_back(paddle::platform::CPUPlace());
  paddle::platform::DeviceContextPool::Init(places);
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
@ -0,0 +1,33 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/contrib/tape/variable.h"
 namespace paddle {
 namespace tape {
 void Variable::InitializeVariable() {
  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
  framework::proto::VarType::Type var_type = desc_.GetType();
  if (var_type == framework::proto::VarType::LOD_TENSOR) {
    var_.GetMutable<framework::LoDTensor>();
  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
    var_.GetMutable<framework::SelectedRows>();
  } else {
    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
                 var_type);
  }
 }
 }
 }
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
@ -0,0 +1,85 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/variable.h"
 namespace paddle {
 namespace tape {
 class Variable;
 using VariableHandle = std::shared_ptr<Variable>;
 /*
 * Combination of
 *     framework::VarDesc desc_;
 *     framework::Variable var_;
 */
 class Variable {
 public:
  Variable(const std::string pre_fix)
      : desc_(pre_fix + std::to_string(count())) {}
  Variable(const std::string pre_fix, bool is_grad)
      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
                                 : std::to_string(count()))) {}
  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
  // Instantiate LoDTensor/SelectedRow
  void InitializeVariable();
  VariableHandle Grad() {
    if (grad_.expired()) {
      VariableHandle new_grad(new Variable(desc_.Name(), true));
      grad_ = new_grad;
      return new_grad;
    } else {
      return VariableHandle(grad_);
    }
  }
  // Stochastic Gradient Descent with Momentum
  //  VariableHandle Momentum ();
  //  void init(const std::string& initializer,
  //            const framework::AttributeMap& attrs);
  // void value() {};
  const framework::VarDesc& Desc() const { return desc_; }
  framework::VarDesc* MutableDesc() { return &desc_; }
  // TODO(tonyyang-svail): No need to expose name
  std::string Name() const { return desc_.Name(); }
  framework::Variable* Var() { return &var_; }
 private:
  int count() {
    static int counter = 0;
    return counter++;
  }
  framework::VarDesc desc_;
  framework::Variable var_;
  std::weak_ptr<Variable> grad_;
 };
 }
 }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -330,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
    // NOTE! Please do not delete this line, it's usefull because the debug
    // string before and after op.run are different, after run the output
    // will have right shape which is usefull for debug.
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }
 static int GetRowSize(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return -1;
  }
  if (var->IsType<SelectedRows>()) {
    return var->Get<SelectedRows>().rows().size();
  }
  return -1;
 }
 static LoD GetLoD(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  VLOG(10) << "- " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
  RunImpl(scope, place);
  VLOG(10) << "+ " << DebugStringEx(&scope);
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
        int row_size = GetRowSize(*scope, input.second[i]);
        if (row_size >= 0) {
          ss << "[row_size=" << row_size << "]";
        }
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
        int row_size = GetRowSize(*scope, output.second[i]);
        if (row_size >= 0) {
          ss << "[row_size=" << row_size << "]";
        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      platform::NCCLGroupGuard guard;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        buffers.push_back(buffer);
        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
                                     nccl_ctx.comm_, nccl_ctx.stream());
      }
-      member_->nccl_ctxs_->WaitAll();
+
      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                        "variables' buffer size to bcast NOT equal to places");
      {
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                       nccl_ctx.comm_, nccl_ctx.stream());
        }
        member_->nccl_ctxs_->WaitAll();
      }
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -81,6 +81,9 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
 protected:
  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
@ -93,8 +96,6 @@ class Scope {
  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;
  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -20,16 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/pybind/pybind.h"
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
 DEFINE_bool(init_p2p, false, "Whether to init p2p.");
 DEFINE_int32(math_num_threads, 1,
             "Number of threads used to run math functions.");
 namespace paddle {
 namespace inference {
 void Init(const std::vector<std::string> argv) {
  framework::InitGflags(argv);
  operators::math::SetNumThreads(FLAGS_math_num_threads);
  // init devices
  std::vector<int> devices;
  std::string token;
--- a/Show More
+++ b/Show More