merge develop

7 years ago · e8954a36f5
parent 32a9e050bc 7970ab965a
commit e8954a36f5
52 changed files with 891 additions and 405 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -27,15 +27,6 @@ script:
    # 43min timeout
    paddle/scripts/paddle_docker_build.sh ${JOB}
    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -200,6 +200,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@ -228,14 +236,6 @@ set(EXTERNAL_LIBS
    ${PYTHON_LIBRARIES}
 )

-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
  endif()
 endfunction(cc_test)
@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
  endif()
 endfunction(nv_test)
@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析

-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。

 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。

 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```

 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```

 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。

--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@ -1,15 +1,17 @@
 # how to use timeline tool to do profile

-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.

 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```

@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.

+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.

 	![chrome tracing](./tracing.jpeg)
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif

 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -21,6 +21,26 @@ namespace framework {
 namespace details {

 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };

  enum class GradientScaleStrategy {
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -16,12 +16,18 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");

 namespace paddle {
 namespace framework {
 namespace details {

 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("reduce", nullptr);
  if (places_.size() == 1) return;
  // the input and output may have dummy var.
  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
  } else {
    std::vector<const LoDTensor *> lod_tensors =
        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
      this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
      });
    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
    eptr = std::current_exception();
  }

+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(

 FeedFetchList ThreadedSSAGraphExecutor::Run(
    const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
  std::unordered_map<OpHandleBase *, size_t> pending_ops;
  std::unordered_set<VarHandleBase *> pending_vars;
  BlockingQueue<VarHandleBase *> ready_vars;
@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  // Clean run context
  run_op_futures_.clear();
  exception_holder_.Clear();
+  event.reset(nullptr);

  // Step 3. Execution
  while (!pending_vars.empty()) {
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }

  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
      return var->Get<SelectedRows>().value().dims();
@ -74,8 +79,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
  if (var == nullptr) {
    return "";
  }
+
  if (var->IsType<LoDTensor>()) {
-    return DataTypeToString(ToDataType(var->Get<LoDTensor>().type()));
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
  } else if (var->IsType<SelectedRows>()) {
    return DataTypeToString(
        ToDataType(var->Get<SelectedRows>().value().type()));
@ -106,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  }

  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
  } else {
    return default_lod;
  }
@ -122,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    platform::SetDeviceId(dev_id);
 #endif
  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
  RunImpl(scope, place);
  VLOG(10) << "+ " << DebugStringEx(&scope);
 }
@ -625,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);

-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -82,7 +82,7 @@ class Tensor {
  template <typename T>
  const T* data() const;

-  bool IsInitialized() const;
+  inline bool IsInitialized() const;

  /**
   * @brief   Return a pointer to mutable memory block.
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
    if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api_shared)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        # this test is unstable, disable it first.
+        #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+                                  #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  #DEPS inference_anakin_api_shared)
+        #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
               const platform::Place &place) const override {
    // get device context from pool
    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);

    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
    // FIXME(yuyang18): We save variable to local file now, but we should change
    // it to save an output stream.
    auto filename = Attr<std::string>("file_path");
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel {

    auto table_dims = ctx->GetInputDim("W");
    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();

-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");

-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));

    if (ctx->GetOutputsVarType("Out")[0] ==
        framework::proto::VarType::LOD_TENSOR) {
@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Ids",
             "An input with type int32 or int64 "
             "contains the ids to be looked up in W. "
-             "Ids must be a column vector with rank = 2. "
-             "The 2nd dimension size must be 1.");
+             "The last dimension size must be 1.");
    AddOutput("Out", "The lookup results, which have the same type as W.");
    AddAttr<bool>("is_sparse",
                  "(boolean, default false) "
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));

      auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();

      auto stream = dev_ctx.stream();
      // copy GPU memory to CPU pinned memory
      framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());

      // TODO(yuyang18): Strange code here.
      memory::Copy(platform::CPUPlace(),
                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);

      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->Resize({ids_num, table->dims()[1]});
      d_table_value->mutable_data<T>(context.GetPlace());

      auto *d_table_data = d_table_value->data<T>();
      auto *d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                   d_output->numel() * sizeof(T), stream);

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));

      auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();

      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
        new_rows.push_back(ids_data[i]);
      }
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
      d_table_value->mutable_data<T>(context.GetPlace());

      d_table->set_height(table_dim[0]);
@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *d_output_data = d_output->data<T>();
      auto *d_table_data = d_table_value->data<T>();

-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
    } else {
      auto *ids = context.Input<LoDTensor>("Ids");
@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));

      auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();

      int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];

      auto *d_output_data = d_output->data<T>();
      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace operators {
@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {

      workers.emplace_back(
          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
            framework::Executor executor(place);
            executor.Run(*program, cur_scope, block->ID(),
                         false /*create_local_scope*/);
@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
      // execute
      workers.emplace_back(
          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
            framework::Executor executor(place);
            executor.Run(*program, cur_scope, block->ID(),
                         false /*create_local_scope*/);
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace operators {
@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
            .GetMutable<framework::ReaderHolder>();
    std::vector<std::string> out_arg_names = Outputs("Out");
    std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
    reader->ReadNext(&ins);
    if (ins.empty()) {
      if (Attr<bool>("throw_eof_exp")) {
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {

    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);

    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
    bool sync_mode = Attr<bool>("sync_mode");

-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/Show More
+++ b/Show More