Merge branch 'develop' into core_inference_example

8 years ago · 42a0603e6f
parent cd7d0f850b e57a40b8a6
commit 42a0603e6f
18 changed files with 955 additions and 67 deletions
--- a/doc/design/backward.md
+++ b/doc/design/backward.md
@ -106,9 +106,11 @@ See function `_addup_repetitive_outputs_` in `backward.py` for implementation de

 In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 

-But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.

-This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 

 ### Creating Backward Variables

--- a/doc/design/images/profiler.png
+++ b/doc/design/images/profiler.png
--- a/doc/design/profiler.md
+++ b/doc/design/profiler.md
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -29,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc DEPS glog)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)

-cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
 cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {
@ -23,5 +24,92 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
  return data_transform_map;
 }

+auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNHWC, LibraryType::kPlain);
+
+auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
+                               DataLayout::kNCHW, LibraryType::kPlain);
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+void TransDataLayout(const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  auto src_dim = src.dims();
+  dst->Resize(src_dim);
+  auto place = kernel_pair.second.place_;
+  CopyFrom(src, place, *ctx, dst);
+  const std::vector<int> axis = {0, 2, 3, 1};
+
+  std::vector<int64_t> dst_dim;
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  dst->Resize(make_ddim(dst_dim));
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
 }  // namespace framework
 }  // namespace paddle
+
+namespace f = paddle::framework;
+REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@ -21,16 +21,20 @@ limitations under the License. */
 #include "paddle/framework/op_kernel_type.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"

 namespace paddle {
 namespace framework {

-using DataTransformFn = std::function<void(const platform::DeviceContext* ctx,
-                                           const Variable& in, Variable* out)>;
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;

+using DataTransformFn =
+    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
+                       const Variable&, Variable*)>;
+
 struct KernelTypePairHash {
  static void HashCombine(const OpKernelType& t, std::size_t* seed) {
    OpKernelType::Hash kernel_type_hasher;
@ -45,6 +49,65 @@ struct KernelTypePairHash {
  }
 };

+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+struct CastDataLayout {
+  CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
+                 const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
 using DataTransformMap =
    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;

--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@ -17,6 +17,7 @@ limitations under the License. */
 #include <gtest/gtest.h>

 #include "paddle/framework/data_transform.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {
@ -36,11 +37,13 @@ std::array<proto::DataType, 2> kDataType = {

 std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};

-std::array<DataLayout, 2> kDataLayout = {
-    {DataLayout::kNHWC, DataLayout::kNCHW}};
+std::array<DataLayout, 2> kDataLayout = {{
+    DataLayout::kNHWC, DataLayout::kNCHW,
+}};

-std::array<LibraryType, 2> kLibraryType = {
-    {LibraryType::kPlain, LibraryType::kMKLDNN}};
+std::array<LibraryType, 2> kLibraryType = {{
+    LibraryType::kPlain, LibraryType::kMKLDNN,
+}};

 OpKernelType GenFromBit(const std::vector<bool> bits) {
  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
@ -54,17 +57,20 @@ auto kernel1 = GenFromBit({0, 0, 0, 1});
 auto kernel2 = GenFromBit({0, 0, 1, 0});
 auto kernel3 = GenFromBit({0, 0, 1, 1});

-void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataType_t(const platform::DeviceContext* ctx,
+                     const KernelTypePair& p, const Variable& in,
                     Variable* out) {
  test_value++;
 }

-void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransDataLayout_t(const platform::DeviceContext* ctx,
+                       const KernelTypePair& p, const Variable& in,
                       Variable* out) {
  test_value--;
 }

-void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in,
+void TransLibraryType_t(const platform::DeviceContext* ctx,
+                        const KernelTypePair& p, const Variable& in,
                        Variable* out) {
  test_value += 2;
 }
@ -83,17 +89,68 @@ TEST(DataTransform, Register) {
  using namespace paddle::platform;

  auto& instance = DataTransformFnMap::Instance();
-  ASSERT_EQ(instance.Map().size(), 3UL);
-  DeviceContext* ctx = nullptr;
  paddle::framework::Variable in;
  paddle::framework::Variable out;

-  instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out);
+  DeviceContext* ctx = new CPUDeviceContext();
+  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
+  instance.Get(pair0)(ctx, pair0, in, &out);
  ASSERT_EQ(test_value, 1);

-  instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out);
+  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
+  instance.Get(pair1)(ctx, pair1, in, &out);
  ASSERT_EQ(test_value, 0);

-  instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out);
+  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
+  instance.Get(pair3)(ctx, pair3, in, &out);
  ASSERT_EQ(test_value, 2);
 }
+
+TEST(DataTransform, Layout) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
+  src->set_layout(DataLayout::kNHWC);
+
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  {
+    auto kernel1 = GenFromBit({1, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 1, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.layout() != src->layout());
+}
+
+TEST(DataTransform, DataType) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto& instance = DataTransformFnMap::Instance();
+  DeviceContext* ctx = new CPUDeviceContext();
+
+  Variable in;
+  Variable out;
+  Tensor* src = in.GetMutable<Tensor>();
+  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  {
+    auto kernel1 = GenFromBit({0, 0, 0, 0});
+    auto kernel2 = GenFromBit({1, 0, 0, 0});
+    auto pair0 = std::make_pair(kernel1, kernel2);
+    instance.Get(pair0)(ctx, pair0, in, &out);
+  }
+  Tensor dst = out.Get<Tensor>();
+  EXPECT_TRUE(dst.data<double>() != nullptr);
+}
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -461,7 +461,7 @@ void OperatorWithKernel::Run(const Scope& scope,
        dev_ctx->Wait();

        for (auto var_name : need_trans) {
-          (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)),
+          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
                       scope.FindVar(var_name + framework::KernelTypeToString(
                                                    expected_kernel_key)));
        }
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -186,36 +186,6 @@ endfunction()
 add_subdirectory(math)
 add_subdirectory(nccl)

-set(DEPS_OPS
-    cond_op
-    cross_entropy_op
-    recurrent_op
-    softmax_with_cross_entropy_op
-    softmax_op
-    sequence_softmax_op
-    sum_op
-    pool_op
-    maxout_op
-    unpool_op
-    pool_with_index_op
-    conv_op
-    conv_transpose_op
-    nccl_op
-    sequence_conv_op
-    sequence_pool_op
-    lod_rank_table_op
-    lod_tensor_to_array_op
-    array_to_lod_tensor_op
-    max_sequence_len_op
-    lstm_op
-    gru_op
-    adagrad_op
-    sgd_op
-    save_op
-    load_op
-    send_op
-    recv_op
-    detection_output_op)
 if(WITH_GPU)
    op_library(nccl_op DEPS nccl_common)
 else()
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@ -247,7 +247,10 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;

 #define DEFINE_CPU_TRANS(RANK)                                          \
  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;

 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@ -30,3 +30,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
 nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
+
+cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+
+namespace paddle {
+namespace platform {
+
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             DeviceContext* dev_ctx)
+    : kind_(kind),
+      name_(std::move(name)),
+      thread_id_(thread_id),
+      has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+  if (cuda_dev_ctx) {
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+    has_cuda_ = true;
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedUs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+}
+
+double Event::CudaElapsedUs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms * 1000.0;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
+                        dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
+                        dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
+                        dev_ctx_);
+}
+
+void EnableProfiler(ProfilerState state) {
+  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
+                 "Can't enbale profling, since the input state is ",
+                 "ProfilerState::kDisabled");
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
+                 "The profiling state should be disabled when calling ",
+                 "EnableProfiler.");
+  g_state = state;
+#ifdef PADDLE_WITH_CUDA
+  if (g_state == ProfilerState::kCUDA) {
+    // Generate some dummy evenets first to reduce the startup overhead.
+    for (int i = 0; i < 5; i++) {
+      ForEachDevice([](int d) {
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+        Mark("_cuda_startup_", dev_ctx);
+        dev_ctx->Wait();
+      });
+    }
+  }
+#endif
+  // Mark the profiling start.
+  Mark("_start_profiler_", nullptr);
+}
+
+std::vector<std::vector<Event>> DisableProfiler() {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+  std::vector<std::vector<Event>> result;
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
+  }
+  return result;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <forward_list>
+#include <list>
+#include <mutex>
+#include <vector>
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+enum EventKind { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        DeviceContext* dev_ctx);
+
+  std::string kind() const;
+  std::string name() const { return name_; }
+  bool has_cuda() const { return has_cuda_; }
+
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+  bool has_cuda_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
+enum ProfilerState {
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
+};
+
+void Mark(const std::string& name, DeviceContext* dev_ctx);
+
+struct RecordEvent {
+  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
+
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  DeviceContext* dev_ctx_;
+};
+
+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
+std::vector<std::vector<Event>> DisableProfiler();
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+#include "gtest/gtest.h"
+
+TEST(Event, CpuElapsedTime) {
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  EXPECT_TRUE(start_event.has_cuda() == false);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Event, CudaElapsedTime) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
+  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  EXPECT_TRUE(start_event.has_cuda() == true);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
+}
+#endif
+
+TEST(RecordEvent, RecordEvent) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::ProfilerState;
+
+  ProfilerState state = ProfilerState::kCPU;
+  DeviceContext* dev_ctx = nullptr;
+#ifdef PADDLE_WITH_CUDA
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  state = ProfilerState::kCUDA;
+  dev_ctx =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
+#endif
+  EnableProfiler(state);
+
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "op_" + std::to_string(i);
+    RecordEvent record_event(name, dev_ctx);
+    int counter = 1;
+    while (counter != i * 1000) counter++;
+  }
+  std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  int cuda_startup_count = 0;
+  int start_profiler_count = 0;
+  int stop_profiler_count = 0;
+  for (size_t i = 0; i < events.size(); ++i) {
+    for (size_t j = 0; j < events[i].size(); ++j) {
+      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
+      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
+      if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
+      if (events[i][j].name() == "push") {
+        EXPECT_EQ(events[i][j + 1].name(), "pop");
+#ifdef PADDLE_WITH_CUDA
+        EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
+#else
+        EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
+#endif
+      }
+    }
+  }
+  EXPECT_EQ(cuda_startup_count % 5, 0);
+  EXPECT_EQ(start_profiler_count, 1);
+  EXPECT_EQ(stop_profiler_count, 1);
+}
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@ -57,6 +57,8 @@ def _all_in_set_(cands, s):
    """
    Test if all elements of 'cands' are in set 's'
    """
+    if len(cands) == 0:
+        return False
    for c in cands:
        if not c in s:
            return False
@ -136,12 +138,23 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
    Remove unnecessary grad ops
    A grad op can be removed in two cases:
        1. all outputs of the grad op are in 'no_grad_set'
-        2. (TODO) all grad inputs of the grad op are in 'no_grad_set'
+        2. all grad inputs of the grad op are in 'no_grad_set'
    """
+
+    def _op_can_be_removed_(op_desc, no_grad_set):
+        out_arg_names = op_desc.output_arg_names()
+        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
+            return True
+        if _all_in_set_(
+                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                       op_desc.input_arg_names()), no_grad_set):
+            no_grad_set.union(out_arg_names)
+            return True
+        return False
+
    # Remove ops whose outputs are all in no_grad_dict
    op_descs = filter(
-        lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set),
-        op_descs)
+        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
    # Insert fill_zeros_like_op
    to_insert = []
    for idx, op_desc in enumerate(op_descs):
@ -284,7 +297,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
                    block_no_grad_set.add(_append_grad_suffix_(var.name))
            no_grad_dict[block.idx] = block_no_grad_set
    elif isinstance(no_grad_set, set):
-        no_grad_dict = {0: no_grad_set}
+        no_grad_dict = {
+            0: set([_append_grad_suffix_(name) for name in no_grad_set])
+        }
    else:
        raise ValueError("'no_grad_set' should be a set or None.")

--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@ -16,6 +16,36 @@ __all__ = [


 def split_lod_tensor(input, mask, level=0):
+    """
+    **split_lod_tensor**
+
+    This function takes in an input that contains the complete lod information,
+    and takes in a mask which is used to mask certain parts of the input.
+    The output is the true branch and the false branch with the mask applied to
+    the input at a certain level in the tensor.
+
+    Args:
+        input(tuple|list|None): The input tensor that contains complete
+                                lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The true branch of tensor as per the mask applied to input.
+        Variable: The false branch of tensor as per the mask applied to input.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(name='x', shape=[1])
+          x.persistable = True
+
+          y = layers.data(name='y', shape=[1])
+          y.persistable = True
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+    """
    helper = LayerHelper('split_lod_tensor', **locals())
    out_true = helper.create_tmp_variable(dtype=input.dtype)
    out_false = helper.create_tmp_variable(dtype=input.dtype)
@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0):


 def merge_lod_tensor(in_true, in_false, x, mask, level=0):
+    """
+    **merge_lod_tensor**
+
+    This function takes in an input :math:`x`, the True branch, the False
+    branch and a binary :math:`mask`. Using this information, this function
+    merges the True and False branches of the tensor into a single Output
+    at a certain lod level indiacted by :math:`level`.
+
+    Args:
+        in_true(tuple|list|None): The True branch to be merged.
+        in_false(tuple|list|None): The False branch to be merged.
+        x(tuple|list|None): The input tensor that contains complete
+                            lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The merged output tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(
+                      name='x', shape=[1], dtype='float32', stop_gradient=False)
+          y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+          level = 0
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+          out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+    """
    helper = LayerHelper('merge_lod_tensor', **locals())
    out = helper.create_tmp_variable(dtype=in_true.dtype)
    helper.append_op(
@ -397,9 +461,50 @@ class While(object):


 def lod_rank_table(x, level=0):
-    """
-    This function creates an operator for creating a LOD_RANK_TABLE
-    using the input x.
+    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
+    contains a list of bi-element tuples. Each tuple consists of an index and
+    a length, both of which are int type. Reffering to specified level of LoD,
+    the index is the sequence index number and the length representes the
+    sequence length. Please note that the list is ranked in descending order by
+    the length. The following is an example:
+
+        .. code-block:: text
+
+            x is a LoDTensor:
+                x.lod = [[0,                2, 3],
+                         [0,             5, 6, 7]]
+                x.data = [a, b, c, d, e, f, g]
+
+            1. set level to 0:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=0)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
+
+            2. set level to 1:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=1)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
+
+    Args:
+        x (Variable): Input variable, a LoDTensor based which to create the lod
+            rank table.
+        level (int): Specify the LoD level, on which to create the lod rank
+            table.
+
+    Returns:
+        Variable: The created LoDRankTable object.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            out = layers.lod_rank_table(x=x, level=0)
    """
    helper = LayerHelper("lod_rank_table", **locals())
    table = helper.create_variable(
@ -414,9 +519,25 @@ def lod_rank_table(x, level=0):


 def max_sequence_len(rank_table):
-    """
-    This function creates an operator to calculate the length of
-    max seqence through input rank_table(should be a lod_rank_table)
+    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    object contains a list of tuples(<sequence index, sequence length>) and
+    the list is already sorted by sequence length in descending order, so the
+    operator just returns the sequence length of the first tuple element.
+
+    Args:
+        rank_table (Variable): Input variable which is a LoDRankTable object.
+
+    Returns:
+        Variable: The max length of sequence.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            rank_table = layers.lod_rank_table(x=x, level=0)
+            max_seq_len = layers.max_sequence_len(rank_table)
    """
    helper = LayerHelper("max_seqence_len", **locals())
    res = helper.create_tmp_variable(dtype="int64")
@ -428,6 +549,30 @@ def max_sequence_len(rank_table):


 def topk(input, k):
+    """
+    **topk**
+
+    This function performs the operation that selects the k entries in the input
+    vector and outputs their values and indices as vectors. Thus topk_out[j] is
+    the j-th largest entry in input, and its index is topk_indices[j]
+
+    Args:
+        input (Variable|list): The input tensor that has all the data.
+        k (int): The number of top elements that the function will pick.
+
+    Returns:
+        Variable: The variable of type array that contains the k largest entries
+                  from input.
+        Variable: The variable of type array that contains the indices of k
+                  largest entries from input.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          k = 5
+          array = fluid.layers.topk(x, k)
+    """
    helper = LayerHelper('topk', **locals())
    topk_out = helper.create_tmp_variable(dtype=input.data_type)
    topk_indices = helper.create_tmp_variable(dtype='int64')
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@ -426,8 +426,36 @@ def cross_entropy(input, label, **kwargs):

 def square_error_cost(input, label, **kwargs):
    """
-    This functions returns the squared error cost using the input and label.
-    The output is appending the op to do the above.
+    **Square error cost layer**
+
+    This layer accepts input predictions and target label and returns the squared error cost.
+    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
+
+    .. math::
+
+        Out = (X - Y)^2
+
+    In the above equation:
+
+        * :math:`X`: Input predictions, a tensor.
+        * :math:`Y`: Input labels, a tensor.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    Args:
+       input(Variable): Input tensor, has predictions.
+       label(Variable): Label tensor, has target labels.
+
+    Returns:
+        Variable: The tensor variable storing the element-wise squared error difference \
+                  of input and label.
+
+    Examples:
+        .. code-block:: python
+
+          y = layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = layers.square_error_cost(input=y_predict, label=y)
+
    """
    helper = LayerHelper('square_error_cost', **kwargs)
    minus_out = helper.create_tmp_variable(dtype=input.dtype)
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input,

 def ones(shape, dtype):
    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 1.0.
+    **ones**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 1.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.ones(shape=[1], dtype='int64')
    """
    return fill_constant(value=1.0, **locals())


 def zeros(shape, dtype):
    """
-    This function performs the same function as fill_constant() declared above
-    with the constant value being 0.0.
+    **zeros**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 0.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.zeros(shape=[1], dtype='int64')
    """
    return fill_constant(value=0.0, **locals())