Merge branch 'develop' into expand

7 years ago · b61cf7ac4f
parent 83f4edabe9 836e1e0b36
commit b61cf7ac4f
41 changed files with 1763 additions and 210 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -138,12 +138,6 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()

-if(WITH_MKL)
-  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
-  if (MKL_SPLIT_GEMM)
-    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
-  endif()
-endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
    if (WITH_MKL AND AVX2_FOUND)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
+    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@ -1,7 +1,7 @@
 # Distributed Training with NCCL2

 We design a pattern that can enable training with `ParallelExecutor` and
-using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
 communication library.

 In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as
 ranks in a distributed environment, we can simply run the `ParallelExecutor`
 as a distributed program! The only thing that may be different than in
 the single node version is that we need to broadcast the NCCL unique ID
-to all the nodes, and initialize communicators using that ID, so NCCL2
-will know each other as ranks.
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.

 To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
 so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
-what ever platform you like.
+whatever platform you like.

-It have two running modes:
+It has two running modes:

 1. Generate and broadcast mode, which should be used on trainer 0;
 1. Listen and fetch mode, which should be used on trainers other than 0.
@ -29,7 +29,7 @@ initialize NCCL communicator objects.
 <img src="src/ncc2_design.png">

 The above figure indicates the general process when training with NCCL2
-distributed. Each trainer have the number of communicators equal to the
+distributed. Each trainer has the number of communicators equal to the
 number of GPUs, but the ranks should match the global ranks number: here
 we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
 be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@ -36,19 +36,19 @@
 <tbody>
 <tr>
 <td>OpProtoMake定义 </td>
-<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
 </tr>
 <tr>
 <td>Op定义 </td>
-<td> `.cc`文件</td>
+<td> .cc 文件</td>
 </tr>
 <tr>
 <td>Kernel实现 </td>
-<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
 </tr>
 <tr>
 <td>注册Op </td>
-<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
 </tr>
 </tbody>
 </table>
@ -391,7 +391,7 @@ PADDLE_ENFORCE(ctx->HasInput("X"), "");
 ```
 问题示例2 ：提示信息过于简单
 ```
-PADDLE_ENFORCE(i != nullptr, "I must be set"); // I是什么？
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
 ```

 2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
--- a/doc/fluid/howto/cluster/nccl2_rdma_training.md
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@ -1,12 +1,12 @@
 # Distributed Training with NCCL2 and RDMA

-When doing distributed multi-GPU training, network bandwith often becomes the
-bottle neck. We introduce a way to use NCCL2 to do such training job to
-achieve best performace.
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.

-## Prepare Hardwares with RDMA and Multiple GPUs
+## Prepare Hardware with RDMA and Multiple GPUs

-I'm using two Linux servers each of them is installed with 8 GPUs and
+I'm using two Linux servers each of them installed with 8 GPUs and
 one 100Gb RDMA card.
 Base environment is:

@ -25,7 +25,7 @@ In general, the steps including:
 1. Use docker to run tests and make sure GPUs and RDMA can work inside
   the container.

-I'll ommit section "Install GPU drivers" because we can find it easily
+I'll omit the section "Install GPU drivers" because we can find it easily
 somewhere else.

 ### Install RDMA drivers
@ -33,7 +33,7 @@ somewhere else.
 For my case, I've got two machines with device
 "Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
 "CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
-work with latest overlay2 filesystem.
+work with the latest overlay2 filesystem.

 ***NOTE: before you start, make sure you have a way to get a console
 of the server other than ssh because we may need to re-configure the
@ -45,14 +45,14 @@ network device.***
 1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
 1. Run `/etc/init.d/openibd restart` to make everything work, note that
   this operation may cause the network goes down if you are using this
-   RDMA device as default network device and use ssh to login the server.
+   RDMA device as default network device and use ssh to log in the server.
 1. Re-configure the network interface, for example:
   `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
   `ip route add default via 192.168.16.1 dev eth2`.
 1. Do the same thing on the other node.
 1. Use `ping` to test if the two nodes have typical ICMP connection.
 1. Use either `udaddy` or `ib_write_bw` to test the network connection is
-   ready and have the desired bandwith.
+   ready and have the desired bandwidth.

 ### Prepare Docker Image to Run RDMA Programs

@ -60,7 +60,7 @@ network device.***
   package in it.
 1. Start a docker container and mount GPU driver libs into it (you can
   skip this step if you are using nvidia-docker).
-1. Mount RDMA dirvers and libs into the docker image (see below section),
+1. Mount RDMA drivers and libs into the docker image (see below section),
   also `udaddy` and `ib_write_bw` if needed.
 1. Mount GPU devices and RDMA devices into the container using `--device`
   or just use privileged mode `--privileged`.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+template <typename T, size_t N>
+class Array {
+  static_assert(N > 0, "The size of array must be larger than 0");
+
+ public:
+  HOSTDEVICE Array() {}
+
+  HOSTDEVICE explicit Array(const T &val) {
+    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  }
+
+  HOSTDEVICE const T *Get() const { return data_; }
+
+  HOSTDEVICE T *GetMutable() { return data_; }
+
+  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+
+  HOSTDEVICE constexpr size_t size() const { return N; }
+
+ private:
+  T data_[N];
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -129,10 +129,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                    "Optimized for variable")
      .SetDefault({});

-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
  Validate();
 }

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -39,7 +39,6 @@ class OpProtoAndCheckerMaker {
 public:
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }

  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -11,17 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/operator.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@ -129,48 +127,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-    RunImpl(scope, place);
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
-
-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
+  RunImpl(scope, place);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@ -198,7 +167,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }

 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.end() != outputs_.find(name)) {
+  if (outputs_.find(name) != outputs_.end()) {
    return true;
  } else {
    return false;
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }

-void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           size_t requested_size) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
+  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                 offset_);
 }

-void* Tensor::mutable_data(platform::Place place) {
+void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), requested_size);
 }

 Tensor& Tensor::ShareDataWith(const Tensor& src) {
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -89,22 +89,24 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place, size_t requested_size = 0);

-  void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type,
+                     size_t requested_size = 0);

-  void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place, size_t requested_size = 0);

  /**
   * @brief     Return a pointer to mutable memory block.
   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
+   * @param[in] dims           The dimensions of the memory block.
+   * @param[in] place          The place of the memory block.
+   * @param[in] requested_size The size of the block in bytes.
   *
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);

  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -46,16 +46,17 @@ inline T* Tensor::data() {
 }

 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, requested_size);
 }

 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }

 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class AttentionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@ -135,7 +135,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Variance",
             "The global variance (for training) "
             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
    AddOutput("MeanOut",
              "Share memory with Mean. "
              "Store the global mean when training")
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@ -53,6 +53,18 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
    key_ += "-BWD";
  }

+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
      const std::shared_ptr<mkldnn::memory> user_memory_p,
      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
@ -294,7 +306,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());

    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
@ -354,6 +365,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_weights_memory_p = handler.AcquireWeightsMemory(
        user_weights_md, to_void_cast<T>(filter_data));

+    T* output_data =
+        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
    // create reorder primitive if the input format is not the preferred one
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@ -476,13 +489,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    T* input_grad_data = nullptr;
    T* filter_grad_data = nullptr;

-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
@ -568,6 +574,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
              user_diff_dst_memory_p, pipeline);

+      const size_t size = handler.GetDiffWeightsMemorySize();
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+
      auto diff_weights_memory_p =
          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
              reinterpret_cast<void*>(filter_grad_data));
@ -590,6 +599,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
                                                        pipeline);

+      const size_t size = handler.GetDiffSourceMemorySize();
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+
      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
          reinterpret_cast<void*>(input_grad_data));

--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-// #include <string>
 #include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@ -90,6 +90,11 @@ class Blas {
  void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
            int lda, const T* B, int ldb, T beta, T* C, int ldc) const;

+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+            T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
+            int ldc) const;
+
 #ifdef PADDLE_WITH_MKLML
  template <typename T>
  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
@ -109,6 +114,10 @@ class Blas {
  void GEMM_FREE(T* data) const;
 #endif

+  template <typename T>
+  void MatMul(const int M, const int N, const int K, const T* A, const T* B,
+              T* C) const;
+
  template <typename T>
  void MatMul(const framework::Tensor& mat_a, bool trans_a,
              const framework::Tensor& mat_b, bool trans_b, T alpha,
@ -140,10 +149,19 @@ class Blas {
  template <typename T>
  void VCOPY(int n, const T* x, T* y) const;

+  template <typename T>
+  void VEXP(int n, const T* x, T* y) const;
+
  template <typename T>
  void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
            T* C) const;

+  template <typename T>
+  T DOT(int n, const T* x, const T* y) const;
+
+  template <typename T>
+  void SCAL(int n, const T a, T* x) const;
+
  template <typename T>
  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                   int K, T alpha, const T* A, const T* B, T beta, T* C,
@ -215,11 +233,26 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template VCOPY<T>(args...);
  }

+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
+
  template <typename... ARGS>
  void GEMV(ARGS... args) const {
    Base()->template GEMV<T>(args...);
  }

+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
+
  template <typename... ARGS>
  void BatchedGEMM(ARGS... args) const {
    Base()->template BatchedGEMM<T>(args...);
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@ -73,6 +73,16 @@ struct CBlas<float> {
    platform::dynload::cblas_sgemv(args...);
  }

+  template <typename... ARGS>
+  static float DOT(ARGS... args) {
+    return platform::dynload::cblas_sdot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_sscal(args...);
+  }
+
  template <typename... ARGS>
  static void GEMM_BATCH(ARGS... args) {
    platform::dynload::cblas_sgemm_batch(args...);
@ -87,6 +97,11 @@ struct CBlas<float> {
  static void VMUL(ARGS... args) {
    platform::dynload::vsMul(args...);
  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vsExp(args...);
+  }
 };

 template <>
@ -138,6 +153,16 @@ struct CBlas<double> {
    platform::dynload::cblas_dgemv(args...);
  }

+  template <typename... ARGS>
+  static double DOT(ARGS... args) {
+    return platform::dynload::cblas_ddot(args...);
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    platform::dynload::cblas_dscal(args...);
+  }
+
  template <typename... ARGS>
  static void GEMM_BATCH(ARGS... args) {
    platform::dynload::cblas_dgemm_batch(args...);
@ -152,6 +177,11 @@ struct CBlas<double> {
  static void VMUL(ARGS... args) {
    platform::dynload::vdMul(args...);
  }
+
+  template <typename... ARGS>
+  static void VEXP(ARGS... args) {
+    platform::dynload::vdExp(args...);
+  }
 };

 #else
@ -210,6 +240,9 @@ struct CBlas<platform::float16> {
    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
  }
  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
+  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
+  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@ -217,64 +250,6 @@ struct CBlas<platform::float16> {
 #endif
 };

-template <typename T>
-inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
-                    bool transb, const T &alpha, const T &beta) {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom
-  constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-  if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
-      std::abs<T>(alpha - static_cast<T>(1) >
-                  std::numeric_limits<T>::epsilon()) ||
-      std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
-    return false;
-  } else {
-    return true;
-  }
-#endif
-  return false;
-}
-
-template <>
-inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
-                                       bool transa, bool transb,
-                                       const platform::float16 &alpha,
-                                       const platform::float16 &beta) {
-  return false;
-}
-
-template <typename T>
-inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
-                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
-                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                      int ldc) {
-#ifdef PADDLE_WITH_LIBXSMM
-  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
-                 beta)) {
-    // Note: SMM use ColMajor
-    const char transa = 'N';
-    const char transb = 'N';
-    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
-                       &beta, C, &ldc);
-    return;
-  }
-#endif
-
-#ifdef PADDLE_MKL_SPLIT_GEMM
-  constexpr int bs = 2;
-  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
-    for (int off = 0; off < M; off += bs) {
-      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
-                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
-    }
-    return;
-  }
-#endif
-  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-                 beta, C, ldc);
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
@ -319,8 +294,8 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
-  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-               beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }

 template <>
@ -329,9 +304,20 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                            int N, int K, T alpha, const T *A,
                                            int lda, const T *B, int ldb,
                                            T beta, T *C, int ldc) const {
-  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-               lda, B, ldb, beta, C, ldc);
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            int lda, const T *B, int ldb,
+                                            T beta, T *C, int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
 }

 template <typename DeviceContext>
@ -399,6 +385,47 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
 #endif
 }

+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VEXP(n, x, y);
+#else
+  // try to find if openblas support vexp
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+#endif
+}
+
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  return CBlas<T>::DOT(n, x, 1, y, 1);
+#else
+  // try to find if openblas support cblas_dot
+  T sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i] * y[i];
+  }
+  return sum;
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::SCAL(n, a, x, 1);
+#else
+  // try to find if openblas support cblas_scal
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@ -440,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
 #endif
 }

+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
+                                 const T *A, const T *B, T *C) const {
+  this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                         static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
+                         N);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
+                                              const int K, const T *A,
+                                              const T *B, T *C) const {
+#ifdef PADDLE_WITH_LIBXSMM
+  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
+  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
+
+  // Since the matrix is very small,
+  // so the unit of calculation is already very fast,
+  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
+  // use xsmm directly.
+  // Note: SMM use ColMajor
+  const char transa = 'N';
+  const char transb = 'N';
+  const T alpha = static_cast<T>(1);
+  const T beta = static_cast<T>(0);
+  CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
+                     C, &N);
+  return;
+#endif
+
+  CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
+                 static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
+}
+
 template <typename DeviceContext>
 template <typename T>
 void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_identity(const int n, const T* x, T* y) {
+  // do nothing
+  return;
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_sigmoid(const int n, const T* x, T* y) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = tanh<T>(x[i]);
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
+                                                 float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+  // TODO(TJ): complete me
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+class VecActivations {
+ public:
+  std::function<void(const int, const T*, T*)> operator()(
+      const std::string& type) {
+    if (type == "sigmoid") {
+      return vec_sigmoid<T, isa>;
+    } else if (type == "relu") {
+      return vec_relu<T, isa>;
+    } else if (type == "tanh") {
+      return vec_tanh<T, isa>;
+    } else if (type == "identity" || type == "") {
+      return vec_identity<T, isa>;
+    }
+    PADDLE_THROW("Not support type %s.", type);
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@ -25,17 +25,25 @@ namespace math {
 template <typename DeviceContext, typename T>
 inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
                      const int N, const int K, const T* X, const T* W, T* Y,
-                      const T* B = NULL) {
-  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
-            static_cast<T>(0), Y);
-  if (B) {
+                      const T* B = NULL, bool relu = false) {
+  blas.MatMul(M, N, K, X, W, Y);
+  if (B == NULL) {
+    return;
+  }
+
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
 #endif
-    for (int i = 0; i < M; i++) {
-      blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
-    }
+  for (int i = 0; i < M; i++) {
+    blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
  }
+
+  if (!relu) {
+    return;
+  }
+
+  // TODO(TJ): fuse relu
+  LOG(FATAL) << "Not implemented!";
 }

 }  // namespace math
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@ -0,0 +1,28 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
+                  ops::StackGradOpDescMaker);
+REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
+
+REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
+                       ops::StackKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(stack_grad,
+                       ops::StackGradKernel<plat::CPUDeviceContext, float>,
+                       ops::StackGradKernel<plat::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
+                        ops::StackKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(stack_grad,
+                        ops::StackGradKernel<plat::CUDADeviceContext, float>,
+                        ops::StackGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
--- a/Show More
+++ b/Show More