Merge branch 'develop' into crf

8 years ago · 6a630f2798
parent 427644b2fa 200a02ec2d
commit 6a630f2798
106 changed files with 5038 additions and 1508 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -127,6 +127,7 @@ include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)    # download pybind11
 include(external/nccl)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@ -159,7 +160,7 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -62,11 +62,11 @@ else()
    FIND_PACKAGE(CUDA REQUIRED)
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
    endif()
    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@ -0,0 +1,50 @@
 INCLUDE(ExternalProject)
 SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
 INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
 if(WITH_DSO)
  # If we use DSO, we do not build nccl, just download the dependencies
  set(NCCL_BUILD_COMMAND "")
  set(NCCL_INSTALL_COMMAND "")
  set(NCCL_INSTALL_DIR "")
 else()
  # otherwise, we build nccl and link it.
  set(NCCL_BUILD_COMMAND "make -j 8")
  set(NCCL_INSTALL_COMMAND  "make install")
  SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
 endif()
 ExternalProject_Add(
        extern_nccl
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
        GIT_TAG         "v1.3.4-1"
        PREFIX          "${NCCL_SOURCE_DIR}"
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
        INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
        INSTALL_DIR       "${NCCL_INSTALL_DIR}"
        TEST_COMMAND      ""
 )
 if (WITH_DSO)
  if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
    add_library(nccl STATIC ${dummyfile})
  else()
    add_library(nccl INTERFACE)
  endif()
 else()
  ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL)
  SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION
          ${NCCL_INSTALL_DIR}/lib/libnccl.a)
 endif()
 add_dependencies(nccl extern_nccl)
 LIST(APPEND external_project_dependencies nccl)
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
 1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
 2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
-除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
 5.  如何调用 infer 接口输出多个layer的预测结果
 -----------------------------------------------
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@ -87,11 +87,8 @@ class OpInfoMap {
    }
  }
-  template <typename Callback>
+  const std::unordered_map<std::string, const OpInfo>& map() const {
-  void IterAllInfo(Callback callback) {
+    return map_;
    for (auto& it : map_) {
      callback(it.first, it.second);
    }
  }
 private:
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@ -18,6 +18,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
 void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
 void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@ -75,9 +75,9 @@ class VarDescBind {
  int32_t GetLodLevel() const;
-  VarDesc::VarType GetType() const { return desc_.type(); }
+  VarDesc::VarType GetType() const;
-  void SetType(VarDesc::VarType type) { desc_.set_type(type); }
+  void SetType(VarDesc::VarType type);
  bool Persistable() const { return desc_.persistable(); }
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
  copyInVal_ = nullptr;
  if (act.grad && algo == algorithm::eltwise_tanh) {
    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
    CHECK(copyInVal_) << "should not be emptry";
    pipelineFwd_.push_back(*copyInVal_);
@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
  algorithm algo = getAlgo(this->getName());
  float alpha = getBwdAlpha();
  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
  auto eng = CPUEngine::Instance().getEngine();
  auto bwdDesc = eltwise_bwd::desc(
      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
    int ic = cnt_ / bs / ih / iw;
    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
    val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
    CHECK(val_);
    val_->downSpatial();
  }
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
  MKLDNN_BASE = 1,   // basical info of MKLDNN
  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
  MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@ -48,17 +48,6 @@ protected:
  // save forward primitive_desc, which can be used backward
  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
  // MKLDNNMatrixPtr which should be created from CPU Device
  MKLDNNMatrixPtr cpuInVal_;
  MKLDNNMatrixPtr cpuInGrad_;
  MKLDNNMatrixPtr cpuOutVal_;
  MKLDNNMatrixPtr cpuOutGrad_;
  // convert handle between CPU device and MKLDNN device
  std::shared_ptr<mkldnn::reorder> cvtInVal_;
  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
  // whether the weight has been init
  bool hasInitedWgt_;
@ -94,8 +83,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void updateWeights(const UpdateCallback& callback) override;
  void convertWeightsFromPaddle() override;
@ -109,26 +96,6 @@ public:
                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
  }
  void printValueFormatFlow() override {
    if (cpuInVal_) {
      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
    }
    MKLDNNLayer::printValueFormatFlow();
    if (cpuOutVal_) {
      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
    }
  }
  void printGradFormatFlow() override {
    if (cpuInGrad_) {
      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
    }
    MKLDNNLayer::printGradFormatFlow();
    if (cpuOutGrad_) {
      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
    }
  }
 protected:
  /**
   * load the dims settings of this conv
@ -162,23 +129,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of input value
   */
  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                    MKLDNNMatrixPtr& in);
  /**
   * reset MKLDNNMatrix of weight and bias value
   */
  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias);
  /**
   * reset MKLDNNMatrix of output value
   */
  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                     MKLDNNMatrixPtr& out);
  /**
   * reset the backward weight primitive descriptor.
   */
@ -207,22 +157,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of output grad
   */
  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                    MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of weight and bias grad
   */
  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias);
  /**
   * reset MKLDNNMatrix of input grad
   */
  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
                   MKLDNNMatrixPtr& in);
  /**
   * reset MKLDNNMatrix of weight value for backward data
   * since the primitive_desc would be different with wgtVal_
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }
@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, wgt, bias, out);
  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
  printValueFormatFlow();
 }
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdDataPD(bwdDataPD, in, out);
  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
  printGradFormatFlow();
 }
 void MKLDNNFcLayer::updateInputData() {
  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
  resetInValue(in);
-
+  CHECK(in);
  resetWgtBiasValue(wgt, bias);
  resetOutValue(out);
 }
 void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
  if (inputIsOnlyMKLDNN()) {
    const MatrixPtr& dnnIn = getInputValue(0);
    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
    CHECK(in) << "Input should be MKLDNNMatrix";
  } else {
    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    in = MKLDNNMatrix::create(
        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
  }
  in->downSpatial();
 }
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
+  auto outPD =
-                                      MKLDNNMatrixPtr& bias) {
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
  resetOutValue(out, outPD);
  format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
    wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
    wgtFmt = format::oIhw16i;
  }
-  wgt = MKLDNNMatrix::create(
+  auto wgtPD =
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
  resetWithMatrix(wgt, weight_->getW(), wgtPD);
  wgt->downSpatial();
  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
  bias = (biases_ && biases_->getW())
             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
             : nullptr;
 }
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  if (biases_ && biases_->getW()) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-  if (!outputIsOnlyMKLDNN()) {
+    resetWithMatrix(bias, biases_->getW(), biasPD);
-    // fc cpu output value do not need create convert, just share data
+  } else {
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+    bias = nullptr;
  }
  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
  } else {
    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
  }
  pipeline.push_back(*fwd_);
 }
@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& wgt,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
+  CHECK(inVal_ && outVal_);
-
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetWgtBiasGrad(wgt, bias);
+  resetInGrad(in, inVal_->getPrimitiveDesc());
  resetInGrad(in);
 }
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
  CHECK(outVal_);
  if (outputIsOnlyMKLDNN()) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    output_.grad->setData(cpuOut->getData());
    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
  }
 }
 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias) {
  CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-  bias = nullptr;
+  if (biasVal_) {
-  if (biasVal_ == nullptr) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-    return;
+  } else {
-  }
+    bias = nullptr;
  bias =
      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
 }
 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNFcLayer::resetBwdWgtPD(
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@ -66,8 +66,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void updateWeights(const UpdateCallback& callback) override;
  void convertWeightsFromPaddle() override;
@ -84,9 +82,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetInValue(MKLDNNMatrixPtr& in);
  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
  void resetOutValue(MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
                  MKLDNNMatrixPtr wgt,
@ -109,9 +104,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetOutGrad(MKLDNNMatrixPtr& out);
  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
  void resetInGrad(MKLDNNMatrixPtr& in);
  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                     MKLDNNMatrixPtr& wgt,
                     MKLDNNMatrixPtr& bias,
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, out);
  resetFwdPipeline(pipeline, fwdPD_, in, out);
  printValueFormatFlow();
 }
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdPD(pd, in, out);
  resetBwdPipeline(pipeline, pd, in, out);
  printGradFormatFlow();
 }
 void MKLDNNPoolLayer::updateInputData() {
  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
  resetInValue(in);
  resetOutValue(out);
 }
 void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
  if (inputIsOnlyMKLDNN()) {
    const MatrixPtr& dnnIn = getInputValue(0);
    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
    CHECK(in) << "Input should be MKLDNNMatrix";
  } else {
    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    in = MKLDNNMatrix::create(
        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
  }
 }
 void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
  CHECK(inVal_) << "Should reset input value first";
  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
+  CHECK(in);
-      output_.value, outDims, inVal_->getFormat(), engine_);
+  auto outPD =
-
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  // create reorder if output value has cpu device and pd do not match
+  resetOutValue(out, outPD);
  cpuOutVal_ = nullptr;
  cvtOutVal_ = nullptr;
  if (!outputIsOnlyMKLDNN()) {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
    return;
  }
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr in,
                                 MKLDNNMatrixPtr out) {
  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
  pipeline.push_back(*fwd_);
  if (cvtOutVal_) {
    pipeline.push_back(*cvtOutVal_);
  }
 }
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
+  CHECK(inVal_ && outVal_);
-
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in);
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
  CHECK(outVal_);
  if (outputIsOnlyMKLDNN()) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    // always share the same grad data of CPU output
    // then the activation can get the right grad from output_.grad
    output_.grad->setData(cpuOut->getData());
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
      out = cpuOutGrad_;
    }
  }
 }
 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr& in,
                                 MKLDNNMatrixPtr& out) {
  pd = nullptr;
  if (in == nullptr) {
    return;
  }
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
  memory::dims padR = getPaddingR();
  CHECK(in);
  CHECK(out);
  auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                in->getMemoryDesc(),
@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
    std::shared_ptr<pool_bwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
+  if (pd == nullptr) {
-    pipeline.push_back(*cvtOutGrad_);
+    return;
  }
  bwdData_ =
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@ -38,13 +38,6 @@ protected:
  // pooling_avg or pooling_max
  mkldnn::algorithm poolAlgo_;
  // MKLDNNMatrixPtr which should be created from CPU Device
  MKLDNNMatrixPtr cpuOutVal_;
  MKLDNNMatrixPtr cpuOutGrad_;
  // convert handle between CPU device and MKLDNN device
  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
  // save forward primitive_desc, which can be used backward
  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@ -74,8 +67,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void printSizeInfo() override {
    MKLDNNLayer::printSizeInfo();
    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@ -90,8 +81,6 @@ protected:
   *                    reset pipeline.
   */
  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetInValue(MKLDNNMatrixPtr& in);
  void resetOutValue(MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
                  MKLDNNMatrixPtr out);
@ -106,8 +95,6 @@ protected:
   *                     reset pipeline.
   */
  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetOutGrad(MKLDNNMatrixPtr& out);
  void resetInGrad(MKLDNNMatrixPtr& in);
  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr& in,
                  MKLDNNMatrixPtr& out);
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
    parameters_[REF][i]->randomize();
    dnnValue->copyFrom(*refValue);
-    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
    printVector(dnnValue);
  }
 }
@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(lvl_) << "Input " << i << " data:";
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
    printMatrix(dataLayers_[REF][i]->getOutputValue());
  }
 }
@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
  refLayer_->getOutputGrad()->randomizeUniform();
  dnnLayer_->getOutput(CPU_DEVICE)
      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(lvl_) << "Random Backward Input, TopDiff: ";
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
  printMatrix(refLayer_->getOutputGrad());
 }
 void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_ALL) << "Check Forward";
+  VLOG(MKLDNN_TESTS) << "Check Forward";
  printTopDatas();
  double delta =
      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
 }
 void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_ALL) << "Check Backward Data";
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
  // TODO(TJ): uncomment me when batch norm ready
  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
    printMatrix(dnnDiff);
-    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
    printMatrix(refDiff);
    double delta = compareMatrix(dnnDiff, refDiff);
@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
 }
 void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_ALL) << "Check Backward Weight";
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
  saveWgt(parameters_[DNN], dnnWgts);
@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
                     << parameters_[DNN][i]->getName();
    printVector(dnn);
-    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
                     << parameters_[REF][i]->getName();
    printVector(ref);
    double delta = compareVector(dnn, ref);
@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
  }
  for (int n = 0; n < NUM; ++n) {
-    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
                     << " Forward Result: OutputValue";
    printMatrix(testLayers_[n]->getOutputValue());
  }
 }
@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
  std::ostringstream ostr;
  m->print(ostr);
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 void MKLDNNTester::printVector(const VectorPtr& v) {
@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
  std::ostringstream ostr;
  v->print(ostr, v->getSize());
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 double MKLDNNTester::getDelta(const real* d1,
@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
  UpdateCallback updateCallback = [](Parameter* para) {
    auto& grad = para->getBuf(PARAMETER_GRADIENT);
    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-3;
+    real lr = 1e-2;
    value->add(*grad, lr);
    grad->zeroMem();
  };
@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
                       size_t batchSize,
                       size_t inputImgH,
                       size_t inputImgW,
                       bool printDetails,
                       size_t iter,
-                       float epsilon,
+                       float epsilon) {
                       bool log,
                       int level) {
  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
      << "should be MKLDNN layer or MKLDNN activation";
@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
  ih_ = inputImgH;
  iw_ = inputImgW;
  log_ = printDetails;
  iter_ = iter;
  eps_ = epsilon;
  log_ = log;
  lvl_ = level;
  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
  reset(dnn, ref, batchSize);
@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
  for (size_t i = 0; i < ref.outValues.size(); i++) {
    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
  }
  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
  for (size_t i = 0; i < ref.paraValues.size(); i++) {
    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
  }
@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
                                   float eps) {
  DataIn in;
  initArgument(in, configPath, iter);
  DataOut outCpu, outDnn;
  VLOG(MKLDNN_TESTS) << "runing cpu network";
  getOutResult(configPath, in, outCpu, false, iter);
  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
  getOutResult(configPath, in, outDnn, true, iter);
  compareResult(outCpu, outDnn, eps);
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@ -58,8 +58,6 @@ protected:
  size_t iter_;
  /// whether to print out the details
  bool log_;
  /// vlog level to print the matrix details datas
  int lvl_;
  /// epsilon
  float eps_;
  /// input image size, default 1
@ -70,7 +68,6 @@ public:
    iter_ = iter;
    eps_ = epsilon;
    log_ = false;
    lvl_ = MKLDNN_ALL;
  }
  ~MKLDNNTester() {}
@ -81,10 +78,9 @@ public:
           size_t batchSize,
           size_t inputImgH = 1,
           size_t inputImgW = 1,
           bool printDetails = false,
           size_t iter = 3,
-           float epsilon = 1e-4,
+           float epsilon = 1e-4);
           bool log = false,
           int level = MKLDNN_ALL);
  static void runBranchesTest(const std::string& configPath,
                              size_t iter = 3,
                              float eps = 1e-4);
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@ -51,7 +51,10 @@ def test_sparse_non_value_no_seq(setting, filename):
        yield [(i + 1) * (j + 1) for j in xrange(10)]
-@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(input_types=[
    sparse_float_vector(
        30000, seq_type=SequenceType.NO_SEQUENCE)
 ])
 def test_sparse_value_no_seq(setting, filename):
    for i in xrange(200):
        yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@ -18,7 +18,7 @@ using namespace mkldnn;  // NOLINT
 namespace paddle {
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
  memory::desc md = pd.desc();
  size_t ndims = md.data.ndims;
  int* dims = md.data.dims;
@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
                                     memory::dims dims,
                                     memory::format fmt,
                                     engine& eg,
                                     MatrixPtr m,
                                     mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
 }
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@ -40,24 +40,37 @@ public:
  /**
   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
   */
-  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
                                MatrixPtr m = nullptr);
  /**
   * Create MKLDNNMatrix from a MatrixPtr and memory details info
   */
  static MKLDNNMatrixPtr create(
      MatrixPtr m,
      mkldnn::memory::dims dims,
      mkldnn::memory::format fmt,
      mkldnn::engine& eg,
      MatrixPtr m = nullptr,
      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
  /**
   * Create primitive descriptor.
   * default with f32 dtype
   */
  static mkldnn::memory::primitive_desc createPrimitiveDesc(
      const mkldnn::memory::dims dims,
      const mkldnn::memory::format& fmt,
      const mkldnn::engine& eg,
      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
  }
  /**
   * Create Memory descriptor.
   * default with any format and f32 dtype
   */
  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
    return mkldnn::memory::desc(dims, dtype, fmt);
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -115,7 +115,8 @@ set(DEPS_OPS
    softmax_with_cross_entropy_op
    sum_op
    pool_op
-    pool_with_index_op)
+    pool_with_index_op
    lstm_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@ -114,7 +114,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
        // im2col
        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[1]);
+               paddings[0], paddings[0], paddings[1], paddings[1]);
        // gemm
        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
@ -213,7 +213,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          Tensor in_grad_slice =
              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[1]);
+                 strides[1], paddings[0], paddings[0], paddings[1],
                 paddings[1]);
        }
      }
    }
@ -235,7 +236,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
          im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[1]);
+                 strides[1], paddings[0], paddings[0], paddings[1],
                 paddings[1]);
          // gemm
          Tensor filter_grad_slice =
--- a/paddle/operators/conv2dtranspose_op.cc
+++ b/paddle/operators/conv2dtranspose_op.cc
@ -0,0 +1,107 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/operators/conv2dtranspose_op.h"
 namespace paddle {
 namespace operators {
 void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
                 "Input(Input) of Conv2DTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
                 "Input(Filter) of Conv2DTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
                 "Output(Output) of Conv2DTransposeOp should not be null.");
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  for (size_t i = 0; i < paddings.size(); ++i) {
    PADDLE_ENFORCE_EQ(paddings[i], 0,
                      "No Padding allowed in conv transpose op.");
  }
  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
                    "Conv2DTransposeOp input should be 4-D tensor.");
  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
                    "Conv2DTransposeOp filter should be 4-D tensor.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                    "input and kernel input dimension should be equal.");
  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
  ctx->SetOutputDim("Output",
                    {in_dims[0], filter_dims[1], output_height, output_width});
 }
 Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution transpose operator. "
      "The format of input tensor is NCHW. Where N is batch size, C is the "
      "number of input channels, H and W is the height and width of image.");
  AddInput("Filter",
           "(Tensor) The filter tensor of convolution transpose operator."
           "The format of the filter tensor is CMHW, where C is the number of "
           "output image channels, M is the number of input image channels, "
           "H and W is height and width of filter. "
           "We enforce groups number == 1 and padding == 0 in "
           "convolution transpose Scenario.");
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator."
            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>("strides",
                            "strides of convolution transpose operator.")
      .SetDefault({1, 1});
  AddAttr<std::vector<int>>("paddings",
                            "paddings of convolution transpose operator.")
      .SetDefault({0, 0});
  AddComment(R"DOC(
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 )DOC");
 }
 void Conv2DTransposeOpGrad::InferShape(
    framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
  }
  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
  }
 }
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
            ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
            ops::Conv2DTransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
    conv2dtranspose,
    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv2dtranspose_grad,
    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/Show More
+++ b/Show More