Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_op

8 years ago · a3c3b7866e
parent 44224f4b5b 5b42d2b21b
commit a3c3b7866e
51 changed files with 2429 additions and 680 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
+paddle/pybind/pybind.h
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@ -22,10 +22,10 @@ limitations under the License. */
 */
 typedef enum {
  HL_POOLING_MAX = 0,
-  // average includes padded values
-  HL_POOLING_AVERAGE = 1,
  // average does not include padded values
-  HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
+  HL_POOLING_AVERAGE = 1,
+  // average includes padded values
+  HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
  HL_POOLING_END
 } hl_pooling_mode_t;

--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@ -461,7 +461,7 @@ class add<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
-    return vmulq_f32(a, b);
+    return vaddq_f32(a, b);
  }
 };

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,

    int hstart = ph * strideH - padH;
    int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height + padH);
-    int wend = min(wstart + sizeX, width + padW);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
+    int pool_size = (hend - hstart) * (wend - wstart);

    real aveval = 0;
    inputData += (frameNum * channels + c) * height * width;
@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);

    for (int ph = phstart; ph < phend; ++ph) {
+      int hstart = ph * strideH - padH;
+      int hend = min(hstart + sizeY, height);
+      hstart = max(hstart, 0);
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
-        int hstart = ph * strideH - padH;
        int wstart = pw * strideW - padW;
-        int hend = min(hstart + sizeY, height + padH);
-        int wend = min(wstart + sizeX, width + padW);
+        int wend = min(wstart + sizeX, width);
+        wstart = max(wstart, 0);
        int poolsize = (hend - hstart) * (wend - wstart);
        gradient += outGrad[ph * pooledW + pw] / poolsize;
      }
@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
    int dstart = pd * strideD - padD;
    int hstart = ph * strideH - padH;
    int wstart = pw * strideW - padW;
-    int dend = min(dstart + sizeZ, depth + padD);
-    int hend = min(hstart + sizeY, height + padH);
-    int wend = min(wstart + sizeX, width + padW);
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    int dend = min(dstart + sizeZ, depth);
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
    dstart = max(dstart, 0);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    dend = min(dend, depth);
-    hend = min(hend, height);
-    wend = min(wend, width);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);

    real aveval = 0;
    inputData += (frameNum * channels + c) * depth * height * width;
@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;

    for (int pd = pdstart; pd < pdend; ++pd) {
+      int dstart = pd * strideD - padD;
+      int dend = min(dstart + sizeZ, depth);
+      dstart = max(dstart, 0);
      for (int ph = phstart; ph < phend; ++ph) {
+        int hstart = ph * strideH - padH;
+        int hend = min(hstart + sizeY, height);
+        hstart = max(hstart, 0);
        for (int pw = pwstart; pw < pwend; ++pw) {
          // figure out the pooling size
-          int dstart = pd * strideD - padD;
-          int hstart = ph * strideH - padH;
          int wstart = pw * strideW - padW;
-          int dend = min(dstart + sizeZ, depth + padD);
-          int hend = min(hstart + sizeY, height + padH);
-          int wend = min(wstart + sizeX, width + padW);
+          int wend = min(wstart + sizeX, width);
+          wstart = max(wstart, 0);
          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
        }
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
      cudnn_mode = CUDNN_POOLING_MAX;
      break;
    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
      break;
+    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
    default:
      LOG(FATAL) << "parameter mode error";
  }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }

 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -366,7 +366,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 class ExecutionContext : public InferShapeContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext* device_context)
+                   const platform::DeviceContext& device_context)
      : InferShapeContext(op, scope), device_context_(device_context) {}

  template <typename PlaceType,
@ -374,9 +374,9 @@ class ExecutionContext : public InferShapeContext {
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;

-  platform::Place GetPlace() const { return device_context_->GetPlace(); }
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }

-  const platform::DeviceContext* device_context() const {
+  const platform::DeviceContext& device_context() const {
    return device_context_;
  }

@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext {
    return res;
  }

-  const platform::DeviceContext* device_context_;
+ private:
+  const platform::DeviceContext& device_context_;
 };

 template <>
@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
  }

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
    if (mode) {
      *mode = HL_POOLING_AVERAGE;
    }
-  } else if (poolType == "cudnn-avg-excl-pad-pool") {
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
    if (mode) {
-      *mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
    }
  } else {
    return false;
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@ -143,7 +143,7 @@ void DetectionOutputLayer::forward(PassType passType) {
    resetOutput(numKept, 7);
  } else {
    MatrixPtr outV = getOutputValue();
-    outV = NULL;
+    if (outV) outV->resize(0, 0);
    return;
  }
  MatrixPtr outV = getOutputValue();
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@ -0,0 +1,138 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::pooling_forward pool_fwd;
+typedef mkldnn::pooling_backward pool_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer pool layer.
+ *
+ * The config file api is mkldnn_pool
+ */
+class MKLDNNPoolLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+
+  // pooling_avg or pooling_max
+  mkldnn::algorithm poolAlgo_;
+
+  // MKLDNNMatrixPtr which should be created from CPU Device
+  MKLDNNMatrixPtr cpuOutVal_;
+  MKLDNNMatrixPtr cpuOutGrad_;
+  // convert handle between CPU device and MKLDNN device
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_pooling_forward.cpp, pool need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+public:
+  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNPoolLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateInputData() override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_;
+  }
+
+protected:
+  /**
+   * Forward functions: reset buffers(input, output),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetInValue(MKLDNNMatrixPtr& in);
+  void resetOutValue(MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(input, output),
+   *                     reset primitive descriptor,
+   *                     reset pipeline.
+   */
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetOutGrad(MKLDNNMatrixPtr& out);
+  void resetInGrad(MKLDNNMatrixPtr& in);
+  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_pooling_forward.cpp
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
+        ++padR[0];
+      }
+      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@ -141,6 +141,68 @@ TEST(MKLDNNLayer, ConvLayer) {
  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
 }

+struct testPoolDesc {
+  int bs, ch;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+
+void testPoolLayer(const testPoolDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_pool", "pool"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  // pool->set_pool_type(poolType);
+  pool->set_channels(pm.ch);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+
+  MKLDNNTester tester;
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+
+TEST(MkldnnLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+
 // TODO(TJ): add branch test

 int main(int argc, char** argv) {
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@ -17,6 +17,7 @@ limitations under the License. */
 #include <cmath>
 #include "BaseMatrix.h"
 #include "MathFunctions.h"
+#include "NEONFunctions.h"
 #include "SIMDFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_base.cuh"
@ -666,6 +667,13 @@ void BaseMatrixT<T>::relu(BaseMatrixT& b) {
  applyBinary(binary::Relu<T>(), b);
 }

+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+void BaseMatrixT<float>::relu(BaseMatrixT& b) {
+  neon::relu(data_, b.data_, height_ * width_);
+}
+#endif
+
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
 template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include "NEONFunctions.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace neon {
+
+// b[i] = a[i] > 0.0f ? a[i] : 0.0f
+void relu(const float* a, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+  float32x4_t mb0, mb1, mb2, mb3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    mb0 = vmaxq_f32(ma0, zero);
+    mb1 = vmaxq_f32(ma1, zero);
+    mb2 = vmaxq_f32(ma2, zero);
+    mb3 = vmaxq_f32(ma3, zero);
+
+    vst1q_f32(b, mb0);
+    vst1q_f32(b + 4, mb1);
+    vst1q_f32(b + 8, mb2);
+    vst1q_f32(b + 12, mb3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : 0.0f;
+  }
+}
+
+}  // namespace neon
+}  // namespace paddle
+
+#endif
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace neon {
+
+void relu(const float* a, float* b, int len);
+
+}  // namespace neon
+}  // namespace paddle
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
                       int strideW,
                       int padH,
                       int padW) {
-  int outH = 0, outW = 0;
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);

  int inWidth = imgSizeH * imgSizeW * channels;
  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
                       int strideW,
                       int padH,
                       int padW) {
-  int outH = 0, outW = 0;
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);

  int inWidth = imgSizeH * imgSizeW * channels;
  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+
+class CrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+
+    ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
+  }
+};
+
+class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) must not be null.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) must be 1.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+
+    auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    dx->Resize(x->dims());
+  }
+};
+
+class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CrossEntropyOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of CrossEntropyOp");
+    AddInput("Label", "The second input of CrossEntropyOp");
+    AddOutput("Y", "The output of CrossEntropyOp");
+    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+
+    AddComment(R"DOC(
+CrossEntropy Operator.
+
+It supports both standard cross-entropy and soft-label cross-entropy loss
+computation.
+1) One-hot cross-entropy:
+    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+
+                Y[i] = -log(X[i, Label[i]])
+
+2) Soft-label cross-entropy:
+    soft_label = 1, Label[i, j] indicates the soft label of class j
+    for sample i:
+
+                Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+   Please make sure that in this case the summuation of each row of Label
+   equals one.
+
+3) One-hot cross-entropy with vecterized Input(Label):
+     As a special case of 2), when each row of Input(Label) has only one
+     non-zero element (equals 1), soft-label cross-entropy degenerates to a
+     one-hot cross-entropy with one-hot label representation.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpKernel<float>);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int N, const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    T sum = static_cast<T>(0);
+    for (int j = 0; j < D; j++) {
+      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
+    }
+    Y[i] = -sum;
+  }
+}
+
+// TODO(qingqing): make zero setting an common function.
+template <typename T>
+__global__ void zero(T* X, const int N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    X[i] = 0.0;
+  }
+}
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int* label, const int N,
+                                           const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                               const T* label, const int N,
+                                               const int D) {
+  // TOOD(qingqing): optimize for this kernel
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    for (int j = 0; j < D; ++j) {
+      int idx = i * D + j;
+      dX[idx] = -label[idx] * dY[i] / X[idx];
+    }
+  }
+}
+
+template <typename T>
+class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto y = ctx.Output<Tensor>("Y");
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* x_data = x->data<T>();
+    y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
+
+    int n = x->dims()[0];
+    int d = x->dims()[1];
+    int block = 512;
+    int grid = (n + block - 1) / block;
+    // TODO(qingqing) launch kernel on specified stream
+    // base on ExecutionContext.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
+                                                 d);
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
+    }
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dy_data = dy->data<T>();
+    auto* x_data = x->data<T>();
+
+    int n = x->dims()[0];
+    int d = x->dims()[1];
+    int block = 512;
+    int grid = (n * d + block - 1) / block;
+    zero<T><<<grid, block>>>(dx_data, n * d);
+    grid = (n + block - 1) / block;
+    // TODO(qingqing): launch kernel on specified stream
+    // base on ExecutionContext.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = label->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
+          dx_data, dy_data, x_data, label_data, n, d);
+    } else {
+      auto* label_data = label->data<int>();
+      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
+                                                     label_data, n, d);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpCUDAKernel<float>);
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE T tolerable_value(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+  const T kApproInf = 1e20;
+  if (x == INFINITY) {
+    return kApproInf;
+  }
+  if (x == -INFINITY) {
+    return -kApproInf;
+  }
+  return x;
+}
+
+template <typename T>
+class CrossEntropyOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto y = ctx.Output<Tensor>("Y");
+
+    auto* x_data = x->data<T>();
+    y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
+
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
+
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      int index = 0;
+      for (int i = 0; i < batch_size; ++i) {
+        T sum = static_cast<T>(0);
+        for (int j = 0; j < class_num; ++j) {
+          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
+          y_data[i] = -sum;
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        y_data[i] = -tolerable_value(std::log(x_data[index]));
+      }
+    }
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dy_data = dy->data<T>();
+    auto* x_data = x->data<T>();
+
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
+
+    // TODO(qingqing): make zero setting an common function.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      int index = 0;
+      for (int i = 0; i < batch_size; ++i) {
+        for (int j = 0; j < class_num; ++j) {
+          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = label->data<int>();
+      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+      for (int i = 0; i < batch_size; ++i) {
+        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
+        int index = i * class_num + label_data[i];
+        dx_data[index] = -dy_data[i] / x_data[index];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::LoDTensor;
+
+class DropoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<LoDTensor>("Out")->Resize(dims);
+    if (ctx.Attr<int>("is_training") == 1) {
+      ctx.Output<LoDTensor>("Mask")->Resize(dims);
+    }
+  }
+};
+
+template <typename AttrType>
+class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DropoutOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f);
+    // TODO(xinghai-sun): use bool for is_training after bool is supported.
+    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+    AddInput("X", "The input of dropout op.");
+    AddOutput("Out", "The output of dropout op.");
+    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
+
+    AddComment(R"DOC(
+Dropout Operator.
+
+"Dropout" refers to randomly dropping out units in a nerual network. It is a
+regularization technique for reducing overfitting by preventing neuron
+co-adaption during training. The dropout operator randomly set (according to
+the given dropout probability) the outputs of some units to zero, while others
+being set to their inputs.
+)DOC");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+
+    PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, out_dims,
+                      "Dimensions of Input(X) and Out@Grad must be the same.");
+    auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
+    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
+                      "Dimensions of Input(X) and Mask must be the same.");
+
+    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    x_grad->Resize(x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+            ops::DropoutOpGrad<float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename AttrType>
+struct MaskGenerator {
+  AttrType dropout_prob;
+  int seed;
+
+  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
+      : dropout_prob(dropout_prob), seed(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<AttrType> dist(0, 1);
+    rng.discard(n);
+    if (dist(rng) < dropout_prob) {
+      return static_cast<T>(0);
+    } else {
+      return static_cast<T>(1);
+    }
+  }
+};
+
+// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename Place, typename T, typename AttrType>
+class GPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    auto X = EigenMatrix<T>::Reshape(*x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+
+    auto place = context.GetEigenDevice<Place>();
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int size = framework::product(mask->dims());
+      int seed = context.Attr<int>("seed");
+      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(mask_data),
+                        MaskGenerator<T, AttrType>(dropout_prob, seed));
+      auto M = EigenMatrix<T>::Reshape(*mask, 1);
+      Y.device(place) = X * M;
+    } else {
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
+REGISTER_OP_GPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType>
+class CPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int seed = context.Attr<int>("seed");
+      std::minstd_rand engine;
+      engine.seed(seed);
+      std::uniform_real_distribution<AttrType> dist(0, 1);
+      size_t size = framework::product(mask->dims());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < dropout_prob) {
+          mask_data[i] = 0;
+          y_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          y_data[i] = x_data[i];
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto place = context.GetEigenDevice<Place>();
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class DropoutGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+
+    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
+    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+
+    auto place = context.GetEigenDevice<Place>();
+    dX.device(place) = dY * M;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@ -19,12 +19,13 @@ namespace operators {
 namespace math {

 template <>
-void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const CBLAS_TRANSPOSE transA,
                                     const CBLAS_TRANSPOSE transB, const int M,
                                     const int N, const int K,
                                     const float alpha, const float* A,
-                                     const float* B, const float beta, float* C,
-                                     platform::DeviceContext* context) {
+                                     const float* B, const float beta,
+                                     float* C) {
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
@ -33,13 +34,13 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
 }

 template <>
-void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const CBLAS_TRANSPOSE transA,
                                      const CBLAS_TRANSPOSE transB, const int M,
                                      const int N, const int K,
                                      const double alpha, const double* A,
                                      const double* B, const double beta,
-                                      double* C,
-                                      platform::DeviceContext* context) {
+                                      double* C) {
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
@ -48,13 +49,10 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
 }

 template <>
-void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
-                                       bool trans_a,
-                                       const framework::Tensor& matrix_b,
-                                       bool trans_b, float alpha,
-                                       framework::Tensor* matrix_out,
-                                       float beta,
-                                       platform::DeviceContext* context) {
+void matmul<platform::CPUPlace, float>(
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@ -74,18 +72,15 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::CPUPlace, float>(
-      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
 }

 template <>
-void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
-                                        bool trans_a,
-                                        const framework::Tensor& matrix_b,
-                                        bool trans_b, double alpha,
-                                        framework::Tensor* matrix_out,
-                                        double beta,
-                                        platform::DeviceContext* context) {
+void matmul<platform::CPUPlace, double>(
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@ -105,8 +100,8 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;

  gemm<platform::CPUPlace, double>(
-      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }

 }  // namespace math
--- a/Show More
+++ b/Show More