support rectangle padding, stride, window and input for PoolProjection (#115)

* support rectangle padding, stride, window and input for PoolProjection * Follow comments. 1. Remove start 2. refine img_pool_a/b.conf for test_NetworkCompare 3. Split unit test * Modify the test in img_layers.py
8 years ago · 191fafe355
parent 8a044d2e2d
commit 191fafe355
22 changed files with 757 additions and 325 deletions
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@ -84,16 +84,23 @@ extern void hl_expand_feature2col(
 * @param[in]   width       image width.
 * @param[in]   pooledH     output image height.
 * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 *
 */
 extern void hl_maxpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData);
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData);

 /**
 * @brief   Maximum pool backward.
@ -107,21 +114,28 @@ extern void hl_maxpool_forward(
 * @param[in]   width       image width.
 * @param[in]   pooledH     output image height.
 * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
- * @param[out]  targetGrad  output grad.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
 * @param[in]   scaleA      scale.
 * @param[in]   scaleB      scale.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
 *
 */
 extern void hl_maxpool_backward(
-    int frameCnt, const real* inputData,
+    const int frameCnt, const real* inputData,
    const real* outData, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* targetGrad,
-    real scaleA, real scaleB);
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW,
+    real scaleA, real scaleB,
+    real* targetGrad);

 /**
 * @brief   Averge pool forward.
@ -133,16 +147,23 @@ extern void hl_maxpool_backward(
 * @param[in]   width       image width.
 * @param[in]   pooledH     output image height.
 * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 *
 */
 extern void hl_avgpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData);
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData);

 /**
 * @brief   Maximum pool backward.
@ -154,20 +175,27 @@ extern void hl_avgpool_forward(
 * @param[in]   width       image width.
 * @param[in]   pooledH     output image height.
 * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       size of pooling window.
- * @param[in]   stride      pooling stride.
- * @param[in]   start       pooling start.
- * @param[out]  backGrad    output grad.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
 * @param[in]   scaleA      scale.
 * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
 *
 */
 extern void hl_avgpool_backward(
-    int frameCnt, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* backGrad,
-    real scaleA, real scaleB);
+    const int frameCnt, const real* outGrad,
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    int paddingH, int paddingW,
+    real scaleA, real scaleB,
+    real* backGrad);

 /**
 * @brief   Cross-map-respose normalize forward.
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@ -38,29 +38,45 @@ inline void hl_expand_feature2col(
    real* dataCol) {}

 inline void hl_maxpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData) {}
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData) {}

 inline void hl_maxpool_backward(
-    int frameCnt, const real* inputData,
+    const int frameCnt, const real* inputData,
    const real* outData, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* targetGrad,
-    real scaleA, real scaleB) {}
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW,
+    real scaleA, real scaleB,
+    real* targetGrad) {}

 inline void hl_avgpool_forward(
-    int frameCnt, const real* inputData, int channels,
-    int height, int width, int pooledH, int pooledW,
-    int sizeX, int stride, int start, real* tgtData) {}
+    const int frameCnt, const real* inputData,
+    const int channels,
+    const int height, const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    const int paddingH, const int paddingW, real* tgtData) {}

 inline void hl_avgpool_backward(
-    int frameCnt, const real* outGrad,
-    int channels, int height, int width,
-    int pooledH, int pooledW, int sizeX,
-    int stride, int start, real* backGrad,
-    real scaleA, real scaleB) {}
+    const int frameCnt, const real* outGrad,
+    const int channels, const int height,
+    const int width,
+    const int pooledH, const int pooledW,
+    const int sizeX, const int sizeY,
+    const int strideH, const int strideW,
+    int paddingH, int paddingW,
+    real scaleA, real scaleB,
+    real* backGrad) {}

 inline void hl_CMRNorm_forward(
    size_t frameCnt, const real* in, real* scale, real* out,
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@ -51,7 +51,6 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap,
  PoolLayer::init(layerMap, parameterMap);

  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-  CHECK_EQ(start_, 0) << poolType_ << " dose not support 'start'";

  hl_create_tensor_descriptor(&inputDesc_);
  hl_create_tensor_descriptor(&outputDesc_);
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@ -56,16 +56,6 @@ public:
  void reshape(int batchSize);
  virtual void forward(PassType passType);
  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  /**
-   * Calculate output size according window size of pooling.
-   */
-  int outputSize(int imageSize, int windowSize, int padding, int stride) {
-    int outputSize;
-    outputSize =
-        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
-    return outputSize;
-  }
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@ -35,7 +35,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
  poolType_ = conf.pool_type();
  channels_ = conf.channels();
  sizeX_ = conf.size_x();
-  start_ = conf.start();
  stride_ = conf.stride();
  outputX_ = conf.output_x();
  imgSize_ = conf.img_size();
@ -47,22 +46,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();

-  bool cudnnTypeCheck = true;
-#ifndef PADDLE_ONLY_CPU
-  cudnnTypeCheck = !CudnnPoolLayer::typeCheck(poolType_);
-#endif
-
-  if ((sizeY_ != sizeX_ || imgSizeY_ != imgSize_ || strideY_ != stride_ ||
-       confPaddingY_ != confPadding_ || outputY_ != outputX_) &&
-      cudnnTypeCheck) {
-    LOG(FATAL) << poolType_ << " does not supported non-square "
-                               "filter, image, stride or padding";
-  }
-
-  if (confPadding_ != 0 && cudnnTypeCheck) {
-    LOG(FATAL) << poolType_ << " does not supported 'padding'";
-  }
-
  return true;
 }

--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@ -28,7 +28,7 @@ namespace paddle {
 class PoolLayer : public Layer {
 protected:
  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int start_, confPadding_;
+  int confPadding_;

  size_t sizeY_;
  size_t imgSizeY_;
@ -47,6 +47,16 @@ public:
  static Layer* create(const LayerConfig& config);

  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Calculate output size according window size and padding size.
+   */
+  int outputSize(int imageSize, int windowSize, int padding, int stride) {
+    int outputSize;
+    outputSize =
+        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
+    return outputSize;
+  }
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@ -25,13 +25,15 @@ size_t PoolProjectionLayer::getSize() {
  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
  }
  if (imgSizeW_ == 0) {
    imgSizeW_ = imgSize_;
  }
-  outputH_ = 1 + (imgSizeH_ - start_ - sizeX_ + stride_ - 1) / stride_;
-  outputW_ = 1 + (imgSizeW_ - start_ - sizeX_ + stride_ - 1) / stride_;
+
+  outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_);
+  outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_);
+
  layerSize = outputH_ * outputW_ * channels_;

  getOutput().setFrameHeight(outputH_);
@ -51,8 +53,9 @@ void MaxPoolProjectionLayer::forward(PassType passType) {

  MatrixPtr outV = getOutputValue();

-  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
-                       stride_, outputH_, outputW_);
+  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
+                       sizeX_, sizeY_, strideY_, stride_,
+                       outputH_, outputW_, confPaddingY_, confPadding_);
 }

 void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
@ -69,7 +72,9 @@ void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inputGrad = getInputGrad(0);

  inputGrad->maxPoolBackward(*inputV, imgSizeH_, imgSizeW_, *outGrad, *outV,
-                             sizeX_, start_, stride_, outputH_, outputW_, 1, 1);
+                             sizeX_, sizeY_,
+                             strideY_, stride_, outputH_, outputW_, 1, 1,
+                             confPaddingY_, confPadding_);
 }

 void AvgPoolProjectionLayer::forward(PassType passType) {
@ -84,8 +89,9 @@ void AvgPoolProjectionLayer::forward(PassType passType) {

  MatrixPtr outV = getOutputValue();

-  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
-                       stride_, outputH_, outputW_);
+  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
+                       sizeX_, sizeY_, strideY_, stride_,
+                       outputH_, outputW_, confPaddingY_, confPadding_);
 }

 void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
@ -97,7 +103,9 @@ void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
  /* Do derivation */
  MatrixPtr outputGrad = getOutputGrad();
  MatrixPtr inputGrad = getInputGrad(0);
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_, sizeX_, start_,
-                             stride_, outputH_, outputW_, 1, 1);
+  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_,
+                             sizeX_, sizeY_, strideY_, stride_,
+                             outputH_, outputW_, 1, 1,
+                             confPaddingY_, confPadding_);
 }
 }  // namespace paddle
--- a/paddle/gserver/tests/img_pool_a.conf
+++ b/paddle/gserver/tests/img_pool_a.conf
@ -0,0 +1,46 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=8,stride=1)
+maxpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         img_width=16,
+                         pool_type=MaxPooling(),
+)
+avgpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         img_width=16,
+                         pool_type=AvgPooling(),
+)
+
+outputs([maxpool, avgpool])
--- a/paddle/gserver/tests/img_pool_b.conf
+++ b/paddle/gserver/tests/img_pool_b.conf
@ -0,0 +1,44 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8, num_filters=8, stride=1)
+maxpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         pool_type=CudnnMaxPooling(),
+)
+
+avgpool = img_pool_layer(input=conv,
+                         pool_size=3,
+                         pool_size_y=5,
+                         num_channels=8,
+                         stride=1,
+                         stride_y=2,
+                         padding=1,
+                         padding_y=2,
+                         pool_type=CudnnAvgPooling(),
+)
+
+outputs([maxpool, avgpool])
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -791,21 +791,24 @@ void setPoolConfig(TestConfig* config, PoolConfig* pool,
  (*config).biasSize = 0;
  (*config).layerConfig.set_type("pool");
  (*config).layerConfig.set_num_filters(16);
-  (*config).layerConfig.set_partial_sum(1);
-  (*config).layerConfig.set_shared_biases(true);

+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
  pool->set_pool_type(poolType);
  pool->set_channels(16);
-  pool->set_size_x(3);
-  if (poolType == "cudnn-max-pool" || poolType == "cudnn-avg-pool") {
-    pool->set_padding(0);
-  } else {
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
  pool->set_start(0);
-  }
-  pool->set_stride(2);
-  pool->set_output_x((pool->img_size() - pool->start() - pool->size_x()) /
-                         ((float)pool->stride()) +
-                     1.5);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = (pool->img_size() - kw + 2 * pw + sw - 1) / sw + 1;
+  int oh = (pool->img_size_y() - kh + 2 * ph + sh - 1) / sh + 1;
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
 }

 void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
@ -814,9 +817,10 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
  LayerInputConfig* input = config.layerConfig.add_inputs();
  PoolConfig* pool = input->mutable_pool_conf();

-  setPoolConfig(&config, pool, poolType);
  pool->set_img_size(14);
-  config.layerConfig.set_size(pool->output_x() * pool->output_x() *
+  pool->set_img_size_y(14);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                              pool->channels());

  testLayerGrad(config, "pool", 100, trans, useGpu);
@ -829,11 +833,11 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
  LayerInputConfig* input = config.layerConfig.add_inputs();
  PoolConfig* pool = input->mutable_pool_conf();

-  setPoolConfig(&config, pool, poolType);
  pool->set_size_y(4);
  pool->set_stride_y(3);
  pool->set_img_size(10);
  pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
                         ((float)pool->stride_y()) +
                     1.5);
@ -1252,8 +1256,6 @@ TEST(Layer, MultiplexLayer) {
  }
 }

-
-
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@ -116,6 +116,8 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
  gradientMachine->start(trainer.getConfig(), nullptr);
  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    // If the all the layers in the config have no parameters, also
+    // not set NeedGradient(), the outArgs[i] will be nullptr.
    outArgs[i].grad->copyFrom(*in.outGrads[i]);
  }
  gradientMachine->backward();
@ -225,6 +227,18 @@ TEST(Compare, concat_table) {
  compareNetwork(config_file_a, config_file_b);
 }

+#ifndef PADDLE_ONLY_CPU
+TEST(Compare, img_pool) {
+  std::string config_file_a = "./gserver/tests/img_pool_a.conf";
+  std::string config_file_b = "./gserver/tests/img_pool_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+#endif
+
+
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@ -742,31 +742,37 @@ public:
   */
  virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
                              size_t imgSizeW, size_t channels, size_t sizeX,
-                              int start_, size_t stride, size_t outputH,
-                              size_t outputW) {
+                              size_t sizeY, size_t strideH, size_t strideW,
+                              size_t outputH, size_t outputW,
+                              size_t paddingH, size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }

  /// Pooling backward operation.
  virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
                               Matrix& outGrad, Matrix& outV, size_t sizeX,
-                               int start, size_t stride, size_t outputH,
-                               size_t outputW, real scaleTargets,
-                               real scaleOutput) {
+                               size_t sizeY, size_t strideH, size_t strideW,
+                               size_t outputH, size_t outputW,
+                               real scaleTargets, real scaleOutput,
+                               size_t paddingH, size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }

  /// Pooling forward operation, caculate the average of sizeX elements.
  virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                              size_t channels, size_t sizeX, int start,
-                              size_t stride, size_t outputH, size_t outputW) {
+                              size_t channels, size_t sizeX, size_t sizeY,
+                              size_t strideH, size_t strideW,
+                              size_t outputH, size_t outputW,
+                              size_t paddingH, size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }

  virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t sizeX, int start, size_t stride,
+                               size_t sizeX, size_t sizeY,
+                               size_t strideH, size_t strideW,
                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput) {
+                               real scaleTargets, real scaleOutput,
+                               size_t paddingH, size_t paddingW) {
    LOG(FATAL) << "Not implemeted";
  }

@ -1131,21 +1137,30 @@ public:
                  real alpha = 1.0f, real beta = 0.0f);

  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start_, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);

  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
-                       size_t stride, size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput);
+                       Matrix& outGrad, Matrix& outV, size_t sizeX,
+                       size_t sizeY, size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);

  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);

  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, int start, size_t stride, size_t outputH,
-                       size_t outputW, real scaleTargets, real scaleOutput);
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);

  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                         Matrix& denoms, size_t channels, size_t sizeX,
@ -1242,21 +1257,31 @@ public:
                  real alpha = 1.0f, real beta = 0.0f);

  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start_, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);

  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
-                       size_t stride, size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput);
+                       Matrix& outGrad, Matrix& outV,
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);

  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, int start, size_t stride,
-                      size_t outputH, size_t outputW);
+                      size_t channels, size_t sizeX, size_t sizeY,
+                      size_t strideH, size_t strideW,
+                      size_t outputH, size_t outputW,
+                      size_t paddingH, size_t paddingW);

  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, int start, size_t stride, size_t outputH,
-                       size_t outputW, real scaleTargets, real scaleOutput);
+                       size_t sizeX, size_t sizeY,
+                       size_t strideH, size_t strideW,
+                       size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput,
+                       size_t paddingH, size_t paddingW);

  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
                         Matrix& denoms, size_t channels, size_t sizeX,
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@ -1846,6 +1846,159 @@ TEST(Matrix, classificationError) {
  }
 }

+void testMaxPoolFwdBwd(int numSamples, int channels,
+                       int imgSizeH, int imgSizeW,
+                       int ksizeH, int ksizeW,
+                       int strideH, int strideW,
+                       int padH, int padW) {
+  int outH = 0, outW = 0;
+  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPoolForward(*input, imgSizeH, imgSizeW,
+                         channels, ksizeW, ksizeH,
+                         strideH, strideW, outH, outW, padH, padW);
+  targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW,
+                            channels, ksizeW, ksizeH,
+                            strideH, strideW, outH, outW, padH, padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
+                                              false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW,
+                             *targetGrad, *target,
+                             ksizeW, ksizeH,
+                             strideH, strideW,
+                             outH, outW, 1.0, 1.0, padH, padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW,
+                                *targetGpuGrad, *targetGpu,
+                                ksizeW, ksizeH,
+                                strideH, strideW,
+                                outH, outW, 1.0, 1.0, padH, padW);
+  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
+                                               false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPoolFwdBwd(int numSamples, int channels,
+                       int imgSizeH, int imgSizeW,
+                       int ksizeH, int ksizeW,
+                       int strideH, int strideW,
+                       int padH, int padW) {
+  int outH = 0, outW = 0;
+  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPoolForward(*input, imgSizeH, imgSizeW,
+                         channels, ksizeW, ksizeH,
+                         strideH, strideW, outH, outW, padH, padW);
+  targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW,
+                            channels, ksizeW, ksizeH,
+                            strideH, strideW, outH, outW, padH, padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
+                                              false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW,
+                             ksizeW, ksizeH,
+                             strideH, strideW,
+                             outH, outW, 1.0, 1.0, padH, padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW,
+                                ksizeW, ksizeH,
+                                strideH, strideW,
+                                outH, outW, 1.0, 1.0, padH, padW);
+  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
+                                               false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+}
+
+TEST(Matrix, PoolFwdBwd) {
+  for (auto numSamples : {5, 32}) {
+    for (auto channels : {1, 9, 32}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto sizeX : {2, 5}) {
+            for (auto sizeY : {2, 5}) {
+              for (auto sH : {1, 2}) {
+                for (auto sW : {1, 2}) {
+                   for (auto pH : {0, (sizeY - 1)/2}) {
+                     for (auto pW : {0, (sizeX - 1)/2}) {
+                       VLOG(3) << " numSamples=" << numSamples
+                               << " channels=" << channels
+                               << " imgSizeH=" << imgSizeH
+                               << " imgSizeW=" << imgSizeW
+                               << " sizeX=" << sizeX
+                               << " sizeY=" << sizeY
+                               << " strideH=" << sH
+                               << " strideW=" << sW
+                               << " padingH=" << pH
+                               << " padingW=" << pW;
+                       testMaxPoolFwdBwd(numSamples, channels, imgSizeH,
+                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
+                       testAvgPoolFwdBwd(numSamples, channels, imgSizeH,
+                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
+                     }
+                   }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@ -88,7 +88,8 @@ message PoolConfig {
  required uint32 size_x = 3;

  // Tell the net where in the input image to start the pooling.
-  required uint32 start = 4;
+  // start is deprecated now.
+  optional uint32 start = 4;

  // Defines the stride size between successive pooling squares.
  required uint32 stride = 5;
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@ -961,10 +961,6 @@ def parse_pool(pool, input_layer_name, pool_conf):
                  "['max-projection', 'avg-projection', "
                  "'cudnn-max-pool', 'cudnn-avg-pool']"
                  % pool.pool_type)
-    if pool.size_y or pool.stride_y or pool.img_width or pool.padding_y:
-        config_assert(pool.pool_type.startswith('cudnn'),
-                      "'size_y', 'stride_y' and 'img_width' and 'padding_y'"
-                      "can only be used for cudnn")

    pool_conf.channels = pool.channels
    pool_conf.size_x = pool.size_x
@ -974,36 +970,25 @@ def parse_pool(pool, input_layer_name, pool_conf):
    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride);

    img_pixels = g_layer_map[input_layer_name].size / pool.channels
+    # the img_width may be removed,
+    # and it can be calculated automatically later.
    pool_conf.img_size = default(pool.img_width, int(img_pixels ** 0.5))
    pool_conf.img_size_y = img_pixels / pool_conf.img_size
    config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
                  "Incorrect input image size %d for input image pixels %d"
                  % (pool_conf.img_size, img_pixels))

-    if pool.start is not None:
-        config_assert(pool.padding is None,
-              'At most one of start and padding can be set.')
-        pool_conf.start = pool.start
-        pool_conf.padding = 0
-        pool_conf.output_x = int(math.ceil((pool_conf.img_size - \
-            pool_conf.start - pool_conf.size_x) / \
-            float(pool_conf.stride))) + 1
+    config_assert(not pool.start, "start is deprecated in pooling.")

-        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y - \
-            pool_conf.start - pool_conf.size_y) / \
-            float(pool_conf.stride_y))) + 1
-    elif pool.padding is not None:
+    if pool.padding is not None:
        pool_conf.padding = pool.padding
        pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-        pool_conf.start = 0
        pool_conf.output_x = int(math.ceil((pool_conf.img_size + \
            2*pool_conf.padding - pool_conf.size_x) / \
            float(pool_conf.stride))) + 1
        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y + \
            2*pool_conf.padding_y - pool_conf.size_y) / \
            float(pool_conf.stride_y))) + 1
-    else:
-        raise ValueError('At least one of start and padding should be set.')

 def parse_image(image, input_layer_name, image_conf):
    image_conf.channels = image.channels
@ -1603,7 +1588,7 @@ class PoolLayer(LayerBase):
            pool_conf = self.config.inputs[input_index].pool_conf
            print("output size for %s is %d*%d " % (
                name, pool_conf.output_y, pool_conf.output_x))
-            self.set_layer_size((pool_conf.output_x ** 2) * pool_conf.channels)
+            self.set_layer_size((pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)

@config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@ -1627,7 +1627,9 @@ def img_conv_layer(input, filter_size, num_filters,
@layer_support()
 def img_pool_layer(input, pool_size, name=None,
                   num_channels=None, pool_type=None,
-                   stride=1, start=None, padding=0, layer_attr=None):
+                   stride=1, start=None, padding=0, layer_attr=None,
+                   pool_size_y=None, stride_y=None, padding_y=None,
+                   img_width=None):
    """
    Image pooling Layer.

@ -1635,25 +1637,34 @@ def img_pool_layer(input, pool_size, name=None,

    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/

-    :param padding: pooling padding
+    :param padding: pooling padding width.
    :type padding: int
+    :param padding_y: pooling padding height. It's equal to padding by default.
+    :type padding_y: int|None
    :param name: name of pooling layer
    :type name: basestring.
    :param input: layer's input
    :type input: LayerOutput
-    :param pool_size: pooling size
+    :param pool_size: pooling window width
    :type pool_size: int
+    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
+    :type pool_size_y: int|None
    :param num_channels: number of input channel.
    :type num_channels: int
    :param pool_type: pooling type. MaxPooling or AveragePooling. Default is
                      MaxPooling.
    :type pool_type: BasePoolingType
-    :param stride: stride of pooling.
+    :param stride: stride width of pooling.
    :type stride: int
-    :param start: start position of pooling operation.
-    :type start: int
+    :param stride_y: stride height of pooling. It is equal to stride by default.
+    :type stride_y: int|None
+    :param start: start position of pooling operation. Note it is deprecated now.
+    :type start: int|None
    :param layer_attr: Extra Layer attribute.
    :type layer_attr: ExtraLayerAttribute
+    :param img_width: the width of input feature map. If it is None, the input feature
+                      map should be square.
+    :type img_width: int|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@ -1666,17 +1677,29 @@ def img_pool_layer(input, pool_size, name=None,
    elif isinstance(pool_type, AvgPooling):
        pool_type.name = 'avg'

+    type_name = pool_type.name + '-projection' \
+      if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+      else pool_type.name
+
+    pool_size_y = pool_size if pool_size_y is None else pool_size_y
+    stride_y = stride if stride_y is None else stride_y
+    padding_y = padding if padding_y is None else padding_y
+
    Layer(
        name=name,
        type=LayerType.POOL_LAYER,
        inputs=[Input(input.name,
                      pool=Pool(
-                          pool_type=''.join([pool_type.name, '-projection']),
+                          pool_type=type_name,
                          channels=num_channels,
                          size_x=pool_size,
                          start=start,
                          stride=stride,
-                          padding=padding
+                          padding=padding,
+                          size_y=pool_size_y,
+                          stride_y=stride_y,
+                          padding_y=padding_y,
+                          img_width=img_width
                      ))],
        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -170,13 +170,13 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
    :type shared_bias: bool
    :param conv_layer_attr: see img_conv_layer for details
    :type conv_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_conv_layer for details
+    :param pool_stride: see img_pool_layer for details
    :type pool_stride: int
-    :param pool_start: see img_conv_layer for details
+    :param pool_start: see img_pool_layer for details. It is deprecated now.
    :type pool_start: int
-    :param pool_padding: see img_conv_layer for details
+    :param pool_padding: see img_pool_layer for details
    :type pool_padding: int
-    :param pool_layer_attr: see img_conv_layer for details
+    :param pool_layer_attr: see img_pool_layer for details
    :type pool_layer_attr: ExtraLayerAttribute
    :return: Layer's output
    :rtype: LayerOutput
@ -243,7 +243,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
    :param bn_layer_attr: ParameterAttribute.
    :param pool_stride: see img_pool_layer's document.
    :type pool_stride: int
-    :param pool_start: see img_pool_layer's document.
+    :param pool_start: see img_pool_layer's document. It is deprecated now.
    :type pool_start: int
    :param pool_padding: see img_pool_layer's document.
    :type pool_padding: int
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@ -19,6 +19,8 @@ __all__ = [
    "BasePoolingType",
    "MaxPooling",
    "AvgPooling",
+    "CudnnMaxPooling",
+    "CudnnAvgPooling",
    "SumPooling",
    "SquareRootNPooling"
 ]
@ -57,6 +59,23 @@ class MaxPooling(BasePoolingType):
        self.output_max_index = output_max_index


+class CudnnMaxPooling(BasePoolingType):
+    """
+    Cudnn max pooling only support GPU. Return the maxinum value in the
+    pooling window.
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-max-pool")
+
+
+class CudnnAvgPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window.
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-pool")
+
 class AvgPooling(BasePoolingType):
    """
    Average pooling.
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@ -1,4 +1,4 @@
-7e6919d17562516e9a1d9a88de1fb3b9  img_layers.protostr
+86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
 5913f87b39cee3b2701fa158270aca26  projections.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@ -7,8 +7,10 @@ settings(

 img = data_layer(name='image', size=256*256)

+# the parse_conv in config_parse.py is not strictly accurate when filter_size
+# is not square. So here set square filter_size.
 img_conv = img_conv_layer(input=img, num_channels=1, num_filters=64,
-                          filter_size=(32, 64), padding=(1, 0), stride=(1, 1),
+                          filter_size=(32, 32), padding=(1, 1), stride=(1, 1),
                          act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())