Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-3736

8 years ago · e9cc32820d
parent 6bef079660 c2edd2dc07
commit e9cc32820d
74 changed files with 7561 additions and 401 deletions
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -28,7 +28,7 @@ INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
@ -54,7 +54,8 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                ${MKLML_SOURCE_DIR}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
+                          && tar zxf ${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
    UPDATE_COMMAND        ""
    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -419,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:

-huber_cost
----------
-..  autoclass:: paddle.v2.layer.huber_cost
+huber_regression_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+
+huber_classification_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_classification_cost
    :noindex:

 lambda_cost
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@ -6,14 +6,12 @@
 安装流程
 ++++++++

-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+PaddlePaddle提供Docker镜像来部署环境。

 .. toctree::
   :maxdepth: 1
   
   docker_install_cn.rst 
-   ubuntu_install_cn.rst
-


 编译流程
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@ -8,14 +8,13 @@ Install PaddlePaddle
    :maxdepth: 1

    docker_install_en.rst
-    ubuntu_install_en.rst

 Build from Source
 -----------------

 ..  warning::

-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.

 ..  toctree::
    :maxdepth: 1
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@ -1,71 +0,0 @@
-Ubuntu部署PaddlePaddle
-===================================
-
-PaddlePaddle提供了ubuntu 14.04 deb安装包。
-
-安装
------
-
-安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
-
-它包含四个版本\:
-
-* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
-
-* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
-
-* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
-
-* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
-
-下载完相关安装包后，执行:
-
-..  code-block:: shell
-
-    sudo apt-get install gdebi
-    gdebi paddle-*-cpu.deb
-
-或者:
-
-..  code-block:: shell
-
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-
-
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-
-安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
-
-..  code-block:: shell
-
-    PaddlePaddle 0.8.0b1, compiled with
-        with_avx: ON
-        with_gpu: OFF
-        with_double: OFF
-        with_python: ON
-        with_rdma: OFF
-        with_timer: OFF
-        with_predict_sdk:
-
-
-可能遇到的问题
--------------
-
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-
-安装完成后，运行 :code:`paddle train` 报错\:
-
-..  code-block:: shell
-
-      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
-
-..  code-block:: shell
-
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_en.rst
@ -1,25 +0,0 @@
-Debian Package installation guide
-=================================
-
-PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
-
-There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
-
-
-After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
-
-..	code-block:: bash
-
-	gdebi paddle-*.deb
-
-If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
-
-Or you can use following commands to install PaddlePaddle.
-
-..	code-block:: bash
-
-	dpkg -i paddle-*.deb
-	apt-get install -f
-
-And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
-
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@ -5,12 +5,13 @@
   - [定义ProtoMaker类](#定义ProtoMaker类)
   - [定义Operator类](#定义Operator类)
   - [定义OpKernel类](#定义OpKernel类)
-   - [注册类](#注册类)
+   - [注册Operator](#注册Operator)
   - [编译](#编译)
 - [绑定Python](#绑定Python)
 - [实现单元测试](#实现单元测试)
   - [前向Operator单测](#前向Operator单测)
   - [反向Operator单测](#反向Operator单测)
+   - [编译和执行](#编译和执行)


 ## 概念简介
@ -22,19 +23,17 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成

-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结如下：
+依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：

-Forward Op需要包含：
-
-   - OpProtoMake定义
-   - Op定义
-   - Kernel实现
+  
+ 内容            | 定义位置         
+--------------  | :----------------------  
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件 
+Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
+注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
+     
     
-与之对应的Backward Op包含：
-
-   - Op定义
-   - Kernel实现
-
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。


@ -137,8 +136,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 ```	
 	
 还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
-	 - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法
-	 - 2). 设置输出Tensor的形状
+
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。

 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中

@ -172,7 +172,7 @@ class MulKernel : public framework::OpKernel {
   
 到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
   
-### 4. 注册类
+### 4. 注册Operator

 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。

@ -297,4 +297,28 @@ class TestMulOp(unittest.TestCase):
   - 调用`create_op("mul")`创建反向Op对应的前向Op。
   - 定义输入`inputs`。
   - 调用`compare_grad`函数对比CPU、GPU计算结果。
-   - 调用`check_grad`检查梯度稳定性。
+   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
+      - 第一个参数`op` : 前向op。
+      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
+      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
+      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
+
+
+### 编译和执行 
+
+单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
+
+```
+py_test(test_mul_op SRCS test_mul_op.py)
+```
+
+编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
+
+```
+make test ARGS="-R test_mul_op -V"
+```
+或者:
+
+```
+ctest -R test_mul_op
+```
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride);

+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
 /**
 * @brief   Bilinear interpolation forward.
 *
@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);

-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise);

+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[out]   dataDst     output matrix.
+ *
+ */
+extern void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input
+ * @param[in]   alpha      input
+ *
+ */
+extern void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta);
+
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride) {}

+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
 inline void hl_bilinear_forward(const real* inData,
                                const size_t inImgH,
                                const size_t inImgW,
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}

+inline void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst) {}
+
+inline void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta) {}
+
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@ -592,3 +592,204 @@ void hl_matrix_rotate(
      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
  }
+  PADDLE_ENFORCE(cudaStreamSynchronize(0),
+                 "cudaStreamSynchronize failed in Tensor CopyFrom");

 #endif
 }
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -21,6 +21,8 @@ if(USE_NNPACK)
  endif()
 endif()

+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
+
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@ -42,11 +44,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
-    add_simple_unittest(DepthwiseConvOpTest)
 endif()

 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
 endif()

 add_style_check_target(paddle_function ${h_files})
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif

+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
+}
+
+#endif
+
 }  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -16,6 +16,7 @@ limitations under the License. */

 #include "TensorShape.h"
 #include "TensorType.h"
+#include "neon/neon_util.h"

 namespace paddle {

@ -93,4 +94,95 @@ public:
                  int paddingWidth);
 };

+template <class T>
+struct Padding {
+  static void run(const T* src,
+                  T* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+
+        memcpy(dest, src, inputWidth * sizeof(T));
+        dest += inputWidth;
+        src += inputWidth;
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = T(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+struct Padding<float> {
+  static void run(const float* src,
+                  float* dest,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    const int destWidth = inputWidth + 2 * paddingWidth;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(src);
+          vst1q_f32(dest, s0);
+          src += 4;
+          dest += 4;
+        }
+        for (int r = 0; r < remain; r++) {
+          *dest++ = *src++;
+        }
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *dest++ = float(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
+        dest += destWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#endif
+
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include <arm_neon.h>
+
+namespace paddle {
+
+namespace neon {
+
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32(
+      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+
+#ifndef __aarch64__
+inline float32_t vaddvq_f32(float32x4_t a) {
+  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+
+inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
+                                   float32x4_t b,
+                                   float32x4_t v,
+                                   const int lane) {
+  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
+}
+#endif
+
+}  // namespace neon
+}  // namespace paddle
+
+#endif
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@ -0,0 +1,244 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr outGradMat = getOutputGrad();
+  if (this->sharedBiases_) {
+    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(biases_->getW()), 1.0f);
+  } else {
+    outMat->addBias(*(biases_->getW()), 1.0f);
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@ -38,7 +38,6 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    strideY_.push_back(conf.stride_y());
    dilationY_.push_back(conf.dilation_y());
    filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                              : conf.img_size());
@ -47,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterChannels_.push_back(conf.filter_channels());
    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
    outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
  }

  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }

-  /* initialize the biases_ */
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
+  // create new weights_ in derived class
+  // create new biases_ in derived class

  // default caffe model
  caffeMode_ = true;
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@ -62,6 +62,13 @@ protected:
  IntV outputH_;
  /// The spatial dimensions of output feature map width.
  IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
  }
 }

-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber, HuberTwoClass);
-
-bool HuberTwoClass::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
  CostLayer::init(layerMap, parameterMap);
  if (useGpu_) {
    tmpCpuInput_.reserve(inputLayers_.size());
@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
  return true;
 }

-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(
@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
    }
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
-  forwardImpIn(output, label, cost);
 }

-void HuberTwoClass::forwardImpIn(Matrix& output,
-                                 Argument& label,
-                                 Matrix& target) {
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
  CHECK_EQ((*label.ids).getSize(), numSamples);
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), (size_t)1);
@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,

  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples);
+  std::vector<real> cost(numSamples, 0);
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (out[i] * y < -1)
-      cost[i] = -4 * out[i] * y;
-    else if (out[i] * y < 1)
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
-    else
-      cost[i] = 0;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
  }
  target.copyFrom(cost.data(), numSamples);
 }

-void HuberTwoClass::backwardImp(Matrix& outputValue,
-                                Argument& label,
-                                Matrix& outputGrad) {
-  if (useGpu_) {
-    backwardImpIn(
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
-  } else {
-    backwardImpIn(outputValue, label, outputGrad);
-  }
-}
-
-void HuberTwoClass::backwardImpIn(Matrix& output,
-                                  Argument& label,
-                                  Matrix& outputG) {
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
  size_t numSamples = output.getHeight();
-  real* out = output.getData();
-  real* grad = outputG.getData();
-  int* lbl = (*label.ids).getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
+    if (a < -1)
      grad[i] += -4 * y;
-    else if (y * out[i] < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
-
 /**
 * This cost layer compute the sum of its input as loss.
 * \f[
--- a/Show More
+++ b/Show More