Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-3736

8 years ago · e9cc32820d
parent 6bef079660 c2edd2dc07
commit e9cc32820d
74 changed files with 7561 additions and 401 deletions
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
@ -54,7 +54,8 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX                ${MKLML_SOURCE_DIR}
    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
                          && tar zxf ${MKLML_VER}.tgz
    DOWNLOAD_NO_PROGRESS  1
    UPDATE_COMMAND        ""
    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -419,9 +419,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
-huber_cost
+huber_regression_cost
----------
+-------------------------
-..  autoclass:: paddle.v2.layer.huber_cost
+..  autoclass:: paddle.v2.layer.huber_regression_cost
    :noindex:
 huber_classification_cost
 -------------------------
 ..  autoclass:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@ -6,14 +6,12 @@
 安装流程
 ++++++++
-PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+PaddlePaddle提供Docker镜像来部署环境。
 .. toctree::
   :maxdepth: 1
   docker_install_cn.rst 
   ubuntu_install_cn.rst
 编译流程
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@ -8,14 +8,13 @@ Install PaddlePaddle
    :maxdepth: 1
    docker_install_en.rst
    ubuntu_install_en.rst
 Build from Source
 -----------------
 ..  warning::
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
 ..  toctree::
    :maxdepth: 1
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@ -1,71 +0,0 @@
 Ubuntu部署PaddlePaddle
 ===================================
 PaddlePaddle提供了ubuntu 14.04 deb安装包。
 安装
 ------
 安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
 它包含四个版本\:
 * cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
 * cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
 * gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
 * gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
 下载完相关安装包后，执行:
 ..  code-block:: shell
    sudo apt-get install gdebi
    gdebi paddle-*-cpu.deb
 或者:
 ..  code-block:: shell
    dpkg -i paddle-*-cpu.deb
    apt-get install -f
 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
 在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
 安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 ..  code-block:: shell
    PaddlePaddle 0.8.0b1, compiled with
        with_avx: ON
        with_gpu: OFF
        with_double: OFF
        with_python: ON
        with_rdma: OFF
        with_timer: OFF
        with_predict_sdk:
 可能遇到的问题
 --------------
 libcudart.so/libcudnn.so找不到
 ++++++++++++++++++++++++++++++
 安装完成后，运行 :code:`paddle train` 报错\:
 ..  code-block:: shell
      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 ..  code-block:: shell
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
    export PATH=/usr/local/cuda/bin:$PATH
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_en.rst
@ -1,25 +0,0 @@
 Debian Package installation guide
 =================================
 PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
 There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
 After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
 ..	code-block:: bash
 	gdebi paddle-*.deb
 If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
 Or you can use following commands to install PaddlePaddle.
 ..	code-block:: bash
 	dpkg -i paddle-*.deb
 	apt-get install -f
 And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@ -5,12 +5,13 @@
   - [定义ProtoMaker类](#定义ProtoMaker类)
   - [定义Operator类](#定义Operator类)
   - [定义OpKernel类](#定义OpKernel类)
-   - [注册类](#注册类)
+   - [注册Operator](#注册Operator)
   - [编译](#编译)
 - [绑定Python](#绑定Python)
 - [实现单元测试](#实现单元测试)
   - [前向Operator单测](#前向Operator单测)
   - [反向Operator单测](#反向Operator单测)
   - [编译和执行](#编译和执行)
 ## 概念简介
@ -22,18 +23,16 @@
 - `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
 - `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
-依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结如下：
+依据是否包含kernel，将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorBase`，后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 Forward Op需要包含：
-   - OpProtoMake定义
+ 内容            | 定义位置         
-   - Op定义
+--------------  | :----------------------  
-   - Kernel实现
+OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件 
 Kernel实现       | CPU、GPU共享Kernel在`.h`文件，否则，CPU可以在`.cc`文件，GPU可在`.cu`文件。 
 注册Op           | Op注册在`.cc`文件；Kernel注册CPU在`.cc`文件，GPU在`.cu`文件
 与之对应的Backward Op包含：
   - Op定义
   - Kernel实现
 下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
@ -137,8 +136,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 ```	
 还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
-	 - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+
-	 - 2). 设置输出Tensor的形状
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
  - 2). 设置输出Tensor的形状。
 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和要讲到的注册函数一起放在`.cc`中
@ -172,7 +172,7 @@ class MulKernel : public framework::OpKernel {
 到此前向Op实现完成，需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似，这里不再重复。但注意，反向Op没有`ProtoMaker`。
-### 4. 注册类
+### 4. 注册Operator
 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
@ -297,4 +297,28 @@ class TestMulOp(unittest.TestCase):
   - 调用`create_op("mul")`创建反向Op对应的前向Op。
   - 定义输入`inputs`。
   - 调用`compare_grad`函数对比CPU、GPU计算结果。
-   - 调用`check_grad`检查梯度稳定性。
+   - 调用`check_grad`检查梯度稳定性，这里采用数值法检测梯度正确性。
      - 第一个参数`op` : 前向op。
      - 第二个参数`inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
      - 第三个参数`set(["X", "Y"])` : 指定对输入变量`X`、`Y`做梯度检测。
      - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
 ### 编译和执行 
 单测完成之后，在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译：
 ```
 py_test(test_mul_op SRCS test_mul_op.py)
 ```
 编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`，编译成功之后执行单测命令为：
 ```
 make test ARGS="-R test_mul_op -V"
 ```
 或者:
 ```
 ctest -R test_mul_op
 ```
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride);
 extern void hl_maxpool3D_forward(const int frameCnt,
                                 const real* inputData,
                                 const int channels,
                                 const int depth,
                                 const int height,
                                 const int width,
                                 const int pooledD,
                                 const int pooledH,
                                 const int pooledW,
                                 const int sizeZ,
                                 const int sizeY,
                                 const int sizeX,
                                 const int strideD,
                                 const int strideH,
                                 const int strideW,
                                 const int paddingD,
                                 const int paddingH,
                                 const int paddingW,
                                 real* tgtData,
                                 real* maxPoolIdxData,
                                 const int tgtStride);
 extern void hl_maxpool3D_backward(const int frameCnt,
                                  const real* outGrad,
                                  const int channels,
                                  const int depth,
                                  const int height,
                                  const int width,
                                  const int pooledD,
                                  const int pooledH,
                                  const int pooledW,
                                  const int sizeZ,
                                  const int sizeY,
                                  const int sizeX,
                                  const int strideD,
                                  const int strideH,
                                  const int strideW,
                                  const int paddingD,
                                  const int paddingH,
                                  const int paddingW,
                                  real scaleA,
                                  real scaleB,
                                  real* targetGrad,
                                  real* maxPoolIdxData,
                                  const int outStride);
 extern void hl_avgpool3D_forward(const int frameCnt,
                                 const real* inputData,
                                 const int channels,
                                 const int depth,
                                 const int height,
                                 const int width,
                                 const int pooledD,
                                 const int pooledH,
                                 const int pooledW,
                                 const int sizeZ,
                                 const int sizeY,
                                 const int sizeX,
                                 const int strideD,
                                 const int strideH,
                                 const int strideW,
                                 const int paddingD,
                                 const int paddingH,
                                 const int paddingW,
                                 real* tgtData,
                                 const int tgtStride);
 extern void hl_avgpool3D_backward(const int frameCnt,
                                  const real* outGrad,
                                  const int channels,
                                  const int depth,
                                  const int height,
                                  const int width,
                                  const int pooledD,
                                  const int pooledH,
                                  const int pooledW,
                                  const int sizeZ,
                                  const int sizeY,
                                  const int sizeX,
                                  const int strideD,
                                  const int strideH,
                                  const int strideW,
                                  int paddingD,
                                  int paddingH,
                                  int paddingW,
                                  real scaleA,
                                  real scaleB,
                                  real* backGrad,
                                  const int outStride);
 /**
 * @brief   Bilinear interpolation forward.
 *
@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
 /**
 * @brief  Matrix vol2Col: Convert 3D volume into col matrix
 *
 * @param[in]   matSrc     input matrix.
 * @param[in]   channel    channel of matSrc.
 * @param[in]   depth      depth of matSrc.
 * @param[in]   height     height of matSrc.
 * @param[in]   width      width of matSrc.
 * @param[in]   filterD    depth of filter.
 * @param[in]   filterH    height of filter.
 * @param[in]   filterW    width of filter.
 * @param[in]   strideD    stride in the depth.
 * @param[in]   strideH    stride in the height.
 * @param[in]   strideW    stride in the width.
 * @param[in]   paddingD   padding in the depth.
 * @param[in]   paddingH   padding in the height.
 * @param[in]   paddingW   padding in the width.
 * @param[out]   dataDst     output matrix.
 *
 */
 extern void hl_matrix_vol2Col(const real* dataSrc,
                              int channels,
                              int depth,
                              int height,
                              int width,
                              int filterD,
                              int filterH,
                              int filterW,
                              int strideD,
                              int strideH,
                              int strideW,
                              int paddingD,
                              int paddingH,
                              int paddingW,
                              real* dataDst);
 /**
 * @brief  Matrix col2Vol: Convert col matrix into 3D volume
 *
 * @param[out]  matDst     output matrix.
 * @param[in]   channel    channel of matDst.
 * @param[in]   depth      depth of matDst.
 * @param[in]   height     height of matDst.
 * @param[in]   width      width of matDst.
 * @param[in]   filterD    depth of filter.
 * @param[in]   filterH    height of filter.
 * @param[in]   filterW    width of filter.
 * @param[in]   strideD    stride in the depth.
 * @param[in]   strideH    stride in the height.
 * @param[in]   strideW    stride in the width.
 * @param[in]   paddingD   padding in the depth.
 * @param[in]   paddingH   padding in the height.
 * @param[in]   paddingW   padding in the width.
 * @param[in]   matSrc     input matrix.
 * @param[in]   beta       input
 * @param[in]   alpha      input
 *
 */
 extern void hl_matrix_col2Vol(real* dataDst,
                              int channels,
                              int depth,
                              int height,
                              int width,
                              int filterD,
                              int filterH,
                              int filterW,
                              int strideD,
                              int strideH,
                              int strideW,
                              int paddingD,
                              int paddingH,
                              int paddingW,
                              const real* dataSrc,
                              real alpha,
                              real beta);
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real* backGrad,
                                const int outStride) {}
 inline void hl_maxpool3D_forward(const int frameCnt,
                                 const real* inputData,
                                 const int channels,
                                 const int depth,
                                 const int height,
                                 const int width,
                                 const int pooledD,
                                 const int pooledH,
                                 const int pooledW,
                                 const int sizeZ,
                                 const int sizeY,
                                 const int sizeX,
                                 const int strideD,
                                 const int strideH,
                                 const int strideW,
                                 const int paddingD,
                                 const int paddingH,
                                 const int paddingW,
                                 real* tgtData,
                                 real* maxPoolIdxData,
                                 const int tgtStride) {}
 inline void hl_maxpool3D_backward(const int frameCnt,
                                  const real* outGrad,
                                  const int channels,
                                  const int depth,
                                  const int height,
                                  const int width,
                                  const int pooledD,
                                  const int pooledH,
                                  const int pooledW,
                                  const int sizeZ,
                                  const int sizeY,
                                  const int sizeX,
                                  const int strideD,
                                  const int strideH,
                                  const int strideW,
                                  const int paddingD,
                                  const int paddingH,
                                  const int paddingW,
                                  real scaleA,
                                  real scaleB,
                                  real* targetGrad,
                                  real* maxPoolIdxData,
                                  const int outStride) {}
 inline void hl_avgpool3D_forward(const int frameCnt,
                                 const real* inputData,
                                 const int channels,
                                 const int depth,
                                 const int height,
                                 const int width,
                                 const int pooledD,
                                 const int pooledH,
                                 const int pooledW,
                                 const int sizeZ,
                                 const int sizeY,
                                 const int sizeX,
                                 const int strideD,
                                 const int strideH,
                                 const int strideW,
                                 const int paddingD,
                                 const int paddingH,
                                 const int paddingW,
                                 real* tgtData,
                                 const int tgtStride) {}
 inline void hl_avgpool3D_backward(const int frameCnt,
                                  const real* outGrad,
                                  const int channels,
                                  const int depth,
                                  const int height,
                                  const int width,
                                  const int pooledD,
                                  const int pooledH,
                                  const int pooledW,
                                  const int sizeZ,
                                  const int sizeY,
                                  const int sizeX,
                                  const int strideD,
                                  const int strideH,
                                  const int strideW,
                                  const int paddingD,
                                  const int paddingH,
                                  const int paddingW,
                                  real scaleA,
                                  real scaleB,
                                  real* backGrad,
                                  const int outStride) {}
 inline void hl_bilinear_forward(const real* inData,
                                const size_t inImgH,
                                const size_t inImgW,
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
 inline void hl_matrix_vol2Col(const real* dataSrc,
                              int channels,
                              int depth,
                              int height,
                              int width,
                              int filterD,
                              int filterH,
                              int filterW,
                              int strideD,
                              int strideH,
                              int strideW,
                              int paddingD,
                              int paddingH,
                              int paddingW,
                              real* dataDst) {}
 inline void hl_matrix_col2Vol(real* dataDst,
                              int channels,
                              int depth,
                              int height,
                              int width,
                              int filterD,
                              int filterH,
                              int filterW,
                              int strideD,
                              int strideH,
                              int strideW,
                              int paddingD,
                              int paddingH,
                              int paddingW,
                              const real* dataSrc,
                              real alpha,
                              real beta) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@ -592,3 +592,204 @@ void hl_matrix_rotate(
      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
 __global__ void keMatrixVol2Col(int num_kernels,
                                const real* dataSrc,
                                real* dataDst,
                                int depth,
                                int height,
                                int width,
                                int filterD,
                                int filterH,
                                int filterW,
                                int strideD,
                                int strideH,
                                int strideW,
                                int paddingD,
                                int paddingH,
                                int paddingW,
                                int depth_col,
                                int height_col,
                                int width_col) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
       index += blockDim.x * gridDim.x) {
    int w_out = index % width_col;
    int h_out = (index / width_col) % height_col;
    int d_out = (index / width_col / height_col) % depth_col;
    int channel_in = index / width_col / height_col / depth_col;
    int channel_out = channel_in * filterD * filterH * filterW;
    int w_in = w_out * strideW - paddingW;
    int h_in = h_out * strideH - paddingH;
    int d_in = d_out * strideD - paddingD;
    dataDst +=
        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
        w_out;
    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
    for (int k = 0; k < filterD; ++k) {
      for (int i = 0; i < filterH; ++i) {
        for (int j = 0; j < filterW; ++j) {
          int d = d_in + k;
          int h = h_in + i;
          int w = w_in + j;
          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
                      w < width)
                         ? dataSrc[(k * height + i) * width + j]
                         : 0;
          dataDst += depth_col * height_col * width_col;
        }
      }
    }
  }
 }
 void hl_matrix_vol2Col(const real* dataSrc,
                       int channels,
                       int depth,
                       int height,
                       int width,
                       int filterD,
                       int filterH,
                       int filterW,
                       int strideD,
                       int strideH,
                       int strideW,
                       int paddingD,
                       int paddingH,
                       int paddingW,
                       real* dataDst) {
  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
  int num_kernels = channels * depth_col * height_col * width_col;
  const int threads = 512;
  const int blocks = DIVUP(num_kernels, threads);
  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
                                                          dataSrc,
                                                          dataDst,
                                                          depth,
                                                          height,
                                                          width,
                                                          filterD,
                                                          filterH,
                                                          filterW,
                                                          strideD,
                                                          strideH,
                                                          strideW,
                                                          paddingD,
                                                          paddingH,
                                                          paddingW,
                                                          depth_col,
                                                          height_col,
                                                          width_col);
  CHECK_SYNC("hl_matrix_vol2Col failed");
 }
 __global__ void keMatrixCol2Vol(int num_kernels,
                                real* dataDst,
                                const real* dataSrc,
                                int depth,
                                int height,
                                int width,
                                int filterD,
                                int filterH,
                                int filterW,
                                int strideD,
                                int strideH,
                                int strideW,
                                int paddingD,
                                int paddingH,
                                int paddingW,
                                int depth_col,
                                int height_col,
                                int width_col,
                                real alpha,
                                real beta) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
       index += blockDim.x * gridDim.x) {
    real srcVal = 0;
    real dstVal = dataDst[index];
    int w = index % width + paddingW;
    int h = (index / width) % height + paddingH;
    int d = (index / width / height) % depth + paddingD;
    int c = index / width / height / depth;
    // compute the start and end of the output
    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
    int w_col_end = min(w / strideW + 1, width_col);
    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
    int h_col_end = min(h / strideH + 1, height_col);
    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
    int d_col_end = min(d / strideD + 1, depth_col);
    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
                  h * filterW + w) *
                 depth_col * height_col * width_col;
    int coeff_d_col =
        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
    int coeff_h_col =
        (1 - strideH * filterW * depth_col * height_col) * width_col;
    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
                            w_col * coeff_w_col];
        }
      }
    }
    dataDst[index] = alpha * srcVal + beta * dstVal;
  }
 }
 void hl_matrix_col2Vol(real* dataDst,
                       int channels,
                       int depth,
                       int height,
                       int width,
                       int filterD,
                       int filterH,
                       int filterW,
                       int strideD,
                       int strideH,
                       int strideW,
                       int paddingD,
                       int paddingH,
                       int paddingW,
                       const real* dataSrc,
                       real alpha,
                       real beta) {
  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
  int num_kernels = channels * depth * height * width;
  const int threads = 512;
  const int blocks = DIVUP(num_kernels, threads);
  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
                                                          dataDst,
                                                          dataSrc,
                                                          depth,
                                                          height,
                                                          width,
                                                          filterD,
                                                          filterH,
                                                          filterW,
                                                          strideD,
                                                          strideH,
                                                          strideW,
                                                          paddingD,
                                                          paddingH,
                                                          paddingW,
                                                          depth_col,
                                                          height_col,
                                                          width_col,
                                                          alpha,
                                                          beta);
  CHECK_SYNC("hl_matrix_col2Vol failed");
 }
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
  }
  PADDLE_ENFORCE(cudaStreamSynchronize(0),
                 "cudaStreamSynchronize failed in Tensor CopyFrom");
 #endif
 }
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -21,6 +21,8 @@ if(USE_NNPACK)
  endif()
 endif()
 list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
@ -42,11 +44,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
 add_simple_unittest(DepthwiseConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
 }
 #endif
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 TEST(DepthwiseConv, Forward) {
  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
 }
 #endif
 }  // namespace paddle
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -16,6 +16,7 @@ limitations under the License. */
 #include "TensorShape.h"
 #include "TensorType.h"
 #include "neon/neon_util.h"
 namespace paddle {
@ -93,4 +94,95 @@ public:
                  int paddingWidth);
 };
 template <class T>
 struct Padding {
  static void run(const T* src,
                  T* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
        memcpy(dest, src, inputWidth * sizeof(T));
        dest += inputWidth;
        src += inputWidth;
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = T(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(T));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <>
 struct Padding<float> {
  static void run(const float* src,
                  float* dest,
                  int channels,
                  int inputHeight,
                  int inputWidth,
                  int paddingHeight,
                  int paddingWidth) {
    const int destWidth = inputWidth + 2 * paddingWidth;
    for (int c = 0; c < channels; c++) {
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
      for (int i = 0; i < inputHeight; i++) {
        // padding head
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
        int step = inputWidth >> 2;
        int remain = inputWidth & 3;
        for (int s = 0; s < step; s++) {
          float32x4_t s0 = vld1q_f32(src);
          vst1q_f32(dest, s0);
          src += 4;
          dest += 4;
        }
        for (int r = 0; r < remain; r++) {
          *dest++ = *src++;
        }
        // padding tail
        for (int j = 0; j < paddingWidth; j++) {
          *dest++ = float(0);
        }
      }
      if (paddingHeight > 0) {
        memset(dest, 0, destWidth * paddingHeight * sizeof(float));
        dest += destWidth * paddingHeight;
      }
    }
  }
 };
 #endif
 }  // namespace paddle
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
--- a/paddle/function/neon/neon_util.h
+++ b/paddle/function/neon/neon_util.h
@ -0,0 +1,47 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 namespace paddle {
 namespace neon {
 inline float32x4_t vld1q_f32_aligned(const float* p) {
  return vld1q_f32(
      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
 }
 #ifndef __aarch64__
 inline float32_t vaddvq_f32(float32x4_t a) {
  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
  return vget_lane_f32(vpadd_f32(v, v), 0);
 }
 inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
                                   float32x4_t b,
                                   float32x4_t v,
                                   const int lane) {
  return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
 }
 #endif
 }  // namespace neon
 }  // namespace paddle
 #endif
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@ -0,0 +1,244 @@
 /* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Conv3DLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 namespace paddle {
 REGISTER_LAYER(conv3d, Conv3DLayer);
 bool Conv3DLayer::init(const LayerMap &layerMap,
                       const ParameterMap &parameterMap) {
  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
  int index = 0;
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
    M_.push_back(numFilters_ / conf.groups());
    K_.push_back(filterPixels_[index] * filterChannels_[index]);
    // create a new weight
    size_t height, width;
    width = filterPixels_[index] * filterChannels_[index];
    height = numFilters_;
    CHECK_EQ(parameters_[index]->getSize(), width * height);
    Weight *w = new Weight(height, width, parameters_[index]);
    weights_.emplace_back(w);
    ++index;
  }
  if (biasParameter_.get()) {
    if (sharedBiases_) {
      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
      biases_ =
          std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
    } else {
      biases_ =
          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
    }
  }
  return true;
 }
 size_t Conv3DLayer::getSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
  outputH_.clear();
  outputW_.clear();
  outputD_.clear();
  N_.clear();
  size_t layerSize = 0;
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
    outputW_.push_back(outputSize(
        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
    outputH_.push_back(outputSize(
        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
    outputD_.push_back(outputSize(
        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
    layerSize += N_[i] * numFilters_;
  }
  getOutput().setFrameHeight(outputH_[0]);
  getOutput().setFrameWidth(outputW_[0]);
  getOutput().setFrameDepth(outputD_[0]);
  return layerSize;
 }
 void Conv3DLayer::forward(PassType passType) {
  Layer::forward(passType);
  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  int outWidth = getSize();
  resetOutput(batchSize, outWidth);
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
    const MatrixPtr &inMat = getInputValue(i);
    const MatrixPtr &outMat = getOutputValue();
    int M = M_[i];
    int N = N_[i];
    int K = K_[i];
    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
    MatrixPtr wMat = weights_[i]->getW();
    for (int n = 0; n < batchSize; ++n) {
      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
                       channels_[i],
                       imgSizeD_[i],
                       imgSizeH_[i],
                       imgSizeW_[i],
                       filterSizeZ_[i],
                       filterSizeY_[i],
                       filterSize_[i],
                       strideZ_[i],
                       strideY_[i],
                       stride_[i],
                       paddingZ_[i],
                       paddingY_[i],
                       padding_[i]);
      real *outData = outMat->getData() + n * outMat->getStride();
      MatrixPtr outMatSub =
          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
      for (int g = 0; g < groups_[i]; g++) {
        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
        MatrixPtr in = colBuf_->subMatrix(g * K, K);
        MatrixPtr out = outMatSub->subMatrix(g * M, M);
        out->mul(*wMatSub, *in, 1.0, 1.0);
      }
    }
  }
  if (nullptr != this->biasParameter_) {
    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
    this->addBias();
  }
  forwardActivation();
 }
 void Conv3DLayer::backward(const UpdateCallback &callback) {
  backwardActivation();
  if (biases_ && biases_->getWGrad()) {
    bpropBiases();
    biases_->getParameterPtr()->incUpdate(callback);
  }
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
    if (weights_[i]->getWGrad()) {
      bpropWeights(i);
    }
    if (getInputGrad(i)) {
      bpropData(i);
    }
    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
    weights_[i]->getParameterPtr()->incUpdate(callback);
  }
 }
 void Conv3DLayer::bpropWeights(int i) {
  int M = M_[i];
  int N = N_[i];
  int K = K_[i];
  const MatrixPtr &inMat = getInputValue(i);
  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
  MatrixPtr wGradMat = weights_[i]->getWGrad();
  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  for (int n = 0; n < batchSize; ++n) {
    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
                     channels_[i],
                     imgSizeD_[i],
                     imgSizeH_[i],
                     imgSizeW_[i],
                     filterSizeZ_[i],
                     filterSizeY_[i],
                     filterSize_[i],
                     strideZ_[i],
                     strideY_[i],
                     stride_[i],
                     paddingZ_[i],
                     paddingY_[i],
                     padding_[i]);
    real *outGradData =
        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
    MatrixPtr outGradSub =
        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
    for (int g = 0; g < groups_[i]; ++g) {
      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
    }
  }
 }
 void Conv3DLayer::bpropData(int i) {
  int M = M_[i];
  int N = N_[i];
  int K = K_[i];
  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
  MatrixPtr wMat = weights_[i]->getW();
  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  for (int n = 0; n < batchSize; ++n) {
    real *outGradData =
        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
    real *preGradData =
        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
    MatrixPtr outGradSub =
        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
    for (int g = 0; g < groups_[i]; ++g) {
      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
    }
    colBuf_->col2Vol(preGradData,
                     channels_[i],
                     imgSizeD_[i],
                     imgSizeH_[i],
                     imgSizeW_[i],
                     filterSizeZ_[i],
                     filterSizeY_[i],
                     filterSize_[i],
                     strideZ_[i],
                     strideY_[i],
                     stride_[i],
                     paddingZ_[i],
                     paddingY_[i],
                     padding_[i],
                     1.0,
                     1.0);
  }
 }
 void Conv3DLayer::bpropBiases() {
  MatrixPtr outGradMat = getOutputGrad();
  if (this->sharedBiases_) {
    biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
  } else {
    biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
  }
 }
 void Conv3DLayer::addBias() {
  MatrixPtr outMat = getOutputValue();
  if (this->sharedBiases_) {
    outMat->addSharedBias(*(biases_->getW()), 1.0f);
  } else {
    outMat->addBias(*(biases_->getW()), 1.0f);
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
@ -0,0 +1,51 @@
 /* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "ConvBaseLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
 /**
 * @brief A subclass of convolution layer.
 * This layer expands input and use matrix multiplication to
 * calculate convolution operation.
 */
 class Conv3DLayer : public ConvBaseLayer {
 public:
  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
  ~Conv3DLayer() {}
  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
  void forward(PassType passType);
  void addBias();
  void backward(const UpdateCallback& callback);
  void bpropBiases();
  void bpropData(int i);
  void bpropWeights(int i);
  size_t getSize();
 protected:
  // Figure out the dimensions for individual gemms.
  IntV M_;  /// numFilters_ / filter_group_;
  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
  IntV K_;  /// outputD_ * outputH_ * outputW_
  MatrixPtr colBuf_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@ -38,7 +38,6 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    strideY_.push_back(conf.stride_y());
    dilationY_.push_back(conf.dilation_y());
    filterSizeY_.push_back(conf.filter_size_y());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                              : conf.img_size());
@ -47,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterChannels_.push_back(conf.filter_channels());
    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
    outputW_.push_back(conf.output_x());
    paddingZ_.push_back(conf.padding_z());
    strideZ_.push_back(conf.stride_z());
    filterSizeZ_.push_back(conf.filter_size_z());
    imgSizeD_.push_back(conf.img_size_z());
    outputD_.push_back(conf.output_z());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
                            filterSizeZ_.back());
  }
  CHECK(inputLayers_.size() == parameters_.size());
  for (size_t i = 0; i < inputLayers_.size(); i++) {
    size_t height, width;
    height = filterPixels_[i] * filterChannels_[i];
    width = (!isDeconv_) ? numFilters_ : channels_[i];
    // create a new weight
    CHECK_EQ(parameters_[i]->getSize(), width * height);
    Weight* w = new Weight(height, width, parameters_[i]);
    weights_.emplace_back(w);
  }
-  /* initialize the biases_ */
+  // create new weights_ in derived class
-  if (biasParameter_.get()) {
+  // create new biases_ in derived class
    if (sharedBiases_) {
      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
      biases_ =
          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
    } else {
      biases_ =
          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
    }
  }
  // default caffe model
  caffeMode_ = true;
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@ -62,6 +62,13 @@ protected:
  IntV outputH_;
  /// The spatial dimensions of output feature map width.
  IntV outputW_;
  IntV outputD_;
  IntV imgSizeD_;
  IntV filterSizeZ_;
  IntV strideZ_;
  IntV paddingZ_;
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@ -572,12 +572,7 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
  }
 }
-//
+bool HuberCost::init(const LayerMap& layerMap,
 // Huber loss for robust 2-classes classification
 //
 REGISTER_LAYER(huber, HuberTwoClass);
 bool HuberTwoClass::init(const LayerMap& layerMap,
                     const ParameterMap& parameterMap) {
  CostLayer::init(layerMap, parameterMap);
  if (useGpu_) {
@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
  return true;
 }
-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
  if (useGpu_) {
    for (size_t i = 0; i < inputLayers_.size(); i++) {
      tmpCpuInput_[i].resizeAndCopyFrom(
@ -597,61 +592,123 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
    }
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  }
  forwardImpIn(output, label, cost);
 }
-void HuberTwoClass::forwardImpIn(Matrix& output,
+//
 // Huber loss for robust regression.
 //
 REGISTER_LAYER(huber_regression, HuberRegressionLoss);
 bool HuberRegressionLoss::init(const LayerMap& layerMap,
                               const ParameterMap& parameterMap) {
  HuberCost::init(layerMap, parameterMap);
  delta_ = config_.delta();
  return true;
 }
 void HuberRegressionLoss::forwardImp(Matrix& output,
                                     Argument& label,
                                     Matrix& target) {
  HuberCost::forwardImp(output, label, target);
  size_t numSamples = target.getHeight();
-  CHECK_EQ((*label.ids).getSize(), numSamples);
+  size_t dim = output.getWidth();
  CHECK(label.value);
  CHECK_EQ((*label.value).getHeight(), numSamples);
  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
+  CHECK_EQ(dim, (*label.value).getWidth());
  CHECK_EQ(target.getWidth(), (size_t)1);
  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* lbl =
-  std::vector<real> cost(numSamples);
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
  std::vector<real> cost(numSamples, 0);
  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
+    for (size_t j = 0; j < dim; ++j) {
-    if (out[i] * y < -1)
+      int index = i * dim + j;
-      cost[i] = -4 * out[i] * y;
+      real a = std::abs(lbl[index] - out[index]);
-    else if (out[i] * y < 1)
+      if (a <= delta_)
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
+        cost[i] += a * a / 2;
      else
-      cost[i] = 0;
+        cost[i] += delta_ * (a - delta_ / 2);
    }
  }
  target.copyFrom(cost.data(), numSamples);
 }
-void HuberTwoClass::backwardImp(Matrix& outputValue,
+void HuberRegressionLoss::backwardImp(Matrix& output,
                                      Argument& label,
-                                Matrix& outputGrad) {
+                                      Matrix& outputG) {
-  if (useGpu_) {
+  size_t numSamples = output.getHeight();
-    backwardImpIn(
+  size_t dim = output.getWidth();
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
+  real* lbl =
-  } else {
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-    backwardImpIn(outputValue, label, outputGrad);
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
  for (size_t i = 0; i < numSamples; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      int index = i * dim + j;
      real a = lbl[index] - out[index];
      if (std::abs(a) <= delta_)
        grad[index] += -a;
      else
        grad[index] += a > 0 ? -delta_ : delta_;
    }
  }
  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
 }
-void HuberTwoClass::backwardImpIn(Matrix& output,
+//
 // Huber loss for robust 2-classes classification
 //
 REGISTER_LAYER(huber_classification, HuberTwoClassification);
 bool HuberTwoClassification::init(const LayerMap& layerMap,
                                  const ParameterMap& parameterMap) {
  return HuberCost::init(layerMap, parameterMap);
 }
 void HuberTwoClassification::forwardImp(Matrix& output,
                                        Argument& label,
                                        Matrix& target) {
  HuberCost::forwardImp(output, label, target);
  size_t numSamples = target.getHeight();
  CHECK(label.ids);
  CHECK_EQ((*label.ids).getSize(), numSamples);
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), (size_t)1);
  CHECK_EQ(target.getWidth(), (size_t)1);
  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
  std::vector<real> cost(numSamples, 0);
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
    real a = out[i] * y;
    if (a < -1)
      cost[i] = -4 * a;
    else if (a < 1)
      cost[i] = (1 - a) * (1 - a);
  }
  target.copyFrom(cost.data(), numSamples);
 }
 void HuberTwoClassification::backwardImp(Matrix& output,
                                         Argument& label,
                                         Matrix& outputG) {
  size_t numSamples = output.getHeight();
-  real* out = output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* grad = outputG.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  int* lbl = (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
  for (size_t i = 0; i < numSamples; ++i) {
    int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
    if (a < -1)
      grad[i] += -4 * y;
-    else if (y * out[i] < 1)
+    else if (a < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+      grad[i] += -2 * (1 - a) * y;
  }
  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
 /**
 * This cost layer compute the sum of its input as loss.
 * \f[
--- a/Show More
+++ b/Show More