Merge branch 'develop' into feature/increase_cpu

8 years ago · ef84ff8657
parent c9fc7ba9f8 03fa1edc20
commit ef84ff8657
83 changed files with 2696 additions and 763 deletions
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -13,7 +13,7 @@ define_py_data_sources2(

 settings(
    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -1,17 +1,12 @@
 # Find the CBlas and lapack libraries
 #
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#

 set(CBLAS_FOUND OFF)

@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  return()
 endif()

-## Then find MKL.
-set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
-
-set(MKL_INCLUDE_SEARCH_PATHS
-  ${MKL_ROOT}/include
-  ${INTEL_MKL_ROOT}/include)
-set(MKL_LIB_SEARCH_PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64
-  ${INTEL_MKL_ROOT}/lib
-  ${INTEL_MKL_ROOT}/lib/intel64)
-
-find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
-  add_definitions(-DPADDLE_USE_MKL)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find atlas.
 set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
 set(ATLAS_INCLUDE_SEARCH_PATHS
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()

+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
+IF(${CBLAS_PROVIDER} EQUAL MKLML)
    ADD_LIBRARY(cblas SHARED ${dummyfile})
 ELSE()
    ADD_LIBRARY(cblas STATIC ${dummyfile})
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================

+..  toctree::
+    :maxdepth: 1

-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
--- a/doc/api/v2/data/data_reader.rst
+++ b/doc/api/v2/data/data_reader.rst
@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
+++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
+++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
+++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
+++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
+++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
+++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
+++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
--- a/doc/api/v2/data/image.rst
+++ b/doc/api/v2/data/image.rst
@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
--- a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@ -0,0 +1,245 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and image to text, 
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, 
+it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, 
+due to the complexity, the implementation relays on a lot of special data structures, 
+quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, 
+so the flexibility of sequence decoder is very important to users.
+
+During PaddlePaddle's refactoring work,
+some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
+and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences,
+it stores several arrays of integers each represents a level.
+
+The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, 
+let's call this format the **absolute-offset LoD** for clear.
+
+The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that relay on empty sequence representation,
+such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD, 
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences, 
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following demos are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence,
+and a decoder which uses the sequence decoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is 
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+return the result of the beam search algorithm.
+
+In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+
+1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. remove some specific candidate in `selected_ids`
+3. get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`
+
+- the first level represents `batch_size` of (source) sentences;
+- the second level represents the candidate ID sets for translation prefix.
+
+for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state stored in `encoder_ctx_expanded`
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is 
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+
+the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is 
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+the `selected_ids` is the candidate ids for the prefixes, 
+it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
+the first level represents the source sequences,
+the second level represents generated sequences.
+
+Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+
+Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According the image above, the only phrase to change LoD is beam search.
+
+## Beam search design
+The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+
+1. `topk_ids`, top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of the are LoDTensors, so that the sequence affilication is clear.
+Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+and they exist in each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+the results of beam search are better to store in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. 
+It needs some extensions to support pack or unpack an array of `LoDTensors`.
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"

-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);

 namespace paddle {
 namespace framework {
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
  }
 }

+inline std::type_index ToTypeIndex(DataType type) {
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
  switch (type) {
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -79,6 +79,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
  return result;
 }

+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -81,6 +81,8 @@ struct DDim {
 */
 DDim make_ddim(const std::vector<int64_t>& dims);

+DDim make_ddim(const std::vector<int>& dims);
+
 /**
 * \brief Make a DDim from an initializer list
 *
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -45,6 +45,7 @@ if(WITH_GPU)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()

 add_simple_unittest(Im2ColTest)
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@ -110,6 +110,7 @@ public:
        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
    function1_->init(config);
    function2_->init(config);
+    initArgsCallback_ = nullptr;
  }

  ~Compare2Function() {}
@ -170,6 +171,10 @@ public:
                                      *seq2_));
  }

+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
  // output need only contains shape, do not contains data.
  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
    size_t size =
@ -340,6 +345,10 @@ protected:
        initArg(*func1Inputs_[i]);
      }

+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
    }
  }
@ -386,6 +395,7 @@ protected:
  std::shared_ptr<SequenceIdArg> seq1_;
  std::shared_ptr<SequenceIdArg> seq2_;
  test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
 };

 class CpuGpuFuncCompare
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ b/paddle/function/ScaleSubRegionOp.cpp
@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
--- a/paddle/function/ScaleSubRegionOp.h
+++ b/paddle/function/ScaleSubRegionOp.h
@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ *        region. Indices must be provided to indcate the location and shape of
+ *        the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs  Output value.
+ * \param[in]  inputs   Input data which contains NCHW information.
+ * \param[in]  indices  Indices data to indcate the sub region.
+ * \param[in]  shape    Tensor shape of input value.
+ * \param[in]  conf     Configure variable which contains the multiplied value.
+ */
+template <DeviceType Device>
+void ScaleSubRegion(real* outputs,
+                    const real* inputs,
+                    const real* indices,
+                    const TensorShape shape,
+                    const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad   Gradients of previous layer.
+ * \param[in]  outGrad  Output gradient.
+ * \param[in]  indices  Indices data.
+ * \param[in]  shape    The Shape of input tensor.
+ * \param[in]  conf     Configure variable.
+ */
+template <DeviceType Device>
+void ScaleSubRegionGrad(const real* inGrad,
+                        real* outGrad,
+                        const real* indices,
+                        const TensorShape shape,
+                        const FuncConfig& conf);
+}  // namespace paddle
--- a/paddle/function/ScaleSubRegionOpGpu.cu
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+                                 const real* inputs,
+                                 const real* indices,
+                                 real value,
+                                 int channel,
+                                 int height,
+                                 int width,
+                                 int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outputs[idx] = inputs[idx] * value;
+    } else {
+      outputs[idx] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+                                     real* outGrad,
+                                     const real* indices,
+                                     real value,
+                                     int channel,
+                                     int height,
+                                     int width,
+                                     int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outGrad[idx] += inGrad[idx] * value;
+    } else {
+      outGrad[idx] += inGrad[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inGrad, outGrad, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+}  // namespace paddle
--- a/paddle/function/ScaleSubRegionOpTest.cpp
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 32}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (real value : {-0.5, 0.0, 0.5}) {
+            for (bool firstHalf : {false, true}) {
+              VLOG(3) << " numSamples=" << numSamples
+                      << " channels=" << channels << " imgSizeH=" << imgSizeH
+                      << " imgSizeW=" << imgSizeW;
+
+              for (bool testGrad : {false, true}) {
+                CpuGpuFuncCompare compare(
+                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+                    FuncConfig().set<real>("value", value));
+
+                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+                TensorShape indicesShape{numSamples, 6};
+
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+                  if (index == 1) {
+                    real* data = (real*)arg.data();
+
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      size_t offset = i * 6;
+                      data[offset] = firstHalf ? 1 : channels / 2;
+                      data[offset + 1] = firstHalf ? channels / 2 : channels;
+                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+                    }
+                  }
+                });
+
+                compare.addOutputs(
+                    BufferArg(
+                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+                    testGrad ? ADD_TO : ASSIGN_TO);
+                compare.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  if (biases_) {
-    LOG(FATAL) << "not implemented yet";
-  }
-  resetFwdBuffers(inVals_, out);
+  resetFwdBuffers(inVals_, bias, out);
  in = inVals_[0];

  std::shared_ptr<sum::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);

-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
 }

 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
+  resetBwdBuffers(inGrads_, bias, out);
  in = inGrads_[0];

  // backward only need share output grad to input grad
@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
    }
  }
+
+  // backward bias
+  bwdBias_ = nullptr;
+  if (bias) {
+    std::vector<float> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    pipeline.push_back(*bwdBias_);
+  }
 }

 void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
  }
 }

+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
+
 void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  }

  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
 }

 void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
                                  MKLDNNMatrixPtr out) {
-  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<float> scales(inputs.size(), 1.0);
  std::vector<memory::primitive_desc> srcPDs;
  for (size_t i = 0; i < inputs.size(); i++) {
    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
  CHECK(out);
  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<float> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
 }

 void MKLDNNAddtoLayer::resetFwdPipeline(
    std::vector<primitive>& pipeline,
    std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
  std::vector<primitive::at> srcs;
  for (size_t i = 0; i < inputs.size(); i++) {
@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
  }
  fwd_.reset(new sum(*pd, srcs, *out));
  pipeline.push_back(*fwd_);
+
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
 }

 void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  CHECK(outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }
+
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
 }

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@ -32,9 +32,15 @@ protected:
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;

-  // TODO(TJ): this part has not been optimized by MKL-DNN
  std::unique_ptr<Weight> biases_;

+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
 public:
  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}

@ -91,20 +97,34 @@ protected:
   *                    reset pipeline.
   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
                  MKLDNNMatrixPtr out);
  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);

  /**
   * Backward functions: reset buffers(inputs, output, bias)
   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
+
+  /**
+   * prepare for bias
+   */
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@ -119,7 +119,7 @@ void MKLDNNBatchNormLayer::reshape(
    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  oh = ih;
-  ow = ow;
+  ow = iw;
  // ic_ and oc can not be changed
  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
      << "Input channel can not be changed";
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@ -287,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
    return;
  }
  CHECK(out) << "should have reset internal ouput grad";
-  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<float> scales(outputMap_.size(), 1.0);
  std::vector<memory::primitive_desc> srcPDs;
  std::vector<primitive::at> srcs;
  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
--- a/Show More
+++ b/Show More