Merge branch 'develop' into feature/increase_cpu

7 years ago · ef84ff8657
parent c9fc7ba9f8 03fa1edc20
commit ef84ff8657
83 changed files with 2696 additions and 763 deletions
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -13,7 +13,7 @@ define_py_data_sources2(
 settings(
    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -1,17 +1,12 @@
 # Find the CBlas and lapack libraries
 #
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
 #
 # User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
 # during cmake. If none of them set, it will try to find cblas implementation in
 # system paths.
 #
 set(CBLAS_FOUND OFF)
@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  return()
 endif()
 ## Then find MKL.
 set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
 set(MKL_INCLUDE_SEARCH_PATHS
  ${MKL_ROOT}/include
  ${INTEL_MKL_ROOT}/include)
 set(MKL_LIB_SEARCH_PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64
  ${INTEL_MKL_ROOT}/lib
  ${INTEL_MKL_ROOT}/lib/intel64)
 find_path(MKL_INC_DIR mkl.h PATHS
  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
  ${MKL_LIB_SEARCH_PATHS})
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER MKL)
  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
  add_definitions(-DPADDLE_USE_MKL)
  add_definitions(-DLAPACK_FOUND)
  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
  return()
 endif()
 ## Then find atlas.
 set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
 set(ATLAS_INCLUDE_SEARCH_PATHS
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
+IF(${CBLAS_PROVIDER} EQUAL MKLML)
    ADD_LIBRARY(cblas SHARED ${dummyfile})
 ELSE()
    ADD_LIBRARY(cblas STATIC ${dummyfile})
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================
 ..  toctree::
    :maxdepth: 1
-DataTypes
+    data/data_reader.rst
-=========
+    data/image.rst
-
+    data/dataset.rst
 ..  automodule:: paddle.v2.data_type
    :members:
    :noindex:
 DataFeeder
 ==========
 ..  automodule:: paddle.v2.data_feeder
    :members:
    :noindex:
 Reader
 ======
 ..  automodule:: paddle.v2.reader
    :members:
    :noindex:
 ..  automodule:: paddle.v2.reader.creator
    :members:
    :noindex:
 minibatch
 =========
 ..  automodule:: paddle.v2.minibatch
    :members:
    :noindex:
 Dataset
 =======
 ..  automodule:: paddle.v2.dataset
    :members:
    :noindex:
 mnist
 +++++
 ..  automodule:: paddle.v2.dataset.mnist
    :members:
    :noindex:
 cifar
 +++++
 ..  automodule:: paddle.v2.dataset.cifar
    :members:
    :noindex:
 conll05
 +++++++
 ..  automodule:: paddle.v2.dataset.conll05
    :members: get_dict,get_embedding,test
    :noindex:
 imdb
 ++++
 ..  automodule:: paddle.v2.dataset.imdb
    :members:
    :noindex:
 imikolov
 ++++++++
 ..  automodule:: paddle.v2.dataset.imikolov
    :members:
    :noindex:
 movielens
 +++++++++
 ..  automodule:: paddle.v2.dataset.movielens
    :members:
    :noindex:
 ..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
    :noindex:
 ..  autoclass:: paddle.v2.dataset.movielens.UserInfo
    :noindex:
 sentiment
 +++++++++
 ..  automodule:: paddle.v2.dataset.sentiment
    :members:
    :noindex:
 uci_housing
 +++++++++++
 ..  automodule:: paddle.v2.dataset.uci_housing
    :members:
    :noindex:
 wmt14
 +++++
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
--- a/doc/api/v2/data/data_reader.rst
+++ b/doc/api/v2/data/data_reader.rst
@ -0,0 +1,36 @@
 =====================
 Data Reader Interface
 =====================
 DataTypes
 =========
 ..  automodule:: paddle.v2.data_type
    :members:
    :noindex:
 DataFeeder
 ==========
 ..  automodule:: paddle.v2.data_feeder
    :members:
    :noindex:
 Reader
 ======
 ..  automodule:: paddle.v2.reader
    :members:
    :noindex:
 ..  automodule:: paddle.v2.reader.creator
    :members:
    :noindex:
 minibatch
 =========
 ..  automodule:: paddle.v2.minibatch
    :members:
    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -0,0 +1,75 @@
 Dataset
 =======
 ..  automodule:: paddle.v2.dataset
    :members:
    :noindex:
 mnist
 +++++
 ..  automodule:: paddle.v2.dataset.mnist
    :members:
    :noindex:
 cifar
 +++++
 ..  automodule:: paddle.v2.dataset.cifar
    :members:
    :noindex:
 conll05
 +++++++
 ..  automodule:: paddle.v2.dataset.conll05
    :members: get_dict,get_embedding,test
    :noindex:
 imdb
 ++++
 ..  automodule:: paddle.v2.dataset.imdb
    :members:
    :noindex:
 imikolov
 ++++++++
 ..  automodule:: paddle.v2.dataset.imikolov
    :members:
    :noindex:
 movielens
 +++++++++
 ..  automodule:: paddle.v2.dataset.movielens
    :members:
    :noindex:
 ..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
    :noindex:
 ..  autoclass:: paddle.v2.dataset.movielens.UserInfo
    :noindex:
 sentiment
 +++++++++
 ..  automodule:: paddle.v2.dataset.sentiment
    :members:
    :noindex:
 uci_housing
 +++++++++++
 ..  automodule:: paddle.v2.dataset.uci_housing
    :members:
    :noindex:
 wmt14
 +++++
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
--- a/doc/api/v2/data/image.rst
+++ b/doc/api/v2/data/image.rst
@ -0,0 +1,5 @@
 Image Interface
 ===============
 ..  automodule:: paddle.v2.image
    :members:
--- a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@ -0,0 +1,245 @@
 # Design: Sequence Decoder Generating LoDTensors
 In tasks such as machine translation and image to text, 
 a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
 This documentation describes how to implement the sequence decoder as an operator.
 ## Beam Search based Decoder
 The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, 
 it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
 In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, 
 due to the complexity, the implementation relays on a lot of special data structures, 
 quite trivial and hard to be customized by users.
 There are a lot of heuristic tricks in the sequence generation tasks, 
 so the flexibility of sequence decoder is very important to users.
 During PaddlePaddle's refactoring work,
 some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
 and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
 For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
 the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
 ## Changing LoD's absolute offset to relative offsets
 The current `LoDTensor` is designed to store levels of variable-length sequences,
 it stores several arrays of integers each represents a level.
 The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, 
 let's call this format the **absolute-offset LoD** for clear.
 The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
 [0, 2, 3, 3, 3, 9]]
 ```
 The first level tells that there are two sequences:
 - the first's offset is `[0, 3)`
 - the second's offset is `[3, 9)`
 while on the second level, there are several empty sequences that both begin and end at `3`.
 It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
 There are many scenarios that relay on empty sequence representation,
 such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
 So let's introduce another format of LoD, 
 it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
 For example, to represent the same sequences of the above data
 ```python
 [[0, 3, 6]
 [0, 2, 3, 3, 3, 9]]
 ```
 the first level represents that there are two sequences, 
 their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
 The second level is the same with the relative offset example because the lower level is a tensor.
 It is easy to find out the second sequence in the first-level LoD has two empty sequences.
 The following demos are based on relative-offset LoD.
 ## Usage in a simple machine translation model
 Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
 The model has an encoder that learns the semantic vector from a sequence,
 and a decoder which uses the sequence decoder to generate new sentences.
 **Encoder**
 ```python
 import paddle as pd
 dict_size = 8000
 source_dict_size = dict_size
 target_dict_size = dict_size
 word_vector_dim = 128
 encoder_dim = 128
 decoder_dim = 128
 beam_size = 5
 max_length = 120
 # encoder
 src_word_id = pd.data(
    name='source_language_word',
    type=pd.data.integer_value_sequence(source_dict_dim))
 src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
 src_word_vec = pd.lookup(src_embedding, src_word_id)
 encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
 encoder_ctx = pd.last_seq(encoder_out_seq)
 # encoder_ctx_proj is the learned semantic vector
 encoder_ctx_proj = pd.fc(
    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
 ```
 **Decoder**
 ```python
 def generate():
    decoder = pd.while_loop()
    with decoder.step():
        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
        generated_ids = decoder.memory() # TODO init to batch_size <s>s
        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
        target_word = pd.lookup(trg_embedding, gendrated_ids)
        # expand encoder_ctx's batch to fit target_word's lod
        # for example
        # decoder_mem.lod is
        # [[0 1 3],
        #  [0 1 3 6]]
        # its tensor content is [a1 a2 a3 a4 a5]
        # which means there are 2 sentences to translate
        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
        # the target_word.lod is 
        # [[0, 1, 6]
        #  [0, 2, 4, 7, 9 12]]
        # which means 2 sentences to translate, each has 1 and 5 prefixes
        # the first prefix has 2 candidates
        # the following has 2, 3, 2, 3 candidates
        # the encoder_ctx_expanded's content will be
        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
        decoder_input = pd.fc(
            act=pd.activation.Linear(),
            input=[target_word, encoder_ctx],
            size=3 * decoder_dim)
        gru_out, cur_mem = pd.gru_step(
            decoder_input, mem=decoder_mem, size=decoder_dim)
        scores = pd.fc(
            gru_out,
            size=trg_dic_size,
            bias=None,
            act=pd.activation.Softmax())
        # K is an config
        topk_scores, topk_ids = pd.top_k(scores, K)
        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
        selected_ids, selected_generation_scores = decoder.beam_search(
            topk_ids, topk_generated_scores)
        # update the states
        decoder_mem.update(cur_mem)  # tells how to update state
        generated_ids.update(selected_ids)
        generated_scores.update(selected_generation_scores)
        decoder.output(selected_ids)
        decoder.output(selected_generation_scores)
 translation_ids, translation_scores = decoder()
 ```
 The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
 return the result of the beam search algorithm.
 In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
 1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
 2. remove some specific candidate in `selected_ids`
 3. get the final `translation_ids`, remove the translation sequence in it.
 The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
 so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
 Both of them are two-level `LoDTensors`
 - the first level represents `batch_size` of (source) sentences;
 - the second level represents the candidate ID sets for translation prefix.
 for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
 Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
 a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
 For example, the previous state
 * LoD is `[0, 1, 3][0, 2, 5, 6]`
 * content of tensor is `a1 a2 b1 b2 b3 c1`
 the current state stored in `encoder_ctx_expanded`
 * LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
 * the content is 
  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
  - a2 a2
  - b1 b1 b1
  - b2
  - b3 b3
  - None (c1 has 0 candidates, so c1 is dropped)
 Benefit from the relative offset LoD, empty candidate set can be represented naturally.
 the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is 
 ```python
 decoder.output(selected_ids)
 decoder.output(selected_generation_scores)
 ```
 the `selected_ids` is the candidate ids for the prefixes, 
 it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
 the first level represents the source sequences,
 the second level represents generated sequences.
 Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
 Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
 ## LoD and shape changes during decoding
 <p align="center">
  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
 According the image above, the only phrase to change LoD is beam search.
 ## Beam search design
 The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
 1. `topk_ids`, top K candidate ids for each prefix.
 2. `topk_scores`, the corresponding scores for `topk_ids`
 3. `generated_scores`, the score of the prefixes.
 All of the are LoDTensors, so that the sequence affilication is clear.
 Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
 It will return three variables
 1. `selected_ids`, the final candidate beam search function selected for the next step.
 2. `selected_scores`, the scores for the candidates.
 3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
 ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
 The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
 and they exist in each time step,
 so it is natural to store them in arrays.
 Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
 the results of beam search are better to store in a `TensorArray`.
 The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. 
 It needs some extensions to support pack or unpack an array of `LoDTensors`.
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
 namespace paddle {
 namespace framework {
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
  }
 }
 inline std::type_index ToTypeIndex(DataType type) {
  switch (type) {
    case DataType::FP32:
      return typeid(float);
    case DataType::FP64:
      return typeid(double);
    case DataType::INT32:
      return typeid(int);
    case DataType::INT64:
      return typeid(int64_t);
    default:
      PADDLE_THROW("Not support type %d", type);
  }
 }
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
  switch (type) {
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -79,6 +79,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
  return result;
 }
 DDim make_ddim(const std::vector<int>& dims) {
  std::vector<int64_t> res(dims.size());
  std::transform(dims.begin(), dims.end(), res.begin(),
                 [](int d) { return static_cast<int64_t>(d); });
  return make_ddim(res);
 }
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -81,6 +81,8 @@ struct DDim {
 */
 DDim make_ddim(const std::vector<int64_t>& dims);
 DDim make_ddim(const std::vector<int>& dims);
 /**
 * \brief Make a DDim from an initializer list
 *
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -45,6 +45,7 @@ if(WITH_GPU)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(SwitchOpTest)
    add_simple_unittest(ScaleSubRegionOpTest)
 endif()
 add_simple_unittest(Im2ColTest)
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@ -110,6 +110,7 @@ public:
        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
    function1_->init(config);
    function2_->init(config);
    initArgsCallback_ = nullptr;
  }
  ~Compare2Function() {}
@ -170,6 +171,10 @@ public:
                                      *seq2_));
  }
  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
    initArgsCallback_ = callback;
  }
  // output need only contains shape, do not contains data.
  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
    size_t size =
@ -340,6 +345,10 @@ protected:
        initArg(*func1Inputs_[i]);
      }
      if (initArgsCallback_ != nullptr) {
        initArgsCallback_(*func1Inputs_[i], i);
      }
      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
    }
  }
@ -386,6 +395,7 @@ protected:
  std::shared_ptr<SequenceIdArg> seq1_;
  std::shared_ptr<SequenceIdArg> seq2_;
  test::CopyArgument<DType1, DType2> copyArg_;
  std::function<void(BufferArg&, size_t)> initArgsCallback_;
 };
 class CpuGpuFuncCompare
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ b/paddle/function/ScaleSubRegionOp.cpp
@ -0,0 +1,155 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "ScaleSubRegionOp.h"
 #include "paddle/function/TensorShape.h"
 namespace paddle {
 template <>
 void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
                                     const real* inputs,
                                     const real* indices,
                                     const TensorShape shape,
                                     const FuncConfig& conf) {
  real value = conf.get<real>("value");
  int number = shape[0];
  int channel = shape[1];
  int height = shape[2];
  int width = shape[3];
  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
  for (int n = 0; n < number; ++n) {
    // indices start from 1
    int offset = n * 6;
    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
          int idx = ((n * channel + c) * height + h) * width + w;
          outputs[idx] *= value;
        }
      }
    }
  }
 }
 template <>
 void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
                                         real* outGrad,
                                         const real* indices,
                                         const TensorShape shape,
                                         const FuncConfig& conf) {
  real value = conf.get<real>("value");
  int number = shape[0];
  int channel = shape[1];
  int height = shape[2];
  int width = shape[3];
  for (int n = 0; n < number; ++n) {
    for (int c = 0; c < channel; ++c) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {
          int idx = ((n * channel + c) * height + h) * width + w;
          int offset = n * 6;
          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
              h >= (indices[offset + 2] - 1) &&
              h <= (indices[offset + 3] - 1) &&
              w >= (indices[offset + 4] - 1) &&
              w <= (indices[offset + 5] - 1)) {
            outGrad[idx] += inGrad[idx] * value;
          } else {
            outGrad[idx] += inGrad[idx];
          }
        }
      }
    }
  }
 }
 /**
 * \brief For each instance, ScaleSubRegion can be used to multiply a value to
 *        a specified sub continuous region. By providing start index and end
 *        index for C/H/W, you can specify the location and shape of the region.
 *
 * Argument in this Function:
 * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
 * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
 * \param outputs   A 4-D tensor with same shape as inputs, output value.
 */
 template <DeviceType Device>
 class ScaleSubRegionFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override { conf_ = config; }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(2UL, inputs.size());
    CHECK_EQ(1UL, outputs.size());
    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
    TensorShape shape = inputs[0].shape();
    ScaleSubRegion<Device>(outputs[0].data<real>(),
                           inputs[0].data<real>(),
                           inputs[1].data<real>(),
                           shape,
                           conf_);
  }
 private:
  FuncConfig conf_;
 };
 /**
 * \brief The backward propagation of ScaleSubRegion Function.
 *
 * Argument in this Function:
 * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
 * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
 * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
 */
 template <DeviceType Device>
 class ScaleSubRegionGradFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override { conf_ = config; }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(2UL, inputs.size());
    CHECK_EQ(1UL, outputs.size());
    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
    TensorShape shape = inputs[0].shape();
    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
                               outputs[0].data<real>(),
                               inputs[1].data<real>(),
                               shape,
                               conf_);
  }
 private:
  FuncConfig conf_;
 };
 REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
 REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
 #ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
 REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
 #endif
 }  // namespace paddle
--- a/paddle/function/ScaleSubRegionOp.h
+++ b/paddle/function/ScaleSubRegionOp.h
@ -0,0 +1,55 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "Function.h"
 namespace paddle {
 /**
 * \brief Function to multiply a value to values in specified sub continuous
 *        region. Indices must be provided to indcate the location and shape of
 *        the region and the multiplied value is passed by configure variable.
 *
 *
 * \param[out] outputs  Output value.
 * \param[in]  inputs   Input data which contains NCHW information.
 * \param[in]  indices  Indices data to indcate the sub region.
 * \param[in]  shape    Tensor shape of input value.
 * \param[in]  conf     Configure variable which contains the multiplied value.
 */
 template <DeviceType Device>
 void ScaleSubRegion(real* outputs,
                    const real* inputs,
                    const real* indices,
                    const TensorShape shape,
                    const FuncConfig& conf);
 /**
 * \brief Backward propagation function of ScaleSubRegion.
 *
 * \param[out] inGrad   Gradients of previous layer.
 * \param[in]  outGrad  Output gradient.
 * \param[in]  indices  Indices data.
 * \param[in]  shape    The Shape of input tensor.
 * \param[in]  conf     Configure variable.
 */
 template <DeviceType Device>
 void ScaleSubRegionGrad(const real* inGrad,
                        real* outGrad,
                        const real* indices,
                        const TensorShape shape,
                        const FuncConfig& conf);
 }  // namespace paddle
--- a/paddle/function/ScaleSubRegionOpGpu.cu
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@ -0,0 +1,116 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "ScaleSubRegionOp.h"
 #include "hl_base.h"
 namespace paddle {
 __global__ void KeScaleSubRegion(real* outputs,
                                 const real* inputs,
                                 const real* indices,
                                 real value,
                                 int channel,
                                 int height,
                                 int width,
                                 int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % width;
    const int h = (idx / width) % height;
    const int c = (idx / width / height) % channel;
    const int n = idx / width / height / channel;
    const int offset = n * 6;
    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
      outputs[idx] = inputs[idx] * value;
    } else {
      outputs[idx] = inputs[idx];
    }
  }
 }
 template <>
 void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
                                     const real* inputs,
                                     const real* indices,
                                     const TensorShape shape,
                                     const FuncConfig& conf) {
  real value = conf.get<real>("value");
  int number = shape[0];
  int channel = shape[1];
  int height = shape[2];
  int width = shape[3];
  size_t nth = number * channel * height * width;
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
      outputs, inputs, indices, value, channel, height, width, nth);
  CHECK_SYNC("ScaleSubRegion");
 }
 __global__ void KeScaleSubRegionDiff(const real* inGrad,
                                     real* outGrad,
                                     const real* indices,
                                     real value,
                                     int channel,
                                     int height,
                                     int width,
                                     int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % width;
    const int h = (idx / width) % height;
    const int c = (idx / width / height) % channel;
    const int n = idx / width / height / channel;
    const int offset = n * 6;
    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
      outGrad[idx] += inGrad[idx] * value;
    } else {
      outGrad[idx] += inGrad[idx];
    }
  }
 }
 template <>
 void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
                                         real* outGrad,
                                         const real* indices,
                                         const TensorShape shape,
                                         const FuncConfig& conf) {
  real value = conf.get<real>("value");
  int number = shape[0];
  int channel = shape[1];
  int height = shape[2];
  int width = shape[3];
  size_t nth = number * channel * height * width;
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
      inGrad, outGrad, indices, value, channel, height, width, nth);
  CHECK_SYNC("ScaleSubRegionGrad");
 }
 }  // namespace paddle
--- a/paddle/function/ScaleSubRegionOpTest.cpp
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@ -0,0 +1,72 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
 namespace paddle {
 TEST(ScaleSubRegion, real) {
  for (size_t numSamples : {5, 32}) {
    for (size_t channels : {5, 32}) {
      for (size_t imgSizeH : {5, 33}) {
        for (size_t imgSizeW : {5, 32}) {
          for (real value : {-0.5, 0.0, 0.5}) {
            for (bool firstHalf : {false, true}) {
              VLOG(3) << " numSamples=" << numSamples
                      << " channels=" << channels << " imgSizeH=" << imgSizeH
                      << " imgSizeW=" << imgSizeW;
              for (bool testGrad : {false, true}) {
                CpuGpuFuncCompare compare(
                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
                    FuncConfig().set<real>("value", value));
                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
                TensorShape indicesShape{numSamples, 6};
                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
                  if (index == 1) {
                    real* data = (real*)arg.data();
                    for (size_t i = 0; i < numSamples; ++i) {
                      size_t offset = i * 6;
                      data[offset] = firstHalf ? 1 : channels / 2;
                      data[offset + 1] = firstHalf ? channels / 2 : channels;
                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
                    }
                  }
                });
                compare.addOutputs(
                    BufferArg(
                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
                    testGrad ? ADD_TO : ASSIGN_TO);
                compare.run();
              }
            }
          }
        }
      }
    }
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  if (biases_) {
+  resetFwdBuffers(inVals_, bias, out);
    LOG(FATAL) << "not implemented yet";
  }
  resetFwdBuffers(inVals_, out);
  in = inVals_[0];
  std::shared_ptr<sum::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  std::shared_ptr<sum::primitive_desc> biasPD;
  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
 }
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
+  resetBwdBuffers(inGrads_, bias, out);
  in = inGrads_[0];
  // backward only need share output grad to input grad
@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
    }
  }
  // backward bias
  bwdBias_ = nullptr;
  if (bias) {
    std::vector<float> scales(bs_, 1.0);
    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
    std::vector<primitive::at> srcs;
    for (size_t i = 0; i < grads_.size(); ++i) {
      srcs.push_back(*(grads_[i]));
    }
    bwdBias_.reset(new sum(biasPD, srcs, *bias));
    pipeline.push_back(*bwdBias_);
  }
 }
 void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
  }
 }
 void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
                                   const MatrixPtr& biasMat,
                                   const MKLDNNMatrixPtr& out,
                                   std::vector<MKLDNNMatrixPtr>& outs) {
  auto pd = MKLDNNMatrix::createPrimitiveDesc(
      {(int)layerSize_}, memory::format::x, engine_);
  bias = MKLDNNMatrix::create(pd, biasMat);
  outs.clear();
  real* data = out->getData();
  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
  for (int i = 0; i < bs_; ++i) {
    MatrixPtr tmp =
        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
  }
 }
 void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  }
  resetOutValue(out, inputs[0]->getPrimitiveDesc());
  if (biases_ && biases_->getW()) {
    prepareBias(bias, biases_->getW(), out, vals_);
  } else {
    bias = nullptr;
  }
 }
 void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
                                  std::shared_ptr<sum::primitive_desc>& biasPD,
                                  std::vector<MKLDNNMatrixPtr>& inputs,
                                  MKLDNNMatrixPtr bias,
                                  MKLDNNMatrixPtr out) {
-  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<float> scales(inputs.size(), 1.0);
  std::vector<memory::primitive_desc> srcPDs;
  for (size_t i = 0; i < inputs.size(); i++) {
    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
  CHECK(out);
  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
  biasPD = nullptr;
  if (bias) {
    std::vector<float> scales(2, 1.0);
    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
    biasPD.reset(
        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
  }
 }
 void MKLDNNAddtoLayer::resetFwdPipeline(
    std::vector<primitive>& pipeline,
    std::shared_ptr<sum::primitive_desc>& pd,
    std::shared_ptr<sum::primitive_desc>& biasPD,
    std::vector<MKLDNNMatrixPtr>& inputs,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
  std::vector<primitive::at> srcs;
  for (size_t i = 0; i < inputs.size(); i++) {
@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
  }
  fwd_.reset(new sum(*pd, srcs, *out));
  pipeline.push_back(*fwd_);
  fwdBias_.clear();
  if (biasPD == nullptr || bias == nullptr) {
    return;
  }
  fwdBias_.resize(vals_.size());
  for (size_t i = 0; i < vals_.size(); ++i) {
    std::vector<primitive::at> srcs;
    srcs.push_back(*(vals_[i]));
    srcs.push_back(*bias);
    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
    pipeline.push_back(*fwdBias_[i]);
  }
 }
 void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  CHECK(outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }
  if (biases_ && biases_->getWGrad()) {
    prepareBias(bias, biases_->getWGrad(), out, grads_);
  } else {
    bias = nullptr;
  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@ -32,9 +32,15 @@ protected:
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;
  // TODO(TJ): this part has not been optimized by MKL-DNN
  std::unique_ptr<Weight> biases_;
  // buffers for adding bias
  std::vector<MKLDNNMatrixPtr> vals_;
  std::vector<MKLDNNMatrixPtr> grads_;
  // primitives for adding bias
  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
  std::shared_ptr<mkldnn::primitive> bwdBias_;
 public:
  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@ -91,20 +97,34 @@ protected:
   *                    reset pipeline.
   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                  std::vector<MKLDNNMatrixPtr>& inputs,
                  MKLDNNMatrixPtr bias,
                  MKLDNNMatrixPtr out);
  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * Backward functions: reset buffers(inputs, output, bias)
   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  /**
   * prepare for bias
   */
  void prepareBias(MKLDNNMatrixPtr& bias,
                   const MatrixPtr& biasMat,
                   const MKLDNNMatrixPtr& out,
                   std::vector<MKLDNNMatrixPtr>& outs);
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@ -119,7 +119,7 @@ void MKLDNNBatchNormLayer::reshape(
    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  oh = ih;
-  ow = ow;
+  ow = iw;
  // ic_ and oc can not be changed
  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
      << "Input channel can not be changed";
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@ -287,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
    return;
  }
  CHECK(out) << "should have reset internal ouput grad";
-  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<float> scales(outputMap_.size(), 1.0);
  std::vector<memory::primitive_desc> srcPDs;
  std::vector<primitive::at> srcs;
  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
--- a/Show More
+++ b/Show More