From fa5cb7104d7a8bce56a446eb352037302f93ae70 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Sat, 9 Sep 2017 17:40:38 +0800
Subject: [PATCH 01/18] fix switchOrderLayer

---
 python/paddle/trainer/config_parser.py         | 4 ++--
 python/paddle/trainer_config_helpers/layers.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 7e9112b43b..356e1d8b6f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3748,8 +3748,8 @@ class SwitchOrderLayer(LayerBase):
     def __init__(self, name, inputs, reshape, **xargs):
         super(SwitchOrderLayer, self).__init__(
             name, 'switch_order', 0, inputs=inputs, **xargs)
-        self.config.reshape_conf.heightAxis.extend(reshape['height'])
-        self.config.reshape_conf.widthAxis.extend(reshape['width'])
+        self.config.reshape_conf.height_axis.extend(reshape['height'])
+        self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
 # Deprecated, use a new layer specific class instead
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index dc68c213da..c103edf237 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -6460,6 +6460,7 @@ def switch_order_layer(input,
     return LayerOutput(
         name=name,
         layer_type=LayerType.SWITCH_ORDER_LAYER,
+        activation=act,
         parents=input,
         size=l.config.size)
 

From a5103770240ceebb5e98f2d99c97f9a6038818b9 Mon Sep 17 00:00:00 2001
From: wenshilei <wenshilei@baidu.com>
Date: Sun, 10 Sep 2017 02:37:34 +0800
Subject: [PATCH 02/18] Fix ssd bugs.

---
 paddle/gserver/layers/DetectionOutputLayer.cpp | 1 +
 paddle/gserver/layers/DetectionUtil.cpp        | 4 +++-
 paddle/gserver/layers/DetectionUtil.h          | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
index 8ab838e191..a1036ea866 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -139,6 +139,7 @@ void DetectionOutputLayer::forward(PassType passType) {
                                        allDecodedBBoxes,
                                        &allIndices);
 
+  numKept = numKept > 0 ? numKept : 1;
   resetOutput(numKept, 7);
   MatrixPtr outV = getOutputValue();
   getDetectionOutput(confBuffer_->getData(),
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
index 3e61adc66e..92c6193035 100644
--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -469,7 +469,7 @@ size_t getDetectionIndices(
     const size_t numClasses,
     const size_t backgroundId,
     const size_t batchSize,
-    const size_t confThreshold,
+    const real confThreshold,
     const size_t nmsTopK,
     const real nmsThreshold,
     const size_t keepTopK,
@@ -536,6 +536,8 @@ void getDetectionOutput(const real* confData,
   MatrixPtr outBuffer;
   Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
   real* bufferData = outBuffer->getData();
+  for (size_t i = 0; i < 7; i++)
+     bufferData[i] = -1;
   size_t count = 0;
   for (size_t n = 0; n < batchSize; ++n) {
     for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
index fe4f9f075e..641ed873b4 100644
--- a/paddle/gserver/layers/DetectionUtil.h
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -275,7 +275,7 @@ size_t getDetectionIndices(
     const size_t numClasses,
     const size_t backgroundId,
     const size_t batchSize,
-    const size_t confThreshold,
+    const real confThreshold,
     const size_t nmsTopK,
     const real nmsThreshold,
     const size_t keepTopK,

From dade4272c7104653587ec516ce77a4ac6dee23da Mon Sep 17 00:00:00 2001
From: Yuan Gao <yuan.gao.gavin@gmail.com>
Date: Sun, 10 Sep 2017 02:41:43 +0800
Subject: [PATCH 03/18] Update DetectionUtil.cpp

---
 paddle/gserver/layers/DetectionUtil.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
index 92c6193035..7c18295385 100644
--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -537,7 +537,7 @@ void getDetectionOutput(const real* confData,
   Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
   real* bufferData = outBuffer->getData();
   for (size_t i = 0; i < 7; i++)
-     bufferData[i] = -1;
+    bufferData[i] = -1;
   size_t count = 0;
   for (size_t n = 0; n < batchSize; ++n) {
     for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();

From 4343be3309cd966026fcbd16ff60dd3474c7cf14 Mon Sep 17 00:00:00 2001
From: Yuan Gao <yuan.gao.gavin@gmail.com>
Date: Sun, 10 Sep 2017 10:16:24 +0800
Subject: [PATCH 04/18] Update DetectionUtil.cpp

---
 paddle/gserver/layers/DetectionUtil.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
index 7c18295385..070bc0bce6 100644
--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -536,8 +536,7 @@ void getDetectionOutput(const real* confData,
   MatrixPtr outBuffer;
   Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
   real* bufferData = outBuffer->getData();
-  for (size_t i = 0; i < 7; i++)
-    bufferData[i] = -1;
+  for (size_t i = 0; i < 7; i++) bufferData[i] = -1;
   size_t count = 0;
   for (size_t n = 0; n < batchSize; ++n) {
     for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();

From 5bd3fbeef8c369b2d0cb968947853314db3ffd49 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 11 Sep 2017 10:23:11 +0800
Subject: [PATCH 05/18] Fix switch layer attribute name

---
 python/paddle/trainer/config_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 11dc84ae20..14a1b38079 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3675,8 +3675,8 @@ class SwitchOrderLayer(LayerBase):
     def __init__(self, name, inputs, reshape, **xargs):
         super(SwitchOrderLayer, self).__init__(
             name, 'switch_order', 0, inputs=inputs, **xargs)
-        self.config.reshape_conf.heightAxis.extend(reshape['height'])
-        self.config.reshape_conf.widthAxis.extend(reshape['width'])
+        self.config.reshape_conf.height_axis.extend(reshape['height'])
+        self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
 # Deprecated, use a new layer specific class instead

From a251956d7dddb2302b26b7342132d2bb0d33233b Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 11 Sep 2017 10:37:27 +0800
Subject: [PATCH 06/18] Add activation to output of switch layer

---
 python/paddle/trainer_config_helpers/layers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index dc68c213da..c103edf237 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -6460,6 +6460,7 @@ def switch_order_layer(input,
     return LayerOutput(
         name=name,
         layer_type=LayerType.SWITCH_ORDER_LAYER,
+        activation=act,
         parents=input,
         size=l.config.size)
 

From 38e7ad799cb96ff2b7430fc0c8c7d9e57470131a Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Mon, 11 Sep 2017 11:19:36 +0800
Subject: [PATCH 07/18] Set detection output to NULL

---
 paddle/gserver/layers/DetectionOutputLayer.cpp | 9 +++++++--
 paddle/gserver/layers/DetectionUtil.cpp        | 1 -
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
index a1036ea866..0cf0a92bf4 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -139,8 +139,13 @@ void DetectionOutputLayer::forward(PassType passType) {
                                        allDecodedBBoxes,
                                        &allIndices);
 
-  numKept = numKept > 0 ? numKept : 1;
-  resetOutput(numKept, 7);
+  if (numKept > 0) {
+    resetOutput(numKept, 7);
+  } else {
+    MatrixPtr outV = getOutputValue();
+    outV = NULL;
+    return;
+  }
   MatrixPtr outV = getOutputValue();
   getDetectionOutput(confBuffer_->getData(),
                      numKept,
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
index 070bc0bce6..d83674f45a 100644
--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -536,7 +536,6 @@ void getDetectionOutput(const real* confData,
   MatrixPtr outBuffer;
   Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
   real* bufferData = outBuffer->getData();
-  for (size_t i = 0; i < 7; i++) bufferData[i] = -1;
   size_t count = 0;
   for (size_t n = 0; n < batchSize; ++n) {
     for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();

From 1eb5e56f3844f5b4e2bac1be68e1f316e5eeddda Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Mon, 11 Sep 2017 11:56:22 +0800
Subject: [PATCH 08/18] Add comment to the PythonAPI

---
 python/paddle/trainer_config_helpers/layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index dc68c213da..fb1af2d8e8 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1223,7 +1223,8 @@ def detection_output_layer(input_loc,
                            name=None):
     """
     Apply the NMS to the output of network and compute the predict bounding
-    box location.
+    box location. The output of this layer could be None if there is no valid
+    bounding box.
 
     :param name: The Layer Name.
     :type name: basestring

From 74de8c0c763f7f42e0fb0c8a23bb7b289af4c06a Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 11 Sep 2017 11:42:08 +0800
Subject: [PATCH 09/18] make docker can configure with_mkldnn and with_mklml

---
 paddle/scripts/docker/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e57f793ac4..2ac455d771 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -30,6 +30,8 @@ Configuring cmake in /paddle/build ...
       -DCMAKE_BUILD_TYPE=Release
       -DWITH_DOC=OFF
       -DWITH_GPU=${WITH_GPU:-OFF}
+      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+      -DWITH_MKLML=${WITH_MKLML:-ON}
       -DWITH_AVX=${WITH_AVX:-OFF}
       -DWITH_GOLANG=${WITH_GOLANG:-ON}
       -DWITH_SWIG_PY=ON
@@ -50,6 +52,8 @@ cmake .. \
       -DCMAKE_BUILD_TYPE=Release \
       -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+      -DWITH_MKLML=${WITH_MKLML:-ON} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
       -DWITH_GOLANG=${WITH_GOLANG:-ON} \
       -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \

From dad5421afebbe025898ffe45eceb4e2669827ae8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sun, 10 Sep 2017 21:25:40 -0700
Subject: [PATCH 10/18] Remove enforce demangle

It is buggy in some Linux because the unique_ptr will be free however
the std::string trying to use that char*.

Moreover, it is no need to demangle for error log by Paddle.
Just use `c++filt` or other shell utilities to do this.
---
 paddle/platform/enforce.h | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 81448897e9..64fcbd93b6 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"
 
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
 #ifndef PADDLE_ONLY_CPU
 
 #include "paddle/platform/dynload/cublas.h"
@@ -46,19 +42,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-namespace {
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-}
-
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
   std::string err_str_;
@@ -79,7 +62,7 @@ struct EnforceNotMet : public std::exception {
       Dl_info info;
       for (int i = 0; i < size; ++i) {
         if (dladdr(call_stack[i], &info)) {
-          auto demangled = demangle(info.dli_sname);
+          auto demangled = info.dli_sname;
           auto addr_offset = static_cast<char*>(call_stack[i]) -
                              static_cast<char*>(info.dli_saddr);
           sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,

From d34516fb662c2fe9727989dd2885de0b9ad9cf8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Mon, 11 Sep 2017 13:00:44 +0800
Subject: [PATCH 11/18] Get output when training (#3978)

* get output when training

* follow comments
---
 python/paddle/v2/event.py   | 10 ++++++++--
 python/paddle/v2/trainer.py |  9 +++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index 7589cc9917..e66bf67d79 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -53,10 +53,13 @@ class BeginPass(object):
 class EndPass(WithMetric):
     """
     Event On One Pass Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
     """
 
-    def __init__(self, pass_id, evaluator):
+    def __init__(self, pass_id, evaluator, gm):
         self.pass_id = pass_id
+        self.gm = gm
         WithMetric.__init__(self, evaluator)
 
 
@@ -73,10 +76,13 @@ class BeginIteration(object):
 class EndIteration(WithMetric):
     """
     Event On One Batch Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
     """
 
-    def __init__(self, pass_id, batch_id, cost, evaluator):
+    def __init__(self, pass_id, batch_id, cost, evaluator, gm):
         self.pass_id = pass_id
         self.batch_id = batch_id
         self.cost = cost
+        self.gm = gm
         WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 0654a30104..ca95ef13bd 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -174,13 +174,18 @@ class SGD(object):
                         pass_id=pass_id,
                         batch_id=batch_id,
                         cost=cost,
-                        evaluator=batch_evaluator))
+                        evaluator=batch_evaluator,
+                        gm=self.__gradient_machine__))
                 self.__parameter_updater__.finishBatch(cost)
                 batch_evaluator.finish()
 
             self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
-            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+            event_handler(
+                v2_event.EndPass(
+                    pass_id,
+                    evaluator=pass_evaluator,
+                    gm=self.__gradient_machine__))
         self.__gradient_machine__.finish()
 
     def test(self, reader, feeding=None):

From d74fe780402747baf6bd5564b8584bf06e9fb099 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 11 Sep 2017 13:56:07 +0800
Subject: [PATCH 12/18] refine MKLDNNMatrix, solid data handle, rename
 updateData to setData

---
 paddle/gserver/layers/MKLDNNLayer.h |  4 ++--
 paddle/math/MKLDNNMatrix.cpp        | 10 ++++-----
 paddle/math/MKLDNNMatrix.h          | 35 +++++++++++++++++++++++------
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index b983b833d5..ed1ad7c0bd 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -203,7 +203,7 @@ protected:
     real* iData = getInputValue(0, CPU_DEVICE)->getData();
     // update input data
     // since it might be changed if this is after data layer
-    inVal_->updateData(iData);
+    inVal_->setData(iData);
   }
 
   /**
@@ -216,7 +216,7 @@ protected:
 
     // update diff
     real* oDiff = getOutput(CPU_DEVICE).grad->getData();
-    outGrad_->updateData(oDiff);
+    outGrad_->setData(oDiff);
   }
 
   /**
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0a355e2644..c4063e5069 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -33,14 +33,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
     size_t width = cnts / dims[0];
     m = Matrix::create(height, width, false, false);
   }
-
   CHECK(m) << " Matrix should not be empty";
+
   CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
   CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-
-  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  return std::make_shared<MKLDNNMatrix>(
-      m->getData(), m->getHeight(), m->getWidth(), pd);
+  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
 
 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -138,7 +136,7 @@ void MKLDNNMatrix::downSpatial() {
       mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
       "could not create a memory primitive");
   reset(result);
-  set_data_handle(getData());
+  set_data_handle(data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index e50f698b49..eef3b429e6 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -30,11 +30,10 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
-  MKLDNNMatrix(real* data,
-               size_t height,
-               size_t width,
-               mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
 
   ~MKLDNNMatrix() {}
 
@@ -81,11 +80,29 @@ public:
   void downSpatial();
 
   /**
-   * Update the memory data handle.
+   * set the memory data handle.
    * Caution: This will not check the buffer size of the data,
    *          it should be coverd by user.
    */
-  void updateData(void* data) { set_data_handle(data); }
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
 
   /**
    * Get primitive descriptor.
@@ -143,6 +160,10 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+
+private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
 };
 
 }  // namespace paddle

From d4c0734840420314298e4a330ddd4f10d957e8e7 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 11 Sep 2017 14:08:13 +0800
Subject: [PATCH 13/18] remove convertOutputToOtherDevice

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 23 ++++-------------------
 paddle/gserver/layers/MKLDNNFcLayer.h   |  2 --
 paddle/gserver/layers/MKLDNNLayer.h     | 13 +++++++------
 3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 8318c8c519..f4deb351f2 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,24 +77,6 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
-void MKLDNNFcLayer::convertOutputToOtherDevice() {
-  copyOutputInfoToOtherDevice();
-  // find other cpu device and reorder output to cpu device
-  int cnt = 0;
-  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-      // fc cpu output value do not need convert
-      // just share point
-      outputOtherDevice_[i].value = output_.value;
-      ++cnt;
-    }
-  }
-
-  if (cnt > 1) {
-    LOG(WARNING) << "should not have more than one CPU devie";
-  }
-}
-
 void MKLDNNFcLayer::reshape() {
   const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
@@ -155,7 +137,10 @@ void MKLDNNFcLayer::resetFwd() {
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
   if (!outputIsOnlyMKLDNN()) {
-    convertOutputToOtherDevice();
+    copyOutputInfoToOtherDevice();
+    // fc cpu output value do not need create convert
+    // just share point
+    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
   }
 
   // create forward handle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index e138a6faf1..e2657a8d5e 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -72,8 +72,6 @@ protected:
    * only would be called when needed
    */
   void resetBwd();
-
-  void convertOutputToOtherDevice() override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index ed1ad7c0bd..1a3e949fb9 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -113,12 +113,6 @@ public:
    */
   virtual void convertWeightsToPaddle() {}
 
-  /**
-   * convert MKLDNN output to other device.
-   * only support CPU device yet
-   */
-  virtual void convertOutputToOtherDevice() {}
-
   /**
    * print info about sizes
    */
@@ -155,6 +149,7 @@ protected:
    *        copy base info and do not copy data value
    */
   void copyOutputInfoToOtherDevice() {
+    int cnt = 0;
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
       outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
       outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
@@ -163,6 +158,12 @@ protected:
       outputOtherDevice_[i].subSequenceStartPositions =
           output_.subSequenceStartPositions;
       outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    if (cnt > 1) {
+      LOG(WARNING) << "should not have more than one CPU devie";
     }
   }
 

From f40d5f580de3731e071bb9cca3c98a6537955e25 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 11 Sep 2017 14:19:10 +0800
Subject: [PATCH 14/18] remove syncOutputGrad, rename syncInputValue to
 updateInputData

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 18 +++++++++-----
 paddle/gserver/layers/MKLDNNFcLayer.h   |  2 ++
 paddle/gserver/layers/MKLDNNLayer.h     | 32 +++++--------------------
 3 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index f4deb351f2..53433cef35 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -220,13 +220,12 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  const MatrixPtr& in = getInputGrad(0, device);
+  const MatrixPtr& in = inputLayers_[0]->getOutput().grad;
   if (in == nullptr) {
     return;
   }
-  if (getInput(0, device).getAllCount() > 1) {
-    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
   } else {
     inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
   }
@@ -243,13 +242,21 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdData_);
 }
 
+void MKLDNNFcLayer::updateInputData() {
+  if (inputLayers_[0]->getType() != "data") {
+    return;
+  }
+  real* iData = getInputValue(0, CPU_DEVICE)->getData();
+  inVal_->setData(iData);
+}
+
 void MKLDNNFcLayer::forward(PassType passType) {
   Layer::forward(passType);
   reshape();
 
   {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    syncInputValue();
+    updateInputData();
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -271,7 +278,6 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
     resetBwd();
 
-    syncOutputGrad();
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
   }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index e2657a8d5e..4ad67a16e0 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -53,6 +53,8 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
+  void updateInputData() override;
+
 protected:
   /**
    * reshape the input image sizes
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 1a3e949fb9..543364edce 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -113,6 +113,12 @@ public:
    */
   virtual void convertWeightsToPaddle() {}
 
+  /**
+   * Update input value data when input layer is "data" type.
+   * Since the input value data address might be changed.
+   */
+  virtual void updateInputData() {}
+
   /**
    * print info about sizes
    */
@@ -194,32 +200,6 @@ protected:
     return outputOtherDevice_.size() == 0;
   }
 
-  /**
-   * Sync input value data
-   */
-  void syncInputValue() {
-    if (inputIsOnlyMKLDNN()) {
-      return;
-    }
-    real* iData = getInputValue(0, CPU_DEVICE)->getData();
-    // update input data
-    // since it might be changed if this is after data layer
-    inVal_->setData(iData);
-  }
-
-  /**
-   * Sync output grad data
-   */
-  void syncOutputGrad() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-
-    // update diff
-    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
-    outGrad_->setData(oDiff);
-  }
-
   /**
    * Set deviceId of this layer.
    */

From 7bd517129ae50850979c7a7c6cc8fce22e2131a8 Mon Sep 17 00:00:00 2001
From: Yancey <yancey1989@gmail.com>
Date: Mon, 11 Sep 2017 15:21:14 +0800
Subject: [PATCH 15/18] Add Concat operator with CPU kernel  (#3775)

add concat op with CPU kernel
---
 paddle/operators/concat_op.cc                 | 79 +++++++++++++++++++
 paddle/operators/concat_op.cu                 | 19 +++++
 paddle/operators/concat_op.h                  | 64 +++++++++++++++
 paddle/pybind/pybind.cc                       |  1 +
 python/paddle/v2/framework/op.py              |  1 -
 .../paddle/v2/framework/tests/CMakeLists.txt  |  1 +
 .../v2/framework/tests/gradient_checker.py    |  5 +-
 .../paddle/v2/framework/tests/op_test_util.py | 32 +++++---
 .../v2/framework/tests/test_concat_op.py      | 22 ++++++
 9 files changed, 211 insertions(+), 13 deletions(-)
 create mode 100644 paddle/operators/concat_op.cc
 create mode 100644 paddle/operators/concat_op.cu
 create mode 100644 paddle/operators/concat_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_concat_op.py

diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
new file mode 100644
index 0000000000..0ebefbab26
--- /dev/null
+++ b/paddle/operators/concat_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/concat_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class ConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+
+    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+
+    auto out_dims = ins[0]->dims();
+    size_t in_zero_dims_size = out_dims.size();
+    for (size_t i = 1; i < n; i++) {
+      for (size_t j = 0; j < in_zero_dims_size; j++) {
+        if (j == axis) {
+          out_dims[axis] += ins[i]->dims()[j];
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out_dims[j], ins[i]->dims()[j],
+                          "Input tensors should have the same "
+                          "elements except the specify axis.")
+      }
+    }
+    out->Resize(out_dims);
+  }
+};
+
+class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of concat operator.");
+    AddComment(R"DOC(
+            Join the input tensors along with the axis.
+            Examples:
+              Input[0] = [[1,2],[3,4]]
+              Input[1] = [[5,6]]
+              axis = 0
+              Output = [[1,2],
+                        [3,4],
+                        [5,6]]
+        )DOC");
+    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(concat, ops::ConcatOp, ops::ConcatOpMaker)
+REGISTER_OP_CPU_KERNEL(concat,
+                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/operators/concat_op.cu b/paddle/operators/concat_op.cu
new file mode 100644
index 0000000000..38fee7473d
--- /dev/null
+++ b/paddle/operators/concat_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/concat_op.h"
+
+namespace ops = paddle::operators;
+// TODO(Yancey1989) Add GPU kernel
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
new file mode 100644
index 0000000000..f977054fdf
--- /dev/null
+++ b/paddle/operators/concat_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ConcatKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+    size_t output_axis_dim = 0;
+    size_t before = 1, after = 1;
+    for (size_t i = 0; i < n; i++) {
+      output_axis_dim += ins[i]->dims()[axis];
+    }
+    auto& input_zero = ins[0];
+    for (int64_t i = 0; i < input_zero->dims().size(); i++) {
+      if (i == axis) {
+        continue;
+      }
+      if (i < axis) {
+        before *= input_zero->dims()[i];
+      } else {
+        after *= input_zero->dims()[i];
+      }
+    }
+    size_t output_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& in = ins[i];
+      auto axis_dim = in->dims()[axis];
+      for (size_t j = 0; j < before; j++) {
+        size_t len = axis_dim * after * sizeof(T);
+        const T* src = in->data<T>() + axis_dim * after * j;
+        T* out_data = out->mutable_data<T>(platform::CPUPlace());
+        T* dest = out_data + output_offset + output_axis_dim * after * j;
+        memcpy(dest, src, len);
+      }
+      output_offset += axis_dim * after;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index db701a2a30..227b75aff8 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -49,6 +49,7 @@ USE_OP(minus);
 USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_CPU_ONLY_OP(concat);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 4e91924a50..9e665adad2 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -43,7 +43,6 @@ class OpDescCreationMethod(object):
         if len(args) != 0:
             raise ValueError("Only keyword arguments are supported.")
         op_desc = framework_pb2.OpDesc()
-
         for input_parameter in self.__op_proto__.inputs:
             input_arguments = kwargs.get(input_parameter.name, [])
             if is_str(input_arguments):
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 2117fdf0d5..2f6be105b6 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -35,4 +35,5 @@ py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(test_sum_op SRCS test_sum_op.py)
 py_test(mnist SRCS mnist.py)
+py_test(test_concat_op SRCS test_concat_op.py)
 py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index fdb06b7988..51a98284bd 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -11,11 +11,10 @@ __all__ = ['get_numeric_gradient']
 def create_op(op_type):
     # TODO need to set attrs
     kwargs = dict()
-    for in_name in Operator.get_op_input_names(op_type):
+    for in_name, _ in Operator.get_op_input_names(op_type):
         kwargs[in_name] = in_name
-    for out_name in Operator.get_op_output_names(op_type):
+    for out_name, _ in Operator.get_op_output_names(op_type):
         kwargs[out_name] = out_name
-
     return Operator(op_type, **kwargs)
 
 
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 370f27eaf6..54fe5da440 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -27,17 +27,30 @@ class OpTestMeta(type):
                 places.append(core.GPUPlace(0))
 
             for place in places:
-                for in_name in Operator.get_op_input_names(self.type):
-                    if hasattr(self, "inputs") and in_name in self.inputs:
-                        kwargs[in_name] = in_name
-                        var = scope.new_var(in_name).get_tensor()
-                        arr = self.inputs[in_name]
-                        var.set_dims(arr.shape)
-                        var.set(arr, place)
+                for ins in Operator.get_op_input_names(self.type):
+                    in_name = ins[0]
+                    in_dup = ins[1]
+                    if hasattr(self, 'inputs') and in_name in self.inputs:
+                        kwargs[in_name] = []
+                        if in_dup:
+                            arrays = self.inputs[in_name]
+                            for index, arr in enumerate(arrays):
+                                var = scope.new_var(in_name + str(index))
+                                tensor = var.get_tensor()
+                                tensor.set_dims(arr.shape)
+                                tensor.set(arr, place)
+                                kwargs[in_name].append(in_name + str(index))
+                        else:
+                            kwargs[in_name] = in_name
+                            var = scope.new_var(in_name).get_tensor()
+                            arr = self.inputs[in_name]
+                            var.set_dims(arr.shape)
+                            var.set(arr, place)
                     else:
                         kwargs[in_name] = "@EMPTY@"
 
-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_output_names(
+                        self.type):
                     if not hasattr(self, "outputs"):
                         raise ValueError(
                             "The test op must set self.outputs dict.")
@@ -60,7 +73,8 @@ class OpTestMeta(type):
                 ctx = core.DeviceContext.create(place)
                 op.run(scope, ctx)
 
-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_output_names(
+                        self.type):
                     actual = numpy.array(scope.find_var(out_name).get_tensor())
                     expect = self.outputs[out_name]
                     self.assertTrue(
diff --git a/python/paddle/v2/framework/tests/test_concat_op.py b/python/paddle/v2/framework/tests/test_concat_op.py
new file mode 100644
index 0000000000..6bd4c30974
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
@@ -0,0 +1,22 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestConcatOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "concat"
+        x0 = np.random.random((2, 3, 2, 5)).astype('float32')
+        x1 = np.random.random((2, 3, 3, 5)).astype('float32')
+        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        axis = 2
+        self.inputs = {'X': [x0, x1, x2]}
+        self.attrs = {'axis': axis}
+        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9d46f443fede2edba2e8041b2b30d9513852820b Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 11 Sep 2017 16:43:17 +0800
Subject: [PATCH 16/18] fix attr bug in op_test and ensure order in duplicate
 inputs/outputs

---
 python/paddle/v2/framework/tests/op_test.py   | 34 ++++++++++++-------
 .../paddle/v2/framework/tests/test_sum_op.py  |  2 +-
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 3a6a5dca4c..1daa6fa277 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -9,7 +9,7 @@ def grad_var_name(var_name):
     return var_name + "@GRAD"
 
 
-def create_op(scope, op_type, inputs, outputs, attrs=None):
+def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
     for in_name, in_dup in Operator.get_op_inputs(op_type):
@@ -17,7 +17,7 @@ def create_op(scope, op_type, inputs, outputs, attrs=None):
             kwargs[in_name] = []
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name in sub_in:
+                for sub_in_name, arr in sub_in:
                     var = scope.new_var(sub_in_name)
                     kwargs[in_name].append(sub_in_name)
             else:
@@ -29,15 +29,16 @@ def create_op(scope, op_type, inputs, outputs, attrs=None):
             kwargs[out_name] = []
             if out_dup:
                 sub_in = outputs[out_name]
-                for sun_in_name in sub_in:
-                    var = scope.new_var(sun_in_name)
-                    kwargs[out_name].append(sun_in_name)
+                for sub_in_name, arr in sub_in:
+                    var = scope.new_var(sub_in_name)
+                    kwargs[out_name].append(sub_in_name)
             else:
                 var = scope.new_var(out_name)
                 kwargs[out_name].append(out_name)
 
     for attr_name in Operator.get_op_attr_names(op_type):
-        kwargs[attr_name] = attrs[attr_name]
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
     return Operator(op_type, **kwargs)
 
 
@@ -46,10 +47,9 @@ def set_input(scope, op, inputs, place):
         if in_name in inputs:
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name in sub_in:
+                for sub_in_name, arr in sub_in:
                     var = scope.find_var(sub_in_name)
                     tensor = var.get_tensor()
-                    arr = sub_in[sub_in_name]
                     tensor.set_dims(arr.shape)
                     tensor.set(arr, place)
             else:
@@ -65,7 +65,7 @@ def set_output_grad(scope, op, outputs, place):
         if out_name in outputs:
             if out_dup:
                 sub_out = outputs[out_name]
-                for sub_out_name in sub_out:
+                for sub_out_name, arr in sub_out:
                     out_tensor = scope.find_var(sub_out_name).get_tensor()
                     grad_tensor = scope.new_var(grad_var_name(
                         sub_out_name)).get_tensor()
@@ -110,7 +110,7 @@ def get_numeric_gradient(scope,
     # we use a for loop to compute the gradient of every element.
     for i in xrange(tensor_size):
         if in_place:
-            set_input(op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, core.CPUPlace())
 
         # get one input element throw it's index i.
         origin = tensor_to_check.get_float_element(i)
@@ -120,7 +120,7 @@ def get_numeric_gradient(scope,
         y_pos = get_output()
 
         if in_place:
-            set_input(op, inputs, core.CPUPlace())
+            set_input(scope, op, inputs, core.CPUPlace())
 
         x_neg = origin - delta
         tensor_to_check.set_float_element(i, x_neg)
@@ -168,7 +168,11 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 class OpTest(unittest.TestCase):
     def check_output_with_place(self, place):
         self.scope = core.Scope()
-        self.op = create_op(self.scope, self.op_type, self.inputs, self.outputs)
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
         if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
             return
         set_input(self.scope, self.op, self.inputs, place)
@@ -227,7 +231,11 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005):
         self.scope = core.Scope()
-        self.op = create_op(self.scope, self.op_type, self.inputs, self.outputs)
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
         if no_grad_set is None:
             no_grad_set = set()
 
diff --git a/python/paddle/v2/framework/tests/test_sum_op.py b/python/paddle/v2/framework/tests/test_sum_op.py
index 66417d70e8..2ad1cc2610 100644
--- a/python/paddle/v2/framework/tests/test_sum_op.py
+++ b/python/paddle/v2/framework/tests/test_sum_op.py
@@ -9,7 +9,7 @@ class TestSumOp(OpTest):
         x0 = np.random.random((3, 4)).astype('float32')
         x1 = np.random.random((3, 4)).astype('float32')
         x2 = np.random.random((3, 4)).astype('float32')
-        self.inputs = {"X": {"x0": x0, "x1": x1, "x2": x2}}
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
 

From 76a70d10db35a812654d3d2b2351b1ce867d511d Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 11 Sep 2017 17:27:49 +0800
Subject: [PATCH 17/18] fix unit test error

---
 python/paddle/v2/framework/tests/gradient_checker.py |  4 ++--
 python/paddle/v2/framework/tests/op_test_util.py     | 10 +++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 51a98284bd..ed838b5979 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -11,9 +11,9 @@ __all__ = ['get_numeric_gradient']
 def create_op(op_type):
     # TODO need to set attrs
     kwargs = dict()
-    for in_name, _ in Operator.get_op_input_names(op_type):
+    for in_name in Operator.get_op_input_names(op_type):
         kwargs[in_name] = in_name
-    for out_name, _ in Operator.get_op_output_names(op_type):
+    for out_name in Operator.get_op_output_names(op_type):
         kwargs[out_name] = out_name
     return Operator(op_type, **kwargs)
 
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 54fe5da440..99a114e45f 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -27,9 +27,7 @@ class OpTestMeta(type):
                 places.append(core.GPUPlace(0))
 
             for place in places:
-                for ins in Operator.get_op_input_names(self.type):
-                    in_name = ins[0]
-                    in_dup = ins[1]
+                for in_name, in_dup in Operator.get_op_inputs(self.type):
                     if hasattr(self, 'inputs') and in_name in self.inputs:
                         kwargs[in_name] = []
                         if in_dup:
@@ -49,8 +47,7 @@ class OpTestMeta(type):
                     else:
                         kwargs[in_name] = "@EMPTY@"
 
-                for out_name, out_dup in Operator.get_op_output_names(
-                        self.type):
+                for out_name, out_dup in Operator.get_op_outputs(self.type):
                     if not hasattr(self, "outputs"):
                         raise ValueError(
                             "The test op must set self.outputs dict.")
@@ -73,8 +70,7 @@ class OpTestMeta(type):
                 ctx = core.DeviceContext.create(place)
                 op.run(scope, ctx)
 
-                for out_name, out_dup in Operator.get_op_output_names(
-                        self.type):
+                for out_name, out_dup in Operator.get_op_outputs(self.type):
                     actual = numpy.array(scope.find_var(out_name).get_tensor())
                     expect = self.outputs[out_name]
                     self.assertTrue(

From 477b23c3f5c123b446cec48321105e1a471c1212 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Mon, 11 Sep 2017 18:37:19 +0800
Subject: [PATCH 18/18] follow comments

---
 python/paddle/v2/framework/tests/op_test.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 1daa6fa277..489358ba85 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -17,7 +17,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             kwargs[in_name] = []
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name, arr in sub_in:
+                for sub_in_name, _ in sub_in:
                     var = scope.new_var(sub_in_name)
                     kwargs[in_name].append(sub_in_name)
             else:
@@ -29,7 +29,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             kwargs[out_name] = []
             if out_dup:
                 sub_in = outputs[out_name]
-                for sub_in_name, arr in sub_in:
+                for sub_in_name, _ in sub_in:
                     var = scope.new_var(sub_in_name)
                     kwargs[out_name].append(sub_in_name)
             else:
@@ -47,11 +47,11 @@ def set_input(scope, op, inputs, place):
         if in_name in inputs:
             if in_dup:
                 sub_in = inputs[in_name]
-                for sub_in_name, arr in sub_in:
+                for sub_in_name, sub_in_array in sub_in:
                     var = scope.find_var(sub_in_name)
                     tensor = var.get_tensor()
-                    tensor.set_dims(arr.shape)
-                    tensor.set(arr, place)
+                    tensor.set_dims(sub_in_array.shape)
+                    tensor.set(sub_in_array, place)
             else:
                 var = scope.find_var(in_name)
                 tensor = var.get_tensor()
@@ -65,7 +65,7 @@ def set_output_grad(scope, op, outputs, place):
         if out_name in outputs:
             if out_dup:
                 sub_out = outputs[out_name]
-                for sub_out_name, arr in sub_out:
+                for sub_out_name, sub_out_grad in sub_out:
                     out_tensor = scope.find_var(sub_out_name).get_tensor()
                     grad_tensor = scope.new_var(grad_var_name(
                         sub_out_name)).get_tensor()
@@ -169,9 +169,8 @@ class OpTest(unittest.TestCase):
     def check_output_with_place(self, place):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
-        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+        self.op = create_op(self.scope, self.op_type, op_inputs, self.outputs,
                             op_attrs)
         if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
             return
@@ -232,9 +231,8 @@ class OpTest(unittest.TestCase):
                    max_relative_error=0.005):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
-        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+        self.op = create_op(self.scope, self.op_type, op_inputs, self.outputs,
                             op_attrs)
         if no_grad_set is None:
             no_grad_set = set()