From 2dccdc3ccf01e6c660ac2276188297388bcb6780 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 27 Oct 2017 10:22:27 +0800
Subject: [PATCH 1/4] update benchmark data on VGG19

---
 benchmark/IntelOptimizedPaddle.md | 48 +++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 benchmark/IntelOptimizedPaddle.md

diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
new file mode 100644
index 0000000000..f2744c075d
--- /dev/null
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -0,0 +1,48 @@
+# Benchmark
+
+Machine:
+
+- Server
+ 	- Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop
+ 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
+ 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
+- Desktop
+ 	- i7-6700k
+
+System: CentOS 7.3.1611
+
+PaddlePaddle: commit cfa86a3f70cb5f2517a802f32f2c88d48ab4e0e0
+
+- MKL-DNN tag v0.10
+- MKLML 2018.0.20170720
+- OpenBLAS v0.2.20
+	 
+On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
+
+## Benchmark Model
+
+### Server
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+
+Input image size - 3 * 224 * 224, Time: images/second
+
+- VGG-19
+
+| BatchSize    | 64    | 128  | 256     |
+|--------------|-------| -----| --------|
+| OpenBLAS     | 7.86  | 9.02  | 10.62  | 
+| MKLML        | 11.80 | 13.43 | 16.21  |
+| MKL-DNN      | 29.07 | 30.40 | 31.06  |
+
+
+chart on batch size 128
+TBD
+
+ - ResNet
+ - GoogLeNet
+
+### Laptop
+TBD
+### Desktop
+TBD

From 56f6e231c6fb4cf2af5f11e7d7b0fe53deef4044 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 30 Oct 2017 15:41:00 +0800
Subject: [PATCH 2/4] refine mkldnntester, support comparing values near zero

---
 paddle/gserver/tests/MKLDNNTester.cpp | 28 ++++++++++++++++-----------
 paddle/gserver/tests/MKLDNNTester.h   | 10 +++++-----
 paddle/gserver/tests/test_MKLDNN.cpp  |  3 +--
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 73b7e8857f..c345a16221 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -273,31 +273,37 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
   VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
-double MKLDNNTester::getDelta(const real* d1,
-                              const real* d2,
+double MKLDNNTester::getDelta(const real* refer,
+                              const real* value,
                               size_t len,
                               const float failRate,
                               const float thres) {
   double delta = 0, sum = 0;
   int failCnt = 0;
   const double eps = 1e-5;
-  double maxOut = 0;
+  double maxRatio = 0;
   for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(d2[i]);
-    double diff = fabs(d1[i] - d2[i]);
+    double ref = fabs(refer[i]);
+    double val = fabs(value[i]);
+    double diff = fabs(refer[i] - value[i]);
     delta += diff;
     sum += ref;
-    if (ref > eps && fabs(d1[i]) > eps && diff / ref > thres) {
-      maxOut = std::max(maxOut, diff / ref);
+    if (ref < eps && val < eps) {  // both values are very small
+      continue;
+    }
+    double ratio = diff / ref;
+    if (ratio > thres) {
+      maxRatio = std::max(maxRatio, ratio);
       failCnt++;
     }
   }
-  EXPECT_TRUE(std::isnormal(sum));
   EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(sum));
   EXPECT_FALSE(std::isnan(delta));
   VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
                    << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  return (failCnt / (float)len) > failRate ? maxOut : delta / sum;
+  double res = sum > eps ? delta / sum : eps;
+  return (failCnt / (float)len) > failRate ? maxRatio : res;
 }
 
 double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
@@ -543,12 +549,12 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
   CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
   CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.outValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
     EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
   }
-  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
     EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
   }
 }
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 19d8848f74..a99715cff0 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -128,13 +128,13 @@ private:
 
   /**
    * Get delta percent
-   * if many(>failRate) wrong(abs(dnn-ref)/abs(ref)>thres) points return the
-   * max(diff/ref)
-   * else return sum(abs(a-b)) / sum(abs(b))
+   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
+   * return the max(diff/ref)
+   * else return sum(abs(diff)) / sum(abs(ref))
    * The return value should be smaller than eps when passing.
    */
-  static double getDelta(const real* d1,
-                         const real* d2,
+  static double getDelta(const real* refer,
+                         const real* value,
                          size_t len,
                          const float failRate = 1e-3,
                          const float thres = 0.1);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 85d4f437c2..b99192ca0f 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -234,8 +234,7 @@ static void getMKLDNNBatchNormConfig(TestConfig& cfg,
   cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
   cfg.inputDefs.back().isStatic = true;
   LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  // TODO(TJ): uncomment me when refine and support comparing all zeroes vector
-  // cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.add_inputs();
   cfg.layerConfig.add_inputs();
   ImageConfig* img_conf = input->mutable_image_conf();

From 3eb42bfd6f3affbe856d731046a5e4e63c6c42da Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 30 Oct 2017 21:32:05 +0800
Subject: [PATCH 3/4] move test_CompareMKLDNNandCPU to test_MKLDNN and remove
 unused code

---
 paddle/gserver/tests/MKLDNNTester.cpp         |  22 +--
 paddle/gserver/tests/MKLDNNTester.h           |  10 +-
 paddle/gserver/tests/mkldnn_branch_net.conf   | 142 ++++++++++++++++++
 paddle/gserver/tests/mkldnn_branches_fc.conf  |  58 -------
 .../gserver/tests/mkldnn_branches_pool.conf   |  60 --------
 ...nches_conv.conf => mkldnn_simple_net.conf} |  48 +++---
 paddle/gserver/tests/test_MKLDNN.cpp          |   8 +-
 paddle/math/MKLDNNMatrix.h                    |   5 +
 paddle/trainer/tests/CMakeLists.txt           |  16 --
 .../sample_trainer_config_branch_net.conf     | 133 ----------------
 .../sample_trainer_config_simple_net.conf     |  68 ---------
 paddle/trainer/tests/test_CompareTwoNets.cpp  |  11 --
 12 files changed, 197 insertions(+), 384 deletions(-)
 create mode 100644 paddle/gserver/tests/mkldnn_branch_net.conf
 delete mode 100644 paddle/gserver/tests/mkldnn_branches_fc.conf
 delete mode 100644 paddle/gserver/tests/mkldnn_branches_pool.conf
 rename paddle/gserver/tests/{mkldnn_branches_conv.conf => mkldnn_simple_net.conf} (64%)
 delete mode 100644 paddle/trainer/tests/sample_trainer_config_branch_net.conf
 delete mode 100644 paddle/trainer/tests/sample_trainer_config_simple_net.conf

diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index c345a16221..7670cb88fb 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -521,12 +521,16 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
     gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
     // save forward result
     for (size_t k = 0; k < outArgs.size(); k++) {
-      MatrixPtr value = Matrix::create(outArgs[k].value->getHeight(),
-                                       outArgs[k].value->getWidth(),
-                                       false,
-                                       false);
-      value->copyFrom(*outArgs[k].value);
-      out.outValues.push_back(value);
+      const MatrixPtr& src = outArgs[k].value;
+      MatrixPtr dst =
+          Matrix::create(src->getHeight(), src->getWidth(), false, false);
+      if (typeid(*src) == typeid(MKLDNNMatrix)) {
+        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
+        dnnSrc->copyTo(*dst);
+      } else {
+        dst->copyFrom(*src);
+      }
+      out.outValues.push_back(dst);
     }
 
     // random backward input
@@ -559,9 +563,9 @@ void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
   }
 }
 
-void MKLDNNTester::runBranchesTest(const std::string& configPath,
-                                   size_t iter,
-                                   float eps) {
+void MKLDNNTester::runNetTest(const std::string& configPath,
+                              size_t iter,
+                              float eps) {
   DataIn in;
   initArgument(in, configPath, iter);
   DataOut outCpu, outDnn;
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index a99715cff0..ca55a45bc7 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -85,17 +85,17 @@ public:
            bool printDetails = false,
            size_t iter = 3,
            float epsilon = 1e-4);
-  static void runBranchesTest(const std::string& configPath,
-                              size_t iter = 3,
-                              float eps = 1e-4);
+  static void runNetTest(const std::string& configPath,
+                         size_t iter = 2,
+                         float eps = 1e-4);
   static void initArgument(DataIn& data,
                            const std::string& configPath,
-                           size_t iter = 3);
+                           size_t iter = 2);
   static void getOutResult(const std::string& configPath,
                            DataIn& in,
                            DataOut& out,
                            bool use_mkldnn,
-                           size_t iter = 3);
+                           size_t iter = 2);
 
 private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
diff --git a/paddle/gserver/tests/mkldnn_branch_net.conf b/paddle/gserver/tests/mkldnn_branch_net.conf
new file mode 100644
index 0000000000..8d5146abb0
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branch_net.conf
@@ -0,0 +1,142 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_conv(input, group_name):
+  out1 = img_conv_layer(input=input,
+              name=group_name+'_conv1_',
+              filter_size=1,
+              num_filters=channels,
+              padding=0,
+              shared_biases=True,
+              act=ReluActivation())
+
+  out2 = img_conv_layer(input=input,
+              name=group_name+'_conv2_',
+              filter_size=3,
+              num_filters=channels,
+              padding=1,
+              shared_biases=True,
+              act=ReluActivation())
+  return out1, out2
+
+def two_conv_bn(input, group_name):
+  out1, out2 = two_conv(input, group_name)
+  out1 = batch_norm_layer(input=out1,
+              name=group_name+'_bn1_',
+              use_global_stats=False,
+              act=ReluActivation())
+
+  out2 = batch_norm_layer(input=out2,
+              name=group_name+'_bn2_',
+              use_global_stats=False,
+              act=ReluActivation())
+  return out1, out2
+
+def two_conv_pool(input, group_name):
+  out1, out2 = two_conv(input, group_name)
+  out1 = img_pool_layer(input=out1,
+              name=group_name+'_pool1_',
+              pool_size=3,
+              stride=2,
+              padding=0,
+              pool_type=MaxPooling())
+
+  out2 = img_pool_layer(input=out2,
+              name=group_name+'_pool2_',
+              pool_size=5,
+              stride=2,
+              padding=1,
+              pool_type=MaxPooling())
+  return out1, out2
+
+def two_fc(input, group_name):
+  out1 = fc_layer(input=input,
+            name=group_name+'_fc1_',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+
+  out2 = fc_layer(input=input,
+            name=group_name+'_fc2_',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+tmp = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1, a2 = two_conv(tmp, 'conv_branch')
+tmp = addto_layer(input=[a1, a2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1, b2 = two_conv_pool(tmp, 'pool_branch')
+tmp = concat_layer(input=[b1, b2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=channels*2,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            stride=2,
+            shared_biases=True,
+            act=LinearActivation(),
+            bias_attr=False)
+
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
+            act=ReluActivation())
+
+c1, c2 = two_conv_bn(tmp, 'bn_branch')
+tmp = addto_layer(input=[c1, c2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = fc_layer(input=tmp, size=channels,
+            bias_attr=True,
+            act=ReluActivation())
+
+d1, d2 = two_fc(tmp, 'fc_branch')
+tmp = addto_layer(input=[d1, d2])
+
+out = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+outputs(out)
diff --git a/paddle/gserver/tests/mkldnn_branches_fc.conf b/paddle/gserver/tests/mkldnn_branches_fc.conf
deleted file mode 100644
index fb85425c2b..0000000000
--- a/paddle/gserver/tests/mkldnn_branches_fc.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-def two_fc(input, group_name):
-  out1 = fc_layer(input=input,
-            name=group_name+'_fc1',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-
-  out2 = fc_layer(input=input,
-            name=group_name+'_fc2',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-  return out1, out2
-
-data = data_layer(name ="input", size=channels*16*16)
-
-conv = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation())
-
-pool = img_pool_layer(input=conv,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=AvgPooling())
-
-a1, a2 = two_fc(input=pool, group_name='a')
-
-concat = concat_layer(input=[a1, a2])
-
-b1, b2 = two_fc(input=pool, group_name='b')
-
-addto = addto_layer(input=[b1, b2])
-
-outputs([concat, addto])
diff --git a/paddle/gserver/tests/mkldnn_branches_pool.conf b/paddle/gserver/tests/mkldnn_branches_pool.conf
deleted file mode 100644
index ca17c74752..0000000000
--- a/paddle/gserver/tests/mkldnn_branches_pool.conf
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-def two_pool(input, group_name):
-  out1 = img_pool_layer(input=input,
-            name=group_name+'_pool1',
-            pool_size=3,
-            stride=2,
-            padding=0,
-            pool_type=MaxPooling())
-
-  out2 = img_pool_layer(input=input,
-            name=group_name+'_pool2',
-            pool_size=5,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-  return out1, out2
-
-data = data_layer(name ="input", size=channels*16*16)
-
-conv = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation())
-
-pool = img_pool_layer(input=conv,
-            pool_size=3,
-            stride=1,
-            padding=1,
-            pool_type=AvgPooling())
-
-a1, a2 = two_pool(input=pool, group_name='a')
-
-concat = concat_layer(input=[a1, a2])
-
-b1, b2 = two_pool(input=pool, group_name='b')
-
-addto = addto_layer(input=[b1, b2])
-
-outputs([concat, addto])
diff --git a/paddle/gserver/tests/mkldnn_branches_conv.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
similarity index 64%
rename from paddle/gserver/tests/mkldnn_branches_conv.conf
rename to paddle/gserver/tests/mkldnn_simple_net.conf
index 2628509db4..8bbe91e56d 100644
--- a/paddle/gserver/tests/mkldnn_branches_conv.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -17,40 +17,48 @@ from paddle.trainer_config_helpers import *
 settings(batch_size=16)
 channels = get_config_arg("channels", int, 2)
 
-def two_conv(input, group_name):
-  out1 = img_conv_layer(input=input,
-            name=group_name+'_conv1',
-            filter_size=1,
-            num_filters=channels,
-            padding=0,
-            shared_biases=True,
-            act=ReluActivation())
+data = data_layer(name ="input", size=channels*16*16)
 
-  out2 = img_conv_layer(input=input,
-            name=group_name+'_conv2',
+tmp = img_conv_layer(input=data,
+            num_channels=channels,
             filter_size=3,
             num_filters=channels,
             padding=1,
             shared_biases=True,
             act=ReluActivation())
-  return out1, out2
 
-data = data_layer(name ="input", size=channels*16*16)
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=1,
+            padding=0,
+            pool_type=AvgPooling())
 
-conv = img_conv_layer(input=data,
-            num_channels=channels,
+tmp = img_conv_layer(input=tmp,
             filter_size=3,
             num_filters=channels,
             padding=1,
             shared_biases=True,
-            act=ReluActivation())
+            act=LinearActivation(),
+            bias_attr=False)
 
-a1, a2 = two_conv(input=conv, group_name='a')
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
+            act=ReluActivation())
 
-concat = concat_layer(input=[a1, a2])
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
 
-b1, b2 = two_conv(input=conv, group_name='b')
+tmp = fc_layer(input=tmp,
+            size=channels,
+            bias_attr=False,
+            act=ReluActivation())
 
-addto = addto_layer(input=[b1, b2])
+out = fc_layer(input=tmp,
+            size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
 
-outputs([concat, addto])
+outputs(out)
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index b99192ca0f..d60b0f04a1 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -308,15 +308,15 @@ TEST(MKLDNNActivation, Activations) {
 }
 
 DECLARE_string(config_args);
-TEST(MKLDNNLayer, branches) {
-  std::vector<std::string> cases = {"conv", "pool", "fc"};
+TEST(MKLDNNNet, net) {
+  std::vector<std::string> cases = {"simple", "branch"};
   for (auto name : cases) {
-    std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
+    std::string config = "./gserver/tests/mkldnn_" + name + "_net.conf";
     for (auto channels : {2, 32}) {
       std::ostringstream oss;
       oss << "channels=" << channels;
       FLAGS_config_args = oss.str();
-      MKLDNNTester::runBranchesTest(config);
+      MKLDNNTester::runNetTest(config);
     }
   }
 }
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 5f5b819017..54cfefe23b 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -102,6 +102,11 @@ public:
     m_->copyFrom(src);
   }
 
+  void copyTo(Matrix& dst) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    dst.copyFrom(*m_);
+  }
+
 public:
   /**
    * Reorder this MKLDNNMatrix from other format.
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 5ebbb99c94..f01ad4142d 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -37,22 +37,6 @@ add_test(NAME test_CompareTwoNets
             --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
-################ test_CompareMKLDNNandCPU ######################
-if(WITH_MKLDNN)
-  macro(gen_command VAR_NAME CONFIG_FILE)
-    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
-                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
-                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
-                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
-                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
-  endmacro()
-  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
-  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
-  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
-  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
-  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
-endif()
-
 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
     test_CompareTwoOpts.cpp)
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
deleted file mode 100644
index 3d8fb77a11..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 128,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-tmp = img_conv_layer(input=data,
-            num_channels=1,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-a1 = img_conv_layer(input=tmp,
-            filter_size=1,
-            num_filters=32,
-            padding=0,
-            shared_biases=True,
-            act=ReluActivation())
-
-a2 = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-tmp = addto_layer(input=[a1, a2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=AvgPooling())
-
-b1 = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-b1 = img_pool_layer(input=b1,
-            pool_size=3,
-            stride=2,
-            padding=0,
-            pool_type=MaxPooling())
-
-b2 = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=64,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-b2 = img_pool_layer(input=b2,
-            pool_size=5,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = concat_layer(input=[b1, b2])
-
-tmp = img_pool_layer(input=tmp,
-            num_channels=96,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-c1 = img_conv_layer(input=tmp,
-            filter_size=1,
-            num_filters=32,
-            padding=0,
-            shared_biases=True,
-            act=ReluActivation())
-
-c2 = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-tmp = addto_layer(input=[c1, c2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = fc_layer(input=tmp, size=64,
-            bias_attr=False,
-            act=TanhActivation())
-
-output = fc_layer(input=tmp, size=10,
-            bias_attr=True,
-            act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=10)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
deleted file mode 100644
index c615b5622b..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 128,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-tmp = img_conv_layer(input=data,
-            num_channels=1,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=AvgPooling())
-            
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=32,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-            
-tmp = fc_layer(input=tmp, size=64,
-               bias_attr=True,
-               act=ReluActivation())
-
-output = fc_layer(input=tmp, size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=10)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 307645d2c3..94f65e545d 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -26,15 +26,12 @@ DECLARE_int32(gpu_id);
 
 DECLARE_bool(local);
 DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
 
 DECLARE_string(config);
 DECLARE_string(nics);
 
 DEFINE_string(config_file_a, "", "config of one network to compare");
 DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(use_mkldnn_a, false, "whether to use mkldnn to run config_file_a");
-DEFINE_bool(use_mkldnn_b, false, "whether to use mkldnn to run config_file_b");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -131,12 +128,6 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
                 matA.getWidth());
   }
 
-  if (FLAGS_use_mkldnn_a || FLAGS_use_mkldnn_b) {
-    // some format of mkldnn parameter is different with cpu
-    // test_MKLDNN will check the parameters
-    return;
-  }
-
   vector<ParameterPtr>& parametersA = comDataA.parameters;
   vector<ParameterPtr>& parametersB = comDataB.parameters;
 
@@ -176,12 +167,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
-  FLAGS_use_mkldnn = FLAGS_use_mkldnn_a;
   calcGradient(dataA, FLAGS_config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
-  FLAGS_use_mkldnn = FLAGS_use_mkldnn_b;
   calcGradient(dataB, FLAGS_config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 

From 5bd188651740ac577f9cdc97b54137474031f122 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 1 Nov 2017 21:56:26 +0800
Subject: [PATCH 4/4] update the VGG benchmark on CentOs6.3 and Intel 6148

---
 benchmark/IntelOptimizedPaddle.md | 84 +++++++++++++++----------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index f2744c075d..1bf9ea9df0 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -1,48 +1,48 @@
-# Benchmark
-
-Machine:
-
+# Benchmark
+
+Machine:
+
 - Server
- 	- Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+ 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
 - Laptop
  	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- 	- i7-6700k
-
-System: CentOS 7.3.1611
-
-PaddlePaddle: commit cfa86a3f70cb5f2517a802f32f2c88d48ab4e0e0
-
+ 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
+- Desktop
+ 	- i7-6700k
+
+System: CentOS release 6.3 (Final), Docker 1.12.1.
+
+PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+
 - MKL-DNN tag v0.10
 - MKLML 2018.0.20170720
-- OpenBLAS v0.2.20
-	 
-On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
-
-## Benchmark Model
-
-### Server
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
-
-Input image size - 3 * 224 * 224, Time: images/second
-
-- VGG-19
-
-| BatchSize    | 64    | 128  | 256     |
-|--------------|-------| -----| --------|
-| OpenBLAS     | 7.86  | 9.02  | 10.62  | 
-| MKLML        | 11.80 | 13.43 | 16.21  |
-| MKL-DNN      | 29.07 | 30.40 | 31.06  |
-
-
-chart on batch size 128
-TBD
-
+- OpenBLAS v0.2.20
+	 
+On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
+
+## Benchmark Model
+
+### Server
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+
+Input image size - 3 * 224 * 224, Time: images/second
+
+- VGG-19
+
+| BatchSize    | 64    | 128  | 256     |
+|--------------|-------| -----| --------|
+| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+
+
+chart on batch size 128
+TBD
+
  - ResNet
- - GoogLeNet
-
-### Laptop
-TBD
-### Desktop
-TBD
+ - GoogLeNet
+
+### Laptop
+TBD
+### Desktop
+TBD