From 42febfa928af0a066727b9d6b1d2dbe15bf10886 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 23 Apr 2018 21:19:25 +0800
Subject: [PATCH 01/52] tensorrt convert init

---
 .../fluid/inference/tensorrt/CMakeLists.txt   |  2 +
 .../inference/tensorrt/convert/CMakeLists.txt |  2 +
 .../inference/tensorrt/convert/convert.cc     | 51 +++++++++++++++
 .../inference/tensorrt/convert/convert.h      | 64 +++++++++++++++++++
 .../tensorrt/convert/convert_test.cc          | 38 +++++++++++
 5 files changed, 157 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 create mode 100644 paddle/fluid/inference/tensorrt/convert/convert.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/convert.h
 create mode 100644 paddle/fluid/inference/tensorrt/convert/convert_test.cc

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index e39c0daac7..37f038f1fb 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1 +1,3 @@
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
+cc_library(tensorrt DEPS tensorrt_convert)
+add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
new file mode 100644
index 0000000000..c35d61ef05
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -0,0 +1,2 @@
+nv_library(tensorrt_convert SRCS convert.cc DEPS dynload_cuda)
+nv_test(tensorrt_convert_test SRCS convert_test.cc DEPS tensorrt paddle_fluid)
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.cc b/paddle/fluid/inference/tensorrt/convert/convert.cc
new file mode 100644
index 0000000000..be813cf93a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/convert.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void TensorRTConverter::ConvertOp(const framework::OpDesc& op) {
+  std::string type = op.Type();
+  PADDLE_ENFORCE(op_registry_.count(type), "No converter registered for op: %s",
+                 type);
+  std::function<void(const framework::OpDesc&)> op_converter =
+      op_registry_.at(type);
+  op_converter(op);
+}
+
+void TensorRTConverter::ConvertBlock(const framework::BlockDesc& block) {
+  for (auto op : block.AllOps()) {
+    ConvertOp(*op);
+  }
+}
+
+void TensorRTConverter::RegisterOpConverters() {
+  op_registry_["mul"] = ConvertMul;
+  op_registry_["conv2d"] = ConvertConv2D;
+}
+
+void TensorRTConverter::ConvertMul(const framework::OpDesc& op) {
+  LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+}
+
+void TensorRTConverter::ConvertConv2D(const framework::OpDesc& op) {
+  LOG(INFO) << "convert a fluid Conv2d op to tensorrt conv layer without bias";
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.h b/paddle/fluid/inference/tensorrt/convert/convert.h
new file mode 100644
index 0000000000..a029152031
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/convert.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <NvInfer.h>
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TensorRTConverter {
+ public:
+  explicit TensorRTConverter(const framework::Scope& scope) : scope_(scope) {
+    this->RegisterOpConverters();
+  }
+
+  // convert fluid op to tensorrt layer
+  void ConvertOp(const framework::OpDesc& op);
+
+  // convert fluid block to tensorrt network
+  void ConvertBlock(const framework::BlockDesc& block);
+
+ private:
+  // convert op registry, whose key is the fluid op type, and value is the
+  // convert tensorrt function name
+  std::unordered_map<std::string, std::function<void(const framework::OpDesc&)>>
+      op_registry_;
+  // fluid inference scope
+  const framework::Scope& scope_;
+  // tensorrt input/output tensor list, whose key is the fluid variable name,
+  // and value is the pointer position of tensorrt tensor
+  std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
+
+  // register different op converters
+  void RegisterOpConverters();
+
+  // convert a fluid Mul op to tensorrt fc layer without bias
+  static void ConvertMul(const framework::OpDesc& op);
+
+  // convert a fluid Conv2d op to tensorrt conv layer without bias
+  static void ConvertConv2D(const framework::OpDesc& op);
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_test.cc b/paddle/fluid/inference/tensorrt/convert/convert_test.cc
new file mode 100644
index 0000000000..dd1526b783
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/convert_test.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(tensorrt, ConvertBlock) {
+  framework::ProgramDesc prog;
+  auto* block = prog.MutableBlock(0);
+  auto* mul_op = block->AppendOp();
+  mul_op->SetType("mul");
+  auto* conv2d_op = block->AppendOp();
+  conv2d_op->SetType("conv2d");
+
+  framework::Scope scope;
+  TensorRTConverter converter(scope);
+  converter.ConvertBlock(*block);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle

From d599de5c41ca312158874dffb2373fcc116d5b52 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 25 Apr 2018 20:10:28 +0800
Subject: [PATCH 02/52] auto registray op converters

---
 .../inference/tensorrt/convert/CMakeLists.txt |  2 +-
 .../inference/tensorrt/convert/convert.cc     | 30 +++--------
 .../inference/tensorrt/convert/convert.h      | 53 +++++++++++--------
 .../tensorrt/convert/convert_conv2d.h         | 36 +++++++++++++
 .../inference/tensorrt/convert/convert_mul.h  | 35 ++++++++++++
 .../{convert_test.cc => test_convert.cc}      |  5 +-
 6 files changed, 112 insertions(+), 49 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/convert_conv2d.h
 create mode 100644 paddle/fluid/inference/tensorrt/convert/convert_mul.h
 rename paddle/fluid/inference/tensorrt/convert/{convert_test.cc => test_convert.cc} (94%)

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index c35d61ef05..cd51fd609c 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,2 +1,2 @@
 nv_library(tensorrt_convert SRCS convert.cc DEPS dynload_cuda)
-nv_test(tensorrt_convert_test SRCS convert_test.cc DEPS tensorrt paddle_fluid)
+nv_test(test_tensorrt_convert SRCS test_convert.cc DEPS tensorrt paddle_fluid)
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.cc b/paddle/fluid/inference/tensorrt/convert/convert.cc
index be813cf93a..bf6f1cd2c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert.cc
+++ b/paddle/fluid/inference/tensorrt/convert/convert.cc
@@ -13,39 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/convert.h"
+#include "paddle/fluid/inference/tensorrt/convert/convert_conv2d.h"
+#include "paddle/fluid/inference/tensorrt/convert/convert_mul.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void TensorRTConverter::ConvertOp(const framework::OpDesc& op) {
-  std::string type = op.Type();
-  PADDLE_ENFORCE(op_registry_.count(type), "No converter registered for op: %s",
-                 type);
-  std::function<void(const framework::OpDesc&)> op_converter =
-      op_registry_.at(type);
-  op_converter(op);
-}
-
 void TensorRTConverter::ConvertBlock(const framework::BlockDesc& block) {
   for (auto op : block.AllOps()) {
-    ConvertOp(*op);
+    std::string type = op->Type();
+    PADDLE_ENFORCE(GetOpConverter().count(type),
+                   "No converter registered for op: %s", type);
+    auto op_converter = GetOpConverter()[type];
+    op_converter->Convert(*op);
   }
 }
 
-void TensorRTConverter::RegisterOpConverters() {
-  op_registry_["mul"] = ConvertMul;
-  op_registry_["conv2d"] = ConvertConv2D;
-}
-
-void TensorRTConverter::ConvertMul(const framework::OpDesc& op) {
-  LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
-}
-
-void TensorRTConverter::ConvertConv2D(const framework::OpDesc& op) {
-  LOG(INFO) << "convert a fluid Conv2d op to tensorrt conv layer without bias";
-}
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.h b/paddle/fluid/inference/tensorrt/convert/convert.h
index a029152031..4f95233057 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert.h
+++ b/paddle/fluid/inference/tensorrt/convert/convert.h
@@ -26,37 +26,46 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class TensorRTConverter {
+class ConverterBase {
  public:
-  explicit TensorRTConverter(const framework::Scope& scope) : scope_(scope) {
-    this->RegisterOpConverters();
-  }
+  ConverterBase() {}
 
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::OpDesc& op);
-
-  // convert fluid block to tensorrt network
-  void ConvertBlock(const framework::BlockDesc& block);
-
- private:
-  // convert op registry, whose key is the fluid op type, and value is the
-  // convert tensorrt function name
-  std::unordered_map<std::string, std::function<void(const framework::OpDesc&)>>
-      op_registry_;
   // fluid inference scope
-  const framework::Scope& scope_;
+  framework::Scope* scope_;
   // tensorrt input/output tensor list, whose key is the fluid variable name,
   // and value is the pointer position of tensorrt tensor
   std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
+};
 
-  // register different op converters
-  void RegisterOpConverters();
+class OpConverter : public ConverterBase {
+ public:
+  OpConverter() {}
+  virtual ~OpConverter() {}
 
-  // convert a fluid Mul op to tensorrt fc layer without bias
-  static void ConvertMul(const framework::OpDesc& op);
+  // convert fluid op to tensorrt layer
+  virtual void Convert(const framework::OpDesc& op) = 0;
+};
+
+static std::unordered_map<std::string, OpConverter*>& GetOpConverter() {
+  static std::unordered_map<std::string, OpConverter*> register_op_converter;
+  return register_op_converter;
+}
 
-  // convert a fluid Conv2d op to tensorrt conv layer without bias
-  static void ConvertConv2D(const framework::OpDesc& op);
+#define REGISTER_TRT_OP_CONVETER(op_type, convert_class) \
+  class convert_class##Register {                        \
+   public:                                               \
+    convert_class##Register() {                          \
+      GetOpConverter()[#op_type] = new convert_class;    \
+    }                                                    \
+  };                                                     \
+  convert_class##Register convert_class##reg;
+
+class TensorRTConverter : public ConverterBase {
+ public:
+  TensorRTConverter() {}
+
+  // convert fluid block to tensorrt network
+  void ConvertBlock(const framework::BlockDesc& block);
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_conv2d.h b/paddle/fluid/inference/tensorrt/convert/convert_conv2d.h
new file mode 100644
index 0000000000..34622f92a4
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/convert_conv2d.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class Conv2dOpConverter : public OpConverter {
+ public:
+  Conv2dOpConverter() {}
+  void Convert(const framework::OpDesc& op);
+};
+
+void Conv2dOpConverter::Convert(const framework::OpDesc& op) {
+  LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias";
+}
+
+REGISTER_TRT_OP_CONVETER(conv2d, Conv2dOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_mul.h b/paddle/fluid/inference/tensorrt/convert/convert_mul.h
new file mode 100644
index 0000000000..a626300cf3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/convert_mul.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class MulOpConverter : public OpConverter {
+ public:
+  MulOpConverter() {}
+  void Convert(const framework::OpDesc& op);
+};
+
+REGISTER_TRT_OP_CONVETER(mul, MulOpConverter);
+void MulOpConverter::Convert(const framework::OpDesc& op) {
+  LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_test.cc b/paddle/fluid/inference/tensorrt/convert/test_convert.cc
similarity index 94%
rename from paddle/fluid/inference/tensorrt/convert/convert_test.cc
rename to paddle/fluid/inference/tensorrt/convert/test_convert.cc
index dd1526b783..d761b4eb7f 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert_test.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_convert.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/tensorrt/convert/convert.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/convert.h"
 
 namespace paddle {
 namespace inference {
@@ -28,8 +28,7 @@ TEST(tensorrt, ConvertBlock) {
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
-  framework::Scope scope;
-  TensorRTConverter converter(scope);
+  TensorRTConverter converter;
   converter.ConvertBlock(*block);
 }
 

From c4e3010b14cfbc3847466843ee58e49792e31b27 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 25 Apr 2018 22:30:57 +0800
Subject: [PATCH 03/52] use template to do registry

---
 .../inference/tensorrt/convert/CMakeLists.txt |  2 +-
 .../{convert_conv2d.h => conv2d_op.cc}        |  9 +---
 .../inference/tensorrt/convert/convert.cc     |  8 +---
 .../inference/tensorrt/convert/convert.h      | 46 +++++++++----------
 .../convert/{convert_mul.h => mul_op.cc}      |  8 +---
 5 files changed, 26 insertions(+), 47 deletions(-)
 rename paddle/fluid/inference/tensorrt/convert/{convert_conv2d.h => conv2d_op.cc} (87%)
 rename paddle/fluid/inference/tensorrt/convert/{convert_mul.h => mul_op.cc} (87%)

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index cd51fd609c..c4b8514c1c 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,2 +1,2 @@
-nv_library(tensorrt_convert SRCS convert.cc DEPS dynload_cuda)
+nv_library(tensorrt_convert SRCS convert.cc mul_op.cc conv2d_op.cc DEPS dynload_cuda)
 nv_test(test_tensorrt_convert SRCS test_convert.cc DEPS tensorrt paddle_fluid)
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_conv2d.h b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
similarity index 87%
rename from paddle/fluid/inference/tensorrt/convert/convert_conv2d.h
rename to paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 34622f92a4..1201a7696a 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert_conv2d.h
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -12,25 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "paddle/fluid/inference/tensorrt/convert/convert.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class Conv2dOpConverter : public OpConverter {
- public:
-  Conv2dOpConverter() {}
-  void Convert(const framework::OpDesc& op);
-};
+REGISTER_TRT_OP_CONVETER(conv2d, Conv2dOpConverter);
 
 void Conv2dOpConverter::Convert(const framework::OpDesc& op) {
   LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias";
 }
 
-REGISTER_TRT_OP_CONVETER(conv2d, Conv2dOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.cc b/paddle/fluid/inference/tensorrt/convert/convert.cc
index bf6f1cd2c1..78a72b1a8b 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert.cc
+++ b/paddle/fluid/inference/tensorrt/convert/convert.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/convert.h"
-#include "paddle/fluid/inference/tensorrt/convert/convert_conv2d.h"
-#include "paddle/fluid/inference/tensorrt/convert/convert_mul.h"
 
 namespace paddle {
 namespace inference {
@@ -23,10 +21,8 @@ namespace tensorrt {
 void TensorRTConverter::ConvertBlock(const framework::BlockDesc& block) {
   for (auto op : block.AllOps()) {
     std::string type = op->Type();
-    PADDLE_ENFORCE(GetOpConverter().count(type),
-                   "No converter registered for op: %s", type);
-    auto op_converter = GetOpConverter()[type];
-    op_converter->Convert(*op);
+    OpConverter op_converter;
+    op_converter.Convert(*op);
   }
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.h b/paddle/fluid/inference/tensorrt/convert/convert.h
index 4f95233057..953086ace9 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert.h
+++ b/paddle/fluid/inference/tensorrt/convert/convert.h
@@ -26,9 +26,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class ConverterBase {
+class OpConverter {
  public:
-  ConverterBase() {}
+  OpConverter() {}
+
+  void Convert(const framework::OpDesc& op) {
+    std::string type = op.Type();
+    OpConverter& op_converter = this->register_op_converter_[type];
+    op_converter.Convert(op);
+  }
+
+  template <typename T>
+  static void Register(const std::string key) {
+    register_op_converter_[key] = T();
+  }
+  static std::unordered_map<std::string, OpConverter> register_op_converter_;
 
   // fluid inference scope
   framework::Scope* scope_;
@@ -37,30 +49,14 @@ class ConverterBase {
   std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
 };
 
-class OpConverter : public ConverterBase {
- public:
-  OpConverter() {}
-  virtual ~OpConverter() {}
-
-  // convert fluid op to tensorrt layer
-  virtual void Convert(const framework::OpDesc& op) = 0;
-};
-
-static std::unordered_map<std::string, OpConverter*>& GetOpConverter() {
-  static std::unordered_map<std::string, OpConverter*> register_op_converter;
-  return register_op_converter;
-}
-
-#define REGISTER_TRT_OP_CONVETER(op_type, convert_class) \
-  class convert_class##Register {                        \
-   public:                                               \
-    convert_class##Register() {                          \
-      GetOpConverter()[#op_type] = new convert_class;    \
-    }                                                    \
-  };                                                     \
-  convert_class##Register convert_class##reg;
+#define REGISTER_TRT_OP_CONVETER(op_type, convert_class)                \
+  class convert_class : public OpConverter {                            \
+   public:                                                              \
+    convert_class() { OpConverter::Register<convert_class>(#op_type); } \
+    void Convert(const framework::OpDesc& op);                          \
+  }
 
-class TensorRTConverter : public ConverterBase {
+class TensorRTConverter {
  public:
   TensorRTConverter() {}
 
diff --git a/paddle/fluid/inference/tensorrt/convert/convert_mul.h b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
similarity index 87%
rename from paddle/fluid/inference/tensorrt/convert/convert_mul.h
rename to paddle/fluid/inference/tensorrt/convert/mul_op.cc
index a626300cf3..0ce5eb7302 100644
--- a/paddle/fluid/inference/tensorrt/convert/convert_mul.h
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -12,20 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "paddle/fluid/inference/tensorrt/convert/convert.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class MulOpConverter : public OpConverter {
- public:
-  MulOpConverter() {}
-  void Convert(const framework::OpDesc& op);
-};
-
 REGISTER_TRT_OP_CONVETER(mul, MulOpConverter);
+
 void MulOpConverter::Convert(const framework::OpDesc& op) {
   LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
 }

From 6f6f3304238898d0e48541b325afcbed49bb1a98 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 27 Apr 2018 11:30:02 +0800
Subject: [PATCH 04/52] update the register method

---
 .../fluid/inference/tensorrt/CMakeLists.txt   |  1 -
 .../inference/tensorrt/convert/CMakeLists.txt |  4 +-
 .../inference/tensorrt/convert/conv2d_op.cc   | 17 ++--
 .../inference/tensorrt/convert/convert.cc     | 31 -------
 .../inference/tensorrt/convert/convert.h      | 69 --------------
 .../inference/tensorrt/convert/mul_op.cc      | 16 ++--
 .../inference/tensorrt/convert/op_converter.h | 89 +++++++++++++++++++
 .../{test_convert.cc => test_op_converter.cc} |  6 +-
 8 files changed, 115 insertions(+), 118 deletions(-)
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/convert.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/convert.h
 create mode 100644 paddle/fluid/inference/tensorrt/convert/op_converter.h
 rename paddle/fluid/inference/tensorrt/convert/{test_convert.cc => test_op_converter.cc} (88%)

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index ad850055a5..8dd95293e7 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,3 @@
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
-cc_library(tensorrt DEPS tensorrt_convert)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index c4b8514c1c..19fffa71cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,2 +1,2 @@
-nv_library(tensorrt_convert SRCS convert.cc mul_op.cc conv2d_op.cc DEPS dynload_cuda)
-nv_test(test_tensorrt_convert SRCS test_convert.cc DEPS tensorrt paddle_fluid)
+file(GLOB TENSORRT_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc ${TENSORRT_OPS} DEPS ${FLUID_CORE_MODULES})
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 1201a7696a..431500b90e 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-REGISTER_TRT_OP_CONVETER(conv2d, Conv2dOpConverter);
+class Conv2dOpConverter : public OpConverter {
+ public:
+  Conv2dOpConverter() {}
+  void operator()(const framework::OpDesc& op) override {
+    LOG(INFO)
+        << "convert a fluid conv2d op to tensorrt conv layer without bias";
+  }
+};
 
-void Conv2dOpConverter::Convert(const framework::OpDesc& op) {
-  LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias";
-}
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.cc b/paddle/fluid/inference/tensorrt/convert/convert.cc
deleted file mode 100644
index 78a72b1a8b..0000000000
--- a/paddle/fluid/inference/tensorrt/convert/convert.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/convert.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-void TensorRTConverter::ConvertBlock(const framework::BlockDesc& block) {
-  for (auto op : block.AllOps()) {
-    std::string type = op->Type();
-    OpConverter op_converter;
-    op_converter.Convert(*op);
-  }
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/convert.h b/paddle/fluid/inference/tensorrt/convert/convert.h
deleted file mode 100644
index 953086ace9..0000000000
--- a/paddle/fluid/inference/tensorrt/convert/convert.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <NvInfer.h>
-#include <functional>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class OpConverter {
- public:
-  OpConverter() {}
-
-  void Convert(const framework::OpDesc& op) {
-    std::string type = op.Type();
-    OpConverter& op_converter = this->register_op_converter_[type];
-    op_converter.Convert(op);
-  }
-
-  template <typename T>
-  static void Register(const std::string key) {
-    register_op_converter_[key] = T();
-  }
-  static std::unordered_map<std::string, OpConverter> register_op_converter_;
-
-  // fluid inference scope
-  framework::Scope* scope_;
-  // tensorrt input/output tensor list, whose key is the fluid variable name,
-  // and value is the pointer position of tensorrt tensor
-  std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
-};
-
-#define REGISTER_TRT_OP_CONVETER(op_type, convert_class)                \
-  class convert_class : public OpConverter {                            \
-   public:                                                              \
-    convert_class() { OpConverter::Register<convert_class>(#op_type); } \
-    void Convert(const framework::OpDesc& op);                          \
-  }
-
-class TensorRTConverter {
- public:
-  TensorRTConverter() {}
-
-  // convert fluid block to tensorrt network
-  void ConvertBlock(const framework::BlockDesc& block);
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 0ce5eb7302..f9834ab156 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,17 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-REGISTER_TRT_OP_CONVETER(mul, MulOpConverter);
+class MulOpConverter : public OpConverter {
+ public:
+  MulOpConverter() {}
+  void operator()(const framework::OpDesc& op) override {
+    LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+  }
+};
 
-void MulOpConverter::Convert(const framework::OpDesc& op) {
-  LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
-}
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
new file mode 100644
index 0000000000..22a4812ce7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Convert Op from Fluid to TensorRT Engine.
+ */
+class OpConverter {
+ public:
+  OpConverter() {}
+
+  virtual void operator()(const framework::OpDesc& op) {}
+  void Execute(const framework::OpDesc& op) {
+    std::string type = op.Type();
+    auto it = converters_.find(type);
+    PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]",
+                   type);
+    (*it->second)(op);
+  }
+
+  static OpConverter& Global() {
+    static auto* x = new OpConverter;
+    return *x;
+  }
+
+  template <typename T>
+  void Register(const std::string& key) {
+    converters_[key] = new T;
+  }
+
+  virtual ~OpConverter() {}
+
+ private:
+  // registered op converter map, whose key is the fluid op type, and value is
+  // the pointer position of corresponding OpConverter class.
+  std::unordered_map<std::string, OpConverter*> converters_;
+
+  // fluid inference scope
+  framework::Scope* scope_;
+  // tensorrt input/output tensor map, whose key is the fluid variable name,
+  // and value is the pointer position of tensorrt tensor
+  std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
+};
+
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)      \
+  struct trt_##op_type__##_converter {                         \
+    trt_##op_type__##_converter() {                            \
+      OpConverter::Global().Register<Converter__>(#op_type__); \
+    }                                                          \
+  };                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;
+
+class BlockConverter {
+ public:
+  BlockConverter() {}
+
+  // convert fluid block to tensorrt network
+  void ConvertBlock(const framework::BlockDesc& block) {
+    for (auto op : block.AllOps()) {
+      OpConverter::Global().Execute(*op);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_convert.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
similarity index 88%
rename from paddle/fluid/inference/tensorrt/convert/test_convert.cc
rename to paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index d761b4eb7f..43be2af68a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_convert.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/convert.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(tensorrt, ConvertBlock) {
+TEST(BlockConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* mul_op = block->AppendOp();
@@ -28,7 +28,7 @@ TEST(tensorrt, ConvertBlock) {
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
-  TensorRTConverter converter;
+  BlockConverter converter;
   converter.ConvertBlock(*block);
 }
 

From 3948b58b6e1637c7009f56ebc67e3d17577764ed Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Apr 2018 11:04:56 +0800
Subject: [PATCH 05/52] Add unittest of cross entropy. It is not stable on CUDA

---
 .../tests/unittests/test_cross_entropy_op.py  | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index c5b9e92d69..c8e5bd1a8f 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest, randomize_probability
+import paddle.fluid as fluid
 
 
 class TestCrossEntropyOp1(OpTest):
@@ -105,5 +106,60 @@ class TestCrossEntropyOp3(OpTest):
             ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 
 
+class TestCrossEntropyStable(unittest.TestCase):
+    def main(self, place):
+        if isinstance(
+                place,
+                fluid.CUDAPlace) and not fluid.core.is_compiled_with_cuda():
+            return
+
+        class DataRandom(object):
+            def __init__(self):
+                self.random = np.random.RandomState(seed=1)
+
+            def next(self):
+                return {
+                    'input': self.random.uniform(
+                        low=-1, high=1, size=(64, 200)).astype('float32'),
+                    'label': self.random.uniform(
+                        low=0, high=10000, size=(64, 1)).astype('int64'),
+                }
+
+        losses = []
+        for _ in xrange(2):
+            startup = fluid.Program()
+            startup.random_seed = 1
+            main = fluid.Program()
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data('input', shape=[200])
+                    label = fluid.layers.data('label', shape=[1], dtype='int64')
+                    prediction = fluid.layers.fc(input=img,
+                                                 size=10000,
+                                                 act='softmax')
+                    xe = fluid.layers.cross_entropy(
+                        input=prediction, label=label)
+                    loss = fluid.layers.mean(xe)
+                    adam = fluid.optimizer.Adam()
+                    adam.minimize(loss)
+
+                    exe = fluid.Executor(place)
+                    exe.run(startup)
+                    data = DataRandom()
+                    for i in xrange(1000):
+                        exe.run(feed=next(data))
+                    losses.append(
+                        exe.run(feed=next(data), fetch_list=[loss])[0])
+        print losses
+        self.assertAlmostEqual(losses[0][0], losses[1][0])
+
+    def test_cpu(self):
+        self.main(fluid.CPUPlace())
+
+    def test_cuda(self):
+        self.main(fluid.CUDAPlace(0))
+
+
 if __name__ == "__main__":
     unittest.main()

From 76174ec0e9e81ebb049663dc9abf534a241dc143 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Apr 2018 12:49:38 +0800
Subject: [PATCH 06/52] Clean cross entropy and add sync in executor

---
 paddle/fluid/framework/executor.cc            |   3 +
 paddle/fluid/operators/cross_entropy_op.cc    |  10 +-
 paddle/fluid/operators/cross_entropy_op.cu    |  99 +--------------
 paddle/fluid/operators/cross_entropy_op.h     | 117 ++++++++++++------
 .../tests/unittests/test_cross_entropy_op.py  |  55 --------
 5 files changed, 92 insertions(+), 192 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 766bf0ab0c..b719568c65 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -348,6 +348,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       }
     }
   }
+
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+
   if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 0e0622e290..2b2a9dc831 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+using CPUCtx = paddle::platform::CPUDeviceContext;
+
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
-                       ops::CrossEntropyOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyOpKernel<CPUCtx, double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpKernel<float>,
-                       ops::CrossEntropyGradientOpKernel<double>);
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 6449149d4b..30dbd5bd3d 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -14,98 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
 
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int64_t* label, const int N,
-                                           const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    int idx = i * D + label[i];
-    dX[idx] = -dY[i] / X[idx];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                               const T* label, const int N,
-                                               const int D) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < N * D) {
-    int row_ids = ids / D;
-    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
-  }
-}
-}  // namespace
-
-template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
-        ctx.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const T* dy_data =
-        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int64_t batch_size = x->dims()[0];
-    int64_t class_num = x->dims()[1];
-
-    int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    } else {
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
-      functor(dev_ctx, dx, 0);
-      auto* label_data = label->data<int64_t>();
-      grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                        ops::CrossEntropyOpCUDAKernel<double>);
+using CUDACtx = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(cross_entropy,
+                        ops::CrossEntropyOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
 REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpCUDAKernel<float>,
-                        ops::CrossEntropyGradientOpCUDAKernel<double>);
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 6da3a24dc8..822a83712d 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -17,69 +17,106 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+    math::CrossEntropyFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), y, x, labels,
         ctx.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
+class XeSoftlabelGradFunctor {
+ public:
+  XeSoftlabelGradFunctor(T* dx,
+                         const T* dy,     // NOLINT
+                         const T* x,      // NOLINT
+                         const T* label,  // NOLINT
+                         size_t num_classes)
+      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+
+  HOSTDEVICE void operator()(size_t i) {
+    auto row_ids = i / num_classes_;
+    dx_[i] = -label_[i] * dy_[row_ids] / x_[i];
+  }
+
+ private:
+  T* dx_;
+  const T* dy_;
+  const T* x_;
+  const T* label_;
+  size_t num_classes_;
+};
+
+template <typename T>
+class XeGradFunctor {
+ public:
+  XeGradFunctor(T* dx,
+                const T* dy,           // NOLINT
+                const T* x,            // NOLINT
+                const int64_t* label,  // NOLINT
+                size_t num_classes)
+      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+
+  HOSTDEVICE void operator()(size_t label_id) {
+    auto x_is_true_offset = label_id * num_classes_ + label_[label_id];
+    for (size_t x_offset = label_id * num_classes_;
+         x_offset < (label_id + 1) * num_classes_; ++x_offset) {
+      dx_[x_offset] = x_offset != x_is_true_offset
+                          ? static_cast<T>(0)
+                          : -dy_[label_id] / x_[x_offset];
+    }
+  }
+
+ private:
+  T* dx_;
+  const T* dy_;
+  const T* x_;
+  const int64_t* label_;
+  size_t num_classes_;
+};
+
+template <typename DeviceContext, typename T>
 class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int64_t class_num = x->dims()[1];
     if (ctx.Attr<bool>("soft_label")) {
-      auto x_mat = EigenMatrix<T>::From(*x);
-      auto dy_mat = EigenMatrix<T>::From(*dy);
-      auto lbl_mat = EigenMatrix<T>::From(*label);
-      auto dx_mat = EigenMatrix<T>::From(*dx);
-
-      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
-                         .eigen_device()) =
-          -(lbl_mat *
-            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
+      XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
+                                        label->data<T>(),
+                                        static_cast<size_t>(class_num));
+      platform::ForRange<DeviceContext> for_range(
+          ctx.template device_context<DeviceContext>(),
+          static_cast<size_t>(dx->numel()));
+      for_range(functor);
     } else {
-      int64_t batch_size = x->dims()[0];
-      const T* dy_data = dy->data<T>();
-      const T* x_data = x->data<T>();
-      const int64_t* label_data = label->data<int64_t>();
-
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
-      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
-
-      for (int64_t i = 0; i < batch_size; ++i) {
-        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
-        int64_t index = i * class_num + label_data[i];
-        dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
-      }
+      XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
+                               label->data<int64_t>(),
+                               static_cast<size_t>(class_num));
+      platform::ForRange<DeviceContext> for_range(
+          ctx.template device_context<DeviceContext>(),
+          static_cast<size_t>(dy->numel()));
+      for_range(functor);
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index c8e5bd1a8f..25dde7b334 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -106,60 +106,5 @@ class TestCrossEntropyOp3(OpTest):
             ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 
 
-class TestCrossEntropyStable(unittest.TestCase):
-    def main(self, place):
-        if isinstance(
-                place,
-                fluid.CUDAPlace) and not fluid.core.is_compiled_with_cuda():
-            return
-
-        class DataRandom(object):
-            def __init__(self):
-                self.random = np.random.RandomState(seed=1)
-
-            def next(self):
-                return {
-                    'input': self.random.uniform(
-                        low=-1, high=1, size=(64, 200)).astype('float32'),
-                    'label': self.random.uniform(
-                        low=0, high=10000, size=(64, 1)).astype('int64'),
-                }
-
-        losses = []
-        for _ in xrange(2):
-            startup = fluid.Program()
-            startup.random_seed = 1
-            main = fluid.Program()
-            scope = fluid.core.Scope()
-            with fluid.scope_guard(scope):
-                with fluid.program_guard(main, startup):
-                    img = fluid.layers.data('input', shape=[200])
-                    label = fluid.layers.data('label', shape=[1], dtype='int64')
-                    prediction = fluid.layers.fc(input=img,
-                                                 size=10000,
-                                                 act='softmax')
-                    xe = fluid.layers.cross_entropy(
-                        input=prediction, label=label)
-                    loss = fluid.layers.mean(xe)
-                    adam = fluid.optimizer.Adam()
-                    adam.minimize(loss)
-
-                    exe = fluid.Executor(place)
-                    exe.run(startup)
-                    data = DataRandom()
-                    for i in xrange(1000):
-                        exe.run(feed=next(data))
-                    losses.append(
-                        exe.run(feed=next(data), fetch_list=[loss])[0])
-        print losses
-        self.assertAlmostEqual(losses[0][0], losses[1][0])
-
-    def test_cpu(self):
-        self.main(fluid.CPUPlace())
-
-    def test_cuda(self):
-        self.main(fluid.CUDAPlace(0))
-
-
 if __name__ == "__main__":
     unittest.main()

From 6c184104873f2e6137434c148f51a9f8f94b6ada Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Apr 2018 12:55:53 +0800
Subject: [PATCH 07/52] Revert code to develop

---
 paddle/fluid/framework/executor.cc                           | 3 ---
 python/paddle/fluid/tests/unittests/test_cross_entropy_op.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index b719568c65..766bf0ab0c 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -348,9 +348,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       }
     }
   }
-
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-
   if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index 25dde7b334..c5b9e92d69 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -15,7 +15,6 @@
 import unittest
 import numpy as np
 from op_test import OpTest, randomize_probability
-import paddle.fluid as fluid
 
 
 class TestCrossEntropyOp1(OpTest):

From c888e01660ff1258352a537521d0c725d091e6df Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 28 Apr 2018 17:21:29 +0800
Subject: [PATCH 08/52] Refactor GEMM in blas

---
 .../operators/bilinear_tensor_product_op.h    |  23 ++-
 paddle/fluid/operators/gru_unit_op.h          |  52 +++---
 paddle/fluid/operators/math/blas_impl.cu.h    | 145 ++++++++++++++++
 paddle/fluid/operators/math/blas_impl.h       |  68 ++++++++
 paddle/fluid/operators/math/gru_compute.cc    |  50 +++---
 paddle/fluid/operators/math/gru_compute.cu    |  51 +++---
 paddle/fluid/operators/math/math_function.cc  |  82 +--------
 paddle/fluid/operators/math/math_function.cu  | 163 +-----------------
 paddle/fluid/operators/math/math_function.h   |  53 +++++-
 paddle/fluid/operators/math/matmul.h          |   5 +-
 10 files changed, 357 insertions(+), 335 deletions(-)
 create mode 100644 paddle/fluid/operators/math/blas_impl.cu.h
 create mode 100644 paddle/fluid/operators/math/blas_impl.h

diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index ca80e6085c..7191711a73 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
-                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
-                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
+          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
+          weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
       set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
     // Caculate the Output(X@Grad) and Output(Y@Grad).
     if (d_x || d_y) {
       Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
+          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
-                                     y_dim, batch_size, 1, x_scale.data<T>(),
-                                     y->data<T>(), 0, d_weight_i.data<T>());
+        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
+                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 15d91ca305..49e657a272 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -87,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
-        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
+              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
+              gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -103,11 +103,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
-        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
-        gate_data + frame_size * 2, frame_size * 3);
+    blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+              reset_hidden_prev_data, frame_size,
+              weight_data + frame_size * frame_size * 2, frame_size, 1,
+              gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -188,11 +187,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
                    d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, true,
-        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
-        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
-        0, reset_hidden_prev_grad_data, frame_size);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+              gate_grad_data + frame_size * 2, frame_size * 3,
+              weight_data + frame_size * frame_size * 2, frame_size, 0,
+              reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
@@ -200,18 +199,15 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     if (weight_grad) {
       T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
       // backward for state_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
-          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
-          weight_grad_data + frame_size * frame_size * 2, frame_size);
+      blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                reset_hidden_prev_data, frame_size,
+                gate_grad_data + frame_size * 2, frame_size * 3, 0,
+                weight_grad_data + frame_size * frame_size * 2, frame_size);
 
       // backward for update_gate_weight and reset_gate_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
-          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
-          frame_size * 2);
+      blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
+                weight_grad_data, frame_size * 2);
     }
     // backward for hidden_prev
     if (hidden_prev_grad) {
@@ -219,11 +215,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
       auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
       d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), false, true,
-          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
-          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
-          frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
+                hidden_prev_grad_data, frame_size);
     }
     // backward for input
     if (input_grad) {
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
new file mode 100644
index 0000000000..b7bd8f1d04
--- /dev/null
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -0,0 +1,145 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CUBlas;
+
+template <>
+struct CUBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
+  }
+};
+
+template <>
+struct CUBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
+  }
+};
+
+template <>
+struct CUBlas<platform::float16> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasHgemm(args...));
+  }
+};
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                             const CBLAS_TRANSPOSE transB,
+                                             const int M, const int N,
+                                             const int K, const T alpha,
+                                             const T *A, const T *B,
+                                             const T beta, T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, N);
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M,
+    const int N, const int K, const platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    const platform::float16 beta, platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context_.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
+  }
+#endif  // CUDA_VERSION >= 9000
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half *h_A = reinterpret_cast<const half *>(A);
+  const half *h_B = reinterpret_cast<const half *>(B);
+  half *h_C = reinterpret_cast<half *>(C);
+
+  CUBlas<platform::float16>(context_.cublas_handle(), cuTransB, cuTransA, N, M,
+                            K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N);
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, ldc);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
new file mode 100644
index 0000000000..4934afd8bb
--- /dev/null
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CBlas;
+
+template <>
+struct CBlas<float> {
+  static constexpr auto GEMM = cblas_sgemm;
+};
+
+template <>
+struct CBlas<double> {
+  static constexpr auto GEMM = cblas_dgemm;
+};
+
+template <>
+struct CBlas<platform::float16> {
+  void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+};
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                            const CBLAS_TRANSPOSE transB,
+                                            const int M, const int N,
+                                            const int K, const T alpha,
+                                            const T *A, const T *B,
+                                            const T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 3f044b7751..d786250271 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -25,21 +25,21 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
 #ifndef __NVCC__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
                                  frame_size, batch_size, active_gate);
 
     if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
+                frame_size, 1, value.gate_value + frame_size * 2,
+                frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
@@ -58,36 +58,32 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                 grad, frame_size, batch_size, active_node);
-
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
+                frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
                                 grad, frame_size, batch_size, active_gate);
-
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 27caf3383d..f26bec4109 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <paddle/fluid/platform/device_context.h>
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
@@ -36,12 +37,11 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
       threads = dim3(32, 32);
       grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
-
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
     if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
     }
 
     if (batch_size == 1) {
@@ -61,10 +61,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
+                frame_size, 1, value.gate_value + frame_size * 2,
+                frame_size * 3);
     }
 
     if (batch_size == 1) {
@@ -121,18 +121,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
           grad.output_grad, frame_size, batch_size, active_node);
     }
 
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
+                frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
       }
     }
 
@@ -153,16 +154,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index b5ae41c8f9..b63676f961 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -24,72 +24,6 @@ namespace math {
 
 using float16 = paddle::platform::float16;
 
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
 template <>
 void matmul<platform::CPUDeviceContext, float16>(
     const platform::CPUDeviceContext& context,
@@ -123,8 +57,8 @@ void matmul<platform::CPUDeviceContext, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUDeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+  Blas<platform::CPUDeviceContext>(context).GEMM(
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
@@ -152,8 +86,8 @@ void matmul<platform::CPUDeviceContext, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CPUDeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+  Blas<platform::CPUDeviceContext>(context).GEMM(
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
@@ -230,8 +164,8 @@ void batched_gemm<platform::CPUDeviceContext, float>(
     const float* Ak = &A[k * strideA];
     const float* Bk = &B[k * strideB];
     float* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
-                                            alpha, Ak, Bk, beta, Ck);
+    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
+                                                   alpha, Ak, Bk, beta, Ck);
   }
 }
 
@@ -246,8 +180,8 @@ void batched_gemm<platform::CPUDeviceContext, double>(
     const double* Ak = &A[k * strideA];
     const double* Bk = &B[k * strideB];
     double* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
-                                             alpha, Ak, Bk, beta, Ck);
+    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
+                                                   alpha, Ak, Bk, beta, Ck);
   }
 }
 #endif
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 2aa819625e..7bf816ac19 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -25,157 +25,6 @@ namespace math {
 
 using float16 = paddle::platform::float16;
 
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
-
-#if CUDA_VERSION >= 8000
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
-                                                        CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
-                                                        CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, ldc));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
 template <>
 void matmul<platform::CUDADeviceContext, float16>(
     const platform::CUDADeviceContext& context,
@@ -200,8 +49,8 @@ void matmul<platform::CUDADeviceContext, float16>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CUDADeviceContext, float16>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
+  Blas<platform::CUDADeviceContext>(context).GEMM(
+      transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
       matrix_b.data<float16>(), beta, matrix_out->data<float16>());
 }
 
@@ -229,8 +78,8 @@ void matmul<platform::CUDADeviceContext, float>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CUDADeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+  Blas<platform::CUDADeviceContext>(context).GEMM(
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
       matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 
@@ -258,8 +107,8 @@ void matmul<platform::CUDADeviceContext, double>(
   CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
 
-  gemm<platform::CUDADeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+  Blas<platform::CUDADeviceContext>(context).GEMM(
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index cdd0297472..9950c09ea6 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -42,6 +42,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,17 +57,48 @@ namespace math {
 // Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+
+template <typename DeviceContext>
+class Blas {
+ public:
+  explicit Blas(const DeviceContext& context) : context_(context) {}
+
+  template <typename T>
+  void GEMM(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+            const int M, const int N, const int K, const T alpha, const T* A,
+            const T* B, const T beta, T* C) const;
+
+  template <typename T>
+  void GEMM(const bool transA, const bool transB, const int M, const int N,
+            const int K, const T alpha, const T* A, const int lda, const T* B,
+            const int ldb, const T beta, T* C, const int ldc) const;
+
+ private:
+  const DeviceContext& context_;
+};
+
 template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const T* B, const T beta, T* C);
+class BlasT : private Blas<DeviceContext> {
+ public:
+  using Blas<DeviceContext>::Blas;
+
+  template <typename... ARGS>
+  void GEMM(ARGS... args) const {
+    static_cast<const Blas<DeviceContext>*>(this)->template GEMM<T>(args...);
+  }
+};
 
-// gemm wrapper with stride args for matrix uncontinuous in memory
 template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const bool transA, const bool transB,
-          const int M, const int N, const int K, const T alpha, const T* A,
-          const int lda, const T* B, const int ldb, const T beta, T* C,
-          const int ldc);
+inline BlasT<DeviceContext, T> GetBlas(
+    const framework::ExecutionContext& exe_ctx) {
+  return BlasT<DeviceContext, T>(
+      exe_ctx.template device_context<DeviceContext>());
+}
+
+template <typename DeviceContext, typename T>
+inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
+  return BlasT<DeviceContext, T>(dev_ctx);
+}
 
 // matrix multiply with continuous memory
 template <typename DeviceContext, typename T>
@@ -137,3 +169,8 @@ struct RowwiseMean {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
+
+#include "paddle/fluid/operators/math/blas_impl.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/math/blas_impl.cu.h"
+#endif
diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h
index 0006c5062f..67efd1be53 100644
--- a/paddle/fluid/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
@@ -131,8 +131,9 @@ class MatMulFunctor {
 
     if (!batchCount) {
       // regular matrix multiplication
-      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>());
+      Blas<DeviceContext>(context).GEMM(transA, transB, M, N, kA, alpha,
+                                        a.data<T>(), b.data<T>(), beta,
+                                        out->data<T>());
     } else {
       // batched matrix multiplication
       batched_gemm<DeviceContext, T>(

From ef48f3c7665b1142fc04dbfeb6aee04ebf4c2c45 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Sat, 28 Apr 2018 17:43:30 +0800
Subject: [PATCH 09/52] wip

---
 paddle/fluid/operators/detail/grpc_server.cc |  6 ++++++
 paddle/fluid/operators/detail/grpc_server.h  |  7 ++++++-
 paddle/fluid/operators/listen_and_serv_op.cc | 19 ++++++++++++++++++-
 paddle/fluid/operators/listen_and_serv_op.h  |  6 ++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 95f4738b4f..92819ff958 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -241,6 +241,12 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_prefetch_.reset(new std::thread(
       std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
                 "cq_prefetch", prefetch_register)));
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
   // wait server
   server_->Wait();
   t_send_->join();
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 99b87b8c6c..d7c06fc181 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -45,8 +45,9 @@ class RequestBase;
 class AsyncGRPCServer final {
  public:
   explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
-      : address_(address), sync_mode_(sync_mode) {}
+      : address_(address), sync_mode_(sync_mode), ready_(0) {}
 
+  bool WaitServerReady();
   void RunSyncUpdate();
 
   // functions to sync server barrier status.
@@ -118,6 +119,10 @@ class AsyncGRPCServer final {
   framework::ProgramDesc *program_;
   framework::Executor *executor_;
   int selected_port_;
+
+  std::mutext mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 57cff680ab..0a4b6a08e5 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -265,6 +265,23 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   }  // while(true)
 }
 
+void ListenAndServOp::StartServerThread() {
+  server_thread_.reset(new std::thread(
+      std::bind(&ListenAndServOp::ServerThreadEntry, this, rpc_service_)));
+}
+
+void ListenAndServOp::ServerThreadEntry(
+    std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+
+  {
+    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
+    barrier_cond_step_ = cond;
+  }
+  barrier_condition_.notify_all();
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -298,7 +315,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
   VLOG(3) << "wait server thread to become ready...";
-  sleep(5);
+
   // Write to a file of server selected port for python use.
   SavePort(rpc_service_);
   if (sync_mode) {
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 3cc0f30477..c85569acdc 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -51,6 +51,10 @@ class ListenAndServOp : public framework::OperatorBase {
                     framework::Scope* recv_scope,
                     framework::BlockDesc* prefetch_block) const;
 
+  void StartServerThread();
+
+  void ServerThreadEntry();
+
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
@@ -59,6 +63,8 @@ class ListenAndServOp : public framework::OperatorBase {
  protected:
   mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
   mutable std::shared_ptr<std::thread> server_thread_;
+  std::mutext server_ready_mutex_;
+  std::condition_variable server_ready_;
 };
 
 }  // namespace operators

From 008f6df9b2b150f7cd85d457645f4405fd95d4b1 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 30 Apr 2018 08:06:30 +0800
Subject: [PATCH 10/52] update

---
 paddle/fluid/operators/detail/grpc_server.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 92819ff958..ee3b3e3ccb 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -208,6 +208,11 @@ void AsyncGRPCServer::WaitClientGet(int count) {
   }
 }
 
+bool AsyncGRPCServer::WaitServerReady() {
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [&] { return this->ready_ == 1; });
+}
+
 void AsyncGRPCServer::RunSyncUpdate() {
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),

From 3eef539a4238baca093cabab84561c71ef6e039b Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 30 Apr 2018 16:23:00 -0700
Subject: [PATCH 11/52] add word2vec test for the new API

---
 .../book/word2vec/no_test_word2vec_new_api.py | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py

diff --git a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
new file mode 100644
index 0000000000..1e31824aa1
--- /dev/null
+++ b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
@@ -0,0 +1,146 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import math
+import sys
+from functools import partial
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+
+
+def create_random_lodtensor(lod, place, low, high):
+    # The range of data elements is [low, high]
+    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod([lod])
+    return res
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+
+def inference_network(is_sparse):
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+
+    embed_first = fluid.layers.embedding(
+        input=first_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_second = fluid.layers.embedding(
+        input=second_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_third = fluid.layers.embedding(
+        input=third_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_forth = fluid.layers.embedding(
+        input=forth_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+
+    concat_embed = fluid.layers.concat(
+        input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+    hidden1 = fluid.layers.fc(input=concat_embed,
+                              size=HIDDEN_SIZE,
+                              act='sigmoid')
+    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+    return predict_word
+
+
+def train_network():
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    predict_word = inference_network()
+    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+def train(use_cuda, is_sparse, save_path):
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndPass):
+            avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
+                word_dict, N))
+
+            if avg_cost < 5.0:
+                trainer.params.save(save_path)
+                return
+            if math.isnan(avg_cost):
+                sys.exit("got NaN loss, training failed.")
+
+    trainer = fluid.Trainer(
+        partial(inference_network, is_sparse),
+        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
+        place=place,
+        event_handler=event_handler)
+    trainer.train(train_reader, 100)
+
+
+def infer(use_cuda, save_path):
+    params = fluid.Params(save_path)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(inference_network, params, place=place)
+
+    lod = [0, 1]
+    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
+    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
+    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
+    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
+    result = inferencer.infer({
+        'firstw': first_word,
+        'secondw': second_word,
+        'thirdw': third_word,
+        'forthw': fourth_word
+    })
+    print(result)
+
+
+def main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    save_path = "word2vec.inference.model"
+    train(use_cuda, is_sparse, save_path)
+    infer(use_cuda, save_path)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        for is_sparse in (False, True):
+            main(use_cuda=use_cuda, is_sparse=is_sparse)

From a785a837b97c8790c34d16e140df1c4d92b7cf90 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Tue, 1 May 2018 16:50:38 -0700
Subject: [PATCH 12/52] update the example with the latest API

---
 .../tests/book/word2vec/no_test_word2vec_new_api.py      | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
index 1e31824aa1..272db7b573 100644
--- a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
@@ -94,7 +94,7 @@ def train(use_cuda, is_sparse, save_path):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     def event_handler(event):
-        if isinstance(event, fluid.EndPass):
+        if isinstance(event, fluid.Event.END_EPOCH):
             avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
                 word_dict, N))
 
@@ -106,10 +106,9 @@ def train(use_cuda, is_sparse, save_path):
 
     trainer = fluid.Trainer(
         partial(inference_network, is_sparse),
-        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
-        place=place,
-        event_handler=event_handler)
-    trainer.train(train_reader, 100)
+        fluid.optimizer.SGD(learning_rate=0.001),
+        place=place)
+    trainer.train(train_reader, 100, event_handler)
 
 
 def infer(use_cuda, save_path):

From 49dedfad17a9cb80d98247fdbfddda50d33e2381 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 11:46:08 +0800
Subject: [PATCH 13/52] Polish code and tests

---
 paddle/fluid/operators/math/blas_impl.cu.h    | 17 ++++-
 .../operators/math/math_function_test.cc      | 17 +++--
 .../operators/math/math_function_test.cu      | 62 ++++++++++---------
 3 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index b7bd8f1d04..86e4946991 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -42,9 +42,20 @@ struct CUBlas<double> {
 
 template <>
 struct CUBlas<platform::float16> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE(platform::dynload::cublasHgemm(args...));
+  using float16 = platform::float16;
+
+  static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float16 *alpha, const float16 *A, int lda,
+                   const float16 *B, int ldb, const float16 *beta, float16 *C,
+                   int ldc) {
+    PADDLE_ENFORCE(
+        platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
+                                       reinterpret_cast<const __half *>(alpha),
+                                       reinterpret_cast<const __half *>(A), lda,
+                                       reinterpret_cast<const __half *>(B), ldb,
+                                       reinterpret_cast<const __half *>(beta),
+                                       reinterpret_cast<__half *>(C), ldc));
   }
 };
 
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 25a9d0111e..6d11dc8c76 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -14,6 +14,13 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
+template <typename T>
+inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
+GetBlas(const paddle::platform::CPUDeviceContext& context) {
+  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
+                                          T>(context);
+}
+
 TEST(math_function, gemm_notrans_cblas) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input2;
@@ -34,9 +41,8 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
-      input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
+                               input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
@@ -68,9 +74,8 @@ TEST(math_function, gemm_trans_clbas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
-      input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
+                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index 7986326e96..22484e1c1a 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                     const std::vector<float>& data) {
@@ -23,8 +24,8 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
 }
 
 TEST(math_function, notrans_mul_trans_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input1_gpu;
@@ -59,8 +60,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
 }
 
 TEST(math_function, notrans_mul_trans_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input1_gpu;
@@ -100,8 +101,8 @@ TEST(math_function, notrans_mul_trans_fp16) {
 }
 
 TEST(math_function, trans_mul_notrans_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input1_gpu;
@@ -141,8 +142,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
 }
 
 TEST(math_function, trans_mul_notrans_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input1_gpu;
@@ -186,9 +187,16 @@ TEST(math_function, trans_mul_notrans_fp16) {
   EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
 }
 
+template <typename T>
+inline paddle::operators::math::BlasT<paddle::platform::CUDADeviceContext, T>
+GetBlas(const paddle::platform::CUDADeviceContext& context) {
+  return paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
+                                          T>(context);
+}
+
 TEST(math_function, gemm_notrans_cublas_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input2;
@@ -221,8 +229,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
+                               c + 1, 4);
 
   TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -244,8 +252,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_notrans_cublas_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input2;
@@ -281,9 +289,8 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   float16* b = input2_gpu.data<float16>();
   float16* c = input3_gpu.mutable_data<float16>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
-      context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
-      c + 1, 4);
+  GetBlas<float16>(context).GEMM(false, false, m, n, k, float16(1), a, 3, b + 1,
+                                 4, float16(1), c + 1, 4);
 
   TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -305,8 +312,8 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input2;
@@ -339,8 +346,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
+                               c + 1, 4);
 
   TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -356,8 +363,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor input1;
   Tensor input2;
@@ -393,9 +400,8 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   float16* b = input2_gpu.data<float16>();
   float16* c = input3_gpu.mutable_data<float16>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
-      context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
-      c + 1, 4);
+  GetBlas<float16>(context).GEMM(false, true, m, n, k, float16(1), a, 3, b + 3,
+                                 3, float16(1), c + 1, 4);
 
   TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -412,8 +418,8 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
 template <typename T>
 void GemvTest(int m, int n, bool trans) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  using namespace paddle::framework;  // NOLINT
+  using namespace paddle::platform;   // NOLINT
 
   Tensor mat_a;
   Tensor vec_b;

From 60d6348e69c4b19910e303ebd91acf5a48e53161 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 12:00:44 +0800
Subject: [PATCH 14/52] Revert develop

---
 paddle/fluid/operators/math/pooling.cu | 82 ++++++++++----------------
 1 file changed, 30 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 32348e908a..267f8c409d 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <vector>
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -22,7 +20,7 @@ namespace operators {
 namespace math {
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool2D(const int nthreads, const T* input_data,  // NOLINT
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int channels, const int input_height,
                              const int input_width, const int output_height,
                              const int output_width, const int ksize_height,
@@ -60,8 +58,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,  // NOLINT
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,     // NOLINT
-    const T* output_grad, const int channels, const int input_height,  // NOLINT
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
@@ -108,8 +106,8 @@ __global__ void KernelPool2DGrad(
 
 template <typename T>
 __global__ void KernelMaxPool2DGrad(
-    const int nthreads, const T* input_data, const T* output_data,     // NOLINT
-    const T* output_grad, const int channels, const int input_height,  // NOLINT
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
@@ -160,10 +158,8 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -205,10 +201,8 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -252,10 +246,8 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -314,7 +306,7 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
                                  double>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(const int nthreads, const T* input_data,  // NOLINT
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
                              const int channels, const int input_depth,
                              const int input_height, const int input_width,
                              const int output_depth, const int output_height,
@@ -360,8 +352,8 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,  // NOLINT
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,    // NOLINT
-    const T* output_grad, const int channels, const int input_depth,  // NOLINT
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
     const int input_height, const int input_width, const int output_depth,
     const int output_height, const int output_width, const int ksize_depth,
     const int ksize_height, const int ksize_width, const int stride_depth,
@@ -424,8 +416,8 @@ __global__ void KernelPool3DGrad(
 
 template <typename T>
 __global__ void KernelMaxPool3DGrad(
-    const int nthreads, const T* input_data, const T* output_data,    // NOLINT
-    const T* output_grad, const int channels, const int input_depth,  // NOLINT
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
     const int input_height, const int input_width, const int output_depth,
     const int output_height, const int output_width, const int ksize_depth,
     const int ksize_height, const int ksize_width, const int stride_depth,
@@ -482,10 +474,8 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -535,10 +525,8 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -590,10 +578,8 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -750,10 +736,8 @@ template <typename T1, typename T2>
 class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -795,10 +779,8 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -955,10 +937,8 @@ template <typename T1, typename T2>
 class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -1007,10 +987,8 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask,
-                  std::vector<int>& ksize,     // NOLINT
-                  std::vector<int>& strides,   // NOLINT
-                  std::vector<int>& paddings,  // NOLINT
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];

From 4db43c6c9f9962d163efd0afcb13e4cf10acfe45 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 14:47:02 +0800
Subject: [PATCH 15/52] Naive implement cblas

---
 paddle/fluid/operators/math/blas_impl.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 4934afd8bb..f6d6669765 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -24,17 +24,23 @@ struct CBlas;
 
 template <>
 struct CBlas<float> {
-  static constexpr auto GEMM = cblas_sgemm;
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_sgemm(args...);
+  }
 };
 
 template <>
 struct CBlas<double> {
-  static constexpr auto GEMM = cblas_dgemm;
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
 };
 
 template <>
 struct CBlas<platform::float16> {
-  void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
 };
 
 template <>

From e7ac709b4bf1b1ef63bc13e63d7122e8bdbf07d9 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 2 May 2018 15:37:11 +0800
Subject: [PATCH 16/52] done

---
 paddle/fluid/operators/detail/grpc_server.cc |  4 +-
 paddle/fluid/operators/detail/grpc_server.h  |  4 +-
 paddle/fluid/operators/listen_and_serv_op.cc | 47 ++++++++------------
 paddle/fluid/operators/listen_and_serv_op.h  | 16 ++++---
 paddle/fluid/operators/send_recv_op_test.cc  | 13 +++++-
 5 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index ee3b3e3ccb..bb9c93480d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -208,9 +208,9 @@ void AsyncGRPCServer::WaitClientGet(int count) {
   }
 }
 
-bool AsyncGRPCServer::WaitServerReady() {
+void AsyncGRPCServer::WaitServerReady() {
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [&] { return this->ready_ == 1; });
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
 }
 
 void AsyncGRPCServer::RunSyncUpdate() {
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index d7c06fc181..7f9cae21cc 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -47,7 +47,7 @@ class AsyncGRPCServer final {
   explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
       : address_(address), sync_mode_(sync_mode), ready_(0) {}
 
-  bool WaitServerReady();
+  void WaitServerReady();
   void RunSyncUpdate();
 
   // functions to sync server barrier status.
@@ -120,7 +120,7 @@ class AsyncGRPCServer final {
   framework::Executor *executor_;
   int selected_port_;
 
-  std::mutext mutex_ready_;
+  std::mutex mutex_ready_;
   std::condition_variable condition_ready_;
   int ready_;
 };
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 0a4b6a08e5..350c9c8563 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -80,12 +80,7 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-static void SavePort(std::shared_ptr<detail::AsyncGRPCServer> rpc_service) {
-  std::ofstream port_file;
-  port_file.open("/tmp/paddle.selected_port");
-  port_file << rpc_service->GetSelectedPort();
-  port_file.close();
-}
+std::atomic_int ListenAndServOp::selected_port_{0};
 
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
@@ -93,15 +88,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
-int ListenAndServOp::GetSelectedPort() const {
-  return rpc_service_->GetSelectedPort();
-}
-
 void ListenAndServOp::Stop() {
   rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
   server_thread_->join();
 }
 
+void ListenAndServOp::SavePort(const std::string &file_path) const {
+  // NOTE: default write file to /tmp/paddle.selected_port
+  selected_port_ = rpc_service_->GetSelectedPort();
+
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_.load();
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void ListenAndServOp::WaitServerReady() {
+  while (selected_port_.load() == 0) {
+  }
+}
+
 void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::ProgramDesc *program,
                                   framework::Scope *recv_scope,
@@ -265,23 +272,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   }  // while(true)
 }
 
-void ListenAndServOp::StartServerThread() {
-  server_thread_.reset(new std::thread(
-      std::bind(&ListenAndServOp::ServerThreadEntry, this, rpc_service_)));
-}
-
-void ListenAndServOp::ServerThreadEntry(
-    std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
-  VLOG(4) << "RunServer thread end";
-
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -315,9 +305,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
   VLOG(3) << "wait server thread to become ready...";
+  rpc_service_->WaitServerReady();
 
   // Write to a file of server selected port for python use.
-  SavePort(rpc_service_);
+  SavePort();
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
   } else {
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index c85569acdc..87c0df2a8a 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+#include <atomic>
 #include <ostream>
 #include <string>
 
@@ -39,8 +40,6 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
 
-  int GetSelectedPort() const;
-
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
@@ -51,20 +50,25 @@ class ListenAndServOp : public framework::OperatorBase {
                     framework::Scope* recv_scope,
                     framework::BlockDesc* prefetch_block) const;
 
-  void StartServerThread();
+  void SavePort(
+      const std::string& file_path = "/tmp/paddle.selected_port") const;
+
+  void WaitServerReady();
 
-  void ServerThreadEntry();
+  int GetSelectedPort() { return selected_port_; }
 
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
+  static void ResetPort() { selected_port_ = 0; }
+
  protected:
   mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
   mutable std::shared_ptr<std::thread> server_thread_;
-  std::mutext server_ready_mutex_;
-  std::condition_variable server_ready_;
+  // FIXME(wuyi): it's static so that the operator can be cloned.
+  static std::atomic_int selected_port_;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index d2e1f3cb2f..a0b5a390db 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -116,6 +116,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 void StartServerNet(bool is_sparse) {
   f::Scope scope;
   p::CPUPlace place;
+  VLOG(4) << "before init tensor";
   if (is_sparse) {
     InitSelectedRowsInScope(place, &scope);
   } else {
@@ -129,6 +130,7 @@ void StartServerNet(bool is_sparse) {
   auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensors, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
+  VLOG(4) << "before attr";
 
   f::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:0")});
@@ -139,15 +141,19 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"PrefetchBlock", prefetch_block});
   attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
   attrs.insert({"sync_mode", true});
+  VLOG(4) << "before init op";
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
+  VLOG(4) << "before run op";
   listen_and_serv_op->Run(scope, place);
   LOG(INFO) << "server exit";
 }
 
 TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false);
-  sleep(5);  // wait server to start
+  // wait server to start
+  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
+      ->WaitServerReady();
   // local net
   f::Scope scope;
   p::CPUPlace place;
@@ -181,11 +187,13 @@ TEST(SendRecvOp, CPUDense) {
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset(nullptr);
+  paddle::operators::ListenAndServOp::ResetPort();
 }
 
 TEST(SendRecvOp, CPUSparse) {
   std::thread server_thread(StartServerNet, true);
-  sleep(3);  // wait server to start
+  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
+      ->WaitServerReady();
   // local net
   f::Scope scope;
   p::CPUPlace place;
@@ -226,4 +234,5 @@ TEST(SendRecvOp, CPUSparse) {
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset();
+  paddle::operators::ListenAndServOp::ResetPort();
 }

From 3d846fc3f28f136b52a0dd39fc608ac5bc9b0ad4 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 2 May 2018 15:45:08 +0800
Subject: [PATCH 17/52] Make Variable support for future.division.

---
 python/paddle/fluid/layers/math_op_patch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 08a0184c2c..1754061c4b 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -169,7 +169,9 @@ def monkey_patch_variable():
             # a*b == b*a. Do not need to reverse explicitly
         ("__rmul__", "elementwise_mul", False),
         ("__div__", "elementwise_div", False),
+        ("__truediv__", "elementwise_div", False),
         ("__rdiv__", "elementwise_div", True),
+        ("__rtruediv__", "elementwise_div", True),
         ("__pow__", "elementwise_pow", False),
         ("__rpow__", "elementwise_pow", True),
             # for logical compare

From da960ada49694b09e2f74cfeec4774c394ce066f Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 2 May 2018 16:14:25 +0800
Subject: [PATCH 18/52] redefine distribute transpiler api

---
 benchmark/cluster/vgg16/vgg16_fluid.py        |  2 --
 python/paddle/fluid/distribute_transpiler.py  | 31 ++++++++++++++-----
 .../fluid/tests/book/test_fit_a_line.py       |  7 +----
 .../tests/book/test_image_classification.py   |  7 +----
 .../tests/book/test_label_semantic_roles.py   |  7 +----
 .../tests/book/test_machine_translation.py    |  7 +----
 .../fluid/tests/book/test_recognize_digits.py |  7 +----
 .../tests/book/test_recommender_system.py     |  7 +----
 .../tests/book/test_understand_sentiment.py   |  7 +----
 .../paddle/fluid/tests/book/test_word2vec.py  |  7 +----
 10 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 8b29227cfa..7e0b887544 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -239,8 +239,6 @@ def main():
 
         t = fluid.DistributeTranspiler()
         t.transpile(
-            optimize_ops,
-            params_grads,
             trainer_id=args.task_index,
             pservers=args.ps_hosts,
             trainers=trainers)
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index e63411782a..079d90f585 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -137,8 +137,6 @@ def split_dense_variable(var_list,
 
 class DistributeTranspiler:
     def transpile(self,
-                  optimize_ops,
-                  params_grads,
                   trainer_id,
                   program=None,
                   pservers="127.0.0.1:6174",
@@ -169,11 +167,6 @@ class DistributeTranspiler:
             4. append ops that should run on current server instance.
             5. add listen_and_serv op
 
-            :param optimize_ops: op list of optimization, should be the
-                                    return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param params_grads: list of tuple(weight, gradient)
-            :type params_grads: list
             :param trainer_id: one unique id for each trainer in a job.
             :type trainer_id: int
             :param program: program to transpile, default is default_main_program
@@ -194,7 +187,6 @@ class DistributeTranspiler:
             program = default_main_program()
         self.origin_program = program
         self.trainer_num = trainers
-        self.optimize_ops = optimize_ops
         self.sync_mode = sync_mode
         # TODO(typhoonzero): currently trainer_id is fetched from cluster system
         # like Kubernetes, we should port this to use etcd later when developing
@@ -202,6 +194,7 @@ class DistributeTranspiler:
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
+        self.optimize_ops, params_grads = self._get_optimize_pass()
 
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
@@ -1147,3 +1140,25 @@ class DistributeTranspiler:
                     # we only need to append op for once
                     break
         return lr_ops
+
+    def _get_optimize_pass(self):
+        block = self.origin_program.global_block()
+        opt_ops = []
+        params_grads = []
+        for op in block.ops:
+            if self._is_opt_op(op):
+                opt_ops.append(op)
+                params_grads.append((self.origin_program.global_block().var(
+                    op.input("Param")[0]),
+                                     self.origin_program.global_block().var(
+                                         op.input("Grad")[0])))
+            elif op.type == "scale":
+                # for adam optimize op
+                for in_name in op.input_arg_names:
+                    if in_name.startswith("beta1_pow_acc") or \
+                            in_name.startswith("beta2_pow_acc"):
+                        opt_ops.append(op)
+                        break
+            else:
+                pass
+        return opt_ops, params_grads
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 6dfc2997ae..ecb34699af 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -80,12 +80,7 @@ def train(use_cuda, save_dirname, is_local):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 09f994c370..8ff4f6d47a 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -189,12 +189,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index d9cd76952e..50ef29c457 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -259,12 +259,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 830d78df8b..46c6b9c29a 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -231,12 +231,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 5ec6890c1b..c115aa4d7d 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -162,12 +162,7 @@ def train(nn_type,
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 2172c275b8..d022dedbff 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -261,12 +261,7 @@ def train(use_cuda, save_dirname, is_local=True):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
index dedd153778..241778e303 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -213,12 +213,7 @@ def train(word_dict,
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 8929779de9..6dec0f6857 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -145,12 +145,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,

From caa4027d9dadb99af28084565b7d3f4c8b17e8d5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 17:12:45 +0800
Subject: [PATCH 19/52] Follow comments

---
 paddle/fluid/operators/math/blas_impl.cu.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 86e4946991..89935829ab 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -126,14 +126,9 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       CUDA_R_32F, algo));
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half *h_A = reinterpret_cast<const half *>(A);
-  const half *h_B = reinterpret_cast<const half *>(B);
-  half *h_C = reinterpret_cast<half *>(C);
-
-  CUBlas<platform::float16>(context_.cublas_handle(), cuTransB, cuTransA, N, M,
-                            K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N);
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &h_alpha, h_B, ldb, h_A, lda,
+                                  &h_beta, h_C, N);
 #endif  // CUDA_VERSION >= 8000
 }
 

From 1bb579a3f5f8b077faa32ce13ae34617f6d04e3d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 17:33:26 +0800
Subject: [PATCH 20/52] A naive trainer implementation

---
 python/paddle/fluid/__init__.py               |   3 +-
 python/paddle/fluid/layers/io.py              |   6 +-
 python/paddle/fluid/optimizer.py              |   3 +-
 .../book/word2vec/no_test_word2vec_new_api.py |  12 +-
 python/paddle/fluid/trainer.py                | 188 ++++++++++++++++--
 5 files changed, 185 insertions(+), 27 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1e6482e3c1..bd325bd257 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -21,8 +21,7 @@ import executor
 from executor import *
 
 import trainer
-from trainer import Trainer
-from trainer import Event
+from trainer import *
 
 import inferencer
 from inferencer import Inferencer
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index cc71c2136a..a5570b653e 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -50,8 +50,6 @@ def data(name,
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program(Program): Name of the main program that calls this
-       startup_program(Program): Name of the startup program
        stop_gradient(bool): A boolean that mentions whether gradient should flow.
 
     Returns:
@@ -74,13 +72,15 @@ def data(name,
     if append_batch_size:
         shape = [-1] + shape  # append batch size as -1
 
-    return helper.create_global_variable(
+    data_var = helper.create_global_variable(
         name=name,
         shape=shape,
         dtype=dtype,
         type=type,
         stop_gradient=stop_gradient,
         lod_level=lod_level)
+    data_var.is_data = True
+    return data_var
 
 
 class BlockGuardServ(BlockGuard):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 9ae43b3e93..0a314ddfd7 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -28,7 +28,8 @@ from contextlib import contextmanager
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage'
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage',
+    'Optimizer'
 ]
 
 
diff --git a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
index 272db7b573..30939cae29 100644
--- a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
@@ -79,9 +79,9 @@ def inference_network(is_sparse):
     return predict_word
 
 
-def train_network():
+def train_network(is_sparse):
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-    predict_word = inference_network()
+    predict_word = inference_network(is_sparse)
     cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
     avg_cost = fluid.layers.mean(cost)
     return avg_cost
@@ -94,7 +94,8 @@ def train(use_cuda, is_sparse, save_path):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     def event_handler(event):
-        if isinstance(event, fluid.Event.END_EPOCH):
+        print type(event)
+        if isinstance(event, fluid.EndEpochEvent):
             avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
                 word_dict, N))
 
@@ -105,10 +106,11 @@ def train(use_cuda, is_sparse, save_path):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        partial(inference_network, is_sparse),
+        partial(train_network, is_sparse),
         fluid.optimizer.SGD(learning_rate=0.001),
         place=place)
-    trainer.train(train_reader, 100, event_handler)
+    trainer.train(
+        reader=train_reader, num_epochs=100, event_handler=event_handler)
 
 
 def infer(use_cuda, save_path):
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index aeda676502..2362da370a 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -12,44 +12,200 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import core
+import framework
+import executor
+import data_feeder
+import contextlib
+
+# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
+import optimizer as opt_module
+
 __all__ = [
-    'Event',
     'Trainer',
+    'BeginEpochEvent',
+    'EndEpochEvent',
+    'BeginStepEvent',
+    'EndStepEvent',
 ]
 
 
-class Event(object):
-    BEGIN_EPOCH = 0
-    END_EPOCH = 1
-    BEGIN_STEP = 2
-    END_STEP = 3
+class BeginEpochEvent(object):
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
+
+
+class EndEpochEvent(object):
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
 
-    def __init__(self):
-        self.step = 0
-        self.epoch = 0
-        self.type = Event.BEGIN_EPOCH
+
+class BeginStepEvent(object):
+    def __init__(self, epoch_id, step_id):
+        self.epoch = epoch_id
+        self.step = step_id
+
+
+class EndStepEvent(object):
+    def __init__(self, epoch_id, step_id):
+        self.epoch = epoch_id
+        self.step = step_id
 
 
 class Trainer(object):
+    """
+
+    Args:
+        network_func(callable): A function which will return loss. The loss must be a scaler.
+        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
+        params:
+        place: The device place of this trainer.
+    """
+
     def __init__(self, network_func, optimizer, params=None, place=None):
         # 1. we need to generate a framework.Program by calling
         # network_func. Reference: fluid.program_guard in
         # test_word2vec.py
+        self.scope = self._get_scope_from_params(params)
+
+        self.startup_program = framework.Program()
+        self.train_program = framework.Program()
+
+        with framework.program_guard(self.train_program, self.startup_program):
+            loss = network_func()
+            if not isinstance(optimizer, opt_module.Optimizer):
+                raise TypeError(
+                    "The optimizer should be an instance of Optimizer")
+
+            optimizer.minimize(loss)
+
+        self.place = Trainer._check_and_get_place(place)
 
         # 2. move the default_main_program to self.program and run the
         # default_startup program on an empty core.Scope()
+        # Run startup program
+        if params is None:
+            exe = executor.Executor(place)
+            exe.run(self.startup_program, scope=self.scope)
 
         # 3. call self.params.add_vars with the initialized scope, it
         # will add the new vars of the initialized scope into
         # self.params.
-        self.network_func = network_func
-        self.optimizer = optimizer
-        self.params = params
-        self.place = place
+        # TODO(yuyang): This depends on parameters implementation.
+
         # TODO(helin): support distributed training
 
-    def train(self, reader, num_epochs, event_handler):
-        pass
+    def train(self,
+              num_epochs,
+              event_handler,
+              reader=None,
+              parallel=False,
+              feed_order=None):
+        """
+        Train the model.
+
+        Args:
+            num_epochs: The number of epoch. An epoch will process all data in reader
+            event_handler: The event handler. A function with type (ev:Event)->void
+            reader:
+            parallel: True if use multi-CPUs or multi-GPUs
+            feed_order: Feeding order of reader. None will following the defining
+                order in program
+
+        Returns:
+
+        """
+        if parallel:
+            raise NotImplementedError(
+                "Parallel Executor version of trainer is not implemented")
+
+        self._train_by_executor(num_epochs, event_handler, reader, feed_order)
 
     def test(self, reader):
         pass
+
+    def _get_scope_from_params(self, params):
+        """
+        Get Scope from parameter object.
+        Args:
+            params(Parameter|None): The parameter object instance. Could be None.
+
+        Returns: New scope if params is None. Or params.scope()
+        NOTE: This method is WIP. Not fully implemented.
+        """
+        if params is None:
+            return core.Scope()  # new scope when params is None
+        else:
+            raise NotImplementedError("Not implemented right now.")
+
+    @staticmethod
+    def _check_and_get_place(place):
+        """
+        Check the type of place or get the default place
+        Args:
+            place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
+
+        Raises:
+            TypeError if the type mismatched.
+
+        Returns:
+            the original place if it is not None.
+            if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
+            Otherwise returns CPUPlace by default.
+        """
+        if place is None:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            else:
+                return core.CPUPlace()
+        else:
+            if not isinstance(place, core.CUDAPlace) and not isinstance(
+                    place, core.CPUPlace):
+                raise TypeError("Place should be either CUDAPlace or CPUPlace")
+            return place
+
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(
+                main_program=self.train_program,
+                startup_program=self.startup_program):
+            with executor.scope_guard(self.scope):
+                yield
+
+    def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
+        """
+        Train by Executor and single device.
+
+        Args:
+            num_epochs:
+            event_handler:
+            reader:
+            feed_order:
+
+        Returns:
+
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            if feed_order is None:
+                feed_var_list = [
+                    var
+                    for var in self.train_program.global_block(
+                    ).vars.itervalues()
+                    if hasattr(var, 'is_data') and var.is_data
+                ]
+            else:
+                feed_var_list = [
+                    self.train_program.global_block().var(var_name)
+                    for var_name in feed_order
+                ]
+
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            for epoch_id in range(num_epochs):
+                event_handler(BeginEpochEvent(epoch_id))
+                for step_id, data in enumerate(reader()):
+                    event_handler(BeginStepEvent(epoch_id, step_id))
+                    exe.run(feed=feeder.feed(data), fetch_list=[])
+                    event_handler(EndStepEvent(epoch_id, step_id))
+                event_handler(EndEpochEvent(epoch_id))

From 5e151b2c83f70900a47431d27aa33687b407ddd4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 May 2018 17:47:11 +0800
Subject: [PATCH 21/52] Follow comment

---
 paddle/fluid/operators/cross_entropy_op.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 822a83712d..19a2aec92b 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -72,13 +72,13 @@ class XeGradFunctor {
                 size_t num_classes)
       : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
 
-  HOSTDEVICE void operator()(size_t label_id) {
-    auto x_is_true_offset = label_id * num_classes_ + label_[label_id];
-    for (size_t x_offset = label_id * num_classes_;
-         x_offset < (label_id + 1) * num_classes_; ++x_offset) {
+  HOSTDEVICE void operator()(size_t sample_id) {
+    auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
+    for (size_t x_offset = sample_id * num_classes_;
+         x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
       dx_[x_offset] = x_offset != x_is_true_offset
                           ? static_cast<T>(0)
-                          : -dy_[label_id] / x_[x_offset];
+                          : -dy_[sample_id] / x_[x_offset];
     }
   }
 

From b8f7fa97b6f2f8787c9fced40004a3cb45795a05 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 2 May 2018 20:13:59 +0800
Subject: [PATCH 22/52] replace __shfl with __shfl_sync

---
 paddle/cuda/src/hl_top_k.cu             | 9 +++++----
 paddle/fluid/operators/top_k_op.cu      | 7 ++++++-
 paddle/fluid/platform/cuda_primitives.h | 7 +++++++
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index 59ba552f56..4a737d5ba7 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
-#include "hl_sparse.ph"
-#include "hl_top_k.h"
+#include "paddle/cuda/include/hl_base.h"
+#include "paddle/cuda/include/hl_sparse.ph"
+#include "paddle/cuda/include/hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
 // using namespace hppl;
@@ -244,8 +244,9 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
     if (--beamSize == 0) break;
     __syncthreads();
 
+    // temporary solution
     unsigned mask = 0u;
-    // CREATE_SHFL_MASK(mask, tid < len);
+    CREATE_SHFL_MASK(mask, true);
 
     if (tid == maxId[0]) {
       if (beam < maxLength) {
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index d7f4d383ce..a2e3973fe8 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -235,8 +236,12 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
         sh_topk[tid] = topk[*beam];
       }
     }
+    // temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
     if (maxid[0] / 32 == warp) {
-      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl_sync(mask, *beam, (maxid[0]) % 32, 32) == MaxLength) break;
     }
   }
 }
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 866ff30a8b..0f6e6159b6 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -72,6 +72,13 @@ template <typename T>
 __forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
   return __shfl_down(val, delta);
 }
+
+template <typename T>
+__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
+                                         int width) {
+  return __shfl(val, src_line, width);
+}
+
 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
 #else
 #define FULL_WARP_MASK 0xFFFFFFFF

From 6422c0e4f6814536ba7772e431858c84840d417b Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 2 May 2018 23:12:18 +0800
Subject: [PATCH 23/52] update by comment

---
 paddle/fluid/operators/listen_and_serv_op.cc          |  5 ++++-
 paddle/fluid/operators/send_recv_op_test.cc           | 11 +++++------
 .../paddle/fluid/tests/unittests/test_dist_train.py   |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 350c9c8563..038a2aa1f1 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -308,7 +308,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   rpc_service_->WaitServerReady();
 
   // Write to a file of server selected port for python use.
-  SavePort();
+  std::string file_path =
+    string::Sprintf("/tmp/paddle.%d.selected_port",
+                    static_cast<int>(::getpid()));
+  SavePort(file_path);
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
   } else {
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 0d495d8d15..eb51f301bf 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -198,8 +198,11 @@ TEST(SendRecvOp, CPUSparse) {
   std::thread server_thread(StartServerNet, true, &initialized);
   while (!initialized) {
   }
-  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
-      ->WaitServerReady();
+  auto *listen_and_serv_op_ptr =
+      static_cast<paddle::operators::ListenAndServOp *>(
+          listen_and_serv_op.get());
+  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
+  listen_and_serv_op_ptr->WaitServerReady();
 
   // local net
   f::Scope scope;
@@ -208,10 +211,6 @@ TEST(SendRecvOp, CPUSparse) {
   InitSelectedRowsInScope(place, &scope);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
   selected_port = listen_and_serv_op_ptr->GetSelectedPort();
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index c7fdd06f10..77e9a8f7e7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase):
         p.start()
 
         time.sleep(10)
-        with open("/tmp/paddle.selected_port", "r") as fn:
+        with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
 

From 753ea15d26576103e2592b500a2d443246408f54 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Tue, 1 May 2018 19:11:06 -0700
Subject: [PATCH 24/52] Build: add cicheck task.

---
 paddle/scripts/paddle_build.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 53455fd860..1595cc9e8a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -40,6 +40,7 @@ function print_usage() {
     ${BLUE}capi${NONE}: generate paddle CAPI package
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
+    ${BLUE}cicheck${NONE}: run CI tasks
     "
 }
 
@@ -453,6 +454,8 @@ function gen_capi_package() {
 }
 
 function gen_fluid_inference_lib() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
     if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
         cat <<EOF
     ========================================
@@ -503,6 +506,13 @@ function main() {
       check_style)
         check_style
         ;;
+      cicheck)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_test
+        gen_capi_package
+        gen_fluid_inference_lib
+        ;;
       *)
         print_usage
         exit 0

From a66052c6ff50aa152c70c8758a226234afa2a511 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 2 May 2018 15:41:45 -0700
Subject: [PATCH 25/52] improve trainer API

- The trainer and inferencer will load params from disk if param_path
  argument is not None in their constructor.

- Remove params.py, we will expose core.Scope to the user if needed
  (e.g., for GAN). Currently we will not expose it, unless we clearly
  know doing so can support GAN.

- Add `save_params` to Trainer (a TODO item).

- rename "network" to "program"
---
 python/paddle/fluid/__init__.py               |  6 ++-
 python/paddle/fluid/inferencer.py             |  8 +++-
 python/paddle/fluid/params.py                 | 39 ------------------
 .../book/word2vec/no_test_word2vec_new_api.py | 20 +++++-----
 python/paddle/fluid/trainer.py                | 40 +++++++------------
 5 files changed, 36 insertions(+), 77 deletions(-)
 delete mode 100644 python/paddle/fluid/params.py

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index bd325bd257..0f197aab41 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -21,7 +21,11 @@ import executor
 from executor import *
 
 import trainer
-from trainer import *
+from trainer import Trainer
+from trainer import BeginEpochEvent
+from trainer import EndEpochEvent
+from trainer import BeginStepEvent
+from trainer import EndStepEvent
 
 import inferencer
 from inferencer import Inferencer
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 3ea50bf196..58e027695a 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,18 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import core
+
 __all__ = ['Inferencer', ]
 
 
 class Inferencer(object):
-    def __init__(self, network_func, params, place=None):
+    def __init__(self, network_func, param_path=None, place=None):
         # 1. we need to generate a framework.Program by calling
         # network_func. Reference: fluid.program_guard in test_word2vec.py
 
         # 2. move the default_main_program to self.program.
 
         # 3. run the default_startup program.
-        self.params = params
+
+        # 4. load params from param_path into scope
+        self.scope = core.Scope()
         self.place = place
 
     def infer(self, inputs):
diff --git a/python/paddle/fluid/params.py b/python/paddle/fluid/params.py
deleted file mode 100644
index a5d257e53a..0000000000
--- a/python/paddle/fluid/params.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import core
-
-__all__ = ['Params', ]
-
-
-class Params(object):
-    def __init__(self, path=None):
-        self.scope = core.Scope()
-
-        if path:
-            self._load(path)
-
-    def _load(self, path):
-        # reference: load_persistables in io.py
-        pass
-
-    def save(self, path):
-        # reference: save_persistables in io.py
-        pass
-
-    def add_params(self, scope):
-        # take the keys from the scope,
-        # if not already exists in self.scope,
-        # add the key and value into self.scope.
-        pass
diff --git a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
index 30939cae29..35e163dc9d 100644
--- a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
@@ -39,7 +39,7 @@ word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
 
-def inference_network(is_sparse):
+def inference_program(is_sparse):
     first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
     second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
     third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
@@ -79,9 +79,9 @@ def inference_network(is_sparse):
     return predict_word
 
 
-def train_network(is_sparse):
+def train_program(is_sparse):
     next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-    predict_word = inference_network(is_sparse)
+    predict_word = inference_program(is_sparse)
     cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
     avg_cost = fluid.layers.mean(cost)
     return avg_cost
@@ -100,23 +100,25 @@ def train(use_cuda, is_sparse, save_path):
                 word_dict, N))
 
             if avg_cost < 5.0:
-                trainer.params.save(save_path)
+                trainer.save_params(save_path)
                 return
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        partial(train_network, is_sparse),
+        partial(train_program, is_sparse),
         fluid.optimizer.SGD(learning_rate=0.001),
         place=place)
     trainer.train(
         reader=train_reader, num_epochs=100, event_handler=event_handler)
 
 
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+def infer(use_cuda, is_sparse, save_path):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        partial(inference_program, is_sparse),
+        param_path=save_path,
+        place=place)
 
     lod = [0, 1]
     first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
@@ -138,7 +140,7 @@ def main(use_cuda, is_sparse):
 
     save_path = "word2vec.inference.model"
     train(use_cuda, is_sparse, save_path)
-    infer(use_cuda, save_path)
+    infer(use_cuda, is_sparse, save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 2362da370a..0aada3deb0 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -56,23 +56,22 @@ class Trainer(object):
     """
 
     Args:
-        network_func(callable): A function which will return loss. The loss must be a scaler.
+        program_func(callable): A function which will return loss. The loss must be a scaler.
         optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
-        params:
         place: The device place of this trainer.
     """
 
-    def __init__(self, network_func, optimizer, params=None, place=None):
+    def __init__(self, program_func, optimizer, param_path=None, place=None):
         # 1. we need to generate a framework.Program by calling
-        # network_func. Reference: fluid.program_guard in
+        # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
-        self.scope = self._get_scope_from_params(params)
+        self.scope = core.Scope()
 
         self.startup_program = framework.Program()
         self.train_program = framework.Program()
 
         with framework.program_guard(self.train_program, self.startup_program):
-            loss = network_func()
+            loss = program_func()
             if not isinstance(optimizer, opt_module.Optimizer):
                 raise TypeError(
                     "The optimizer should be an instance of Optimizer")
@@ -84,14 +83,13 @@ class Trainer(object):
         # 2. move the default_main_program to self.program and run the
         # default_startup program on an empty core.Scope()
         # Run startup program
-        if params is None:
-            exe = executor.Executor(place)
-            exe.run(self.startup_program, scope=self.scope)
+        exe = executor.Executor(place)
+        exe.run(self.startup_program, scope=self.scope)
 
-        # 3. call self.params.add_vars with the initialized scope, it
-        # will add the new vars of the initialized scope into
-        # self.params.
-        # TODO(yuyang): This depends on parameters implementation.
+        if param_path:
+            # load params from param_path into scope
+            # TODO(yuyang): This depends on parameters implementation.
+            pass
 
         # TODO(helin): support distributed training
 
@@ -124,19 +122,9 @@ class Trainer(object):
     def test(self, reader):
         pass
 
-    def _get_scope_from_params(self, params):
-        """
-        Get Scope from parameter object.
-        Args:
-            params(Parameter|None): The parameter object instance. Could be None.
-
-        Returns: New scope if params is None. Or params.scope()
-        NOTE: This method is WIP. Not fully implemented.
-        """
-        if params is None:
-            return core.Scope()  # new scope when params is None
-        else:
-            raise NotImplementedError("Not implemented right now.")
+    def save_params(self, param_path):
+        # reference: save_persistables in io.py
+        pass
 
     @staticmethod
     def _check_and_get_place(place):

From 0fca8a14ef94d061686a7c0100c6c5cf156093bf Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 2 May 2018 17:09:40 -0700
Subject: [PATCH 26/52] Fix fluid/__init__.py

---
 python/paddle/fluid/__init__.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 0f197aab41..dcf4e2a8e0 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -30,9 +30,6 @@ from trainer import EndStepEvent
 import inferencer
 from inferencer import Inferencer
 
-import params
-from params import Params
-
 import io
 import evaluator
 import initializer
@@ -61,7 +58,7 @@ from parallel_executor import ParallelExecutor
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
-          trainer.__all__ + inferencer.__all__ + params.__all__ + [
+          trainer.__all__ + inferencer.__all__ + [
     'io',
     'initializer',
     'layers',

From 4fbde42cdf2a10c9dc69f36ce911ca3bdadf22dd Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 3 May 2018 09:28:35 +0800
Subject: [PATCH 27/52] Fix __shfl_down_sync_ of cross_entropy (#10345)

* fix __shfl_down_sync_ of cross_entropy

* use reduceSum

* "fix ci"
---
 .../fluid/operators/elementwise_op_function.h | 42 +----------
 paddle/fluid/operators/math/cross_entropy.cu  | 65 +++-------------
 paddle/fluid/operators/row_conv_op.cu         |  2 +-
 paddle/fluid/platform/cuda_device_function.h  | 74 +++++++++++++++++++
 paddle/fluid/platform/cuda_primitives.h       | 13 ----
 5 files changed, 88 insertions(+), 108 deletions(-)
 create mode 100644 paddle/fluid/platform/cuda_device_function.h

diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 953aedc850..8b052611f8 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef __NVCC__
 #include <cuda.h>
 #include <thrust/iterator/iterator_adaptor.h>
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
 }
 
 #ifdef __NVCC__
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  const int warpSize = 32;
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += platform::__shfl_down_sync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += platform::__shfl_down_sync(mask, val, offset);
-  }
-
-  return val;
-}
-
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
     const T* x, const T* y, const T* out, const T* dout, int h, int w,
@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
 
   if (dy) {
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = reduceSum(val, tid, h);
+    val = paddle::platform::reduceSum(val, tid, h);
     if (threadIdx.x == 0) {
       dy[j] = val;
     }
@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   if (dy) {
     int h = pre * post;
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = reduceSum(val, tid, h);
+    val = paddle::platform::reduceSum(val, tid, h);
     if (threadIdx.x == 0) {
       dy[j] = val;
     }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 6d2ba2bd0d..0de58d5fdd 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
   }
 }
 
-template <typename T>
-__device__ __forceinline__ T sum_single_warp(T val) {
-  val += platform::__shfl_down_sync(0, val, 16);
-  val += platform::__shfl_down_sync(0, val, 8);
-  val += platform::__shfl_down_sync(0, val, 4);
-  val += platform::__shfl_down_sync(0, val, 2);
-  val += platform::__shfl_down_sync(0, val, 1);
-  return val;
-}
-
-// CUDA do not support dynamic arrary in template
-// https://stackoverflow.com/questions/20497209
-template <typename T>
-struct SharedMemory {
-  // Ensure that we won't compile any un-specialized types
-  __device__ T* GetPointer() { return NULL; }
-};
-
-template <>
-struct SharedMemory<float> {
-  __device__ float* GetPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
-};
-
-template <>
-struct SharedMemory<double> {
-  __device__ double* GetPointer() {
-    extern __shared__ double s_double[];
-    return s_double;
-  }
-};
-
 template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  SharedMemory<T> d_sum_shared;
-  T* d_sum = d_sum_shared.GetPointer();
-  d_sum[tid] = 0;
+  T val = 0;
 
-  int cur_idx = tid;
-  int next_idx = blockIdx.x * class_num + tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] +=
-        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockDim.x;
-    cur_idx += blockDim.x;
+  int idx = blockIdx.x * class_num + tid;
+  int end = blockIdx.x * class_num + class_num;
+  for (; idx < end; idx += blockDim.x) {
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
   }
-  __syncthreads();
 
-  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
+  val = paddle::platform::reduceSum(val, tid, blockDim.x);
+  if (threadIdx.x == 0) {
+    Y[blockIdx.x] = -val;
   }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
 }
 }  // namespace
 
@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
                       ? 512
                       : pow(2, static_cast<int>(std::log2(class_num)));
 
-      SoftCrossEntropyKernel<T><<<
-          batch_size, block, block * sizeof(T),
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index dd8e62aca4..79d08cf3d1 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
new file mode 100644
index 0000000000..7cfeaab35b
--- /dev/null
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+// __shfl_down and __shfl have been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
+                                         int width) {
+  return __shfl(val, src_line, width);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += platform::__shfl_down_sync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += platform::__shfl_down_sync(mask, val, offset);
+  }
+  return val;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 866ff30a8b..8758af0804 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -66,18 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 }
 #endif
 
-// __shfl_down has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
 }  // namespace platform
 }  // namespace paddle

From 4a497b826da9f133ef873be5316f0cf12d280f55 Mon Sep 17 00:00:00 2001
From: Tomasz Patejko <tomasz.patejko@intel.com>
Date: Thu, 3 May 2018 03:32:25 +0200
Subject: [PATCH 28/52] MKLDNN implementation of batch normalization (#9904)

* Initial implementation of forward pass for MKLDNN batch norm

* Added attributes for MKLDNN batch norm

* MKLDNN batch norm forward pass passes unittest. Started working on backward

* Backward pass for MKLDNN batch norm added

* MKLDNN batch norm: scoring added to forward pass

* MKLDNN batch norm: bias as input added; handling AnyLayout when kernel is looked up

* MKLDNN batch norm: python unit tests added; mkldnn tests removed

* MKLDNN batch norm: changes required by cpplint

* MKLDNN batch norm: refactoring the operator

* MKLDNN batch norm: saved variance inversed in backward pass for correct execution of MKLDNN unit tests

* MKLDNN batch norm: refctoring, function for static/const cast to void* added

* MKLDNN batch norm: remove AnyLayout from batch norm

*  MKLDNN batch norm: only NCHW format is supported. Unittests refactored

* MKDNN batch norm: use_mkldnn added to attributes

* MKLDNN batch norm: AnyLayout removed from unittest

* MKLDNN batch norm: added CUDNN defines to batch norm

* MKLDNN batch norm: undefined data_format variable corrected

* MKLDNN batch norm: use_cudnn added, use of setUp method for configuring attributes

* MKLDNN batch norm: added use_cudnn attribute to batch norm operator

* MKLDNN batch norm: correcting batch norm unit tests for MKLDNN

* MKLDNN batch norm: MKLDNN tests moved to another file; reverting changes for saved variance not being inverted

* Change default layout to NCHW

* MKLDNN batch norm: init_kernel_type method added to unit tests

* MKLDNN batch norm: style changes

* MKLDNN batch norm: unit tests refactored

* MKLDNN batch norm: added use_mkldnn attribute to batch norm python interface
---
 .../fluid/operators/batch_norm_mkldnn_op.cc   | 325 ++++++++++++++++++
 paddle/fluid/operators/batch_norm_op.cc       |  35 +-
 python/paddle/fluid/layers/nn.py              |  10 +-
 .../unittests/test_batch_norm_mkldnn_op.py    |  56 +++
 .../tests/unittests/test_batch_norm_op.py     |  49 ++-
 5 files changed, 458 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/operators/batch_norm_mkldnn_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py

diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
new file mode 100644
index 0000000000..0e4a56d4a4
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -0,0 +1,325 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+using mkldnn::memory;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+namespace {
+template <typename T>
+struct bn_type_traits {
+  using op_type = T;
+  using op_desc = typename op_type::desc;
+  using op_prim = typename op_type::primitive_desc;
+};
+
+template <typename T, typename Container>
+void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
+                     Container *c) {
+  auto it = std::begin(*c);
+
+  std::copy(scale_begin, scale_end, std::inserter(*c, it));
+  std::copy(
+      shift_begin, shift_end,
+      std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
+}
+
+template <typename Op, typename... Args>
+void run_batch_norm_op(Args &&... args) {
+  Op batch_norm_op{args...};
+
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(batch_norm_op);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+template <typename T>
+inline void *cast_const_to_void(const T *t) {
+  return static_cast<void *>(const_cast<T *>(t));
+}
+}  // namespace
+
+template <typename T>
+class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto data_layout_str = ctx.Attr<std::string>("data_layout");
+    auto data_layout = framework::StringToDataLayout(data_layout_str);
+    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
+                   "MKLDNN batch normalization handles only NCHW data layout");
+
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *variance = ctx.Input<Tensor>("Variance");
+
+    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *batch_mean = ctx.Output<Tensor>("SavedMean");
+    auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *shift = ctx.Input<Tensor>("Bias");
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+
+    if (!is_test) {
+      batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_variance->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
+                                       : mkldnn::prop_kind::forward_training;
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto dst_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+
+    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
+
+    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
+    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
+
+    unsigned flags = mkldnn::use_scale_shift;
+    if (is_test) flags |= mkldnn::use_global_stats;
+
+    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    auto batch_norm_fwd_desc =
+        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
+    auto batch_norm_fwd_pd =
+        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+
+    const unsigned int ic = dims[1];
+
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    const size_t scaleshift_size = 2 * ic;
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+
+    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+                    shift->data<T>() + ic, &scaleshift_data);
+
+    auto scaleshift_memory = mkldnn::memory{
+        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+
+    if (is_test) {
+      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+                                        cast_const_to_void(mean->data<T>())};
+
+      auto variance_memory =
+          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
+                         cast_const_to_void(variance->data<T>())};
+
+      run_batch_norm_op<typename bn_fwd_types::op_type>(
+          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
+          dst);
+    } else {
+      auto mean_memory =
+          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+                         cast_const_to_void(batch_mean->data<T>())};
+
+      auto variance_memory =
+          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
+                         cast_const_to_void(batch_variance->data<T>())};
+
+      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
+                                               scaleshift_memory, dst,
+                                               mean_memory, variance_memory);
+    }
+
+    if (!is_test) {
+      const unsigned int in = dims[0];
+      const unsigned int sample_size = x->numel() / in / ic;
+
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(
+          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
+      EigenVectorArrayMap<T> saved_variance_e(
+          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      const unsigned int x_arr_size = in * ic;
+      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
+      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
+        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
+      }
+      saved_mean_e /= in * sample_size;
+      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
+        saved_variance_e(nc % ic) +=
+            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
+      }
+      saved_variance_e /= in * sample_size;
+
+      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
+      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
+
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
+
+      auto one_minus_momentum = 1. - momentum;
+      running_mean_arr =
+          mean_arr * momentum + saved_mean_e * one_minus_momentum;
+      running_var_arr =
+          variance_arr * momentum + saved_variance_e * one_minus_momentum;
+    }
+  }
+};
+
+template <typename T>
+class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto data_layout_str = ctx.Attr<std::string>("data_layout");
+    auto data_layout = framework::StringToDataLayout(data_layout_str);
+    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
+                   "MKLDNN batch normalization handles only NCHW data layout");
+
+    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    const float epsilon = ctx.Attr<float>("epsilon");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *shift = ctx.Input<Tensor>("Bias");
+    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
+
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    diff_x->mutable_data<T>(ctx.GetPlace());
+    diff_scale->mutable_data<T>(ctx.GetPlace());
+    diff_shift->mutable_data<T>(ctx.GetPlace());
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
+
+    auto src_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto dst_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto diff_src_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto diff_dst_md =
+        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
+    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
+    auto batch_norm_fwd_pd =
+        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+
+    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
+        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
+        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
+
+    auto src = mkldnn::memory{{src_md, mkldnn_engine},
+                              cast_const_to_void(x->data<T>())};
+
+    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
+                               cast_const_to_void(batch_mean->data<T>())};
+
+    auto variance =
+        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
+                       cast_const_to_void(batch_variance->data<T>())};
+
+    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                                   cast_const_to_void(diff_y->data<T>())};
+
+    const unsigned int ic = dims[1];
+
+    const size_t scaleshift_size = 2 * ic;
+
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+                    shift->data<T>() + ic, &scaleshift_data);
+
+    auto scaleshift_memory = mkldnn::memory{
+        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+
+    std::vector<T> diff_scaleshift_data;
+    diff_scaleshift_data.reserve(scaleshift_size);
+    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
+                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
+                    &diff_scaleshift_data);
+
+    auto diff_scaleshift_memory =
+        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
+                       diff_scaleshift_data.data()};
+
+    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                   static_cast<void *>(diff_x->data<T>())};
+
+    run_batch_norm_op<bn_bwd_types::op_type>(
+        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
+        diff_src, diff_scaleshift_memory);
+
+    auto it = std::begin(diff_scaleshift_data);
+    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
+              diff_shift->data<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+                   ops::BatchNormMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::BatchNormMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index f8b2505ccf..b4bd40d031 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -106,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
                                          ctx.Input<Tensor>("Variance")->type()),
                       "Variance input should be of float type");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library_);
   }
 };
 
@@ -151,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "Variance of the current mini batch, "
               "will apply to output when training")
         .AsIntermediate();
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -349,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+    }
+#endif
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout, library_);
   }
 };
 
@@ -474,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
 
     op->SetInput("Scale", Input("Scale"));
+    op->SetInput("Bias", Input("Bias"));
     op->SetInput("SavedMean", Output("SavedMean"));
     op->SetInput("SavedVariance", Output("SavedVariance"));
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7f16bf2a0c..93e8d0bf29 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1496,6 +1496,7 @@ def batch_norm(input,
                bias_attr=None,
                data_layout='NCHW',
                in_place=False,
+               use_mkldnn=False,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None,
@@ -1574,9 +1575,12 @@ def batch_norm(input,
             "SavedMean": saved_mean,
             "SavedVariance": saved_variance
         },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
+        attrs={
+            "momentum": momentum,
+            "epsilon": epsilon,
+            "is_test": is_test,
+            "use_mkldnn": use_mkldnn
+        })
 
     return helper.append_activation(batch_norm_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
new file mode 100644
index 0000000000..f6097d4b84
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid.framework import grad_var_name
+from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
+
+
+class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_formats = ["NCHW"]
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        # run forward
+        y, saved_mean, saved_variance = _reference_training(
+            x, scale, bias, epsilon, data_layout)
+        mean_out = saved_mean * (1. - momentum) + momentum * mean
+        variance_out = saved_variance * (1. - momentum) + momentum * variance
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index a0e78a4607..4216d83653 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
 class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
+        self.use_mkldnn = False
+        self.init_kernel_type()
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
@@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase):
             # attrs
             is_test=True,
             data_layout=data_layout,
+            use_mkldnn=self.use_mkldnn,
             epsilon=epsilon)
 
         batch_norm_op.run(scope, place)
@@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase):
                                       [2, 3, 4, 5])
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
+    def init_kernel_type(self):
+        pass
+
 
 class TestFP16BatchNormOpInference(TestBatchNormOpInference):
     def setUp(self):
         self.dtype = np.float16
+        self.use_mkldnn = False
+        self.init_kernel_type()
 
     def test_check_output(self):
         places = []
@@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 
 
 class TestBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.use_mkldnn = False
+        self.data_formats = ["NCHW", "NHWC"]
+        self.init_kernel_type()
+
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         np.allclose(np.array(tensor), np_array, atol=atol)
 
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        # run forward
+        y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
+                                                     data_layout)
+        mean_out = saved_mean * (1. - momentum) + momentum * mean
+        variance_out = var_ref * (1. - momentum) + momentum * variance
+        saved_variance = 1. / np.sqrt(var_ref + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+
+        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
+
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
@@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase):
             mean = np.zeros(scale_shape).astype(np.float32)
             variance = np.ones(scale_shape).astype(np.float32)
 
-            # run forward
-            y, saved_mean, var_ref = _reference_training(x, scale, bias,
-                                                         epsilon, data_layout)
-            mean_out = saved_mean * (1. - momentum) + momentum * mean
-            variance_out = var_ref * (1. - momentum) + momentum * variance
-            saved_variance = 1. / np.sqrt(var_ref + epsilon)
-            # run backward
             y_grad = np.random.random_sample(shape).astype(np.float32)
-            x_grad, scale_grad, bias_grad = _reference_grad(
-                x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
+
+            y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
+                x, y_grad, scale, bias, mean, variance, epsilon, momentum,
+                shape, data_layout)
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
@@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "momentum": momentum,
                         "epsilon": epsilon,
                         "is_test": False,
-                        "data_layout": data_layout
+                        "data_layout": data_layout,
+                        "use_mkldnn": self.use_mkldnn
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
@@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase):
             print "op test forward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
+
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            for data_format in self.data_formats:
                 test_with_place(place, data_format, [2, 3, 4, 5])
 
+    def init_kernel_type(self):
+        pass
+
 
 if __name__ == '__main__':
     unittest.main()

From 6084af47ef4358ab2b54636aa41d976f2ea34056 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Thu, 3 May 2018 10:28:16 +0800
Subject: [PATCH 29/52] Fix the bug when a input variable of op is dispensable.
 (#10268)

* Fix the bug when a input variable of op is dispensable.

* Add HasInputs/Outputs interfaces to OperatorBase.

* Remove the unreferenced header file.
---
 paddle/capi/Matrix.cpp             |  2 +-
 paddle/fluid/framework/operator.cc | 41 +++++++++++++++++++++++++++---
 paddle/fluid/framework/operator.h  |  2 ++
 paddle/fluid/platform/profiler.h   |  1 -
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 24b0020636..733d49cacf 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
 paddle_error paddle_matrix_get_shape(paddle_matrix mat,
                                      uint64_t* height,
                                      uint64_t* width) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
   if (height != nullptr) {
     *height = cast(mat)->mat->getHeight();
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 32576423a6..d70f26026c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   RunImpl(scope, place);
 }
 
+bool OperatorBase::HasInputs(const std::string& name) const {
+  if (inputs_.find(name) != inputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs(
   return it->second;
 }
 
+bool OperatorBase::HasOutputs(const std::string& name) const {
+  if (outputs_.find(name) != outputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
   PADDLE_ENFORCE_LE(outs.size(), 1UL,
@@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const {
   if (op_info == nullptr || op_info->proto_ == nullptr) return;
 
   for (auto& in : op_info->Proto().inputs()) {
-    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                   "Type %s's input %s is not set", Type(), in.name());
+    if (!in.dispensable()) {
+      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                     "Operator %s's input, %s, is not set", Type(), in.name());
+    }
   }
 
   for (auto& out : op_info->Proto().outputs()) {
-    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                   "Type %s's output %s is not set", Type(), out.name());
+    if (!out.dispensable()) {
+      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                     "Operator %s's output, %s, is not set", Type(),
+                     out.name());
+    }
   }
 }
 
@@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
       : op_(op), scope_(scope) {}
 
   bool HasInput(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
     auto& ins = Inputs(name);
     size_t length = ins.size();
     if (length == 0) {
@@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasOutput(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
     auto& outs = Outputs(name);
     size_t length = outs.size();
     if (length == 0) {
@@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasInputs(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
     auto inputs = op_.Inputs(name);
     if (inputs.empty()) {
       return false;
@@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasOutputs(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
     auto outputs = op_.Outputs(name);
     if (outputs.empty()) {
       return false;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 826cc57b72..d373c48b1a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -105,6 +105,7 @@ class OperatorBase {
   const VariableNameMap& Inputs() const { return inputs_; }
   const VariableNameMap& Outputs() const { return outputs_; }
 
+  bool HasInputs(const std::string& name) const;
   //! Get a input with argument's name described in `op_proto`
   std::string Input(const std::string& name) const;
   //! Get a input which has multiple variables.
@@ -112,6 +113,7 @@ class OperatorBase {
   //! Get all inputs variable names
   std::vector<std::string> InputVars() const;
 
+  bool HasOutputs(const std::string& name) const;
   //! Get a output with argument's name described in `op_proto`
   std::string Output(const std::string& name) const;
   //! Get an output which has multiple variables.
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index b07427c8f6..428d9ebcea 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
 namespace platform {

From 6a5bf0376c0fb26b56755ad33542a348a19e55ec Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Thu, 3 May 2018 10:53:12 +0800
Subject: [PATCH 30/52] fix toctree in multi_cluster/index_en.rst

---
 doc/v2/howto/cluster/multi_cluster/index_en.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/v2/howto/cluster/multi_cluster/index_en.rst b/doc/v2/howto/cluster/multi_cluster/index_en.rst
index b69bd5b2db..9bc1eb2e37 100644
--- a/doc/v2/howto/cluster/multi_cluster/index_en.rst
+++ b/doc/v2/howto/cluster/multi_cluster/index_en.rst
@@ -8,28 +8,28 @@ The user's cluster environment is not the same. To facilitate everyone's deploym
 ..  toctree::
   :maxdepth: 1
 
-  k8s_cn.md
-  k8s_distributed_cn.md
+  k8s_en.md
+  k8s_distributed_en.md
 
 `OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
 
 ..  toctree::
   :maxdepth: 1
 
-  openmpi_cn.md
+  openmpi_en.md
 
 `Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
 
 ..  toctree::
   :maxdepth: 1
 
-  fabric_cn.md
+  fabric_en.md
 
 We also support the deployment of PaddlePaddle on AWS. Learn more about:
 
 ..  toctree::
   :maxdepth: 1
 
-  k8s_aws_cn.md
+  k8s_aws_en.md
 
-The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
\ No newline at end of file
+The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .

From e9737d600f44b13810c91c497a2ce42d96efddfe Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 3 May 2018 11:03:07 +0800
Subject: [PATCH 31/52] add a private function to find adam opt pass

---
 python/paddle/fluid/distribute_transpiler.py | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 079d90f585..c180e7b210 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -401,11 +401,8 @@ class DistributeTranspiler:
         # HACK: optimization global ops only used to scale beta1 and beta2
         # replace it with dependency engine.
         for op in self.optimize_ops:
-            if op.type == "scale":
-                for in_name in op.input_arg_names:
-                    if in_name.startswith("beta1_pow_acc") or \
-                            in_name.startswith("beta2_pow_acc"):
-                        global_ops.append(op)
+            if self._is_adam_connected_op(op):
+                global_ops.append(op)
 
         def __append_optimize_op__(op, block, grad_to_block_id):
             if self._is_opt_op(op):
@@ -1152,13 +1149,20 @@ class DistributeTranspiler:
                     op.input("Param")[0]),
                                      self.origin_program.global_block().var(
                                          op.input("Grad")[0])))
-            elif op.type == "scale":
-                # for adam optimize op
-                for in_name in op.input_arg_names:
-                    if in_name.startswith("beta1_pow_acc") or \
-                            in_name.startswith("beta2_pow_acc"):
-                        opt_ops.append(op)
-                        break
+            elif self._is_adam_connected_op(op):
+                opt_ops.append(op)
             else:
                 pass
         return opt_ops, params_grads
+
+    def _is_adam_connected_op(self, op):
+        """
+        A hack function to determinate whether the input operator
+        is connected to optimize operator.
+        """
+        if op.type == "scale":
+            for in_name in op.input_arg_names:
+                if in_name.startswith("beta1_pow_acc") or \
+                        in_name.startswith("beta2_pow_acc"):
+                    return True
+        return False

From f63ff90b03b444ff7562bf72fca6877ad7b068a2 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Thu, 3 May 2018 11:13:30 +0800
Subject: [PATCH 32/52] Fix/fp64 (#10346)

* "fix double type error"

* "fix ci"

* "softmax fp64"

* "fix momentum"

* "fix ci"
---
 paddle/fluid/operators/momentum_op.cc   |  8 ++++++++
 paddle/fluid/operators/scale_op.cc      | 10 ++++------
 paddle/fluid/operators/softmax_op.cc    |  6 ++++--
 paddle/fluid/operators/softmax_op.cu.cc |  6 ++++--
 paddle/fluid/operators/top_k_op.cc      |  3 ++-
 paddle/fluid/operators/top_k_op.cu      |  3 ++-
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index 6c70970e15..f13ec53905 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class MomentumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 1e938638c9..7dcf33c989 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -47,9 +46,9 @@ Scale operator
 
 $$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale",
-                      "(float, default 1.0)"
-                      "The scaling factor of the scale operator.")
+    AddAttr<float>("scale",
+                   "(float, default 1.0)"
+                   "The scaling factor of the scale operator.")
         .SetDefault(1.0);
   }
 };
@@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
-                  ops::ScaleGradMaker);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(
     scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 2741ba95bc..aa7b192e32 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index 0c1f7cef7a..5fb4f011d9 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -19,6 +19,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
     ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(softmax_grad,
-                        ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 2e4e8caed5..942a5de3f9 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -75,4 +75,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(top_k,
-                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
+                       ops::TopkKernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index d7f4d383ce..2ea9fd1d29 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -318,4 +318,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);

From e97c1a8ca04bdbfe8906e74f9433afad58fa2d7f Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 3 May 2018 12:58:32 +0800
Subject: [PATCH 33/52] fix __shfl

---
 paddle/fluid/platform/cuda_device_function.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 7cfeaab35b..2405f33d4f 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -35,6 +35,16 @@ __forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) {
+  return __shfl_down_sync(mask, val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T __shfl_sync(unsigned mask, T val, int src_line,
+                                         int width) {
+  return __shfl_sync(mask, val, src_line, width);
+}
 #endif
 
 template <typename T>

From 9ab8faaf76057c21be368adb0e23999b3acc5028 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Thu, 3 May 2018 13:05:01 +0800
Subject: [PATCH 34/52] fix pool with mask layer bug

---
 paddle/math/Matrix.cpp | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 0e84cb3739..bcd6dfe1fd 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2157,26 +2157,20 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
           int wend = wstart + sizeX;
           wstart = wstart < 0 ? 0 : wstart;
           wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-          if (maskData == NULL) {
-            real tmp = -(real)FLT_MAX;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                tmp = tmp < inputData[h * imgSizeW + w]
-                          ? inputData[h * imgSizeW + w]
-                          : tmp;
-              }
-            }
-            outData[ph * outputW + pw] = tmp;
-          } else {
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
-                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
-                  maskData[ph * outputW + pw] = h * imgSizeW + w;
-                }
+
+          real maxval = -(real)FLT_MAX;
+          int max_index = -1;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              if (maxval < inputData[h * imgSizeW + w]) {
+                maxval = inputData[h * imgSizeW + w];
+                max_index = h * imgSizeW + w;
               }
             }
           }
+
+          outData[ph * outputW + pw] = maxval;
+          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
         }
       }
       // compute offset

From bc8160350b9375ce4a96b6e3927acd2fd9e74c9b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 3 May 2018 13:41:19 +0800
Subject: [PATCH 35/52] Fix compile

---
 paddle/fluid/operators/math/math_function_test.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index eb6b77f958..3d03981b9f 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -279,8 +279,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   paddle::platform::float16* c =
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
-  GetBlas<float16>(context).GEMM(false, false, m, n, k, float16(1), a, 3, b + 1,
-                                 4, float16(1), c + 1, 4);
+  GetBlas<paddle::platform::float16>(context).GEMM(
+      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
+      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -388,12 +389,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   paddle::platform::float16* c =
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
-  GetBlas<float16>(context).GEMM(false, true, m, n, k, float16(1), a, 3, b + 3,
-                                 3, float16(1), c + 1, 4);
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext,
-                                paddle::platform::float16>(
-      context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3,
-      3, paddle::platform::float16(1), c + 1, 4);
+  GetBlas<paddle::platform::float16>(context).GEMM(
+      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
+      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 

From 3000e9946f334452c6448d88ce0cf17595410479 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Wed, 2 May 2018 22:46:15 -0700
Subject: [PATCH 36/52] Write the Understand Sentiment book example with
 stacked LSTM using new API (#10355)

* Add understand apiu with stacked lstm for new API

* Complete exam
---
 ...otest_understand_sentiment_stacked_lstm.py | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py

diff --git a/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
new file mode 100644
index 0000000000..9948e5c023
--- /dev/null
+++ b/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+STACKED_NUM = 3
+
+
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    return prediction
+
+
+def inference_network(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
+                           STACKED_NUM)
+    return net
+
+
+def train_network(word_dict):
+    prediction = inference_network(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
+
+
+def train(use_cuda, save_path):
+    BATCH_SIZE = 128
+    EPOCH_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+
+    test_data = paddle.batch(
+        paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndIteration):
+            if (event.batch_id % 10) == 0:
+                avg_cost, accuracy = trainer.test(reader=test_data)
+
+                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
+                    event.batch_id + 1, avg_cost, accuracy))
+
+                if accuracy > 0.01:  # Low threshold for speeding up CI
+                    trainer.params.save(save_path)
+                    return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    trainer = fluid.Trainer(
+        partial(train_network, word_dict),
+        optimizer=fluid.optimizer.Adagrad(learning_rate=0.002),
+        place=place,
+        event_handler=event_handler)
+
+    trainer.train(train_data, EPOCH_NUM, event_handler=event_handler)
+
+
+def infer(use_cuda, save_path):
+    params = fluid.Params(save_path)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+    inferencer = fluid.Inferencer(
+        partial(inference_network, word_dict), params, place=place)
+
+    def create_random_lodtensor(lod, place, low, high):
+        data = np.random.random_integers(low, high,
+                                         [lod[-1], 1]).astype("int64")
+        res = fluid.LoDTensor()
+        res.set(data, place)
+        res.set_lod([lod])
+        return res
+
+    lod = [0, 4, 10]
+    tensor_words = create_random_lodtensor(
+        lod, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_path = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, save_path)
+    infer(use_cuda, save_path)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)

From 62fed4cbb33275d1fc4b02f1617b8b8efddd4b00 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 3 May 2018 15:12:20 +0800
Subject: [PATCH 37/52] fix __shfl_down (#10362)

---
 paddle/cuda/include/hl_base.h         |  5 ++++
 paddle/fluid/operators/row_conv_op.cu | 12 +++++++--
 paddle/function/RowConvOpGpu.cu       | 35 +++++++++++++++------------
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 402302a5bf..77f5d82dbe 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -229,6 +229,11 @@ extern __thread cudaStream_t default_stream;
 
 // __shfl has been deprecated as of CUDA 9.0.
 #if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
 template <typename T>
 __forceinline__ __device__ T
 __shfl_sync(unsigned, T val, int src_line, int width) {
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 79d08cf3d1..082f761d37 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -189,6 +189,10 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
   }
   __syncthreads();
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
@@ -220,7 +224,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::__shfl_down_sync(0, val, offset);
+          val += platform::__shfl_down_sync(mask, val, offset);
         }
         __syncthreads();
 
@@ -251,6 +255,10 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
   T *sh_in = mem;
   T *sh_dout = &mem[block_x * block_y];
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
@@ -276,7 +284,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::__shfl_down_sync(0, val, offset);
+          val += platform::__shfl_down_sync(mask, val, offset);
         }
         __syncthreads();
 
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index 9d8a6d80bb..f820ee9a97 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "RowConvOp.h"
-#include "hl_base.h"
+#include "paddle/cuda/include/hl_base.h"
+#include "paddle/function/RowConvOp.h"
 
 namespace paddle {
 
@@ -94,7 +94,7 @@ __global__ void KeRowConv2(real* y,
 }
 
 template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
                               const GpuMatrix& in,
                               const GpuMatrix& filter,
                               const GpuIVector& seq) {
@@ -144,6 +144,10 @@ __global__ void KeRowConvBwWeight(real* dw,
   }
   __syncthreads();
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < numSeq; ++i) {
     const int start = starts[i];
     const int end = starts[i + 1];
@@ -170,11 +174,10 @@ __global__ void KeRowConvBwWeight(real* dw,
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
         __syncthreads();
         // warp size and blockDim.x is 32.
-        val += __shfl_down(val, 16);
-        val += __shfl_down(val, 8);
-        val += __shfl_down(val, 4);
-        val += __shfl_down(val, 2);
-        val += __shfl_down(val, 1);
+
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
         __syncthreads();
         if (tidx == 0) {
           sh_dw[t][tidy] += val;
@@ -205,6 +208,10 @@ __global__ void KeRowConvBwWeight2(real* dw,
   __shared__ real sh_x[BLOCK_H][BLOCK_W];
   __shared__ real sh_dy[BLOCK_H][BLOCK_W];
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < numSeq; ++i) {
     const int start = starts[i];
     const int end = starts[i + 1];
@@ -230,11 +237,9 @@ __global__ void KeRowConvBwWeight2(real* dw,
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
         __syncthreads();
         // warp size and blockDim.x is 32.
-        val += __shfl_down(val, 16);
-        val += __shfl_down(val, 8);
-        val += __shfl_down(val, 4);
-        val += __shfl_down(val, 2);
-        val += __shfl_down(val, 1);
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
         __syncthreads();
 
         if (tidx == 0 && (gidx + tidy) < width) {
@@ -323,8 +328,8 @@ template <>
 void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
                                   const GpuMatrix& in,
                                   const GpuMatrix& filter,
-                                  GpuMatrix& inG,
-                                  GpuMatrix& filterG,
+                                  GpuMatrix& inG,      // NOLINT
+                                  GpuMatrix& filterG,  // NOLINT
                                   const GpuIVector& seq) {
   const size_t numSeq = seq.getSize() - 1;
   const size_t contextLength = filter.getHeight();

From bf59d622d0c39aa982f75e7bf34e13326dfde92b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 3 May 2018 15:21:00 +0800
Subject: [PATCH 38/52] brief survey of dist training

---
 .../dist_train/distributed_traing_review.md   | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 doc/fluid/design/dist_train/distributed_traing_review.md

diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
new file mode 100644
index 0000000000..032452c615
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -0,0 +1,50 @@
+# Parallelism, Asynchronous,  Synchronous, Codistillation
+
+
+[TOC]
+
+For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
+
+# Model Parallelism
+In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
+
+A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
+
+More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
+
+# Data Parallelism
+Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
+
+# Asynchronous Training
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of sharedsharded parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+
+In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
+
+Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
+
+# Synchronous Training
+Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
+
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+
+Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
+
+Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
+
+# Codistillation
+Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
+
+
+# Reference
+
+[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
+
+[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
+
+[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
+
+[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
+
+
+
+

From 24b2d14e4c5a41d1aa262f935a86593d4d8bed5e Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 3 May 2018 15:23:15 +0800
Subject: [PATCH 39/52] fix

---
 doc/fluid/design/dist_train/distributed_traing_review.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
index 032452c615..a4604705a8 100644
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -1,8 +1,6 @@
 # Parallelism, Asynchronous,  Synchronous, Codistillation
 
 
-[TOC]
-
 For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
 
 # Model Parallelism

From 0595f23ec3af975cfa8a911b9bc37d6c6b232dba Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Thu, 3 May 2018 15:57:34 +0800
Subject: [PATCH 40/52] fix dead links in README.md (#10359)

* update README.md

* change to .html

* use v2 api
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d06375a444..a3b13fe79c 100644
--- a/README.md
+++ b/README.md
@@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 

From 41452582962a7ad57945c8d5b21fb9ebd95752b4 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 3 May 2018 17:03:24 +0800
Subject: [PATCH 41/52] fix delete_ops

---
 python/paddle/fluid/framework.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2cdf010926..c9a48ea838 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -854,11 +854,10 @@ class Block(object):
         try:
             start = list(self.ops).index(ops[0])
             end = list(self.ops).index(ops[-1])
+            [self.remove_op(start) for _ in xrange(end - start + 1)]
         except Exception, e:
             raise e
 
-        self.desc.remove_op(start, end + 1)
-
     def slice_ops(self, start, end):
         return self.ops[start:end]
 

From 387e2ccdbf68ac9e7f6f1a2280f9c299e1d39259 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 3 May 2018 17:05:29 +0800
Subject: [PATCH 42/52] Update distributed_traing_review.md

---
 doc/fluid/design/dist_train/distributed_traing_review.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
index a4604705a8..74066a3c2b 100644
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -14,7 +14,7 @@ More flexible model parallelism is challenging. For example, multi-level-single-
 Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
 
 # Asynchronous Training
-In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of sharedsharded parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
 
 In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
 
@@ -23,7 +23,7 @@ Many production models, such as [3], are trained with distributed asynchronous s
 # Synchronous Training
 Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
 
-An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
 
 Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
 

From ea522dabc9b20497945d157ce61844d5faadf3aa Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 3 May 2018 17:36:30 +0800
Subject: [PATCH 43/52] refine delete ops

---
 python/paddle/fluid/distribute_transpiler.py | 15 +++++++++++----
 python/paddle/fluid/framework.py             | 10 ----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index c180e7b210..ee17b11c8b 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -317,8 +317,7 @@ class DistributeTranspiler:
 
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        self.origin_program.global_block().delete_ops(self.optimize_ops)
-        self.origin_program.sync_with_cpp()
+        self.delete_ops(self.origin_program.global_block(), self.optimize_ops)
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.origin_program.__str__()
         return self.origin_program
@@ -602,8 +601,7 @@ class DistributeTranspiler:
                         attrs={"axis": 0})
 
                     # delete lookup_table_op
-                    program.global_block().delete_ops([op])
-                    program.sync_with_cpp()
+                    self.delete_ops(program.global_block(), [op])
                     # break for loop
                     break
 
@@ -1166,3 +1164,12 @@ class DistributeTranspiler:
                         in_name.startswith("beta2_pow_acc"):
                     return True
         return False
+
+    def delete_ops(self, block, ops):
+        try:
+            start = list(block.ops).index(ops[0])
+            end = list(block.ops).index(ops[-1])
+            [block.remove_op(start) for _ in xrange(end - start + 1)]
+        except Exception, e:
+            raise e
+        block.program.sync_with_cpp()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index c9a48ea838..ce9b880aeb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -848,16 +848,6 @@ class Block(object):
         self.desc.remove_op(index, index + 1)
         del self.ops[index]
 
-    def delete_ops(self, ops):
-        # remove from cpp
-        # FIXME(typhoonzero): remove only the first occurrence.
-        try:
-            start = list(self.ops).index(ops[0])
-            end = list(self.ops).index(ops[-1])
-            [self.remove_op(start) for _ in xrange(end - start + 1)]
-        except Exception, e:
-            raise e
-
     def slice_ops(self, start, end):
         return self.ops[start:end]
 

From beb1245560b26fd198c3bdd7063334ad933f2d89 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 3 May 2018 20:43:14 +0800
Subject: [PATCH 44/52] add relu converter and unit-test

---
 .../fluid/inference/tensorrt/CMakeLists.txt   |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  5 +-
 .../tensorrt/convert/activation_op.cc         | 40 ++++++++
 .../inference/tensorrt/convert/op_converter.h | 40 ++++----
 .../tensorrt/convert/test_activation_op.cc    | 94 +++++++++++++++++++
 .../tensorrt/convert/test_op_converter.cc     |  4 +-
 paddle/fluid/inference/tensorrt/engine.cc     | 30 +++++-
 paddle/fluid/inference/tensorrt/engine.h      |  8 ++
 .../fluid/inference/tensorrt/test_engine.cc   |  1 -
 9 files changed, 197 insertions(+), 26 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/activation_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_activation_op.cc

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 8dd95293e7..288789d6e4 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,3 +1,4 @@
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
+set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 19fffa71cc..572e29515f 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,2 +1,3 @@
-file(GLOB TENSORRT_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc ${TENSORRT_OPS} DEPS ${FLUID_CORE_MODULES})
+nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
+nv_test(test_tensorrt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc 
+  DEPS ${FLUID_CORE_MODULES} activation_op)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
new file mode 100644
index 0000000000..543784289c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ReluOpConverter : public OpConverter {
+ public:
+  ReluOpConverter() {}
+  void operator()(const framework::OpDesc& op) override {
+    LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
+                 "type is Relu";
+    const nvinfer1::ITensor* input_tensor =
+        engine_->GetITensor(op.Input("X")[0]);
+    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
+        nvinfer1::ActivationType::kRELU);
+    engine_->SetITensor(op.Output("Out")[0], layer->getOutput(0));
+  }
+};
+
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 22a4812ce7..f8ca219bb8 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -30,13 +30,14 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-
   virtual void operator()(const framework::OpDesc& op) {}
-  void Execute(const framework::OpDesc& op) {
+
+  void Execute(const framework::OpDesc& op, TensorRTEngine* engine) {
     std::string type = op.Type();
     auto it = converters_.find(type);
     PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]",
                    type);
+    it->second->SetEngine(engine);
     (*it->second)(op);
   }
 
@@ -50,18 +51,31 @@ class OpConverter {
     converters_[key] = new T;
   }
 
+  // convert fluid op to tensorrt layer
+  void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) {
+    OpConverter::Global().Execute(op, engine);
+  }
+
+  // convert fluid block to tensorrt network
+  void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) {
+    for (auto op : block.AllOps()) {
+      OpConverter::Global().Execute(*op, engine);
+    }
+  }
+
+  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
+
   virtual ~OpConverter() {}
 
+  // TensorRT engine
+  TensorRTEngine* engine_{nullptr};
+
  private:
   // registered op converter map, whose key is the fluid op type, and value is
   // the pointer position of corresponding OpConverter class.
   std::unordered_map<std::string, OpConverter*> converters_;
-
   // fluid inference scope
-  framework::Scope* scope_;
-  // tensorrt input/output tensor map, whose key is the fluid variable name,
-  // and value is the pointer position of tensorrt tensor
-  std::unordered_map<std::string, nvinfer1::ITensor*> tr_tensors_;
+  framework::Scope* scope_{nullptr};
 };
 
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)      \
@@ -72,18 +86,6 @@ class OpConverter {
   };                                                           \
   trt_##op_type__##_converter trt_##op_type__##_converter__;
 
-class BlockConverter {
- public:
-  BlockConverter() {}
-
-  // convert fluid block to tensorrt network
-  void ConvertBlock(const framework::BlockDesc& block) {
-    for (auto op : block.AllOps()) {
-      OpConverter::Global().Execute(*op);
-    }
-  }
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
new file mode 100644
index 0000000000..0f390bee1f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_OP(relu);
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void compare(float input, float expect) {
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+
+  // init fluid op and variable
+  auto x_var = scope.Var("X");
+  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize({1, 1});
+  std::vector<float> init;
+  init.push_back(input);
+  framework::TensorFromVector(init, ctx, x_tensor);
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
+  out_tensor->Resize({1, 1});
+  out_tensor->mutable_data<float>(place);
+
+  framework::OpDesc op_desc;
+  op_desc.SetType("relu");
+  op_desc.SetInput("X", {"X"});
+  op_desc.SetOutput("Out", {"Out"});
+
+  auto relu_op = framework::OpRegistry::CreateOp(op_desc);
+
+  // run fluid op
+  relu_op->Run(scope, place);
+  std::vector<float> out1;
+  framework::TensorToVector(*out_tensor, ctx, &out1);
+
+  // init tensorrt op
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+  TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
+  engine->InitNetwork();
+  engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
+                       nvinfer1::DimsCHW{1, 1, 1});
+
+  OpConverter op_converter;
+  op_converter.ConvertOp(op_desc, engine);
+
+  engine->DeclareOutput("Out");
+  engine->FreezeNetwork();
+  engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
+
+  // run tensorrt op
+  engine->Execute(1);
+
+  float out2;
+  engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float));
+
+  ASSERT_EQ(out1[0], out2);
+  ASSERT_EQ(out1[0], expect);
+
+  delete engine;
+  cudaStreamDestroy(stream);
+}
+
+TEST(OpConverter, ConvertRelu) {
+  compare(1, 1);   // relu(1) = 1
+  compare(-5, 0);  // relu(-5) = 0
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 43be2af68a..5c5ac10394 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -28,8 +28,8 @@ TEST(BlockConverter, ConvertBlock) {
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
-  BlockConverter converter;
-  converter.ConvertBlock(*block);
+  OpConverter converter;
+  converter.ConvertBlock(*block, nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 03a25f8e8b..df123a5907 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -80,8 +80,8 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
   auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-
   buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
+  TensorRTEngine::SetITensor(name, input);
   return input;
 }
 
@@ -99,6 +99,19 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
+void TensorRTEngine::DeclareOutput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
+                    name);
+
+  auto* output = TensorRTEngine::GetITensor(name);
+  PADDLE_ENFORCE(output != nullptr);
+  output->setName(name.c_str());
+  infer_network_->markOutput(*output);
+  // output buffers' size can only be decided latter, set zero here to mark this
+  // and will reset latter.
+  buffer_sizes_[name] = 0;
+}
+
 void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
   return buffer(name);
 }
@@ -110,7 +123,6 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
   PADDLE_ENFORCE_GE(max_size, it->second);
-
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
                                        cudaMemcpyDeviceToHost, *stream_));
 }
@@ -126,10 +138,24 @@ void*& TensorRTEngine::buffer(const std::string& name) {
 void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
                                      size_t size) {
   void* buf = buffer(name);
+  cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
   PADDLE_ENFORCE_EQ(
       0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
 }
 
+void TensorRTEngine::SetITensor(const std::string& name,
+                                nvinfer1::ITensor* tensor) {
+  PADDLE_ENFORCE(tensor != nullptr);
+  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s",
+                    name);
+  itensor_map_[name] = tensor;
+}
+
+nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+  PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name);
+  return itensor_map_[name];
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 82d8c3df4e..eeb807ab59 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -80,6 +80,8 @@ class TensorRTEngine : public EngineBase {
   // name.
   void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                      const std::string& name);
+  // Set the itensor_map_[name] as the network's output, and set its name.
+  void DeclareOutput(const std::string& name);
 
   // GPU memory address for an ITensor with specific name. One can operate on
   // these memory directly for acceleration, for example, output the converted
@@ -98,6 +100,10 @@ class TensorRTEngine : public EngineBase {
   // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
   // to CPU.
   void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
+  // Fill an ITensor into map itensor_map_.
+  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
+  // Get an ITensor called name.
+  nvinfer1::ITensor* GetITensor(const std::string& name);
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
@@ -113,6 +119,8 @@ class TensorRTEngine : public EngineBase {
   std::vector<void*> buffers_;
   // max data size for the buffers.
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
+  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
+      itensor_map_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index c6e1c71cdc..a08b78f930 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -70,7 +70,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
   engine_->Execute(1);
 
   LOG(INFO) << "to get output";
-  // void* y_v =
   float y_cpu;
   engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
 

From f428e82d252465ce0f904a6ce257f5e3f271792f Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 3 May 2018 09:51:13 -0700
Subject: [PATCH 45/52] Prediction should be a part of inference_network in new
 API (#10356)

---
 .../notest_image_classification_resnet.py                  | 7 +++----
 .../notest_image_classification_vgg.py                     | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py
index 5cbfdef91a..17db38797c 100644
--- a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py
@@ -64,15 +64,14 @@ def resnet_cifar10(input, depth=32):
     res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
     pool = fluid.layers.pool2d(
         input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
+    predict = fluid.layers.fc(input=pool, size=10, act='softmax')
+    return predict
 
 
 def inference_network():
-    classdim = 10
     data_shape = [3, 32, 32]
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    net = resnet_cifar10(images, 32)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    predict = resnet_cifar10(images, 32)
     return predict
 
 
diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py
index 8a6a5ff61a..e83afeed2f 100644
--- a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py
@@ -43,15 +43,14 @@ def vgg16_bn_drop(input):
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
+    predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
+    return predict
 
 
 def inference_network():
-    classdim = 10
     data_shape = [3, 32, 32]
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    predict = vgg16_bn_drop(images)
     return predict
 
 

From 7a860694225507286485ba13e96ae6fd4fcf2622 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 3 May 2018 11:45:38 -0700
Subject: [PATCH 46/52] Add float16 demo code and put float16 work in
 contrib/float16 folder (#10331)

* add test float16 inference accuracy example

* complete the test

* clean code

* add argument parse and refine tests

* add shell script

* add float16 benchmark code

* refine code

* prepare for contrib/float16

* put things in contrib float16 folder

* update benchmark result

* further update benchmark report

* add float16 inference report

* update report
---
 contrib/float16/.gitignore                    |   1 +
 contrib/float16/float16_benchmark.md          |  97 +++++
 contrib/float16/float16_inference_demo.py     | 362 ++++++++++++++++++
 contrib/float16/float16_inference_report.md   | 163 ++++++++
 contrib/float16/float16_transpiler.py         | 256 +++++++++++++
 contrib/float16/run_float16_demo.sh           | 117 ++++++
 .../test_inference_image_classification.cc    |  49 ++-
 python/paddle/fluid/inference_transpiler.py   | 208 +---------
 .../tests/book/test_image_classification.py   |  20 -
 9 files changed, 1030 insertions(+), 243 deletions(-)
 create mode 100644 contrib/float16/.gitignore
 create mode 100644 contrib/float16/float16_benchmark.md
 create mode 100644 contrib/float16/float16_inference_demo.py
 create mode 100644 contrib/float16/float16_inference_report.md
 create mode 100644 contrib/float16/float16_transpiler.py
 create mode 100755 contrib/float16/run_float16_demo.sh

diff --git a/contrib/float16/.gitignore b/contrib/float16/.gitignore
new file mode 100644
index 0000000000..dd28d354f4
--- /dev/null
+++ b/contrib/float16/.gitignore
@@ -0,0 +1 @@
+*.inference.model
diff --git a/contrib/float16/float16_benchmark.md b/contrib/float16/float16_benchmark.md
new file mode 100644
index 0000000000..b51d6bde92
--- /dev/null
+++ b/contrib/float16/float16_benchmark.md
@@ -0,0 +1,97 @@
+# float16 benchmark
+
+## Description
+We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
+
+For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
+
+## Test environment
+- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti 
+- CUDNN: 7.1.1
+- CUDA: 9.0
+- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
+
+## Benchmark on V100
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
+|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
+|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
+|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
+|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
+|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
+|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
+|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
+|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
+
+
+### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| 
+|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98  | 6.23  | 8.42  | 13.44  | 24.19  | 44.97  | 
+|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96  | 3.24  | 4.01  |  5.78  |  9.65  | 17.37  |
+|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34  | 1.92  | 2.10  |  2.33  |  2.51  |  2.59  |
+
+
+### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
+|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10  | 3.28  | 4.47  | 6.86   | 11.63  | 21.16  |
+|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77  | 3.97  | 3.92  | 4.15   |  6.41  | 11.02  | 
+|Speedup|      |      |      |      |       |       | 1.14  | 1.65   |  1.81  |  1.92  |
+
+
+## Benchmark on 1080 Ti
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|
+|float32| 5.60  | 9.38  | 15.86 | 29.79 | 57.60  | 117.73 |
+|float16| 4.99  | 7.79  | 13.47 | 26.02 | 52.30  | 102.34 |
+|Speedup| 1.12  | 1.20  |  1.18 |  1.15 |  1.10  |   1.15 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
+|float32| 5.63  | 6.23  | 8.85  | 14.71 | 26.07  | 52.86  | 108.95 |
+|float16| 5.89  | 6.44  | 7.94  | 12.57 | 22.03  | 45.06  |  92.68 |
+|Speedup|       |       | 1.12  |  1.17 |  1.18  |  1.17  |   1.18 |
diff --git a/contrib/float16/float16_inference_demo.py b/contrib/float16/float16_inference_demo.py
new file mode 100644
index 0000000000..063227d5d2
--- /dev/null
+++ b/contrib/float16/float16_inference_demo.py
@@ -0,0 +1,362 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from float16_transpiler import Float16Transpiler
+
+import argparse
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import math
+import sys
+import numpy as np
+import os
+
+parser = argparse.ArgumentParser(
+    'Float16 inference accuracy test and benchmark.')
+parser.add_argument(
+    '--train_batch_size', type=int, default=16, help="Batch size for training.")
+parser.add_argument(
+    '--inf_batch_size', type=int, default=32, help="Batch size for inference.")
+parser.add_argument(
+    '--repeat', type=int, default=1, help="How many times to run the test.")
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'imagenet'],
+    help="Optional dataset for benchmark.")
+parser.add_argument(
+    '--model',
+    type=str,
+    default='vgg',
+    choices=['vgg', 'resnet'],
+    help="Optional model for benchmark.")
+parser.add_argument(
+    '--threshold',
+    type=float,
+    default=0.005,
+    help='Save inference model when test accuracy reach this threshold.')
+parser.add_argument('--learning_rate', type=float, default=0.001)
+args = parser.parse_args()
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, depth=50):
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    return pool2
+
+
+def resnet_cifar10(input, depth=32):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    return fc2
+
+
+def train(place, save_dirname):
+    if args.data_set == "cifar10":
+        class_dim = 10
+        data_shape = [3, 32, 32]
+    elif args.data_set == "imagenet":
+        class_dim = 102
+        data_shape = [3, 224, 224]
+    else:
+        raise ValueError("%s dataset is not supported" % data_set)
+
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.model == "vgg":
+        print("train vgg")
+        net = vgg16(images)
+    elif args.model == "resnet":
+        print("train resnet")
+        if args.data_set == "cifar10":
+            net = resnet_cifar10(images)
+        elif args.data_set == "imagenet":
+            net = resnet_imagenet(images)
+        else:
+            raise ValueError("%s dataset is not supported" % args.data_set)
+    else:
+        raise ValueError("%s network is not supported" % args.model)
+
+    predict = fluid.layers.fc(input=net, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    #Test program
+    test_program = fluid.default_main_program().clone(for_test=True)
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = args.train_batch_size
+    PASS_NUM = 100
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.flowers.train()
+            if args.data_set == 'imagenet' else paddle.dataset.cifar.train10(),
+            buf_size=128 * 10),
+        batch_size=args.train_batch_size)
+
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test()
+        if args.data_set == 'imagenet' else paddle.dataset.cifar.test10(),
+        batch_size=args.inf_batch_size)
+
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+
+    exe.run(fluid.default_startup_program())
+    main_program = fluid.default_main_program()
+
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            train_image = np.array(
+                map(lambda x: x[0].reshape(data_shape), data)).astype("float32")
+            train_label = np.array(map(lambda x: x[1], data)).astype("int64")
+            train_label = train_label.reshape([-1, 1])
+
+            exe.run(main_program,
+                    feed={'pixel': train_image,
+                          'label': train_label})
+
+            if (batch_id % 100) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    test_image = np.array(
+                        map(lambda x: x[0].reshape(data_shape),
+                            test_data)).astype("float32")
+                    test_label = np.array(map(lambda x: x[1],
+                                              test_data)).astype("int64")
+                    test_label = test_label.reshape([-1, 1])
+
+                    loss_t, acc_t = exe.run(
+                        program=test_program,
+                        feed={"pixel": test_image,
+                              "label": test_label},
+                        fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+
+                acc_value = np.array(acc_list).mean()
+                avg_loss_value = np.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Accuracy {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > args.threshold:
+                    print(
+                        'Save inference model with test accuracy of {0} at {1}'.
+                        format(float(acc_value), save_dirname))
+                    fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                  [predict], exe)
+                    return
+
+
+def test_accuracy(executor, inference_program, feed_target_names,
+                  fetch_targets):
+    if args.data_set == "cifar10":
+        data_shape = [3, 32, 32]
+    elif args.data_set == "imagenet":
+        data_shape = [3, 224, 224]
+    else:
+        raise ValueError("%s dataset is not supported" % data_set)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == "cifar10" else paddle.dataset.flowers.test(),
+        batch_size=args.inf_batch_size)
+
+    test_num = 0
+    correct_num = 0
+
+    for test_data in test_reader():
+        test_image = np.array(
+            map(lambda x: x[0].reshape(data_shape), test_data)).astype(
+                "float32")
+        test_label = np.array(map(lambda x: x[1], test_data)).astype("int64")
+        test_label = test_label.reshape([-1, 1])
+
+        results = executor.run(program=inference_program,
+                               feed={feed_target_names[0]: test_image},
+                               fetch_list=fetch_targets)
+
+        prediction = np.argmax(results[0], axis=1).reshape([-1, 1])
+        correct_num += np.sum(prediction == test_label)
+        test_num += test_label.size
+
+    print("{0} out of {1} predictions are correct.".format(correct_num,
+                                                           test_num))
+    print("Test accuray is {0}.".format(float(correct_num) / float(test_num)))
+
+
+def infer(place, save_dirname):
+    exe = fluid.Executor(place)
+    inference_scope = fluid.core.Scope()
+
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        print("Load inference model from {0}".format(save_dirname))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        print("The test set accuracy of inference in float mode is:")
+        test_accuracy(exe, inference_program, feed_target_names, fetch_targets)
+
+        float16_inference_program = inference_program.clone()
+        t = Float16Transpiler()
+        t.transpile(float16_inference_program, place)
+
+        print("The test set accuracy of inference in float16 mode is:")
+        test_accuracy(exe, float16_inference_program, feed_target_names,
+                      fetch_targets)
+
+        fp16_save_dirname = "float16_" + save_dirname
+        fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      float16_inference_program)
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+if __name__ == "__main__":
+    if not fluid.core.is_compiled_with_cuda():
+        raise Exception("This test requires CUDA GPUs!")
+
+    place = fluid.CUDAPlace(0)
+    if not fluid.core.is_float16_supported(place):
+        raise Exception(
+            "This test requires compute capability of CUDA GPU >= 5.3!")
+
+    for i in range(args.repeat):
+        with scope_prog_guard():
+            save_dirname = "image_classification_" + args.data_set + "_" + args.model + ".inference.model"
+            train(place, save_dirname)
+            infer(place, save_dirname)
diff --git a/contrib/float16/float16_inference_report.md b/contrib/float16/float16_inference_report.md
new file mode 100644
index 0000000000..67623a4d8d
--- /dev/null
+++ b/contrib/float16/float16_inference_report.md
@@ -0,0 +1,163 @@
+## Introduction
+Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results. 
+
+## What is float16?
+float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
+
+Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
+
+## Why float16?
+The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
+
+1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
+
+2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
+
+3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
+
+## Fluid implementation of float16 inference
+### Overview
+Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
+
+### Basic requirement
+When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
+
+This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type. 
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
+
+### float16 transpiler
+Furthermore, we need a float16 transpiler to achieve the following usage code:
+
+```python
+# Get the float32 inference program and load the associated float32 weights
+[inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+batch_size = 1
+tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = Float16Transpiler()
+t.transpile(float16_inference_program, GPUPlace)
+
+# Running float16_inference_program in float16 mode using the same input data
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+# Do some tests to verify the correctness of float16 inference
+...
+np.testing.assert_almost_equal(float_results, float16_results, ...)
+...
+
+# Save the float16 inference program and float16 weights for future deployment
+fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
+                              fetch_targets, exe,
+                              float16_inference_program)
+```
+
+In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
+
+We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
+
+The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
+
+### Experiment results
+We provide demo codes that can be used to reproduce the experiment results by doing:
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+# This line will generate a paddle development docker image with cuda 8 and cudnn 7
+# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile 
+# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
+nvidia-docker build -t paddle:float16 .
+# After running this, different results will be written to different log files in Paddle/contrib/float16/
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+```
+
+#### Correctness
+As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
+
+We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
+
+We repeat the test ten times and get the following results:
+
+|        | float16 | float32  |
+|--------|--------:|--------: |
+| # 1    | 62.75%  | 62.72%   |
+| # 2    | 61.27%  | 61.28%   |
+| # 3    | 62.24%  | 62.23%   |
+| # 4    | 64.16%  | 64.17%   |
+| # 5    | 60.75%  | 60.77%   |
+| # 6    | 63.25%  | 63.24%   |
+| # 7    | 62.15%  | 62.13%   |
+| # 8    | 62.05%  | 62.02%   |
+| # 9    | 65.19%  | 65.20%   |
+| #10    | 62.53%  | 62.48%   |
+| average| 62.63%  | 62.62%   |
+
+We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests. 
+
+#### Performance benchmark
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
+
+We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
+
+Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
+
+| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
+|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
+|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
+
+We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes. 
+
+Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
+
+|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
+|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
+|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
+
+Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
+
+Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
+
+| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
+|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
+|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
+
+|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
+|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
+|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
+
+We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
+
+We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
+
+Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
+
+### Summary
+1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
+2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
+3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
+4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
diff --git a/contrib/float16/float16_transpiler.py b/contrib/float16/float16_transpiler.py
new file mode 100644
index 0000000000..91ba101edb
--- /dev/null
+++ b/contrib/float16/float16_transpiler.py
@@ -0,0 +1,256 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.framework import Program
+from paddle.fluid.executor import global_scope
+
+
+class Float16Transpiler:
+    def transpile(self, program, place, scope=None):
+        '''
+        Transpile the program desc and cast the weights to float16 data type to
+        enable float16 inference.
+
+        Since the operator in a program desc will automatically choose the
+        right compute kernel to run based on the data type of the input tensor.
+        We actually don't need to change the program desc to run in float16 mode.
+
+        However, in this way, users who are used to feeding and fetching tensors 
+        of float32 data type when running typical inference may find it confusing
+        and difficult to run inference in float16 mode as they need to convert
+        input data to float16 dtype and then convert the results back to float32 
+        dtype to match the rest of code.
+
+        So this function appends cast ops to the program desc where necessary so 
+        that users are able to run inference in float16 mode while providing input 
+        tensor (feed_holder) of float data type and obtaining output tensor 
+        (fetch_holder) of float data type. 
+
+        Moreover, it is desired that when we have the scope and program desc to run
+        inference in float32 mode, we can use a single API to do the necessary 
+        modification and then user can run float16 inference on the fly. To make 
+        this happen, this function also create new parameters in the scope to have the 
+        converted float16 weights and change the operators in program desc to use 
+        these new parameters.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope         
+        '''
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core.Scope):
+            raise TypeError("scope should be as Scope type or None")
+
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted 
+
+        self._modify_feed_fetch()
+        self._convert_param_to_float16()
+        self._adjust_input(skip=True)
+        self._remove_unused_var()
+
+        # TODO(luotao): use clone() method to flush the program.desc in force, 
+        # since some large program.desc will not be flushed immediately. 
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _adjust_input(self, skip=False):
+        '''
+        Change the input variable name in operators.
+
+        When we are in the process of modifying a program desc, we usually 
+        replace some variables with some other variables, where we create 
+        a dictionary input_map to record the one-to-one correspondence
+        between each old variable and the new one. 
+
+        After that, this function will search all the operators that use the 
+        old variables and change the info in op to use the new variables. There 
+        maybe some exceptions to this rule when we are using the float16 transpiler
+        and insert cast ops to cast float32 variable to float16 one. After we 
+        insert the cast op to cast var_1 to var_1_fp16, we don't want to change 
+        the input of cast op to var_1_fp16 after using this function.     
+        '''
+        skip_ops = {"cast"}
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            if skip and current_op.type in skip_ops:
+                continue
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op.rename_input(input_arg,
+                                            self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in self.block.vars.keys():
+            if var not in args:
+                self.block.remove_var(var)
+
+    def _modify_feed_fetch(self):
+        '''
+        Modify feed fetch op/vars for float16 inference.
+
+        For each feed op:
+        feed_op->feed_target_var
+        
+        Change it to:
+        feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var
+
+        For each fetch op:
+        fetch_target_var->fetch_op
+
+        Change it to:
+        tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op
+
+        :return: None
+        '''
+
+        def find_op(var):
+            # It is possible that var.op is not up to date after some 
+            # modifications to program desc. Here we force to make it up to date.
+            var.op = None
+            for op in self.block.ops:
+                if var.name in op.output_arg_names:
+                    var.op = op
+                    break
+
+            if var.op is None:
+                raise ValueError("The target variable must have an "
+                                 "associated operator that generates it.")
+
+        i = 0
+        while i < len(self.block.ops):
+            cur_op = self.block.ops[i]
+            if cur_op.type == "feed":
+                var_name = cur_op.output("Out")[0]
+                tmp_var_name = var_name + ".fp16"
+                var = self.block.vars[var_name]
+                tmp_var = self.block.create_var(
+                    name=tmp_var_name.encode('ascii'),
+                    type=var.type,
+                    dtype=core.VarDesc.VarType.FP16,
+                    shape=var.shape,
+                    persistable=var.persistable)
+                self.block.insert_op(
+                    i + 1,
+                    type="cast",
+                    inputs={"X": var},
+                    outputs={"Out": tmp_var},
+                    attrs={
+                        'in_dtype': int(var.dtype),
+                        'out_dtype': int(tmp_var.dtype)
+                    })
+                self.input_map[var_name] = tmp_var_name
+                i = i + 1
+            elif cur_op.type == "fetch":
+                var_name = cur_op.input("X")[0]
+                tmp_var_name = var_name + ".fp16"
+                var = self.block.vars[var_name]
+                tmp_var = self.block.create_var(
+                    name=tmp_var_name.encode('ascii'),
+                    type=var.type,
+                    dtype=core.VarDesc.VarType.FP16,
+                    shape=var.shape,
+                    persistable=var.persistable)
+                find_op(var)
+                var.op.rename_output(var_name, tmp_var_name)
+                self.block.insert_op(
+                    i,
+                    type="cast",
+                    inputs={"X": tmp_var},
+                    outputs={"Out": var},
+                    attrs={
+                        'in_dtype': int(tmp_var.dtype),
+                        'out_dtype': int(var.dtype)
+                    })
+                i = i + 1
+            i = i + 1
+
+    def _convert_param_to_float16(self):
+        def _get_no_fp16_conversion_var_names():
+            '''
+            Get the set of input variable names that shouldn't be converted to float16.
+
+            When we want to run inference in float16 mode, most parameters need to be 
+            firstly converted to float16. However, there are some parameters that 
+            shouldn't be converted to float16 because the corresponding operator 
+            requires float32 parameters even in float16 mode (when the input data is 
+            of float16 data type). Currently, the only operator that has this exclusion 
+            is the batch norm op.
+
+            :return: set of input variable names 
+            :type var_names: set         
+            '''
+            op_names = {'batch_norm'}
+            var_names = []
+            for op in self.block.ops:
+                if op.type in op_names:
+                    var_names += op.input_arg_names
+            return set(var_names)
+
+        def _should_be_converted(var):
+            return var.persistable and \
+                   var.name not in self.no_conversion_vars and \
+                   var.type != core.VarDesc.VarType.FEED_MINIBATCH and \
+                   var.type != core.VarDesc.VarType.FETCH_LIST
+
+        self.no_conversion_vars = _get_no_fp16_conversion_var_names()
+        conversion_var_list = filter(_should_be_converted,
+                                     self.block.vars.values())
+        for var in conversion_var_list:
+            fp16_var_name = var.name + ".fp16"
+            fp16_var = self.block.create_parameter(
+                name=fp16_var_name.encode('ascii'),
+                type=var.type,
+                dtype=core.VarDesc.VarType.FP16,
+                shape=var.shape)
+
+            # cast the data in the tensor of the original var to float16
+            # data type and store it in the tensor of the new float16 var
+            self.scope.var(fp16_var_name)
+            fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor()
+            tensor = np.array(self.scope.find_var(var.name).get_tensor())
+            # After the old tensor data is converted to np.float16, view(np.uint16)
+            # is used so that the internal memory of the numpy array will be 
+            # reinterpreted to be of np.uint16 data type, which is binded to fluid 
+            # float16 data type via the help of pybind in tensor_py.h. 
+            fp16_tensor.set(
+                tensor.astype(np.float16).view(np.uint16), self.place)
+
+            # old var will be replaced by the fp16 var in program desc
+            self.input_map[var.name] = fp16_var_name
+            self.block.remove_var(var.name)
diff --git a/contrib/float16/run_float16_demo.sh b/contrib/float16/run_float16_demo.sh
new file mode 100755
index 0000000000..d8a34ee67b
--- /dev/null
+++ b/contrib/float16/run_float16_demo.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+BUILD_PATH=/paddle/fp16_build
+WHEEL_PATH=$BUILD_PATH/python/dist
+INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
+DEMO_PATH=/paddle/contrib/float16
+
+# Use the single most powerful CUDA GPU on your machine
+export CUDA_VISIBLE_DEVICES=0
+
+# Build the PaddlePaddle Fluid wheel package and install it.
+mkdir -p $BUILD_PATH && cd $BUILD_PATH
+cmake .. -DWITH_AVX=OFF \
+         -DWITH_MKL=OFF \
+         -DWITH_GPU=ON \
+         -DWITH_TESTING=ON \
+         -DWITH_TIMER=ON \
+         -DWITH_PROFILER=ON \
+         -DWITH_FLUID_ONLY=ON
+make -j `nproc`
+pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
+
+cd $DEMO_PATH
+# Clear previous log results
+rm -f *.log
+
+# Test the float16 inference accuracy of resnet32 on cifar10 data set
+stdbuf -oL python float16_inference_demo.py \
+       --data_set=cifar10 \
+       --model=resnet \
+       --threshold=0.6 \
+       --repeat=10 \
+       2>&1 | tee -a float16_inference_accuracy.log
+
+# Sleep to cool down the GPU for consistent benchmarking
+sleep 2m
+
+# benchmarking parameters
+REPEAT=1000
+MAXIMUM_BATCH_SIZE=512
+
+for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2)); 
+do
+
+  # Test inference benchmark of vgg16 on imagenet
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=imagenet \
+         --model=vgg \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --data_set=imagenet \
+      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a imagenet_vgg16_benchmark.log
+
+  sleep 2m
+
+  # Test inference benchmark of resnet50 on imagenet
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=imagenet \
+         --model=resnet \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_resnet \
+      --data_set=imagenet \
+      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a imagenet_resnet50_benchmark.log
+
+  sleep 2m
+
+  # Test inference benchmark of vgg16 on cifar10
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=cifar10 \
+         --model=vgg \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --data_set=cifar10 \
+      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a cifar10_vgg16_benchmark.log
+
+  sleep 1m
+
+  # Test inference benchmark of resnet32 on cifar10
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=cifar10 \
+         --model=resnet \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --data_set=cifar10 \
+      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a cifar10_resnet32_benchmark.log
+
+  sleep 1m
+
+done
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index 1a685b9e2e..c4fd1e298b 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -16,9 +16,12 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
+DEFINE_string(data_set, "cifar10", "Data set to test");
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
+DEFINE_bool(skip_cpu, false, "Skip the cpu test");
 
 TEST(inference, image_classification) {
   if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -35,20 +38,31 @@ TEST(inference, image_classification) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0), static_cast<float>(1));
+  if (FLAGS_data_set == "cifar10") {
+    SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+                       static_cast<float>(0), static_cast<float>(1));
+  } else if (FLAGS_data_set == "imagenet") {
+    SetupTensor<float>(&input, {FLAGS_batch_size, 3, 224, 224},
+                       static_cast<float>(0), static_cast<float>(1));
+  } else {
+    LOG(FATAL) << "Only cifar10 or imagenet is supported.";
+  }
+
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
   paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, false, true>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
-  LOG(INFO) << output1.dims();
+  if (!FLAGS_skip_cpu) {
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.push_back(&output1);
+
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs: ---";
+    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+    TestInference<paddle::platform::CPUPlace, false, true>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+    LOG(INFO) << output1.dims();
+  }
 
 #ifdef PADDLE_WITH_CUDA
   paddle::framework::LoDTensor output2;
@@ -57,24 +71,27 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
+  LOG(INFO) << "Batch size is " << FLAGS_batch_size;
   TestInference<paddle::platform::CUDAPlace, false, true>(
       dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
-  CheckError<float>(output1, output2);
+  if (!FLAGS_skip_cpu) {
+    CheckError<float>(output1, output2);
+  }
 
   // float16 inference requires cuda GPUs with >= 5.3 compute capability
-  if (paddle::platform::GetCUDAComputeCapability(0) >= 53) {
+  if (!FLAGS_fp16_dirname.empty() &&
+      paddle::platform::GetCUDAComputeCapability(0) >= 53) {
     paddle::framework::LoDTensor output3;
     std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
     cpu_fetchs3.push_back(&output3);
 
     LOG(INFO) << "--- GPU Runs in float16 mode: ---";
-    std::string fp16_dirname = dirname;
-    fp16_dirname.replace(fp16_dirname.find("book/"),
-                         std::string("book/").size(), "book/float16_");
+    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+
     TestInference<paddle::platform::CUDAPlace, false, true>(
-        fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
+        FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
 
     CheckError<float>(output2, output3);
   }
diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/inference_transpiler.py
index f4ad717b9e..39b01610f9 100644
--- a/python/paddle/fluid/inference_transpiler.py
+++ b/python/paddle/fluid/inference_transpiler.py
@@ -121,60 +121,7 @@ class InferenceTranspiler:
         # And a better solution will be considered later.
         program = program.clone()
 
-    def float16_transpile(self, program, place, scope=None):
-        '''
-        Transpile the program desc and cast the weights to float16 data type to
-        enable float16 inference.
-
-        Since the operator in a program desc will automatically choose the
-        right compute kernel to run based on the data type of the input tensor.
-        We actually don't need to change the program desc to run in float16 mode.
-
-        However, in this way, users who are used to feeding and fetching tensors 
-        of float32 data type when running typical inference may find it confusing
-        and difficult to run inference in float16 mode as they need to convert
-        input data to float16 dtype and then convert the results back to float32 
-        dtype to match the rest of code.
-
-        So this function appends cast ops to the program desc where necessary so 
-        that users are able to run inference in float16 mode while providing input 
-        tensor (feed_holder) of float data type and obtaining output tensor 
-        (fetch_holder) of float data type. 
-
-        Moreover, it is desired that when we have the scope and program desc to run
-        inference in float32 mode, we can use a single API to do the necessary 
-        modification and then user can run float16 inference on the fly. To make 
-        this happen, this function also create new parameters in the scope to have the 
-        converted float16 weights and change the operators in program desc to use 
-        these new parameters.
-
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope         
-        '''
-        if scope is None:
-            scope = global_scope()
-
-        self.scope = scope
-        self.place = place
-        self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted 
-
-        self._modify_feed_fetch()
-        self._convert_param_to_float16()
-        self._adjust_input(skip=True)
-        self._remove_unused_var()
-
-        # TODO(luotao): use clone() method to flush the program.desc in force, 
-        # since some large program.desc will not be flushed immediately. 
-        # And a better solution will be considered later.
-        program = program.clone()
-
     # ====================== private transpiler functions =====================
-
     def _insert_bias_op(self, index, current_op, bn_op):
         '''
         Construct elementwise_add operator for adding bias 
@@ -269,27 +216,9 @@ class InferenceTranspiler:
         # collect the renamed input
         self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
 
-    def _adjust_input(self, skip=False):
-        '''
-        Change the input variable name in operators.
-
-        When we are in the process of modifying a program desc, we usually 
-        replace some variables with some other variables, where we create 
-        a dictionary input_map to record the one-to-one correspondence
-        between each old variable and the new one. 
-
-        After that, this function will search all the operators that use the 
-        old variables and change the info in op to use the new variables. There 
-        maybe some exceptions to this rule when we are using the float16 transpiler
-        and insert cast ops to cast float32 variable to float16 one. After we 
-        insert the cast op to cast var_1 to var_1_fp16, we don't want to change 
-        the input of cast op to var_1_fp16 after using this function.     
-        '''
-        skip_ops = {"cast"}
+    def _adjust_input(self):
         for i in range(len(self.block.ops)):
             current_op = self.block.ops[i]
-            if skip and current_op.type in skip_ops:
-                continue
             for input_arg in current_op.input_arg_names:
                 if input_arg in self.input_map:
                     current_op.rename_input(input_arg,
@@ -309,138 +238,3 @@ class InferenceTranspiler:
         for var in self.block.vars.keys():
             if var not in args:
                 self.block.remove_var(var)
-
-    def _modify_feed_fetch(self):
-        '''
-        Modify feed fetch op/vars for float16 inference.
-
-        For each feed op:
-        feed_op->feed_target_var
-        
-        Change it to:
-        feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var
-
-        For each fetch op:
-        fetch_target_var->fetch_op
-
-        Change it to:
-        tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op
-
-        :return: None
-        '''
-
-        def find_op(var):
-            # It is possible that var.op is not up to date after some 
-            # modifications to program desc. Here we force to make it up to date.
-            var.op = None
-            for op in self.block.ops:
-                if var.name in op.output_arg_names:
-                    var.op = op
-                    break
-
-            if var.op is None:
-                raise ValueError("The target variable must have an "
-                                 "associated operator that generates it.")
-
-        i = 0
-        while i < len(self.block.ops):
-            cur_op = self.block.ops[i]
-            if cur_op.type == "feed":
-                var_name = cur_op.output("Out")[0]
-                tmp_var_name = var_name + ".fp16"
-                var = self.block.vars[var_name]
-                tmp_var = self.block.create_var(
-                    name=tmp_var_name.encode('ascii'),
-                    type=var.type,
-                    dtype=core.VarDesc.VarType.FP16,
-                    shape=var.shape,
-                    persistable=var.persistable)
-                self.block.insert_op(
-                    i + 1,
-                    type="cast",
-                    inputs={"X": var},
-                    outputs={"Out": tmp_var},
-                    attrs={
-                        'in_dtype': int(var.dtype),
-                        'out_dtype': int(tmp_var.dtype)
-                    })
-                self.input_map[var_name] = tmp_var_name
-                i = i + 1
-            elif cur_op.type == "fetch":
-                var_name = cur_op.input("X")[0]
-                tmp_var_name = var_name + ".fp16"
-                var = self.block.vars[var_name]
-                tmp_var = self.block.create_var(
-                    name=tmp_var_name.encode('ascii'),
-                    type=var.type,
-                    dtype=core.VarDesc.VarType.FP16,
-                    shape=var.shape,
-                    persistable=var.persistable)
-                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
-                    i,
-                    type="cast",
-                    inputs={"X": tmp_var},
-                    outputs={"Out": var},
-                    attrs={
-                        'in_dtype': int(tmp_var.dtype),
-                        'out_dtype': int(var.dtype)
-                    })
-                i = i + 1
-            i = i + 1
-
-    def _convert_param_to_float16(self):
-        def _get_no_fp16_conversion_var_names():
-            '''
-            Get the set of input variable names that shouldn't be converted to float16.
-
-            When we want to run inference in float16 mode, most parameters need to be 
-            firstly converted to float16. However, there are some parameters that 
-            shouldn't be converted to float16 because the corresponding operator 
-            requires float32 parameters even in float16 mode (when the input data is 
-            of float16 data type). Currently, the only operator that has this exclusion 
-            is the batch norm op.
-
-            :return: set of input variable names 
-            :type var_names: set         
-            '''
-            op_names = {'batch_norm'}
-            var_names = []
-            for op in self.block.ops:
-                if op.type in op_names:
-                    var_names += op.input_arg_names
-            return set(var_names)
-
-        def _should_be_converted(var):
-            return var.persistable and \
-                   var.name not in self.no_conversion_vars and \
-                   var.type != core.VarDesc.VarType.FEED_MINIBATCH and \
-                   var.type != core.VarDesc.VarType.FETCH_LIST
-
-        self.no_conversion_vars = _get_no_fp16_conversion_var_names()
-        conversion_var_list = filter(_should_be_converted,
-                                     self.block.vars.values())
-        for var in conversion_var_list:
-            fp16_var_name = var.name + ".fp16"
-            fp16_var = self.block.create_parameter(
-                name=fp16_var_name.encode('ascii'),
-                type=var.type,
-                dtype=core.VarDesc.VarType.FP16,
-                shape=var.shape)
-
-            # cast the data in the tensor of the original var to float16
-            # data type and store it in the tensor of the new float16 var
-            self.scope.var(fp16_var_name)
-            fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor()
-            tensor = np.array(self.scope.find_var(var.name).get_tensor())
-            # After the old tensor data is converted to np.float16, view(np.uint16)
-            # is used so that the internal memory of the numpy array will be 
-            # reinterpreted to be of np.uint16 data type, which is binded to fluid 
-            # float16 data type via the help of pybind in tensor_py.h. 
-            fp16_tensor.set(
-                tensor.astype(np.float16).view(np.uint16), self.place)
-
-            # old var will be replaced by the fp16 var in program desc
-            self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 8ff4f6d47a..dbcdb5766e 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -247,26 +247,6 @@ def infer(use_cuda, save_dirname=None):
                                       fetch_targets, exe,
                                       inference_transpiler_program)
 
-        if use_cuda and fluid.core.is_float16_supported(place):
-            # Use float16_transpiler to speedup
-            fp16_transpiler_program = inference_transpiler_program.clone()
-            t.float16_transpile(fp16_transpiler_program, place)
-
-            fp16_results = exe.run(fp16_transpiler_program,
-                                   feed={feed_target_names[0]: tensor_img},
-                                   fetch_list=fetch_targets)
-
-            assert len(results[0]) == len(fp16_results[0])
-            for i in range(len(results[0])):
-                np.testing.assert_almost_equal(
-                    results[0][i], fp16_results[0][i], decimal=2)
-
-            print("float16 infer results: ", fp16_results[0])
-
-            fluid.io.save_inference_model("float16_" + save_dirname,
-                                          feed_target_names, fetch_targets, exe,
-                                          fp16_transpiler_program)
-
 
 def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():

From 8cc91bc02552c0ef5d5773463d4566e5a6e3d3db Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 3 May 2018 17:41:14 -0700
Subject: [PATCH 47/52] initial commit (#10387)

---
 python/paddle/fluid/layer_helper.py | 4 ++--
 python/paddle/fluid/layers/nn.py    | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 62933b5125..86efd1ff51 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -400,11 +400,11 @@ class LayerHelper(object):
         if isinstance(act, basestring):
             act = {'type': act}
 
+        if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
+            act['use_cudnn'] = self.kwargs.get('use_cudnn')
         if 'use_mkldnn' in self.kwargs:
             act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
-        if 'use_mkldnn' in self.kwargs:
-            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
         if not core.IsInplace(act_type):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 93e8d0bf29..1786be22fd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -88,6 +88,7 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
+       use_cudnn=False,
        use_mkldnn=False,
        act=None,
        is_test=False,

From ccc594e4c41c5687b9cfb8a6e4922a3ffe13d982 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 3 May 2018 20:10:34 -0700
Subject: [PATCH 48/52] need to copy LoD info (#10392)

---
 paddle/fluid/operators/save_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index f45d07ed90..dcc1b9ec20 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -106,6 +106,8 @@ class SaveOp : public framework::OperatorBase {
       auto out_kernel_type = framework::OpKernelType(out_dtype, place);
       framework::LoDTensor out;
       framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      // copy LoD info to the new tensor
+      out.set_lod(tensor.lod());
       framework::SerializeToStream(fout, out, dev_ctx);
     } else {
       framework::SerializeToStream(fout, tensor, dev_ctx);

From 20fa8480769362776f3b1df0853206cfc7a00483 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Thu, 3 May 2018 14:53:12 -0700
Subject: [PATCH 49/52] Travis: using ccache for docker build.

---
 .travis.yml                           | 22 +---------------------
 paddle/scripts/paddle_docker_build.sh |  4 ++++
 2 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index fe4eb2d157..3391e2c3ca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -16,34 +16,14 @@ env:
   - JOB=check_style
   - JOB=build_android
 addons:
-  apt:
-    packages:
-      - gcc-4.8
-      - g++-4.8
-      - git
-      - build-essential
-      - python
-      - python-pip
-      - python2.7-dev
-      - python-wheel
-      - libboost-dev
-      - curl
-      - swig
-      - graphviz
-      - clang-format-3.8
-      - automake
-      - libtool
-      - ccache
   ssh_known_hosts: 13.229.163.131
 before_install:
-  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
     # 43min timeout
-    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
   - |
     if [[ "$JOB" != "doc" ]]; then exit 0; fi;
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 311eb57601..ddae9f19a2 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -56,11 +56,15 @@ EOL
     if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
         DOCKER_CMD="docker"
     fi
+    if [ ! -d "${HOME}/.ccache" ]; then
+        mkdir ${HOME}/.ccache
+    fi
     set -x
     ${DOCKER_CMD} run -it \
         --name $CONTAINER_ID \
         ${DOCKER_ENV} \
         -v $PADDLE_ROOT:/paddle \
+        -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
         $IMG \
         paddle/scripts/paddle_build.sh $@

From c9f55dfafc4f0b9706d1227cd80221b60d125df7 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 3 May 2018 22:59:40 -0700
Subject: [PATCH 50/52] Fix CPPLint issues in /math/detail/gru_kernel.h
 (#10390)

* Fix CPPLint issyes in gru_kernel.h

* Fix CPPLint issyes in gru_kernel.h

* Fix Compile error
---
 .../operators/math/detail/gru_cpu_kernel.h    |  40 ++---
 .../operators/math/detail/gru_gpu_kernel.h    |  20 +--
 .../fluid/operators/math/detail/gru_kernel.h  | 139 +++++++++---------
 3 files changed, 101 insertions(+), 98 deletions(-)

diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
index 26e6adafdf..b6f4ab9377 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -43,8 +43,8 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
       r_prev_out = prev_output_value[i];
     }
 
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
+    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                    &r_value_reset_output, active_gate);
 
     update_gate[i] = r_value_update_gate;
     reset_gate[i] = r_value_reset_gate;
@@ -71,8 +71,8 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
       r_prev_out = prev_output_value[i];
     }
 
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
+    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                    &r_output, active_node);
 
     frame_state[i] = r_value_frame_state;
     output_value[i] = r_output;
@@ -99,8 +99,8 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
       r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
     }
 
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
+    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                    &r_value_reset_output, active_gate);
 
     update_gate[i] = r_value_update_gate;
     reset_gate[i] = r_value_reset_gate;
@@ -129,8 +129,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
       r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
     }
 
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
+    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                    &r_output, active_node);
 
     frame_state[i] = r_value_frame_state;
     (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
@@ -213,9 +213,9 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
+    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_out_grad, active_node);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -258,9 +258,9 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -302,9 +302,9 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
+    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_out_grad, active_node);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -350,9 +350,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index da25a7d213..813d69f6ab 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -55,8 +55,8 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                  r_value_reset_output, active_gate);
+  op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                  &r_value_reset_output, active_gate);
 
   gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
   gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
@@ -93,8 +93,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                  r_output, active_node);
+  op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                  &r_output, active_node);
 
   gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
   output_value[frame_idx] = r_output;
@@ -137,9 +137,9 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
     r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                r_out_grad, active_node);
+  op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
+                &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
+                &r_out_grad, active_node);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
@@ -185,9 +185,9 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
     r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                r_reset_output_grad, active_gate);
+  op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
+                &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
+                &r_reset_output_grad, active_gate);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
index 991f2e758c..f6d192358b 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+#include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#include <type_traits>
-
 // TODO(guosheng): refine code style in gru_kernel
 namespace paddle {
 namespace operators {
@@ -28,25 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
-                             T &prev_out, T &value_reset_output,
+  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
+                             T *prev_out, T *value_reset_output,
                              ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = prev_out * value_reset_gate;
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = (*prev_out) * (*value_reset_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_reset_gate, __m256 &prev_out,
-                             __m256 &value_reset_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_reset_gate, __m256 *prev_out,
+                             __m256 *value_reset_output,
                              ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
   }
 #endif
 #endif
@@ -55,25 +55,25 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
-                             T &prev_out, T &value_output,
+  HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
+                             T *prev_out, T *value_output,
                              ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = prev_out - (value_update_gate * prev_out) +
-                   (value_update_gate * value_frame_state);
+    *value_frame_state = activation(*value_frame_state, act_input);
+    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
+                    ((*value_update_gate) * (*value_frame_state));
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_frame_state, __m256 &prev_out,
-                             __m256 &value_output, ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = _mm256_add_ps(
-        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
-        _mm256_mul_ps(value_update_gate, value_frame_state));
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_frame_state, __m256 *prev_out,
+                             __m256 *value_output, ActivationType act_input) {
+    *value_frame_state = activation(*value_frame_state, act_input);
+    *value_output = _mm256_add_ps(
+        _mm256_sub_ps(*prev_out, _mm256_mul_ps(*value_update_gate, *prev_out)),
+        _mm256_mul_ps(*value_update_gate, *value_frame_state));
   }
 #endif
 #endif
@@ -85,37 +85,38 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_frame_state, T &grad_frame_state,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_output, ActivationType act_input) {
-    grad_update_gate = (grad_output * value_frame_state);
-    grad_update_gate -= (grad_output * value_prev_out);
-    grad_prev_out -= (grad_output * value_update_gate);
-    grad_prev_out += grad_output;
-    grad_frame_state = activation(grad_output * value_update_gate,
-                                  value_frame_state, act_input);
+  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
+                             T *value_frame_state, T *grad_frame_state,
+                             T *value_prev_out, T *grad_prev_out,
+                             T *grad_output, ActivationType act_input) {
+    *grad_update_gate = (*grad_output * (*value_frame_state));
+    *grad_update_gate -= (*grad_output * (*value_prev_out));
+    *grad_prev_out -= (*grad_output * (*value_update_gate));
+    *grad_prev_out += *grad_output;
+    *grad_frame_state = activation(*grad_output * (*value_update_gate),
+                                   *value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate,
-                             __m256 &value_frame_state,
-                             __m256 &grad_frame_state, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate,
+                             __m256 *value_frame_state,
+                             __m256 *grad_frame_state, __m256 *value_prev_out,
+                             __m256 *grad_prev_out, __m256 *grad_output,
                              ActivationType act_input) {
-    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
-    grad_update_gate = _mm256_sub_ps(
-        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
-    grad_prev_out = _mm256_add_ps(
-        _mm256_sub_ps(grad_prev_out,
-                      _mm256_mul_ps(grad_output, value_update_gate)),
-        grad_output);
-    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
-                                  value_frame_state, act_input);
+    *grad_update_gate = _mm256_mul_ps(*grad_output, *value_frame_state);
+    *grad_update_gate = _mm256_sub_ps(
+        *grad_update_gate, _mm256_mul_ps(*grad_output, *value_prev_out));
+    *grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(*grad_prev_out,
+                      _mm256_mul_ps(*grad_output, *value_update_gate)),
+        *grad_output);
+    *grad_frame_state =
+        activation(_mm256_mul_ps(*grad_output, *value_update_gate),
+                   *value_frame_state, act_input);
   }
 #endif
 #endif
@@ -124,32 +125,34 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_reset_gate, T &grad_reset_gate,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_reset_output, ActivationType act_gate) {
-    grad_reset_gate = (grad_reset_output * value_prev_out);
-    grad_prev_out += (grad_reset_output * value_reset_gate);
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
+                             T *value_reset_gate, T *grad_reset_gate,
+                             T *value_prev_out, T *grad_prev_out,
+                             T *grad_reset_output, ActivationType act_gate) {
+    *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
+    *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate, __m256 &value_reset_gate,
-                             __m256 &grad_reset_gate, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate, __m256 *value_reset_gate,
+                             __m256 *grad_reset_gate, __m256 *value_prev_out,
+                             __m256 *grad_prev_out, __m256 *grad_reset_output,
                              ActivationType act_gate) {
-    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
-    grad_prev_out = _mm256_add_ps(
-        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+    *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
+    *grad_prev_out = _mm256_add_ps(
+        *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
 #endif
 #endif

From ddf61672131f0243fb568d1e9b083d8bbe3a9794 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Thu, 3 May 2018 23:36:48 -0700
Subject: [PATCH 51/52] Correct filename (#10384)

---
 ...notest_rnn_encoder_decoer.py => notest_rnn_encoder_decoder.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/paddle/fluid/tests/book/{notest_rnn_encoder_decoer.py => notest_rnn_encoder_decoder.py} (100%)

diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
similarity index 100%
rename from python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
rename to python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py

From 3bb99c4f6608f009a9d8a9a276876fb01986176b Mon Sep 17 00:00:00 2001
From: Qingsheng Li <liqingsheng@baidu.com>
Date: Fri, 4 May 2018 14:37:58 +0800
Subject: [PATCH 52/52] Added auto transform to beam_search_decode_op (#10286)

* Added auto transform to beam_search_decode_op

* Added some comment

* Added unittest for beam_search_decode_op on GPU
---
 .../fluid/operators/beam_search_decode_op.cc  | 58 ++++++++++++++++---
 .../unittests/test_beam_search_decode_op.py   | 12 +++-
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 4a8dfd4b54..68fb988afd 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -23,16 +23,54 @@ struct BeamSearchDecodeFunctor {
   BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                           const LoDTensorArray& step_scores,
                           LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_(step_ids),
-        step_scores_(step_scores),
+      : step_ids_origin_(step_ids),
+        step_scores_origin_(step_scores),
         id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
+        score_tensor_(score_tensor) {
+    tensor_on_gpu_ = false;
+    // First make a copy of GPU data on CPU
+    if (platform::is_gpu_place(step_ids_origin_[0].place())) {
+      tensor_on_gpu_ = true;
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
+      // Copy all tensors in the input tensor array
+      for (auto& step_id : step_ids_origin_) {
+        framework::LoDTensor out;
+        dev_ctx->Wait();
+        framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+        dev_ctx->Wait();
+
+        out.set_lod(step_id.lod());
+        step_ids_.push_back(out);
+      }
+    }
+    if (platform::is_gpu_place(step_scores_origin_[0].place())) {
+      tensor_on_gpu_ = true;
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
+      // Copy all tensors in the input tensor array
+      for (auto& step_score : step_scores_origin_) {
+        framework::LoDTensor out;
+        dev_ctx->Wait();
+        framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
+        dev_ctx->Wait();
+
+        out.set_lod(step_score.lod());
+        step_scores_.push_back(out);
+      }
+    }
+  }
 
   template <typename T>
   void operator()() const;
 
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
+  bool tensor_on_gpu_;
+  const LoDTensorArray& step_ids_origin_;
+  const LoDTensorArray& step_scores_origin_;
+  LoDTensorArray step_ids_ = LoDTensorArray();
+  LoDTensorArray step_scores_ = LoDTensorArray();
   LoDTensor* id_tensor_;
   LoDTensor* score_tensor_;
 };
@@ -40,8 +78,14 @@ struct BeamSearchDecodeFunctor {
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
   BeamSearchDecoder<T> beam_search_decoder;
-  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                   score_tensor_);
+  // Check if the tensor is on GPU. If so, use the CPU copy instead
+  if (tensor_on_gpu_) {
+    beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                     score_tensor_);
+  } else {
+    beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
+                                     id_tensor_, score_tensor_);
+  }
 }
 
 template <>
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 4ee00605e2..7976dd7c3f 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -22,12 +22,12 @@ from paddle.fluid.op import Operator
 class TestBeamSearchDecodeOp(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
-        self.cpu_place = core.CPUPlace()
+        self.place = core.CPUPlace()
 
     def append_lod_tensor(self, tensor_array, lod, data):
         lod_tensor = core.LoDTensor()
         lod_tensor.set_lod(lod)
-        lod_tensor.set(data, self.cpu_place)
+        lod_tensor.set(data, self.place)
         tensor_array.append(lod_tensor)
 
     def test_get_set(self):
@@ -71,7 +71,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             SentenceIds="sentence_ids",
             SentenceScores="sentence_scores")
 
-        beam_search_decode_op.run(self.scope, self.cpu_place)
+        beam_search_decode_op.run(self.scope, self.place)
 
         expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
@@ -84,5 +84,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             np.array_equal(np.array(sentence_scores), expected_data))
 
 
+class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.place = core.CUDAPlace(0)
+
+
 if __name__ == '__main__':
     unittest.main()