From 188581801682fd799698f6f170ce1d4b4951ccba Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 21 Mar 2018 02:41:26 +0000
Subject: [PATCH 01/14] Add multi-thread inference example.

---
 .../tests/book/test_inference_fit_a_line.cc   | 66 ++++++++++++++++
 .../tests/test_multi_thread_helper.h          | 78 +++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 paddle/fluid/inference/tests/test_multi_thread_helper.h
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec..e8224be2d4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
@@ -40,6 +41,7 @@ TEST(inference, fit_a_line) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
+  LOG(INFO) << "--- CPU Runs: ---";
   TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
   LOG(INFO) << output1.dims();
 
@@ -49,9 +51,73 @@ TEST(inference, fit_a_line) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
+  LOG(INFO) << "--- CPU Runs: ---";
   TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
 #endif
 }
+
+TEST(multi_thread_inference, fit_a_line) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  int num_threads = 2;
+
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
+  cpu_feeds.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* input = new paddle::framework::LoDTensor();
+    // The second dim of the input tensor should be 13
+    // The input data should be >= 0
+    int64_t batch_size = 10;
+    SetupTensor<float>(*input,
+                       {batch_size, 13},
+                       static_cast<float>(0),
+                       static_cast<float>(10));
+    cpu_feeds[i].push_back(input);
+  }
+
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+  cpu_fetchs1.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* output = new paddle::framework::LoDTensor();
+    cpu_fetchs1[i].push_back(output);
+  }
+
+  // Run inference on CPU
+  LOG(INFO) << "--- CPU Runs (Multi Thread): ---";
+  TestMultiThreadInference<paddle::platform::CPUPlace>(
+      dirname, cpu_feeds, cpu_fetchs1, num_threads);
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+  cpu_fetchs2.resize(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    auto* output = new paddle::framework::LoDTensor();
+    cpu_fetchs2[i].push_back(output);
+  }
+
+  // Run inference on CUDA GPU
+  LOG(INFO) << "--- GPU Runs (Multi Thread): ---";
+  TestMultiThreadInference<paddle::platform::CUDAPlace>(
+      dirname, cpu_feeds, cpu_fetchs2, num_threads);
+
+  for (int i = 0; i < num_threads; ++i) {
+    delete cpu_fetchs2[i][0];
+  }
+#endif
+
+  for (int i = 0; i < num_threads; ++i) {
+    delete cpu_feeds[i][0];
+    delete cpu_fetchs1[i][0];
+  }
+}
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
new file mode 100644
index 0000000000..54e203833b
--- /dev/null
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+
+void ThreadedRunInference(
+    std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    paddle::framework::Executor& executor,
+    paddle::framework::Scope* scope,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+    std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+}
+
+template <typename Place>
+void TestMultiThreadInference(
+    const std::string& dirname,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
+    std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
+    const int num_threads) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
+      paddle::inference::Load(executor, *scope, dirname);
+
+  std::vector<std::thread*> threads;
+  for (int i = 0; i < num_threads; ++i) {
+    threads.push_back(new std::thread(ThreadedRunInference,
+                                      std::ref(inference_program),
+                                      std::ref(executor),
+                                      scope,
+                                      std::ref(cpu_feeds[i]),
+                                      std::ref(cpu_fetchs[i])));
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i]->join();
+    delete threads[i];
+  }
+
+  delete scope;
+}

From 9cba062252d12456a0025256180ee130f784fd8d Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 4 Apr 2018 02:16:49 +0000
Subject: [PATCH 02/14] Add inferface to change the feed/fetch_holder_name.

---
 paddle/fluid/framework/block_desc.h           |  3 ++
 paddle/fluid/framework/program_desc.cc        | 46 ++++++++++++++--
 paddle/fluid/framework/program_desc.h         |  4 ++
 .../test_inference_image_classification.cc    |  2 +-
 .../test_inference_label_semantic_roles.cc    | 16 +++---
 .../book/test_inference_recognize_digits.cc   |  2 +-
 .../book/test_inference_recommender_system.cc | 14 ++---
 .../test_inference_rnn_encoder_decoder.cc     |  4 +-
 .../test_inference_understand_sentiment.cc    |  2 +-
 .../tests/book/test_inference_word2vec.cc     |  8 +--
 paddle/fluid/inference/tests/test_helper.h    | 53 ++++++++++---------
 .../tests/test_multi_thread_helper.h          | 24 +++++++--
 12 files changed, 123 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 468423e0e8..873969b2a8 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -96,6 +97,8 @@ class BlockDesc {
    */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
   std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 049731c721..77d17fbbcc 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -85,9 +85,9 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
   std::vector<std::string> feed_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
     }
@@ -96,9 +96,9 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
 }
 
 const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
   std::vector<std::string> fetch_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_target_names.push_back(op->Input("X")[0]);
     }
@@ -106,5 +106,43 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
   return fetch_target_names;
 }
 
+void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      // Unify the input's name of all feed_ops to feed_holder_name
+      global_block->RemoveVar(op->Input("X")[0]);
+      op->SetInput("X", {feed_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+
+  auto *feed_holder = global_block->Var(feed_holder_name);
+  feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
+  feed_holder->SetPersistable(true);
+}
+
+void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      // Unify the output's name of all fetch_ops to fetch_holder_name
+      global_block->RemoveVar(op->Output("Out")[0]);
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+
+  auto *fetch_holder = global_block->Var(fetch_holder_name);
+  fetch_holder->SetType(proto::VarType::FETCH_LIST);
+  fetch_holder->SetPersistable(true);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 538a037211..fe29a4ae58 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -55,6 +56,9 @@ class ProgramDesc {
   const std::vector<std::string> GetFeedTargetNames();
   const std::vector<std::string> GetFetchTargetNames();
 
+  void SetFeedHolderName(const std::string &feed_holder_name);
+  void SetFetchHolderName(const std::string &fetch_holder_name);
+
  private:
   proto::ProgramDesc desc_;
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index e9a27171f1..76605ef80a 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -35,7 +35,7 @@ TEST(inference, image_classification) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
+  SetupTensor<float>(&input,
                      {FLAGS_batch_size, 3, 32, 32},
                      static_cast<float>(0),
                      static_cast<float>(1));
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 1849240166..6d2feb4ac4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -36,35 +36,35 @@ TEST(inference, label_semantic_roles) {
   int64_t predicate_dict_len = 3162;
   int64_t mark_dict_len = 2;
 
-  SetupLoDTensor(word,
+  SetupLoDTensor(&word,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
+  SetupLoDTensor(&predicate,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
+  SetupLoDTensor(&ctx_n2,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
+  SetupLoDTensor(&ctx_n1,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
+  SetupLoDTensor(&ctx_0,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
+  SetupLoDTensor(&ctx_p1,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
+  SetupLoDTensor(&ctx_p2,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
+  SetupLoDTensor(&mark,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(mark_dict_len - 1));
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 1fb0f9e777..2f8775d2cb 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -35,7 +35,7 @@ TEST(inference, recognize_digits) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
+  SetupTensor<float>(&input,
                      {FLAGS_batch_size, 1, 28, 28},
                      static_cast<float>(-1),
                      static_cast<float>(1));
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
index b42a33c9a9..5e538852dd 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
 
   // Use the first data from paddle.dataset.movielens.test() as input
   std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
 
   std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
 
   std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
 
   std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
 
   std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
 
   std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
 
   std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&user_id);
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
index a0523905bd..85672bb49c 100644
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -33,9 +33,9 @@ TEST(inference, rnn_encoder_decoder) {
   paddle::framework::LoD lod{{0, 4, 10}};
 
   SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+      &word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
   SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+      &trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word_data);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index 824b3274eb..e615738468 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -33,7 +33,7 @@ TEST(inference, understand_sentiment) {
   paddle::framework::LoD lod{{0, 4, 10}};
   int64_t word_dict_len = 5147;
 
-  SetupLoDTensor(words,
+  SetupLoDTensor(&words,
                  lod,
                  static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index 1481760c52..1178589fea 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
   paddle::framework::LoD lod{{0, 1}};
   int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dce541c097..95c526352c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -12,58 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include <time.h>
+#include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
+void SetupTensor(paddle::framework::LoDTensor* input,
                  paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
+                 const T lower,
+                 const T upper) {
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  unsigned int seed = time(NULL);
+  for (int i = 0; i < input->numel(); ++i) {
+    input_ptr[i] = (static_cast<T>(rand_r(&seed)) / static_cast<T>(RAND_MAX)) *
+                       (upper - lower) +
+                   lower;
   }
 }
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
+void SetupTensor(paddle::framework::LoDTensor* input,
                  paddle::framework::DDim dims,
-                 std::vector<T>& data) {
+                 const std::vector<T>& data) {
   CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
+                    const paddle::framework::LoD& lod,
+                    const T lower,
+                    const T upper) {
+  input->set_lod(lod);
   int dim = lod[0][lod[0].size() - 1];
   SetupTensor<T>(input, {dim, 1}, lower, upper);
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                     paddle::framework::DDim dims,
                     paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const std::vector<T>& data) {
   const size_t level = lod.size() - 1;
   CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
   SetupTensor<T>(input, dims, data);
 }
 
 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+void CheckError(const paddle::framework::LoDTensor& output1,
+                const paddle::framework::LoDTensor& output2) {
   // Check lod information
   EXPECT_EQ(output1.lod(), output2.lod());
 
@@ -91,7 +96,7 @@ void CheckError(paddle::framework::LoDTensor& output1,
 template <typename Place>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
                    const int repeat = 1,
                    const bool is_combined = false) {
   // 1. Define place, executor, scope
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
index 54e203833b..1ae2e1e0cb 100644
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include <thread>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
@@ -20,13 +22,23 @@ void ThreadedRunInference(
     std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
     paddle::framework::Executor& executor,
     paddle::framework::Scope* scope,
+    const int thread_id,
     const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
     std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+  std::string fetch_holder_name =
+      "fetch_" + paddle::string::to_string(thread_id);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+
   // 3. Get the feed_target_names and fetch_target_names
   const std::vector<std::string>& feed_target_names =
-      inference_program->GetFeedTargetNames();
+      copy_program->GetFeedTargetNames();
   const std::vector<std::string>& fetch_target_names =
-      inference_program->GetFetchTargetNames();
+      copy_program->GetFetchTargetNames();
 
   // 4. Prepare inputs: set up maps for feed targets
   std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
@@ -42,7 +54,12 @@ void ThreadedRunInference(
   }
 
   // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+  executor.Run(*copy_program,
+               scope,
+               feed_targets,
+               fetch_targets,
+               feed_holder_name,
+               fetch_holder_name);
 }
 
 template <typename Place>
@@ -66,6 +83,7 @@ void TestMultiThreadInference(
                                       std::ref(inference_program),
                                       std::ref(executor),
                                       scope,
+                                      i,
                                       std::ref(cpu_feeds[i]),
                                       std::ref(cpu_fetchs[i])));
   }

From 27f553b37716ce3074cc75747e93f55fbccc68bb Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 4 Apr 2018 06:12:56 +0000
Subject: [PATCH 03/14] Add the check of CPU results and GPU results in
 multi-thread unittest.

---
 .../inference/tests/book/test_inference_fit_a_line.cc | 11 +++++------
 paddle/fluid/inference/tests/test_helper.h            |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index e8224be2d4..4769707780 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -32,7 +32,7 @@ TEST(inference, fit_a_line) {
   // The input data should be >= 0
   int64_t batch_size = 10;
   SetupTensor<float>(
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+      &input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -51,7 +51,7 @@ TEST(inference, fit_a_line) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
-  LOG(INFO) << "--- CPU Runs: ---";
+  LOG(INFO) << "--- GPU Runs: ---";
   TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
@@ -79,10 +79,8 @@ TEST(multi_thread_inference, fit_a_line) {
     // The second dim of the input tensor should be 13
     // The input data should be >= 0
     int64_t batch_size = 10;
-    SetupTensor<float>(*input,
-                       {batch_size, 13},
-                       static_cast<float>(0),
-                       static_cast<float>(10));
+    SetupTensor<float>(
+        input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
     cpu_feeds[i].push_back(input);
   }
 
@@ -112,6 +110,7 @@ TEST(multi_thread_inference, fit_a_line) {
       dirname, cpu_feeds, cpu_fetchs2, num_threads);
 
   for (int i = 0; i < num_threads; ++i) {
+    CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
     delete cpu_fetchs2[i][0];
   }
 #endif
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 95c526352c..a472ee68c0 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <time.h>
+#include <cstdint>
 #include <map>
 #include <string>
 #include <vector>
@@ -28,7 +29,7 @@ void SetupTensor(paddle::framework::LoDTensor* input,
                  const T lower,
                  const T upper) {
   T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
-  unsigned int seed = time(NULL);
+  unsigned int seed = reinterpret_cast<std::uintptr_t>(input);
   for (int i = 0; i < input->numel(); ++i) {
     input_ptr[i] = (static_cast<T>(rand_r(&seed)) / static_cast<T>(RAND_MAX)) *
                        (upper - lower) +

From b138d29c3891a5ddcdd283c74881bec13d58957b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 6 Apr 2018 20:28:51 -0700
Subject: [PATCH 04/14] Avoid init_p2p all the times

---
 paddle/fluid/framework/data_device_transform_test.cu | 2 +-
 paddle/fluid/framework/init.cc                       | 6 ++++--
 paddle/fluid/framework/init.h                        | 2 +-
 paddle/fluid/framework/init_test.cc                  | 4 ++--
 paddle/fluid/framework/lod_tensor_test.cu            | 4 ++--
 paddle/fluid/framework/operator_test.cc              | 8 ++++----
 paddle/fluid/pybind/pybind.cc                        | 3 ++-
 paddle/testing/paddle_gtest_main.cc                  | 2 +-
 python/paddle/fluid/__init__.py                      | 7 ++++++-
 9 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index e896a06162..a66525303d 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name,
 TEST(Operator, CPUtoGPU) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  InitDevices();
+  InitDevices(true);
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
index 3c0d93642a..75c557fa42 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -64,7 +64,7 @@ void InitP2P(int count) {
 #endif
 }
 
-void InitDevices() {
+void InitDevices(bool init_p2p) {
   /*Init all avaiable devices by default */
 
   std::vector<platform::Place> places;
@@ -85,7 +85,9 @@ void InitDevices() {
   for (int i = 0; i < count; ++i) {
     places.emplace_back(platform::CUDAPlace(i));
   }
-  InitP2P(count);
+  if (init_p2p) {
+    InitP2P(count);
+  }
   platform::DeviceContextPool::Init(places);
 }
 
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/framework/init.h
index 7d86d15811..fae98a60b5 100644
--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
@@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);
 
 void InitGLOG(const std::string &prog_name);
 
-void InitDevices();
+void InitDevices(bool init_p2p);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc
index 2a03f0afe6..928e2d14ab 100644
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
@@ -21,7 +21,7 @@ TEST(InitDevices, CPU) {
   using paddle::platform::DeviceContextPool;
 
 #ifndef PADDLE_WITH_CUDA
-  InitDevices();
+  InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
 #endif
@@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) {
 
 #ifdef PADDLE_WITH_CUDA
   int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices();
+  InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index be65da5ba2..e3efbe4c46 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) {
 }
 
 TEST(LoD, data) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
 
   paddle::framework::LoD lod{{0, 1, 2}};
   lod.push_back({0, 2, 4, 5});
@@ -46,7 +46,7 @@ TEST(LoD, data) {
 }
 
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
 
   paddle::framework::LoDTensor lod_tensor;
   paddle::platform::CUDAPlace place(0);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 44ca4d7ca5..25f622b725 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -72,7 +72,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                              paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -198,7 +198,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -228,7 +228,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
@@ -269,7 +269,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 
 TEST(Operator, Clone) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b0a3f06a88..9712da4b72 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -423,7 +423,8 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("init_devices", &framework::InitDevices);
+  m.def("init_devices",
+        [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 0fea6a8079..586ec48477 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,6 @@ int main(int argc, char** argv) {
   paddle::memory::Used(paddle::platform::CUDAPlace(0));
 #endif
 
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   return RUN_ALL_TESTS();
 }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5ea4d977f4..682f45c7db 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -84,6 +84,8 @@ def __bootstrap__():
     import core
     import os
 
+    in_test = 'unittest' in sys.modules
+
     try:
         num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
     except ValueError:
@@ -108,8 +110,11 @@ def __bootstrap__():
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
-    core.init_devices()
+    # don't init_p2p when in unittest to save time.
+    core.init_devices(not in_test)
 
 
+# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
+# Consider paddle.init(args) or paddle.main(args)
 layers.monkey_patch_variable()
 __bootstrap__()

From 90f3a421c72b2c2cf5c3a38a23f40bb6d8a43aa3 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Sun, 8 Apr 2018 08:13:09 +0000
Subject: [PATCH 05/14] Change the argument's type from reference to pointer.

---
 .../fluid/inference/tests/test_multi_thread_helper.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
index 4e798de475..405e9edb4a 100644
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -23,8 +23,8 @@ limitations under the License. */
 
 void ThreadedRunInference(
     const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
-    const paddle::framework::Executor& executor,
-    paddle::framework::Scope* scope, const int thread_id,
+    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
+    const int thread_id,
     const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
     const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
   auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
@@ -56,8 +56,8 @@ void ThreadedRunInference(
   }
 
   // 6. Run the inference program
-  executor.Run(*copy_program, scope, feed_targets, fetch_targets,
-               feed_holder_name, fetch_holder_name);
+  executor->Run(*copy_program, scope, feed_targets, fetch_targets,
+                feed_holder_name, fetch_holder_name);
 }
 
 template <typename Place>
@@ -78,8 +78,8 @@ void TestMultiThreadInference(
   std::vector<std::thread*> threads;
   for (int i = 0; i < num_threads; ++i) {
     threads.push_back(new std::thread(
-        ThreadedRunInference, std::ref(inference_program), std::ref(executor),
-        scope, i, std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
+        ThreadedRunInference, std::ref(inference_program), &executor, scope, i,
+        std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
   }
   for (int i = 0; i < num_threads; ++i) {
     threads[i]->join();

From 5c83de7b3c1f08199cc1a38507f806913e215266 Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Mon, 9 Apr 2018 15:49:57 +0800
Subject: [PATCH 06/14] Delete index_en.rst

---
 doc/v2/build_and_install/index_en.rst | 32 ---------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 doc/v2/build_and_install/index_en.rst

diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
deleted file mode 100644
index 7e0ca5bcbd..0000000000
--- a/doc/v2/build_and_install/index_en.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Install and Build
-=================
-
-.. _install_steps:
-
-Install Steps
-++++++++
-
-You can choose either pip or Docker to complete your install:
-
-.. toctree::
-   :maxdepth: 1
-
-   pip_install_en.rst
-   docker_install_en.rst
-
-Build from Source
------------------
-
-..  warning::
-
-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
-
-..  toctree::
-    :maxdepth: 1
-
-    build_from_source_en.md
-
-FAQ
-++++++++++
-
-`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_

From 5f360750839db4469bc9ba8c7f379d810f9df2b6 Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Mon, 9 Apr 2018 15:52:06 +0800
Subject: [PATCH 07/14] add index_en.rst

---
 doc/v2/build_and_install/index_en.rst | 56 +++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 doc/v2/build_and_install/index_en.rst

diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
new file mode 100644
index 0000000000..04aa4693ac
--- /dev/null
+++ b/doc/v2/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and compile
+==========
+
+.. _install_steps:
+
+PaddlePaddle provides various ways of installation for many different users
+
+focus on deep learning model development
+-----------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_cn.rst
+
+this is the convenient way to install it , please choose the right installation package with mochine configure and system。
+
+follow the bottom frame
+----------
+
+PaddlePaddle provides the installation ways of Docker, please follow the tutorial:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_cn.rst
+
+we recommend running PaddlePaddle in docker , this way has more superiority ：
+
+- don't need the third dependent of installing 
+- easy to shared runtime environment and the problem recurrented
+
+we provides compile and install method of PaddlePaddle from resouce code ， for users with customized binary file：
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+.. warning::
+
+	what need to be attation to , this way of installation involves to download、 compile and install the third depentent , The whole process of installing need more time。
+
+
+FAQ
+-----------
+
+if you have any problem on the process of installation , please trying the bottom page to find the answer：
+
+:ref:`常见问题解答 <install_faq>`
+
+if the problem hasn't been solved , so welcome to come the paddlepaddle community to feedback：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_

From 044da8c4b74e5546422bda03abb3fca4e53861f8 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 9 Apr 2018 15:56:26 +0800
Subject: [PATCH 08/14] Add title for kernel_hint_design.md &
 kernel_selection.md

---
 doc/fluid/design/muti_devices/kernel_hint_design.md | 4 +++-
 doc/fluid/design/muti_devices/kernel_selection.md   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md
index 728c8f0b96..58e44b6416 100644
--- a/doc/fluid/design/muti_devices/kernel_hint_design.md
+++ b/doc/fluid/design/muti_devices/kernel_hint_design.md
@@ -1,4 +1,6 @@
-# Problem
+# Kernel Hint Design
+
+## Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 
 In the current design, we use KernelType to describe one kernel.
diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md
index 39ea2b0009..967317d5d2 100644
--- a/doc/fluid/design/muti_devices/kernel_selection.md
+++ b/doc/fluid/design/muti_devices/kernel_selection.md
@@ -1,4 +1,6 @@
-# Background
+# Kernel Selection
+
+## Background
 Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 
 The `OpKernelType ` is as follows:

From e24172eb54ae7aee604940c206f13777d01d18c7 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 9 Apr 2018 08:35:52 +0000
Subject: [PATCH 09/14] Simplify the inference unittest of fit a line and add
 some comment.

---
 paddle/fluid/framework/program_desc.h         |  14 ++
 .../tests/book/test_inference_fit_a_line.cc   | 143 +++++++-----------
 .../tests/test_multi_thread_helper.h          |   2 +-
 3 files changed, 70 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index fe29a4ae58..4288081be7 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -53,10 +53,24 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
+  // The output variable of feed_op is referenced as feed_target.
+  // This function is used to collect the output variable's name of all
+  // feed_ops.
   const std::vector<std::string> GetFeedTargetNames();
+
+  // The input variable of fetch_op is referenced as fetch_target.
+  // This function is used to collect the input variable's name of all
+  // fetch_ops.
   const std::vector<std::string> GetFetchTargetNames();
 
+  // The input variable of feed_op that holds input Tensor provided by users is
+  // referenced as feed_holder.
+  // This function is used to change or unify the feed_holder variables' name.
   void SetFeedHolderName(const std::string &feed_holder_name);
+
+  // The output variable of fetch_op that holds output Tensor needed by users is
+  // referenced as fetch_holder.
+  // This function is used to change or unify the fetch_holder variables' name.
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
  private:
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 7ad7278706..2c5b66a329 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -27,96 +27,63 @@ TEST(inference, fit_a_line) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  paddle::framework::LoDTensor input;
-  // The second dim of the input tensor should be 13
-  // The input data should be >= 0
-  int64_t batch_size = 10;
-  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
-                     static_cast<float>(10));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
+  for (int num_threads : {1, 2}) {
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
+    cpu_feeds.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* input = new paddle::framework::LoDTensor();
+      // The second dim of the input tensor should be 13
+      // The input data should be >= 0
+      int64_t batch_size = 10;
+      SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
+                         static_cast<float>(10));
+      cpu_feeds[i].push_back(input);
+    }
+
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+    cpu_fetchs1.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs1[i].push_back(output);
+    }
+
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
+                                                cpu_fetchs1[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CPUPlace>(
+          dirname, cpu_feeds, cpu_fetchs1, num_threads);
+    }
 
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+    cpu_fetchs2.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs2[i].push_back(output);
+    }
+
+    // Run inference on CUDA GPU
+    LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
+                                                 cpu_fetchs2[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CUDAPlace>(
+          dirname, cpu_feeds, cpu_fetchs2, num_threads);
+    }
+
+    for (int i = 0; i < num_threads; ++i) {
+      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
+      delete cpu_fetchs2[i][0];
+    }
 #endif
-}
-
-TEST(multi_thread_inference, fit_a_line) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
-  }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  int num_threads = 2;
-
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
-  cpu_feeds.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* input = new paddle::framework::LoDTensor();
-    // The second dim of the input tensor should be 13
-    // The input data should be >= 0
-    int64_t batch_size = 10;
-    SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
-                       static_cast<float>(10));
-    cpu_feeds[i].push_back(input);
-  }
-
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
-  cpu_fetchs1.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* output = new paddle::framework::LoDTensor();
-    cpu_fetchs1[i].push_back(output);
-  }
-
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs (Multi Thread): ---";
-  TestMultiThreadInference<paddle::platform::CPUPlace>(
-      dirname, cpu_feeds, cpu_fetchs1, num_threads);
-
-#ifdef PADDLE_WITH_CUDA
-  std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
-  cpu_fetchs2.resize(num_threads);
-  for (int i = 0; i < num_threads; ++i) {
-    auto* output = new paddle::framework::LoDTensor();
-    cpu_fetchs2[i].push_back(output);
-  }
-
-  // Run inference on CUDA GPU
-  LOG(INFO) << "--- GPU Runs (Multi Thread): ---";
-  TestMultiThreadInference<paddle::platform::CUDAPlace>(
-      dirname, cpu_feeds, cpu_fetchs2, num_threads);
-
-  for (int i = 0; i < num_threads; ++i) {
-    CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
-    delete cpu_fetchs2[i][0];
-  }
-#endif
-
-  for (int i = 0; i < num_threads; ++i) {
-    delete cpu_feeds[i][0];
-    delete cpu_fetchs1[i][0];
-  }
+    for (int i = 0; i < num_threads; ++i) {
+      delete cpu_feeds[i][0];
+      delete cpu_fetchs1[i][0];
+    }
+  }  // num_threads-loop
 }
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
index 405e9edb4a..56745f115d 100644
--- a/paddle/fluid/inference/tests/test_multi_thread_helper.h
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -56,7 +56,7 @@ void ThreadedRunInference(
   }
 
   // 6. Run the inference program
-  executor->Run(*copy_program, scope, feed_targets, fetch_targets,
+  executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
                 feed_holder_name, fetch_holder_name);
 }
 

From 720f6196ea0e62d95111ee0ceb2f9f7f92626fe1 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 9 Apr 2018 08:46:03 +0000
Subject: [PATCH 10/14] Change the seed and make it not fixed for multi-threads
 cases.

---
 paddle/fluid/inference/tests/test_helper.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index aae34ceda0..064e400f0c 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -25,7 +25,8 @@ limitations under the License. */
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                  paddle::framework::DDim dims, T lower, T upper) {
-  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
 
   T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());

From a513c283688e6e8a4f2b6c5671a964d449166a1c Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Mon, 9 Apr 2018 11:07:19 -0700
Subject: [PATCH 11/14] Fix build and install document

---
 doc/v2/build_and_install/index_en.rst | 32 +++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
index 04aa4693ac..5b3de0f8c3 100644
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@@ -1,11 +1,11 @@
-install and compile
+install and Compile
 ==========
 
 .. _install_steps:
 
-PaddlePaddle provides various ways of installation for many different users
+PaddlePaddle provides various methods of installation for many different users
 
-focus on deep learning model development
+Focus on Deep Learning Model Development
 -----------------
 
 PaddlePaddle provides lots of packages of python wheel , that pip can install:
@@ -13,44 +13,44 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install:
 .. toctree::
 	:maxdepth: 1
 
-	pip_install_cn.rst
+	pip_install_en.rst
 
-this is the convenient way to install it , please choose the right installation package with mochine configure and system。
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
 
-follow the bottom frame
+Follow the Bottom Frame
 ----------
 
-PaddlePaddle provides the installation ways of Docker, please follow the tutorial:
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
 
 .. toctree::
 	:maxdepth: 1
 
-	docker_install_cn.rst
+	docker_install_en.rst
 
-we recommend running PaddlePaddle in docker , this way has more superiority ：
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
 
-- don't need the third dependent of installing 
-- easy to shared runtime environment and the problem recurrented
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
 
-we provides compile and install method of PaddlePaddle from resouce code ， for users with customized binary file：
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
 
 .. toctree::
     :maxdepth: 1
 
-    build_from_source_cn.rst
+    build_from_source_en.rst
 
 .. warning::
 
-	what need to be attation to , this way of installation involves to download、 compile and install the third depentent , The whole process of installing need more time。
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
 
 
 FAQ
 -----------
 
-if you have any problem on the process of installation , please trying the bottom page to find the answer：
+For any problems during installation, please refer to the page below for answers:
 
 :ref:`常见问题解答 <install_faq>`
 
-if the problem hasn't been solved , so welcome to come the paddlepaddle community to feedback：
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
 
 `创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_

From add367c3f4b77103cda50b9359492f34a366477c Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Tue, 10 Apr 2018 02:25:26 +0800
Subject: [PATCH 12/14] Code cleanup in the profiler code. (#9782)

---
 paddle/fluid/platform/profiler.cc      | 194 +++++++++++++++----------
 paddle/fluid/platform/profiler.h       |  62 +-------
 paddle/fluid/platform/profiler_test.cc |  17 ++-
 3 files changed, 133 insertions(+), 140 deletions(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index b25206ff35..412cdda286 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,8 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <iomanip>
 #include <map>
+#include <mutex>  // NOLINT
+#include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -28,10 +31,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+struct EventList;
+
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
-// To record which timer the profiler used, CUDA or CPU.
-static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
 static thread_local std::shared_ptr<EventList> g_event_list;
 
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
 inline uint64_t GetTimeInNsec() {
   using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                  std::chrono::high_resolution_clock,
@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() {
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
 
-Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+Event::Event(EventType type, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
-    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
   cpu_ns_ = GetTimeInNsec();
 }
 
-std::string Event::kind() const {
-  switch (kind_) {
-    case EventKind::kMark:
-      return "mark";
-    case EventKind::kPushRange:
-      return "push";
-    case EventKind::kPopRange:
-      return "pop";
-  }
-  PADDLE_THROW("Unknown EventKind.");
-}
+const EventType& Event::type() const { return type_; }
 
 double Event::CpuElapsedMs(const Event& e) const {
   return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
@@ -129,15 +155,15 @@ inline EventList& GetEventList() {
 }
 
 void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
 }
 
 void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
 }
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
 }
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
-  if (g_state == ProfilerState::kCUDA) {
-    g_profiler_place = "CUDA";
-  } else if (g_state == ProfilerState::kCPU) {
-    g_profiler_place = "CPU";
-  } else {
-    g_profiler_place = "All";
+  if (g_state == ProfilerState::kAll) {
     GetDeviceTracer()->Enable();
   }
 #ifdef PADDLE_WITH_CUDA
@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() {
   return result;
 }
 
-void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path) {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
-  // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
-  g_state = ProfilerState::kDisabled;
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Print results
+void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
+                   const std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::string place;
+  if (g_state == ProfilerState::kCPU) {
+    place = "CPU";
+  } else if (g_state == ProfilerState::kCUDA) {
+    place = "CUDA";
+  } else if (g_state == ProfilerState::kAll) {
+    place = "All";
+  } else {
+    PADDLE_THROW("Invalid profiler state");
+  }
 
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
-  DeviceTracer* tracer = GetDeviceTracer();
-  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
-    tracer->Disable();
-    tracer->GenProfile(profile_path);
+  std::cout << "Place: " << place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      const EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
   }
+  std::cout << std::endl;
 }
 
-void ParseEvents(std::vector<std::vector<Event>>& events,
-                 EventSortingKey sorted_by) {
-  if (g_profiler_place == "") return;
+// Parse the event list and output the profiling report
+void ParseEvents(const std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
+  if (g_state == ProfilerState::kDisabled) return;
 
   std::string sorted_domain;
   std::function<bool(const EventItem&, const EventItem&)> sorted_func;
@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
     std::unordered_map<std::string, int> event_idx;
 
     for (size_t j = 0; j < events[i].size(); j++) {
-      if (events[i][j].kind() == "push") {
+      if (events[i][j].type() == EventType::kPushRange) {
         pushed_events.push_back(events[i][j]);
-      } else if (events[i][j].kind() == "pop") {
+      } else if (events[i][j].type() == EventType::kPopRange) {
         std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
         while (rit != pushed_events.rend() &&
                rit->name() != events[i][j].name()) {
@@ -317,10 +374,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time =
-              (g_profiler_place == "CUDA" || g_profiler_place == "All")
-                  ? rit->CudaElapsedMs(events[i][j])
-                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time = (g_state == ProfilerState::kCUDA ||
+                               g_state == ProfilerState::kAll)
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
 
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
   PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
 }
 
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------\n\n";
-  std::cout << "Place: " << g_profiler_place << std::endl;
-  std::cout << "Time unit: ms" << std::endl;
-  std::cout << "Sorted by " << sorted_domain
-            << " in descending order in the same thread\n\n";
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
-  for (size_t i = 0; i < events_table.size(); ++i) {
-    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      EventItem& event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
-                << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
-    }
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile(profile_path);
   }
-  std::cout << std::endl;
+  g_state = ProfilerState::kDisabled;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index de9a5cc20d..b07427c8f6 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
-#include <mutex>
+#include <string>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.pb.h"
@@ -23,16 +23,16 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-enum EventKind { kMark, kPushRange, kPopRange };
+enum EventType { kMark, kPushRange, kPopRange };
 
 class Event {
  public:
   // The DeviceContext is used to get the cuda stream.
   // If CPU profiling mode, can pass nullptr.
-  Event(EventKind kind, std::string name, uint32_t thread_id,
+  Event(EventType type, std::string name, uint32_t thread_id,
         const DeviceContext* dev_ctx);
 
-  std::string kind() const;
+  const EventType& type() const;
   std::string name() const { return name_; }
   uint32_t thread_id() const { return thread_id_; }
   bool has_cuda() const { return has_cuda_; }
@@ -46,7 +46,7 @@ class Event {
   double CudaElapsedMs(const Event& e) const;
 
  private:
-  EventKind kind_;
+  EventType type_;
   std::string name_;
   uint32_t thread_id_;
   int64_t cpu_ns_;
@@ -57,39 +57,6 @@ class Event {
 #endif
 };
 
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  void Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-
-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<Event>> event_blocks;
-};
-
 enum ProfilerState {
   kDisabled,  // disabled state
   kCPU,       // CPU profiling state
@@ -136,16 +103,6 @@ struct RecordThread {
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
 
-// The information of each event given in the profiling report
-struct EventItem {
-  std::string name;
-  int calls;
-  double total_time;
-  double min_time;
-  double max_time;
-  double ave_time;
-};
-
 // Candidate keys to sort the profiling report
 enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
 
@@ -158,14 +115,5 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
-// Parse the event list and output the profiling report
-void ParseEvents(std::vector<std::vector<Event>>&,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault);
-
-// Print results
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width);
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 45cc271bb8..61f467814b 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include <string>
 #ifdef PADDLE_WITH_CUDA
-#include "cuda_runtime.h"
+#include <cuda_runtime.h>
 #endif
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
 
-  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  Event start_event(EventType::kPushRange, "test", 0, nullptr);
   EXPECT_TRUE(start_event.has_cuda() == false);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  Event stop_event(EventType::kPopRange, "test", 0, nullptr);
   EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
 
   DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
   EXPECT_TRUE(start_event.has_cuda() == true);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
   EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) {
 TEST(RecordEvent, RecordEvent) {
   using paddle::platform::DeviceContext;
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
   using paddle::platform::RecordEvent;
   using paddle::platform::ProfilerState;
   using paddle::platform::EventSortingKey;

From 8dbd9c394e9cb97926320e113b112431ef509ec5 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Mon, 9 Apr 2018 15:07:35 -0700
Subject: [PATCH 13/14] Fix part of the cpplint errors in fluid/platform
 (#9802)

---
 cmake/external/mkldnn.cmake                  |  3 ++-
 paddle/fluid/platform/device_context.cc      |  6 +++++-
 paddle/fluid/platform/device_context.h       |  3 ++-
 paddle/fluid/platform/device_context_test.cu |  5 +++--
 paddle/fluid/platform/device_tracer.cc       | 14 +++++++++-----
 paddle/fluid/platform/device_tracer.h        |  4 +++-
 paddle/fluid/platform/mkldnn_helper.h        |  4 ++--
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index a25cff5fc5..5759e5c489 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index feb4f36700..f03165fae5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -8,10 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/device_context.h"
+
+#include <string>
 #include <unordered_set>
+#include <vector>
+
 #include "paddle/fluid/memory/memory.h"
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 6b796d92d0..b175583379 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -8,11 +8,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 9d8d07362c..fa806aba6d 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include <vector>
+
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 3b4437f576..c9e1063168 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,15 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/device_tracer.h"
-#include <google/protobuf/text_format.h>
+
+#include <deque>
 #include <fstream>
 #include <map>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <numeric>
-#include <thread>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
 #include "glog/logging.h"
+#include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -123,7 +127,7 @@ void DisableActivity() {
 
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
                               size_t *maxNumRecords) {
-  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
   *size = kBufSize;
   *buffer = ALIGN_BUFFER(buf, kAlignSize);
   *maxNumRecords = 0;
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index deb3d23f78..0375c7439c 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
+#include <string>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 90b78142b8..de8056237f 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include <mkldnn.hpp>
+#include <vector>
 
+#include "mkldnn/include/mkldnn.hpp"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {

From 0f38bb4593e9ffe39f1d8e3f9a7ceb9becc60098 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Mon, 9 Apr 2018 19:38:06 -0700
Subject: [PATCH 14/14] add fp16 support to activation op (#9769)

---
 paddle/fluid/operators/activation_op.cc       |  11 -
 paddle/fluid/operators/activation_op.cu       |  41 +-
 paddle/fluid/operators/activation_op.h        |  20 +-
 paddle/fluid/platform/float16.h               |  40 +
 .../tests/unittests/test_activation_op.py     | 780 +++++++++++++++---
 5 files changed, 742 insertions(+), 150 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index a6d9ce0f04..b261144f3d 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -662,14 +662,3 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
-
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ReluFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ReluFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                         ops::ReluGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
-                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 7709a551dc..4f745553c1 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,31 +14,19 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
-                                      ops::functor<float>>,                \
-      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
-                            ops::functor<double>>);                        \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type##_grad,                                                     \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
-                                ops::grad_functor<float>>,                 \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+namespace plat = paddle::platform;
+
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type,                                                             \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
+      ops::ActivationKernel<plat::CUDADeviceContext,                        \
+                            ops::functor<plat::float16>>);                  \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
+                                                 ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
-
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                                ops::ReluFunctor<float>>,
-    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                          ops::ReluFunctor<double>>,
-    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
-                          ops::ReluFunctor<paddle::platform::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                         ops::ReluGradFunctor<float>>,
-    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index c4efbcd3f9..43856780bf 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,9 +12,11 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/float16.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -338,11 +337,25 @@ struct Sine {
   HOSTDEVICE T operator()(const T& val) const { return sin(val); }
 };
 
+template <>
+struct Sine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(sin(static_cast<float>(val)));
+  }
+};
+
 template <typename T>
 struct Cosine {
   HOSTDEVICE T operator()(const T& val) const { return cos(val); }
 };
 
+template <>
+struct Cosine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(cos(static_cast<float>(val)));
+  }
+};
+
 // cosine'(x) = -sin(x)
 template <typename T>
 struct CosGradFunctor : public BaseActivationFunctor<T> {
@@ -826,6 +839,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
   __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
   __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
   __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index e77f768bf9..673e1bcae4 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -1003,6 +1003,46 @@ HOSTDEVICE inline float16 exp(const float16& a) {
   return float16(::expf(static_cast<float>(a)));
 }
 
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
+
 }  // namespace numext
 
 }  // namespace Eigen
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index c5b53902bc..57d4a50e91 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -22,221 +22,504 @@ from scipy.special import expit
 class TestExp(OpTest):
     def setUp(self):
         self.op_type = "exp"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.exp(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Exp(TestExp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSigmoid(OpTest):
     def setUp(self):
         self.op_type = "sigmoid"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sigmoid(TestSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestLogSigmoid(OpTest):
     def setUp(self):
         self.op_type = "logsigmoid"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 / (1 + np.exp(-x)))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16LogSigmoid(TestLogSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestTanh(OpTest):
     def setUp(self):
         self.op_type = "tanh"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Tanh(TestTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestTanhShrink(OpTest):
     def setUp(self):
         self.op_type = "tanh_shrink"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
+        out = x - np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16TanhShrink(TestTanhShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestHardShrink(OpTest):
     def setUp(self):
         self.op_type = "hard_shrink"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
         threshold = 0.5
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out[(out >= -threshold) & (out <= threshold)] = 0
 
-        self.inputs = {'X': x}
         self.attrs = {'lambda': threshold}
-
-        t = np.copy(x)
-        t[(t >= -threshold) & (t <= threshold)] = 0
-        self.outputs = {'Out': t}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.005)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16HardShrink(TestHardShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftShrink(OpTest):
     def setUp(self):
         self.op_type = "softshrink"
+        self.dtype = np.float32
+        self.init_dtype()
+
         lambda_val = 0.1
+        x = np.random.uniform(0.25, 10, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
+            out - lambda_val)
+
         self.attrs = {'lambda': lambda_val}
-        self.inputs = {
-            'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32")
-        }
-        y = np.copy(self.inputs['X'])
-        y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
-            y - lambda_val)
-        self.outputs = {'Out': y}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16SoftShrink(TestSoftShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSqrt(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sqrt(TestSqrt):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestAbs(OpTest):
     def setUp(self):
         self.op_type = "abs"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
         # Because we set delta = 0.005 in caculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
         # x_pos will be 0.007, so the numeric gradient is unaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Abs(TestAbs):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestCeil(OpTest):
     def setUp(self):
         self.op_type = "ceil"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.ceil(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.ceil(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Ceil(TestCeil):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestFloor(OpTest):
     def setUp(self):
         self.op_type = "floor"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.floor(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.floor(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Floor(TestFloor):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestCos(OpTest):
     def setUp(self):
         self.op_type = "cos"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.cos(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.cos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Cos(TestCos):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSin(OpTest):
     def setUp(self):
         self.op_type = "sin"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.sin(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.sin(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sin(TestSin):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestRound(OpTest):
     def setUp(self):
         self.op_type = "round"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.round(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.round(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Round(TestRound):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestRelu(OpTest):
     def setUp(self):
@@ -278,222 +561,463 @@ class TestFP16Relu(TestRelu):
 class TestBRelu(OpTest):
     def setUp(self):
         self.op_type = "brelu"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
         t_min = 1.0
         t_max = 4.0
         # The same with TestAbs
         x[np.abs(x - t_min) < 0.005] = t_min + 0.02
         x[np.abs(x - t_max) < 0.005] = t_max + 0.02
-
-        self.inputs = {'X': x}
-        self.attrs = {'t_min': t_min, 't_max': t_max}
         t = np.copy(x)
         t[t < t_min] = t_min
         t[t > t_max] = t_max
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'t_min': t_min, 't_max': t_max}
         self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16BRelu(TestBRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestRelu6(OpTest):
     def setUp(self):
         self.op_type = "relu6"
-        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 10]).astype(self.dtype)
         threshold = 6.0
         # The same with TestAbs
         x[np.abs(x) < 0.005] = 0.02
         x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+        out = np.minimum(np.maximum(x, 0), threshold)
 
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'threshold': threshold}
-        self.outputs = {
-            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
-        }
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Relu6(TestRelu6):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftRelu(OpTest):
     def setUp(self):
         self.op_type = "soft_relu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
         threshold = 2.0
         # The same reason with TestAbs
         x[np.abs(x - threshold) < 0.005] = threshold + 0.02
         x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
-        self.inputs = {'X': x}
-        self.attrs = {'threshold': threshold}
         t = np.copy(x)
         t[t < -threshold] = -threshold
         t[t > threshold] = threshold
-        self.outputs = {'Out': np.log((np.exp(t) + 1))}
+        out = np.log((np.exp(t) + 1))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16SoftRelu(TestSoftRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestELU(OpTest):
     def setUp(self):
         self.op_type = "elu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
         alpha = 1.
+        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
         # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
         # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
         self.inputs = {'X': x}
         self.attrs = {'alpha': alpha}
-        self.outputs = {
-            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
-        }
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16ELU(TestELU):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestReciprocal(OpTest):
     def setUp(self):
         self.op_type = "reciprocal"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Reciprocal(TestReciprocal):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestLog(OpTest):
     def setUp(self):
         self.op_type = "log"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.log(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Log(TestLog):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSquare(OpTest):
     def setUp(self):
         self.op_type = "square"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.square(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Square(TestSquare):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestPow(OpTest):
     def setUp(self):
         self.op_type = "pow"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'factor': 3.0}
-        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Pow(TestPow):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=5e-2)
+
 
 class TestSTanh(OpTest):
     def setUp(self):
         self.op_type = "stanh"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
         scale_a = 2.0 / 3.0
         scale_b = 1.7159
+        out = scale_b * np.tanh(x * scale_a)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16STanh(TestSTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftplus(OpTest):
     def setUp(self):
         self.op_type = "softplus"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
-        }
-        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
+        self.dtype = np.float64
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 + np.exp(x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Softplus(TestSoftplus):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftsign(OpTest):
     def setUp(self):
         self.op_type = "softsign"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
-        }
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.divide(x, 1 + np.abs(x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Softsign(TestSoftsign):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestThresholdedRelu(OpTest):
     def setUp(self):
         self.op_type = "thresholded_relu"
+        self.dtype = np.float32
+        self.init_dtype()
+
         threshold = 0.25
         self.relative_error = 0.005
-        X = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
         # Same reason as TestAbs
         X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
+        out = (X > threshold) * X
 
-        self.inputs = {'X': X}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
         self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': (X > threshold) * X}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16ThresholdedRelu(TestThresholdedRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestHardSigmoid(OpTest):
     def setUp(self):
         self.op_type = "hard_sigmoid"
+        self.dtype = np.float32
+        self.init_dtype()
+
         self.relative_error = 0.002
 
         X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
@@ -502,7 +1026,6 @@ class TestHardSigmoid(OpTest):
         lower_threshold = -offset / slope
         upper_threshold = (1 - offset) / slope
 
-        self.inputs = {'X': X}
         # Same reason as TestAbs
         X[np.abs(X - lower_threshold) < self.relative_error] = \
             lower_threshold + 0.2
@@ -510,29 +1033,70 @@ class TestHardSigmoid(OpTest):
             upper_threshold - 0.2
 
         temp = X * slope + offset
-        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
+        out = np.maximum(0.0, np.minimum(1.0, temp))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.002)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16HardSigmoid(TestHardSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSwish(OpTest):
     def setUp(self):
         self.op_type = "swish"
-        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        self.inputs = {'X': X}
-        self.attrs = {'beta': 2.3}
-        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        beta = 2.3
+        out = X * expit(beta * X)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'beta': beta}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Swish(TestSwish):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 #--------------------test MKLDNN--------------------
 class TestMKLDNNReluDim2(TestRelu):