From a53e8a8da6a96e559c0ca38367024f2c5b04c021 Mon Sep 17 00:00:00 2001
From: Brian Liu <brian.liu@intel.com>
Date: Sat, 9 Jun 2018 09:23:14 +0800
Subject: [PATCH 1/3] Update MKLDNN integration framework to support Paddle
 multi-instances

Make all blob info saved in global device context to be thread based.
Meanwhile save thread id in thread local storage in ParallelDo
---
 paddle/fluid/platform/device_context.cc | 65 +++++++++++++++++++------
 paddle/fluid/platform/device_context.h  | 10 +++-
 2 files changed, 58 insertions(+), 17 deletions(-)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7d1cf57253..690ba55279 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -25,6 +25,14 @@ namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
+namespace {
+// Current thread's id.
+thread_local int cur_thread_id = 0;
+}
+
+void set_cur_thread_id(int tid) { cur_thread_id = tid; }
+int get_cur_thread_id(void) { return cur_thread_id; }
+
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
@@ -296,38 +304,65 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() {
+  p_blobmap_.reset(new BlobMap());
+  p_mutex_.reset(new std::mutex());
 }
 
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
+
+  int tid = platform::get_cur_thread_id();
 
-  auto it = p->find(name);
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
 
-  if (it == p->end()) {
-    (*p)[name] = data;  // create new blob
+  // Find KeyBlob for current thread
+  auto map_it = pMap->find(tid);
+
+  if (map_it == pMap->end()) {
+    // 1st time to set blob in current thread
+    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
+    (*pMap)[tid] = pBlob;
   } else {
-    it->second = data;  // set data to existing blob
+    pBlob = map_it->second;
   }
 
+  // Find Key in found (or newly created) KeyBlob
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) {
+    (*pBlob)[name] = data;  // create new blob
+  } else {
+    key_it->second = data;  // set data to existing blob
+  }
+
+  // lock will be automatically released when out of scope
   return;
 }
 
 std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
 
-  auto it = p->find(name);
+  int tid = platform::get_cur_thread_id();
 
-  if (it != p->end()) {
-    return it->second;
-  }
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+
+  // Find KeyBlob for current thread firstly
+  auto map_it = pMap->find(tid);
+  if (map_it == pMap->end()) return nullptr;
+  pBlob = map_it->second;
+
+  // Find Blob via name
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) return nullptr;
 
-  return nullptr;
+  // lock will be automatically released when out of scope
+  return key_it->second;
 }
 
 #endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 999bbe00f1..1527c9f324 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -39,6 +39,12 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
+
+void set_cur_thread_id(int);
+int get_cur_thread_id(void);
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
@@ -191,8 +197,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
  private:
   mkldnn::engine engine_;
-  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      p_blobs_;
+  std::shared_ptr<BlobMap> p_blobmap_;
+  std::shared_ptr<std::mutex> p_mutex_;
 };
 #endif
 

From 741cb33bd97dcb121d866acf18458f95527f3a11 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Tue, 16 Oct 2018 14:52:45 +0200
Subject: [PATCH 2/3] test multithreading

---
 paddle/fluid/inference/api/helper.h              | 3 ++-
 paddle/fluid/inference/tests/api/tester_helper.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 24f59cf43a..e46dc13269 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms ======";
+            << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
+            << " ======";
   if (epoch > 1) {
     int samples = batch_size * epoch;
     LOG(INFO) << "====== sample number: " << samples
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 5589b58b06..42072895fc 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -139,6 +139,7 @@ void TestMultiThreadPrediction(
   }
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
+      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;

From 2098b42584f0d6c588d2ec62f6b37a4dc8916e68 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 24 Oct 2018 10:26:07 +0200
Subject: [PATCH 3/3] review fixes (Teamcity fails)

test=develop
---
 paddle/fluid/inference/tests/api/tester_helper.h |  2 ++
 paddle/fluid/platform/device_context.cc          | 16 ++++++++--------
 paddle/fluid/platform/device_context.h           | 12 ++++++------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 42072895fc..19c3f532d5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -139,7 +139,9 @@ void TestMultiThreadPrediction(
   }
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
+#ifdef PADDLE_WITH_MKLDNN
       platform::set_cur_thread_id(static_cast<int>(tid) + 1);
+#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 690ba55279..b0de636de4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -25,14 +25,6 @@ namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-namespace {
-// Current thread's id.
-thread_local int cur_thread_id = 0;
-}
-
-void set_cur_thread_id(int tid) { cur_thread_id = tid; }
-int get_cur_thread_id(void) { return cur_thread_id; }
-
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
@@ -309,6 +301,14 @@ MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
   p_mutex_.reset(new std::mutex());
 }
 
+namespace {
+// Current thread's id.
+thread_local int cur_thread_id = 0;
+}
+
+void set_cur_thread_id(int tid) { cur_thread_id = tid; }
+int get_cur_thread_id(void) { return cur_thread_id; }
+
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
   BlobMap* pMap = p_blobmap_.get();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 1527c9f324..942e13a724 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -39,12 +39,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
-using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
-
-void set_cur_thread_id(int);
-int get_cur_thread_id(void);
-
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
@@ -182,6 +176,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
+
+void set_cur_thread_id(int);
+int get_cur_thread_id(void);
+
 class MKLDNNDeviceContext : public CPUDeviceContext {
  public:
   explicit MKLDNNDeviceContext(CPUPlace place);