From 76d8b14bceeb7f2292b617bb19c33dbcfd6dc8f6 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 3 May 2018 05:30:50 -0700
Subject: [PATCH 01/13] Add timeline support for distributed training

---
 benchmark/cluster/vgg16/vgg16_fluid.py        | 28 ++++--
 cmake/external/grpc.cmake                     |  2 +-
 paddle/fluid/operators/detail/send_recv.proto |  4 +
 .../operators/detail/sendrecvop_utils.cc      |  8 ++
 .../operators/detail/variable_response.cc     | 22 ++++-
 paddle/fluid/operators/listen_and_serv_op.cc  |  8 +-
 paddle/fluid/platform/profiler.cc             | 35 ++++++--
 paddle/fluid/platform/profiler.h              |  8 ++
 tools/timeline.py                             | 90 +++++++++++--------
 9 files changed, 149 insertions(+), 56 deletions(-)

diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 6c7d2c1036..05b5f3977c 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -80,6 +80,8 @@ parser.add_argument(
     type=str,
     default="",
     help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--profile", action='store_true', help="If set, profile a few steps.")
 
 # Flags for defining the tf.train.Server
 parser.add_argument(
@@ -183,8 +185,8 @@ def main():
             start_time = time.time()
             num_samples = 0
             train_pass_acc.reset()
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
+
+            def run_step(batch_id, data):
                 img_data = np.array(
                     map(lambda x: x[0].reshape(data_shape), data)).astype(
                         "float32")
@@ -196,14 +198,28 @@ def main():
                     feed={"pixel": img_data,
                           "label": y_data},
                     fetch_list=[avg_cost, batch_acc, batch_size])
+                return loss, acc, b_size
+
+            if args.profile and args.task_index == 0:
+                # warmup.
+                for batch_id, data in enumerate(train_reader()):
+                    if batch_id > 5: break
+                    run_step(batch_id, data)
+                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
+                    for batch_id, data in enumerate(train_reader()):
+                        if batch_id > 5: break
+                        run_step(batch_id, data)
+
+            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
+                loss, acc, b_size = run_step(batch_id, data)
                 iters += 1
                 num_samples += len(data)
                 train_pass_acc.add(value=acc, weight=b_size)
                 print(
-                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
-                                             loss, acc,
-                                             len(data) / (time.time() - ts))
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
+                                            len(data) / (time.time() - ts))
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index e90948782b..ef520b1287 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.10.x"
+    GIT_TAG "v1.8.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 02bb2b9ceb..fffa9ae7a4 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -69,6 +69,10 @@ message VariableMessage {
   bytes rows = 9;
   // Look up table block execution output variable name.
   string out_varname = 10;
+  // If true, the ps server will start profiling, the ps
+  // server stops profiling and generates a profile to /tmp/profile_ps_*
+  // when profile switches from true to false.
+  bool profile = 11;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 766bcf1ac5..d68cf467f7 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
 #include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   void* payload = nullptr;
   size_t payload_size;
   ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
+  }
   e.WriteString(VarMsg::kVarnameFieldNumber, name);
   if (var->IsType<framework::LoDTensor>()) {
     e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index fbef8d02a4..335491e95d 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
@@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) {
         meta_.set_out_varname(temp);
         break;
       }
-
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        bool profiling;
+        if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t lisner_id = platform::ListenerId();
+        if (lisner_id <= 0) {
+          break;
+        }
+        if (profiling && !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (!profiling && platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", lisner_id));
+        }
+        break;
+      }
       default: {
         // Unknown tag, return unknown error.
         return -1;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 59b9451155..470a567e8b 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
+  // Mark this as PS that it should decide profiling by listening from trainer.
+  platform::SetProfileLisener();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &dev_ctx = *pool.Get(dev_place);
   framework::Scope &recv_scope = scope.NewScope();
@@ -328,9 +331,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   rpc_service_->WaitServerReady();
 
   // Write to a file of server selected port for python use.
-  std::string file_path =
-    string::Sprintf("/tmp/paddle.%d.selected_port",
-                    static_cast<int>(::getpid()));
+  std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
+                                          static_cast<int>(::getpid()));
   SavePort(file_path);
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 412cdda286..ac16e4cd59 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+
 #include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <iomanip>
+#include <limits>
 #include <map>
 #include <mutex>  // NOLINT
+#include <random>
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -33,6 +36,9 @@ namespace platform {
 
 struct EventList;
 
+static int64_t profiler_lister_id = 0;
+static bool should_send_profile_state = false;
+
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
 // The thread local event list only can be accessed by the specific thread
@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
-                 "The profiling state should be disabled when calling ",
-                 "EnableProfiler.");
-  g_state = state;
-  if (g_state == ProfilerState::kAll) {
-    GetDeviceTracer()->Enable();
+  if (state == g_state) {
+    return;
   }
+  g_state = state;
+  should_send_profile_state = true;
+  GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy events first to reduce the startup overhead.
@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
+  if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
 
@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key,
   ParseEvents(all_events, sorted_key);
   ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
-  if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
+  if (tracer->IsEnabled()) {
     tracer->Disable();
     tracer->GenProfile(profile_path);
   }
   g_state = ProfilerState::kDisabled;
+  should_send_profile_state = true;
+}
+
+bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
+bool ShouldSendProfileState() { return should_send_profile_state; }
+
+void SetProfileLisener() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist6(
+      1, std::numeric_limits<int64_t>::max());
+  profiler_lister_id = dist6(rng);
 }
+int64_t ListenerId() { return profiler_lister_id; }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 428d9ebcea..c8b8c258a8 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -114,5 +114,13 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
+// Test if the profiler is currently enabled.
+bool IsProfileEnabled();
+// Whether the trainer should send profiling state to PS.
+bool ShouldSendProfileState();
+// Mark current process as PS by assigning a lister id.
+void SetProfileLisener();
+int64_t ListenerId();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/tools/timeline.py b/tools/timeline.py
index f4083c824e..8cd6353d46 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    '--profile_path', type=str, default='', help='Input profile file name.')
+    '--profile_path',
+    type=str,
+    default='',
+    help='Input profile file name. If there are multiple file, the format '
+    'should be trainer1=file1,trainer2=file2,ps=file3')
 parser.add_argument(
     '--timeline_path', type=str, default='', help='Output timeline file name.')
 args = parser.parse_args()
@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
 
 
 class Timeline(object):
-    def __init__(self, profile_pb):
-        self._profile_pb = profile_pb
+    def __init__(self, profile_dict):
+        self._profile_dict = profile_dict
         self._pid = 0
         self._devices = dict()
         self._chrome_trace = _ChromeTraceFormatter()
@@ -120,35 +124,37 @@ class Timeline(object):
         return cur_pid
 
     def _allocate_pids(self):
-        for event in self._profile_pb.events:
-            if event.type == profiler_pb2.Event.CPU:
-                if (event.device_id, "CPU") not in self._devices:
-                    pid = self._allocate_pid()
-                    self._devices[(event.device_id, "CPU")] = pid
-                    self._chrome_trace.emit_pid("cpu:block:%d" %
-                                                (event.device_id), pid)
-            elif event.type == profiler_pb2.Event.GPUKernel:
-                if (event.device_id, "GPUKernel") not in self._devices:
-                    pid = self._allocate_pid()
-                    self._devices[(event.device_id, "GPUKernel")] = pid
-                    self._chrome_trace.emit_pid("gpu:%d" % (event.device_id),
-                                                pid)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid("%s:cpu:block:%d" %
+                                                    (k, event.device_id), pid)
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "GPUKernel")] = pid
+                        self._chrome_trace.emit_pid("%s:gpu:%d" %
+                                                    (k, event.device_id), pid)
 
     def _allocate_events(self):
-        for event in self._profile_pb.events:
-            if event.type == profiler_pb2.Event.CPU:
-                type = "CPU"
-            elif event.type == profiler_pb2.Event.GPUKernel:
-                type = "GPUKernel"
-            pid = self._devices[(event.device_id, type)]
-            args = {'name': event.name}
-            if event.memcopy.bytes > 0:
-                args = {'mem_bytes': event.memcopy.bytes}
-            # TODO(panyx0718): Chrome tracing only handles ms. However, some
-            # ops takes micro-seconds. Hence, we keep the ns here.
-            self._chrome_trace.emit_region(
-                event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
-                event.sub_device_id, 'Op', event.name, args)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+                pid = self._devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args = {'mem_bytes': event.memcopy.bytes}
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                self._chrome_trace.emit_region(
+                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
 
     def generate_chrome_trace(self):
         self._allocate_pids()
@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline'
 if args.timeline_path:
     timeline_path = args.timeline_path
 
-with open(profile_path, 'r') as f:
-    profile_s = f.read()
-    profile_pb = profiler_pb2.Profile()
-    profile_pb.ParseFromString(profile_s)
-
-tl = Timeline(profile_pb)
+profile_paths = profile_path.split(',')
+profile_dict = dict()
+if len(profile_path) == 1:
+    with open(profile_path, 'r') as f:
+        profile_s = f.read()
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(profile_s)
+    profile_dict['trainer'] = profile_pb
+else:
+    for profile_path in profile_paths:
+        k, v = profile_path.split('=')
+        with open(v, 'r') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+        profile_dict[k] = profile_pb
+
+tl = Timeline(profile_dict)
 with open(timeline_path, 'w') as f:
     f.write(tl.generate_chrome_trace())

From 9927413991bd16e4fd16eaf30531885097457553 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 4 May 2018 10:14:09 +0800
Subject: [PATCH 02/13] remove version change

---
 cmake/external/grpc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index ef520b1287..e90948782b 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.10.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""

From 5a9f17f02b37ed369c37d44a516faadc66b6d15a Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 4 May 2018 15:51:43 +0800
Subject: [PATCH 03/13] clean up

---
 paddle/fluid/operators/listen_and_serv_op.cc | 2 +-
 paddle/fluid/platform/profiler.cc            | 2 +-
 paddle/fluid/platform/profiler.h             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 470a567e8b..8acbf82025 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -296,7 +296,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileLisener();
+  platform::SetProfileListener();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &dev_ctx = *pool.Get(dev_place);
   framework::Scope &recv_scope = scope.NewScope();
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index ac16e4cd59..cfddd8e871 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -459,7 +459,7 @@ void DisableProfiler(EventSortingKey sorted_key,
 bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
 bool ShouldSendProfileState() { return should_send_profile_state; }
 
-void SetProfileLisener() {
+void SetProfileListener() {
   std::mt19937 rng;
   rng.seed(std::random_device()());
   std::uniform_int_distribution<std::mt19937::result_type> dist6(
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index c8b8c258a8..61b98143e4 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -119,7 +119,7 @@ bool IsProfileEnabled();
 // Whether the trainer should send profiling state to PS.
 bool ShouldSendProfileState();
 // Mark current process as PS by assigning a lister id.
-void SetProfileLisener();
+void SetProfileListener();
 int64_t ListenerId();
 
 }  // namespace platform

From cdd52f3a30c70f98044ad3dd7a86cd27b5c6071d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 5 May 2018 20:58:04 +0800
Subject: [PATCH 04/13] Add comment to explain how to run inference test

---
 paddle/fluid/inference/tests/book/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 97d9f03f88..ec5ca4a70f 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -24,6 +24,10 @@ function(inference_test TARGET_NAME)
   endforeach()
 endfunction(inference_test)
 
+####################
+# Inference tests here depend on fluid/tests/book
+# User need to run tests in fluid/tests/book first to generate saved model.
+####################
 # This unittest is buggy!
 #inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)

From cd54a31cc88fa1d4feb5215f0c36e3a84971e972 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 7 May 2018 11:04:56 +0800
Subject: [PATCH 05/13] fix fluid Metric

---
 python/paddle/fluid/metrics.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index c618b02a76..1301b6f961 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -251,7 +251,7 @@ class EditDistance(MetricBase):
         self.instance_error += seq_num - seq_right_count
         self.total_distance += total_distance
 
-    def eval():
+    def eval(self):
         if self.seq_num == 0:
             raise ValueError(
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
@@ -340,8 +340,8 @@ class Auc(MetricBase):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
         kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
+        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
+                      for i in range(self._num_thresholds - 2)]
         thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
         # caculate TP, FN, TN, FP count
@@ -358,19 +358,20 @@ class Auc(MetricBase):
                         fp += 1
                     else:
                         tn += 1
-            tp_list[idx_thresh] += tp
-            fn_list[idx_thresh] += fn
-            tn_list[idx_thresh] += tn
-            fp_list[idx_thresh] += fp
+            self.tp_list[idx_thresh] += tp
+            self.fn_list[idx_thresh] += fn
+            self.tn_list[idx_thresh] += tn
+            self.fp_list[idx_thresh] += fp
 
     def eval(self):
         epsilon = self._epsilon
         num_thresholds = self._num_thresholds
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
+        tpr = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fn_list + epsilon)
+        fpr = self.fp_list.astype("float32") / (
+            self.fp_list + self.tn_list + epsilon)
+        rec = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fp_list + epsilon)
 
         x = fpr[:num_thresholds - 1] - fpr[1:]
         y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0

From 171d3e861c51240940c5e33dd213d286cfb790a3 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 7 May 2018 11:10:12 +0800
Subject: [PATCH 06/13] fix CompositeMetric

---
 python/paddle/fluid/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 1301b6f961..7f9e958a8e 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -116,7 +116,7 @@ class CompositeMetric(MetricBase):
         super(CompositeMetric, self).__init__(name, kwargs)
         self._metrics = []
 
-    def add_metric(self, metric):
+    def update(self, metric):
         if not isinstance(metric, MetricBase):
             raise ValueError("SubMetric should be inherit from MetricBase.")
         self._metrics.append(metric)
@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase):
         super(DetectionMAP, self).__init__(name)
         # the current map value
         self.value = .0
+        self.weight = .0
 
     def update(self, value, weight):
         if not _is_number_or_matrix_(value):

From d1ea74d3b99e227f89ae5f9a4130a57abdfaa283 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 7 May 2018 13:07:48 +0800
Subject: [PATCH 07/13] follow comments

---
 paddle/fluid/operators/detail/variable_response.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index 335491e95d..f4a374d56d 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -434,8 +434,8 @@ int VariableResponse::Parse(Source* source) {
           return tag;
         }
         meta_.set_profile(profiling);
-        int64_t lisner_id = platform::ListenerId();
-        if (lisner_id <= 0) {
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
           break;
         }
         if (profiling && !platform::IsProfileEnabled()) {
@@ -444,7 +444,7 @@ int VariableResponse::Parse(Source* source) {
           // TODO(panyx0718): Should we allow to customize file dir.
           platform::DisableProfiler(
               platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", lisner_id));
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
         }
         break;
       }

From 7f37060879a8e10ee92028f92bef7346afb86a13 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 7 May 2018 13:11:35 +0800
Subject: [PATCH 08/13] revert CompositeMetric::add_metric

---
 python/paddle/fluid/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 7f9e958a8e..bb9c6fdc60 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -116,7 +116,7 @@ class CompositeMetric(MetricBase):
         super(CompositeMetric, self).__init__(name, kwargs)
         self._metrics = []
 
-    def update(self, metric):
+    def add_metric(self, metric):
         if not isinstance(metric, MetricBase):
             raise ValueError("SubMetric should be inherit from MetricBase.")
         self._metrics.append(metric)

From 9fccf46270cee6a60b0ab0a0939764dcf6f2199f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 7 May 2018 13:11:45 +0800
Subject: [PATCH 09/13] reword comments

---
 paddle/fluid/inference/tests/book/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index ec5ca4a70f..cc179a8625 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -25,8 +25,9 @@ function(inference_test TARGET_NAME)
 endfunction(inference_test)
 
 ####################
-# Inference tests here depend on fluid/tests/book
-# User need to run tests in fluid/tests/book first to generate saved model.
+# Inference tests here depend on fluid/tests/book. If users want to run
+# individual test with ctest, they need to run tests in fluid/tests/book
+# first to generate saved model.
 ####################
 # This unittest is buggy!
 #inference_test(fit_a_line)

From 2a2c83b9e6b1b818edc3a0d67cc21225922e290c Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 7 May 2018 14:35:20 +0800
Subject: [PATCH 10/13] feature/convert tensorrt io (#10440)

* init

* init

* add ut

* split singleton from base class

* add singleton

* ad singleton
---
 .../fluid/inference/tensorrt/CMakeLists.txt   |  1 +
 .../fluid/inference/tensorrt/io_converter.cc  | 57 +++++++++++++++
 .../fluid/inference/tensorrt/io_converter.h   | 66 +++++++++++++++++
 .../inference/tensorrt/test_io_converter.cc   | 53 ++++++++++++++
 paddle/fluid/inference/utils/singleton.h      | 73 +++++++++++++++++++
 5 files changed, 250 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/io_converter.cc
 create mode 100644 paddle/fluid/inference/tensorrt/io_converter.h
 create mode 100644 paddle/fluid/inference/tensorrt/test_io_converter.cc
 create mode 100644 paddle/fluid/inference/utils/singleton.h

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 288789d6e4..c8b656394b 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
+nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/io_converter.cc b/paddle/fluid/inference/tensorrt/io_converter.cc
new file mode 100644
index 0000000000..2baac96c26
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/io_converter.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/io_converter.h"
+#include <cuda.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using platform::is_gpu_place;
+using platform::is_cpu_place;
+
+class DefaultInputConverter : public EngineInputConverter {
+ public:
+  DefaultInputConverter() {}
+  // NOTE out is GPU memory.
+  virtual void operator()(const LoDTensor& in, void* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(out != nullptr);
+    PADDLE_ENFORCE_LE(in.memory_size(), max_size);
+    const auto& place = in.place();
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE(stream_ != nullptr);
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
+                                        cudaMemcpyHostToDevice, *stream_));
+
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
+                                        cudaMemcpyHostToHost, *stream_));
+
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+};
+
+REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/io_converter.h b/paddle/fluid/inference/tensorrt/io_converter.h
new file mode 100644
index 0000000000..6ea61cbbac
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/io_converter.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_map>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using framework::LoDTensor;
+
+/*
+ * Convert Input from Fluid to an Engine.
+ * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
+ * most cases just need to copy the data.
+ */
+class EngineInputConverter {
+ public:
+  EngineInputConverter() {}
+
+  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
+
+  void SetStream(cudaStream_t* stream) { stream_ = stream; }
+
+  static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
+                  size_t max_size, cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineInputConverter>::Lookup(in_op_type);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  virtual ~EngineInputConverter() {}
+
+ protected:
+  cudaStream_t* stream_{nullptr};
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
+  struct trt_input_##in_op_type__##_converter {                      \
+    trt_input_##in_op_type__##_converter() {                         \
+      ::paddle::inference::Registry<EngineInputConverter>::Register< \
+          Converter__>(#in_op_type__);                               \
+    }                                                                \
+  };                                                                 \
+  trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
diff --git a/paddle/fluid/inference/tensorrt/test_io_converter.cc b/paddle/fluid/inference/tensorrt/test_io_converter.cc
new file mode 100644
index 0000000000..365e936686
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_io_converter.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/tensorrt/io_converter.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class EngineInputConverterTester : public ::testing::Test {
+ public:
+  void SetUp() override { tensor.Resize({10, 10}); }
+
+  framework::LoDTensor tensor;
+};
+
+TEST_F(EngineInputConverterTester, DefaultCPU) {
+  void* buffer;
+  tensor.mutable_data<float>(platform::CPUPlace());
+  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
+
+  cudaStream_t stream;
+  EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
+                            &stream);
+}
+
+TEST_F(EngineInputConverterTester, DefaultGPU) {
+  void* buffer;
+  tensor.mutable_data<float>(platform::CUDAPlace());
+  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
+
+  cudaStream_t stream;
+  EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
+                            &stream);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
new file mode 100644
index 0000000000..f05921067c
--- /dev/null
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_map>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+// NOTE not thread-safe.
+template <typename T>
+struct Singleton {
+  static T& Global() {
+    static T* x = new T;
+    return *x;
+  }
+
+  Singleton() = delete;
+  Singleton& operator=(const Singleton&) = delete;
+};
+
+/*
+ * An registor for any type.
+ * NOTE not thread-safe.
+ */
+template <typename ItemParent>
+struct Registry {
+  static Registry& Global() {
+    static auto* x = new Registry<ItemParent>;
+    return *x;
+  }
+
+  template <typename ItemChild>
+  static void Register(const std::string& name) {
+    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    items_[name] = new ItemChild;
+  }
+
+  static ItemParent* Lookup(const std::string& name) {
+    auto it = items_.find(name);
+    if (it == items_.end()) return nullptr;
+    return it->second;
+  }
+
+  ~Registry() {
+    for (auto& item : items_) {
+      delete item.second;
+    }
+  }
+
+ private:
+  Registry() = default;
+  static std::unordered_map<std::string, ItemParent*> items_;
+};
+
+template <typename ItemParent>
+std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
+
+}  // namespace inference
+}  // namespace paddle

From bb3247e33973ca02d900421e7f823214f4b0a067 Mon Sep 17 00:00:00 2001
From: Yancey <yanxu05@baidu.com>
Date: Mon, 7 May 2018 15:14:08 +0800
Subject: [PATCH 11/13] fix traner.py import error (#10442)

---
 python/paddle/fluid/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 8252592c8c..a9fa2359e0 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -22,7 +22,7 @@ import io
 
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 import optimizer as opt_module
-import distribute_transpiler
+from transpiler import distribute_transpiler
 
 __all__ = [
     'Trainer',

From 5b06944857e74f9b1388e081d4502bfd8c002832 Mon Sep 17 00:00:00 2001
From: Yancey <yanxu05@baidu.com>
Date: Mon, 7 May 2018 18:55:39 +0800
Subject: [PATCH 12/13] fix trainer import error on ce (#10448)

* fix trainer import error on ce

* fix setup.py.in
---
 python/paddle/fluid/__init__.py | 1 +
 python/paddle/fluid/trainer.py  | 1 +
 python/setup.py.in              | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 37d3689467..c8a435748d 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -60,6 +60,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
     'io',
     'initializer',
     'layers',
+    'transpiler'
     'nets',
     'optimizer',
     'learning_rate_decay',
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index a9fa2359e0..1cbecd69e5 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -19,6 +19,7 @@ import executor
 import data_feeder
 import contextlib
 import io
+import transpiler
 
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 import optimizer as opt_module
diff --git a/python/setup.py.in b/python/setup.py.in
index a811b509a9..c42601d335 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -68,7 +68,8 @@ packages=['paddle',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
-          'paddle.fluid.layers']
+          'paddle.fluid.layers',
+          'paddle.fluid.transpiler']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',

From f43b71b242467d665c134262c2b7167cef622757 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Mon, 7 May 2018 19:20:38 +0800
Subject: [PATCH 13/13]  Fix clone function of Program to avoid memory leak.
 (#10358)

* Fix clone function of Program to avoid memory leak.

* Fix inference_optimize function of framework.py.

* Reuse inference_optimize in framework.py.

* Add comments.
---
 python/paddle/fluid/framework.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ce9b880aeb..d7eda619c3 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1042,13 +1042,14 @@ class Program(object):
         Returns(Program):
             The cloned Program object.
         """
-        p = Program()
         if for_test:
-            p.desc = core.inference_optimize(self.desc)
+            p = self.inference_optimize()
         else:
+            p = Program()
             p.desc = core.ProgramDesc(self.desc)
-        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
-        p.sync_with_cpp()
+            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.sync_with_cpp()
+
         p.copy_param_info_from(self)
         return p
 
@@ -1061,7 +1062,7 @@ class Program(object):
                 if isinstance(t, Variable):
                     # After transpiler processing, the op that output this
                     # variable maybe has been changed, so t.op is not reliable
-                    # and we need to find the current op that generate this 
+                    # and we need to find the current op that generate this
                     # variable here.
                     t.op = None
                     global_block = self.global_block()
@@ -1087,8 +1088,16 @@ class Program(object):
         return res
 
     def inference_optimize(self):
+        # this is an alternative implement before
+        # core.inference_optimize being fixed.
         res = Program()
-        res.desc = core.inference_optimize(self.desc)
+        res.desc = core.ProgramDesc(self.desc)
+        for i in xrange(res.desc.num_blocks()):
+            block = res.desc.block(i)
+            for j in xrange(block.op_size()):
+                op = block.op(j)
+                if op.has_attr('is_test'):
+                    op.set_attr('is_test', True)
         res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
         res.sync_with_cpp()
         return res