multi-thread handlerequest

Experiment on vgg flower, 2 trainers, 1ps. more trainer could have more speedup. After: Pass = 0, Iters = 327, Speed = (7.52) img/s Before: Pass = 0, Iters = 385, Speed = (6.77) img/s
7 years ago · b4dd4c048d
parent ebefdbe372
commit b4dd4c048d
11 changed files with 158 additions and 79 deletions
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -38,7 +38,7 @@ def str2bool(v):
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
+    '--batch_size', type=int, default=16, help="Batch size for training.")
 parser.add_argument(
    '--learning_rate',
    type=float,
@ -61,7 +61,7 @@ parser.add_argument(
 parser.add_argument(
    '--data_set',
    type=str,
-    default='cifar10',
+    default='flowers',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
@ -200,26 +200,30 @@ def main():
                    fetch_list=[avg_cost, batch_acc, batch_size])
                return loss, acc, b_size
-            if args.profile and args.task_index == 0:
+            if args.profile:
-                # warmup.
+                with profiler.profiler('All', 'total',
-                for batch_id, data in enumerate(train_reader()):
+                                       '/tmp/profile_vgg_%d' % args.task_index):
                    if batch_id > 5: break
                    run_step(batch_id, data)
                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
+                        if batch_id > 4: break
                        run_step(batch_id, data)
            total_time = 0.0
            count = 0
            for batch_id, data in enumerate(train_reader()):
                ts = time.time()
                loss, acc, b_size = run_step(batch_id, data)
                iters += 1
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                duration = time.time() - ts
                total_time += duration
                count += len(data)
                print(
                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
+                    "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
-                                            len(data) / (time.time() - ts))
+                                                   len(data) / duration,
                                                   count / total_time)
                )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -33,7 +33,7 @@ ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.10.x"
+    GIT_TAG "v1.8.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -350,12 +350,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      }
    }
  }
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  // platform::DeviceContextPool::Instance().Get(place_)->Wait();
  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  } else {
    // Delete the local scopes created in operators.
    scope->DropKids();
  }
  if (FLAGS_benchmark) {
    VLOG(2) << "-------------------------------------------------------";
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@ -19,6 +19,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@ -196,9 +197,14 @@ bool RPCClient::Wait() {
  const size_t kReqCnt = req_count_;
  bool a[kReqCnt];
  std::vector<std::future<void>> waits(req_count_);
  std::mutex mu;
  for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
+    waits[i] = framework::AsyncIO([i, &a, &mu, this] {
      bool ret = Proceed();
      std::lock_guard<std::mutex> l(mu);
      a[i] = ret;
    });
  }
  for (int i = 0; i < req_count_; i++) {
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/framework/blocking_queue.h"
@ -30,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@ -82,19 +84,25 @@ class AsyncGRPCServer final {
 protected:
  void HandleRequest(::grpc::ServerCompletionQueue *cq,
                     const std::string &cq_name,
-                     std::function<void()> TryToRegisterNewOne);
+                     std::function<void(int)> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne();
+  void TryToRegisterNewSendOne(int i);
-  void TryToRegisterNewGetOne();
+  void TryToRegisterNewGetOne(int i);
-  void TryToRegisterNewPrefetchOne();
+  void TryToRegisterNewPrefetchOne(int i);
  void ShutdownQueue();
 private:
  static const int kSendReqsBufSize = 100;
  static const int kGetReqsBufSize = 100;
  std::mutex cq_mutex_;
  volatile bool is_shut_down_ = false;
  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
  RequestBase *send_reqs_[kSendReqsBufSize];
  RequestBase *get_reqs_[kGetReqsBufSize];
  GrpcService::AsyncService service_;
  std::unique_ptr<::grpc::Server> server_;
@ -113,8 +121,9 @@ class AsyncGRPCServer final {
  mutable int barrier_cond_step_;
  std::condition_variable barrier_condition_;
-  std::unique_ptr<std::thread> t_send_;
+  std::vector<std::unique_ptr<std::thread>> t_sends_;
-  std::unique_ptr<std::thread> t_get_;
+  std::vector<std::unique_ptr<std::thread>> t_gets_;
  std::unique_ptr<std::thread> t_prefetch_;
  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@ -25,6 +25,8 @@
 #include <grpc++/support/byte_buffer.h>
 #include "paddle/fluid/operators/detail/variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 // NOTE: This method was originally created by tensorflow
 //       (https://github.com/tensorflow/tensorflow/) we borrow this
 //       method and did some modifications so that we can parse gRPC
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@ -73,7 +73,7 @@ message VariableMessage {
  // If true, the ps server will start profiling, the ps
  // server stops profiling and generates a profile to /tmp/profile_ps_*
  // when profile switches from true to false.
-  bool profile = 11;
+  int64 profile = 11;
 }
 message VoidMessage {}
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@ -122,7 +122,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
  // servers the trainer's profiling state so that PS can follow the
  // trainer.
-  request.set_profile(platform::IsProfileEnabled());
+  if (platform::ShouldSendProfileState()) {
    if (platform::IsProfileEnabled()) {
      request.set_profile(1);
    } else {
      request.set_profile(2);
    }
  }
  if (!out_name.empty()) {
    request.set_out_varname(out_name);
  }
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@ -449,8 +449,8 @@ int VariableResponse::Parse(Source* source) {
        break;
      }
      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        bool profiling;
+        uint64_t profiling = 0;
-        if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
+        if (!input.ReadVarint64(&profiling)) {
          return tag;
        }
        meta_.set_profile(profiling);
@ -458,9 +458,9 @@ int VariableResponse::Parse(Source* source) {
        if (listener_id <= 0) {
          break;
        }
-        if (profiling && !platform::IsProfileEnabled()) {
+        if (profiling == 1 && !platform::IsProfileEnabled()) {
          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (!profiling && platform::IsProfileEnabled()) {
+        } else if (profiling == 2 && platform::IsProfileEnabled()) {
          // TODO(panyx0718): Should we allow to customize file dir.
          platform::DisableProfiler(
              platform::EventSortingKey::kDefault,
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@ -245,7 +245,6 @@ class DeviceTracerImpl : public DeviceTracer {
  void Enable() {
    std::lock_guard<std::mutex> l(trace_mu_);
    if (enabled_) {
      fprintf(stderr, "DeviceTracer already enabled\n");
      return;
    }
    EnableActivity();