integrated HALF_ASYNC to communicator (#21869)

* add half_async in the communicator * fix DistributedStrategy
6 years ago · 82bc814a57
parent 1e932eccfa
commit 82bc814a57
30 changed files with 1029 additions and 562 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -192,7 +192,7 @@ if(WITH_DISTRIBUTE)
  data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
  device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer
-  lod_rank_table feed_fetch_method sendrecvop_rpc collective_helper ${GLOB_DISTRIBUTE_DEPS}
+  lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
  graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer)
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@ -48,7 +48,7 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
  using RpcCtxMap = operators::distributed::RpcCtxMap;
  VLOG(3) << "ProcessGraph";
  RpcCtxMap send_varname_to_ctx;
-  RpcCtxMap recv_varname_to_ctx;
+
  for (auto &node : graphs[0]->Nodes()) {
    VLOG(3) << "node name " << node->Name();
    if (node && node->IsOp()) {
@ -74,30 +74,19 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
            merge_add, use_send_handler);
        VLOG(3) << "find and init an send op: "
                << send_varname_to_ctx[send_var_name];
-      } else if (node->Name() == "recv") {
-        auto recv_var_name = node->Op()->Output("Out")[0];
-        auto recv_varnames = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("recv_varnames"));
-        auto epmap = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("epmap"));
-        auto trainer_id =
-            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-        recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
-            recv_var_name, recv_varnames, epmap, {}, trainer_id);
-        VLOG(3) << "find and remove an recv op: "
-                << recv_varname_to_ctx[recv_var_name];
      }
    }
  }

  // init communicator here
  if (send_varname_to_ctx.size() > 0) {
-    VLOG(3) << "this is distribute mode, will use communicator";
-
-    auto *instance = operators::distributed::Communicator::InitInstance<
-        operators::distributed::AsyncCommunicator>(send_varname_to_ctx,
-                                                   recv_varname_to_ctx, scope);
-    if (!instance->IsRunning()) instance->Start();
+    auto *instance = operators::distributed::Communicator::GetInstance();
+    auto initialized = instance ? true : false;
+    PADDLE_ENFORCE_EQ(initialized, true,
+                      platform::errors::InvalidArgument(
+                          "Communicator is not Initialized, you may use "
+                          "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
+                          "develop/markdown_doc/transpiler)"));
  }
 #endif
 }
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@ -179,6 +179,7 @@ class HogwildWorker : public CPUWorkerBase {
  void CreateThreadScope(const ProgramDesc& program);
  std::vector<std::string> op_names_;
  std::vector<OperatorBase*> ops_;
+  bool thread_barrier_;
  // Scope* thread_scope_;
  HogwildWorkerParameter param_;
  std::vector<std::string> skip_ops_;
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"

@ -29,6 +30,7 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
    skip_ops_[i] = param_.skip_ops(i);
  }
  use_cvm_ = desc.use_cvm();
+  thread_barrier_ = desc.thread_barrier();
 }

 void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
@ -158,6 +160,12 @@ void HogwildWorker::TrainFilesWithProfiler() {
    thread_scope_->DropKids();
    timeline.Start();
  }
+#ifdef PADDLE_WITH_DISTRIBUTE
+  if (thread_barrier_) {
+    operators::distributed::Communicator::GetInstance()
+        ->BarrierTriggerDecrement();
+  }
+#endif
 }

 void HogwildWorker::TrainFiles() {
@ -183,6 +191,12 @@ void HogwildWorker::TrainFiles() {
    PrintFetchVars();
    thread_scope_->DropKids();
  }
+#ifdef PADDLE_WITH_DISTRIBUTE
+  if (thread_barrier_) {
+    operators::distributed::Communicator::GetInstance()
+        ->BarrierTriggerDecrement();
+  }
+#endif
 }

 void HogwildWorker::PrintFetchVars() {
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/operators/distributed/distributed.h"

 namespace paddle {
 namespace framework {
@ -38,6 +39,14 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
  thread_num_ = readers.size();
  VLOG(3) << "worker thread num: " << thread_num_;
  workers_.resize(thread_num_);
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+  if (trainer_desc.thread_barrier()) {
+    operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
+        thread_num_);
+  }
+#endif
+
  for (int i = 0; i < thread_num_; ++i) {
    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
        trainer_desc.device_worker_name());
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@ -47,6 +47,7 @@ message TrainerDesc {
  // adjust ins weight
  optional AdjustInsWeightConfig adjust_ins_weight_config = 20;
  optional bool no_cvm = 21 [ default = false ];
+  optional bool thread_barrier = 22;

  // device worker parameters
  optional HogwildWorkerParameter hogwild_param = 101;
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
--- a/paddle/fluid/operators/distributed/distributed.h
+++ b/paddle/fluid/operators/distributed/distributed.h
@ -17,6 +17,7 @@
 #ifdef PADDLE_WITH_DISTRIBUTE

 #ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/distributed/communicator.h"

 #include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@ -36,6 +36,13 @@ class SendBarrierOp : public framework::OperatorBase {

  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
+    auto is_half_async = Attr<bool>("half_async");
+
+    if (is_half_async) {
+      distributed::Communicator::GetInstance()->Barrier();
+      return;
+    }
+
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");

    distributed::RPCClient* rpc_client =
@ -76,6 +83,12 @@ the Parameter Server would knew all variables have been sent.
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints to send variables to.")
        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<bool>(
+        "half_async",
+        "(bool, default false)"
+        "half_async=True is for half_async mode, this will send signal "
+        "to HalfAsyncCommunicator Instance")
+        .SetDefault(false);
  }
 };

--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@ -48,12 +48,7 @@ class SendOp : public framework::OperatorBase {
    auto use_send_handler = Attr<bool>("use_send_handler");

    if (send_varnames.size() > 0) {
-      if (ins.size() > 1) {
-        distributed::Communicator::GetInstance()->Send(ins, send_varnames,
-                                                       scope);
-      } else {
-        distributed::Communicator::GetInstance()->Send(ins[0], scope);
-      }
+      distributed::Communicator::GetInstance()->Send(ins, send_varnames, scope);
    } else {
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
--- a/paddle/fluid/pybind/communicator_py.cc
+++ b/paddle/fluid/pybind/communicator_py.cc
@ -27,10 +27,11 @@ limitations under the License. */
 namespace py = pybind11;

 using paddle::framework::ProgramDesc;
-using paddle::operators::distributed::Communicator;
+using paddle::framework::Scope;
 using paddle::operators::distributed::AsyncCommunicator;
+using paddle::operators::distributed::Communicator;
 using paddle::operators::distributed::GeoSgdCommunicator;
-using paddle::framework::Scope;
+using paddle::operators::distributed::HalfAsyncCommunicator;

 namespace paddle {
 namespace pybind {
@ -39,29 +40,27 @@ void BindCommunicator(py::module* m) {
  // Communicator is already used by nccl, change to DistCommunicator
  py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
                                                          "DistCommunicator")
-      .def(py::init([](const ProgramDesc& program, Scope* param_scope,
-                       std::map<std::string, int>& env_flags) {
-        VLOG(0) << "using communicator";
-        Communicator::InitInstance<AsyncCommunicator>(program, param_scope,
-                                                      env_flags);
-        return Communicator::GetInstantcePtr();
-      }))
-      .def(py::init([](
-          const ProgramDesc& program, Scope* training_scope,
-          std::map<std::string,
-                   std::map<std::string, std::vector<std::string>>>& vars_info,
-          int& trainers, int& geo_need_push_nums,
-          std::map<std::string, int>& env_flags) {
-        VLOG(0) << "using geo sgd communicator";
-        Communicator::InitInstance<GeoSgdCommunicator>(
-            program, training_scope, vars_info, trainers, geo_need_push_nums,
-            env_flags);
+      .def(py::init([](const std::string& mode, const ProgramDesc& program,
+                       Scope* param_scope,
+                       std::map<std::string, std::string>& envs) {
+        if (mode == "HALF_ASYNC") {
+          Communicator::InitInstance<HalfAsyncCommunicator>(program,
+                                                            param_scope, envs);
+        } else if (mode == "ASYNC") {
+          Communicator::InitInstance<AsyncCommunicator>(program, param_scope,
+                                                        envs);
+        } else if (mode == "GEO") {
+          Communicator::InitInstance<GeoSgdCommunicator>(program, param_scope,
+                                                         envs);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "unsuported communicator MODE"));
+        }
        return Communicator::GetInstantcePtr();
      }))
      .def("stop", &Communicator::Stop)
      .def("start", &Communicator::Start)
      .def("is_running", &Communicator::IsRunning);
 }
-
 }  // namespace pybind
 }  // namespace paddle
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -199,17 +199,6 @@ def __bootstrap__():

        read_env_flags.append('worker_update_interval_secs')

-        # env for communicator
-        read_env_flags.append('communicator_independent_recv_thread')
-        read_env_flags.append('communicator_send_queue_size')
-        read_env_flags.append('communicator_min_send_grad_num_before_recv')
-        read_env_flags.append('communicator_thread_pool_size')
-        read_env_flags.append('communicator_max_merge_var_num')
-        read_env_flags.append('communicator_merge_sparse_bucket')
-        read_env_flags.append('communicator_fake_rpc')
-        read_env_flags.append('communicator_send_wait_times')
-        read_env_flags.append('communicator_merge_sparse_grad')
-        read_env_flags.append('communicator_is_sgd_optimizer')
        if core.is_compiled_with_brpc():
            read_env_flags.append('max_body_size')
            #set brpc max body size
--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
@ -19,17 +19,13 @@ It's a wrapper of a cpp class Communicator and should be used inside fleet API.
 """
 from . import core
 from .framework import Program
+from .transpiler.distribute_transpiler import DistributedMode

 __all__ = ['Communicator']


 class Communicator(object):
-    def __init__(self,
-                 program,
-                 vars_info=None,
-                 trainers=None,
-                 geo_sgd_need_push_nums=None,
-                 env_flags=None):
+    def __init__(self, program, mode, kwargs=None, envs={}):
        """
        Communicator is used for async distribute training in distribute_transpiler mode.
        It's a wrapper of a cpp class Communicator and should be used inside fleet API.
@ -56,20 +52,37 @@ class Communicator(object):
        for op in program.block(0).ops:
            if op.type == "recv":
                op._set_attr('do_not_run', True)
-        # Todo: Add check
-        if env_flags is None:
-            env_flags = {}
-
-        if vars_info and trainers and geo_sgd_need_push_nums:
-            # for geo sgd
-            self.communicator_ = core.DistCommunicator(
-                program.desc,
-                global_scope(), vars_info, trainers, geo_sgd_need_push_nums,
-                env_flags)
-        else:
-            self.communicator_ = core.DistCommunicator(program.desc,
-                                                       global_scope(),
-                                                       env_flags)
+
+        if mode == DistributedMode.GEO:
+            push_vars = kwargs["push_vars"]
+            push_var_names = []
+
+            for k, vs in push_vars.items():
+                varnames = "&".join(vs["var_names"])
+                sections = "&".join([str(v) for v in vs["sections"]])
+                endpoints = "&".join(vs["epmap"])
+                is_sparse = "1" if vs["is_sparse"] else "0"
+
+                push_var_names.append(k)
+                envs[k] = "#".join([varnames, sections, endpoints, is_sparse])
+
+            envs["geo_trainer_nums"] = str(kwargs["trainers"])
+            envs["geo_need_push_nums"] = str(kwargs["push_nums"])
+            envs["geo_send_varnames"] = '#'.join(push_var_names)
+
+        mode_str = None
+
+        if mode == DistributedMode.SYNC:
+            mode_str = "SYNC"
+        elif mode == DistributedMode.ASYNC:
+            mode_str = "ASYNC"
+        elif mode == DistributedMode.HALF_ASYNC:
+            mode_str = "HALF_ASYNC"
+        elif mode == DistributedMode.GEO:
+            mode_str = "GEO"
+
+        self.communicator_ = core.DistCommunicator(mode_str, program.desc,
+                                                   global_scope(), envs)

    def start(self):
        """
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@ -963,6 +963,7 @@ class Executor(object):
                    program._pipeline_opt)
            else:
                trainer = TrainerFactory()._create_trainer(program._fleet_opt)
+                trainer._set_thread_barrier(program._is_distributed)
            trainer._set_program(program)
        else:
            if program._pipeline_opt:
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
@ -17,7 +17,6 @@ import warnings
 """
 Convert the fluid program to distributed data-parallelism programs.
 """
-from .distributed_strategy import *
 import paddle.fluid.io as io
 from paddle.fluid.communicator import Communicator
 from paddle.fluid.framework import default_main_program
@ -27,8 +26,11 @@ from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
 from paddle.fluid.optimizer import Optimizer
+
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig, DistributedStrategy, SyncStrategy, AsyncStrategy, HalfAsyncStrategy, GeoStrategy, StrategyFactory
+
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspiler as OriginTranspiler
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig, DistributedMode

 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
@ -70,25 +72,39 @@ class DistributedTranspiler(Fleet):
        program_config = self._transpile_config.get_program_config()
        trainer_communicator_config = self._transpile_config.get_trainer_runtime_config(
        )
+
+        if isinstance(self._transpile_config, SyncStrategy):
+            return
+
        print(trainer_communicator_config)

-        need_communicator_flag = False
        if isinstance(self._transpile_config, GeoStrategy):
-            need_communicator_flag = True
+            kwargs = {}
+            kwargs["push_vars"] = self.vars_info
+            kwargs["trainers"] = fleet.worker_num()
+            kwargs["push_nums"] = self._transpile_config.get_program_config(
+            ).geo_sgd_need_push_nums
+
            self._communicator = Communicator(
-                self.main_program, self.vars_info,
-                fleet.worker_num(), program_config.geo_sgd_need_push_nums,
+                self.main_program, DistributedMode.GEO, kwargs,
                trainer_communicator_config.get_communicator_flags())
+
        elif isinstance(self._transpile_config, AsyncStrategy):
-            need_communicator_flag = True
            self._communicator = Communicator(
-                self.main_program,
-                env_flags=trainer_communicator_config.get_communicator_flags())
-        if need_communicator_flag:
-            if not self._communicator.is_running():
-                self._communicator.start()
-            else:
-                warnings.warn("communicator has been initialized, skip")
+                self.main_program, DistributedMode.ASYNC, None,
+                trainer_communicator_config.get_communicator_flags())
+
+        elif isinstance(self._transpile_config, HalfAsyncStrategy):
+            self._communicator = Communicator(
+                self.main_program, DistributedMode.HALF_ASYNC, None,
+                trainer_communicator_config.get_communicator_flags())
+        else:
+            raise TypeError("Training MODE do not supported")
+
+        if not self._communicator.is_running():
+            self._communicator.start()
+        else:
+            warnings.warn("communicator has been initialized, skip")

    def init_server(self, model_dir=None):
        """
@ -139,12 +155,12 @@ class DistributedTranspiler(Fleet):
        Returns:
            None
        """
-        if isinstance(self._transpile_config, GeoStrategy) or isinstance(
-                self._transpile_config, AsyncStrategy):
+
+        if not isinstance(self._transpile_config, SyncStrategy):
            self._communicator.stop()
-        self._executor.close()
        if isinstance(self._role_maker, MPISymetricRoleMaker):
            self._role_maker._finalize()
+        self._executor.close()

    def distributed_optimizer(self, optimizer, strategy=None):
        """
@ -250,14 +266,22 @@ class DistributedTranspiler(Fleet):
        io.save_persistables(executor, dirname, main_program, None)

    def _transpile(self, config):
-        if isinstance(config, DistributeTranspilerConfig):
-            self._transpile_config = DistributedStrategy()
-            self._transpile_config.set_program_config(config)
-        elif isinstance(config, DistributedStrategy):
+        if isinstance(config, DistributedStrategy):
            self._transpile_config = config
+        elif isinstance(config, DistributeTranspilerConfig):
+            if config.sync_mode:
+                self._transpile_config = SyncStrategy()
+            elif config.geo_sgd_mode:
+                self._transpile_config = GeoStrategy(
+                    config.geo_sgd_need_push_nums)
+            elif config.runtime_split_send_recv and config.half_async:
+                self._transpile_config = HalfAsyncStrategy()
+            else:
+                self._transpile_config = AsyncStrategy()
+            self._transpile_config.set_program_config(config)
        else:
            raise TypeError(
-                "config must be an instance of DistributeTranspilerConfig or DistributedStrategy"
+                "config must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy or GeoStratey."
            )

        program_config = self._transpile_config.get_program_config()
@ -327,14 +351,12 @@ class TranspilerOptimizer(DistributedOptimizer):
        super(TranspilerOptimizer, self).__init__(optimizer, strategy)

        if strategy:
-            if isinstance(strategy, DistributedStrategy):
+            if isinstance(strategy, DistributeTranspilerConfig) or isinstance(
+                    strategy, DistributedStrategy):
                self._strategy = strategy
-            elif isinstance(strategy, DistributeTranspilerConfig):
-                self._strategy = DistributedStrategy()
-                self._strategy.set_program_config(strategy)
            else:
                raise TypeError(
-                    "In {} mode, strategy must be an instance of DistributeTranspilerConfig or DistributedStrategy".
+                    "In {} mode, strategy must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy, or GeoStrategy".
                    format(fleet._mode))
        else:
            self._strategy = DistributedStrategy()
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@ -24,49 +24,51 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo

 class TrainerRuntimeConfig(object):
    def __init__(self):
-        self.max_merge_var_num = int(
-            os.getenv("FLAGS_communicator_max_merge_var_num", "20"))
-        self.send_queue_size = int(
-            os.getenv("FLAGS_communicator_send_queue_size", "20"))
-        self.independent_recv_thread = int(
-            os.getenv("FLAGS_communicator_independent_recv_thread", "1"))
-        self.min_send_grad_num_before_recv = int(
-            os.getenv("FLAGS_communicator_min_send_grad_num_before_recv", "20"))
-        self.thread_pool_size = int(
-            os.getenv("FLAGS_communicator_thread_pool_size", "5"))
-        self.send_wait_times = int(
-            os.getenv("FLAGS_communicator_send_wait_times", "5"))
-        self.fake_rpc = int(os.getenv("FLAGS_communicator_fake_rpc", "0"))
-        self.merge_sparse_grad = int(
-            os.getenv("FLAGS_communicator_merge_sparse_grad", "1"))
-        self.is_sgd_optimizer = int(
-            os.getenv("FLAGS_communicator_is_sgd_optimizer", "1"))
+        self.max_merge_var_num = os.getenv(
+            "FLAGS_communicator_max_merge_var_num", "20")
+        self.send_queue_size = os.getenv("FLAGS_communicator_send_queue_size",
+                                         "20")
+        self.independent_recv_thread = os.getenv(
+            "FLAGS_communicator_independent_recv_thread", "1")
+        self.min_send_grad_num_before_recv = os.getenv(
+            "FLAGS_communicator_min_send_grad_num_before_recv", "20")
+        self.thread_pool_size = os.getenv("FLAGS_communicator_thread_pool_size",
+                                          "5")
+        self.send_wait_times = os.getenv("FLAGS_communicator_send_wait_times",
+                                         "5")
+        self.fake_rpc = os.getenv("FLAGS_communicator_fake_rpc", "0")
+        self.merge_sparse_grad = os.getenv(
+            "FLAGS_communicator_merge_sparse_grad", "1")
+        self.is_sgd_optimizer = os.getenv("FLAGS_communicator_is_sgd_optimizer",
+                                          "1")

        # not used 
-        self._rpc_deadline = int(os.getenv("FLAGS_rpc_deadline", "180000"))
-        self._rpc_retry_times = int(os.getenv("FLAGS_rpc_retry_times", "3"))
+        self._rpc_deadline = os.getenv("FLAGS_rpc_deadline", "180000")
+        self._rpc_retry_times = os.getenv("FLAGS_rpc_retry_times", "3")

    def get_communicator_flags(self):
        _communicator_flags = dict()
-        _communicator_flags["max_merge_var_num"] = self.max_merge_var_num
-        _communicator_flags["send_queue_size"] = self.send_queue_size
        _communicator_flags[
-            "independent_recv_thread"] = self.independent_recv_thread
+            "communicator_max_merge_var_num"] = self.max_merge_var_num
        _communicator_flags[
-            "min_send_grad_num_before_recv"] = self.min_send_grad_num_before_recv
-        _communicator_flags["thread_pool_size"] = self.thread_pool_size
-        _communicator_flags["send_wait_times"] = self.send_wait_times
-        _communicator_flags["fake_rpc"] = self.fake_rpc
-        _communicator_flags["merge_sparse_grad"] = self.merge_sparse_grad
-        _communicator_flags["is_sgd_optimizer"] = self.is_sgd_optimizer
+            "communicator_send_queue_size"] = self.send_queue_size
+        _communicator_flags[
+            "communicator_independent_recv_thread"] = self.independent_recv_thread
+        _communicator_flags[
+            "communicator_min_send_grad_num_before_recv"] = self.min_send_grad_num_before_recv
+        _communicator_flags[
+            "communicator_thread_pool_size"] = self.thread_pool_size
+        _communicator_flags[
+            "communicator_send_wait_times"] = self.send_wait_times
+        _communicator_flags[
+            "communicator_is_sgd_optimizer"] = self.is_sgd_optimizer
        return _communicator_flags

    def __repr__(self):
        _str = "please check that TrainerRuntimeConfig is as expected:\n"
        _communicator_flags = self.get_communicator_flags()
        for key in _communicator_flags:
-            _str += "communicator_{}: {}\n".format(key,
-                                                   _communicator_flags[key])
+            _str += "{}: {}\n".format(key, _communicator_flags[key])
        return _str


@ -193,8 +195,9 @@ class HalfAsyncStrategy(DistributedStrategy):
    def __init__(self):
        super(HalfAsyncStrategy, self).__init__()
        self._program_config.sync_mode = False
-        self._program_config.runtime_split_send_recv = False
-        self._build_strategy.async_mode = False
+        self._program_config.runtime_split_send_recv = True
+        self._build_strategy.async_mode = True
+        self._program_config.half_async = True


 class GeoStrategy(DistributedStrategy):
@ -202,9 +205,9 @@ class GeoStrategy(DistributedStrategy):
        super(GeoStrategy, self).__init__()
        self._program_config.sync_mode = False
        self._program_config.runtime_split_send_recv = True
+        self._build_strategy.async_mode = True
        self._program_config.geo_sgd_mode = True
        self._program_config.geo_sgd_need_push_nums = update_frequency
-        self._build_strategy.async_mode = True


 class StrategyFactory(object):
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@ -1,10 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

-if(NOT WITH_DISTRIBUTE)
-  list(REMOVE_ITEM TEST_OPS test_communicator)
-endif(NOT WITH_DISTRIBUTE)
-
 foreach(src ${TEST_OPS})
  py_test(${src} SRCS ${src}.py)
 endforeach()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -20,6 +20,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_lookup_remote_table_op)
 list(APPEND MIXED_DIST_TEST_OPS test_launch)
 list(APPEND MIXED_DIST_TEST_OPS test_launch_ps)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
  list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@ -269,6 +273,9 @@ if(WITH_DISTRIBUTE)
    py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
    py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
    py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
    if(WITH_DGC)
        # if with dgc, test all dgc tests.
        # NOTE. dist dgc tests is already in DIST_TEST_OPS
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@ -110,8 +110,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):

        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
        acc = fluid.layers.accuracy(input=predict, label=label)
+
        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
                                                              label=label)
+
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)

@ -242,11 +244,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                debug=False)
            pass_time = time.time() - pass_start

-        model_dir = tempfile.mkdtemp()
-        fleet.save_inference_model(
-            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
-        self.check_model_right(model_dir)
-        shutil.rmtree(model_dir)
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
        fleet.stop_worker()


--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@ -16,7 +16,10 @@ from __future__ import print_function

 import unittest
 import time
+import threading
+import numpy

+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.communicator import Communicator

@ -35,7 +38,7 @@ class TestCommunicator(unittest.TestCase):
        avg_cost = fluid.layers.mean(cost)
        return avg_cost

-    def test_communicator_init_and_start(self):
+    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
@ -48,23 +51,15 @@ class TestCommunicator(unittest.TestCase):
        optimizer = fluid.optimizer.SGD(0.01)

        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = True
+        strategy.sync_mode = False
+        strategy.runtime_split_send_recv = True
        strategy.wait_port = False
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

-        comm = Communicator(fleet.main_program)
-        comm.start()
+        fleet.init_worker()
        time.sleep(10)
-        comm.stop()
-
-
-class TestCommunicator2(unittest.TestCase):
-    def test_communicator_init_and_start(self):
-        prog = fluid.Program()
-        comm = Communicator(prog)
-        comm.start()
-        comm.stop()
+        fleet.stop_worker()


 if __name__ == '__main__':
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@ -0,0 +1,83 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import time
+import threading
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.communicator import Communicator
+from paddle.fluid.transpiler.distribute_transpiler import DistributedMode
+
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+
+
+class TestCommunicator(unittest.TestCase):
+    def net(self):
+        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost
+
+    def test_communicator_geo(self):
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=0,
+            role=role_maker.Role.WORKER,
+            worker_num=2,
+            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+
+        fleet.init(role)
+        avg_cost = self.net()
+
+        optimizer = fluid.optimizer.SGD(0.01)
+
+        strategy = DistributeTranspilerConfig()
+        strategy.sync_mode = False
+        strategy.runtime_split_send_recv = True
+        strategy.geo_sgd_mode = True
+        strategy.wait_port = False
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        fleet.init_worker()
+        time.sleep(10)
+        fleet.stop_worker()
+
+
+# class TestCommunicatorGEO(unittest.TestCase):
+#     def test_communicator_init_and_start(self):
+#         prog = fluid.Program()
+
+#         envs = {}
+#         envs["communicator_thread_pool_size"] = "5"
+#         envs["communicator_send_wait_times"] = "5"
+
+#         kwargs = {}
+#         kwargs["push_vars"] = {}
+#         kwargs["trainers"] = 10
+#         kwargs["push_nums"] = 10
+
+#         comm = Communicator(prog, DistributedMode.GEO, kwargs, envs)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@ -0,0 +1,177 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import threading
+import subprocess
+import unittest
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.communicator import Communicator
+
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.transpiler.distribute_transpiler import DistributedMode
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+
+
+class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
+    def net(self):
+        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, x, y
+
+    def fake_reader(self):
+        def reader():
+            for i in range(10000):
+                x = numpy.random.random((1, 13)).astype('float32')
+                y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
+                yield x, y
+
+        return reader
+
+    def run_pserver(self, role, strategy):
+        fleet.init(role)
+        avg_cost, x, y = self.net()
+        optimizer = fluid.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        fleet.init_server()
+        fleet.run_server()
+
+    def run_trainer(self, role, strategy):
+        place = fluid.core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        fleet.init(role)
+        avg_cost, x, y = self.net()
+        optimizer = fluid.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        exe.run(fleet.startup_program)
+        fleet.init_worker()
+
+        train_reader = paddle.batch(self.fake_reader(), batch_size=24)
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+
+        for batch_id, data in enumerate(train_reader()):
+            exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])
+
+        fleet.stop_worker()
+
+    def run_ut(self):
+        strategy = DistributeTranspilerConfig()
+        strategy.sync_mode = False
+        strategy.runtime_split_send_recv = True
+        strategy.half_async = True
+
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=0,
+            role=role_maker.Role.WORKER
+            if training_role == "TRAINER" else role_maker.Role.SERVER,
+            worker_num=2,
+            server_endpoints=["127.0.0.1:6002"])
+
+        if training_role == "TRAINER":
+            self.run_trainer(role, strategy)
+        else:
+            self.run_pserver(role, strategy)
+
+    def test_communicator(self):
+        run_server_cmd = """
+from __future__ import print_function
+
+import sys
+import os
+
+import time
+import threading
+import subprocess
+import unittest
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.communicator import Communicator
+from paddle.fluid.communicator import DistributedMode
+
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+
+
+class RunServer(TestCommunicatorHalfAsyncEnd2End):
+    def runTest(self):
+        pass
+
+os.environ["TRAINING_ROLE"] = "PSERVER"
+half_run_server = RunServer()
+half_run_server.run_ut()
+"""
+
+        server_file = "run_server_for_communicator_haflaysnc.py"
+        with open(server_file, "w") as wb:
+            wb.write(run_server_cmd)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        _python = sys.executable
+
+        ps_cmd = "{} {}".format(_python, server_file)
+        ps_proc = subprocess.Popen(
+            ps_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_communicator_send_queue_size"] = "1"
+        os.environ["FLAGS_communicator_max_merge_var_num"] = "1"
+
+        self.run_ut()
+        ps_proc.kill()
+
+        if os.path.exists(server_file):
+            os.remove(server_file)
+
+
+# class TestCommunicatorHalfAsync2(unittest.TestCase):
+#     def test_communicator_init_and_start(self):
+#         prog = fluid.Program()
+
+#         envs = {}
+#         envs["communicator_send_queue_size"] = "12"
+#         envs["communicator_max_merge_var_num"] = "12"
+#         envs["communicator_thread_pool_size"] = "5"
+#         envs["communicator_send_wait_times"] = "5"
+
+#         comm = Communicator(prog, DistributedMode.HALF_ASYNC, None, envs)
+#         comm.start()
+#         time.sleep(10)
+#         comm.stop()
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@ -32,7 +32,6 @@ class TestDistCTR2x2(TestDistBase):
            "dist_ctr.py", delta=1e-2, check_error_log=True, log_name=flag_name)


-@unittest.skip(reason="Skip unstable ci")
 class TestDistCTRWithL2Decay2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@ -48,6 +47,7 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
            log_name=flag_name)


+@unittest.skip(reason="Skip unstable ci")
 class TestDistCTR2x2_ASYNC(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
@ -69,6 +69,7 @@ class TestDistCTR2x2_ASYNC(TestDistBase):
            log_name=flag_name)


+@unittest.skip(reason="Skip unstable ci")
 class TestDistCTR2x2_ASYNCWithLRDecay2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
@ -91,6 +92,7 @@ class TestDistCTR2x2_ASYNCWithLRDecay2x2(TestDistBase):
            log_name=flag_name)


+@unittest.skip(reason="Skip unstable ci")
 class TestDistCTR2x2_ASYNC2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@ -53,7 +53,22 @@ class FleetDistRunnerBase(object):
        do training : exe run program
    """

-    def generate_strategy(self, args):
+    def build_role(self, args):
+        if args.role.upper() == "PSERVER":
+            role = role_maker.UserDefinedRoleMaker(
+                current_id=args.current_id,
+                role=role_maker.Role.SERVER,
+                worker_num=args.trainers,
+                server_endpoints=args.endpoints.split(","))
+        else:
+            role = role_maker.UserDefinedRoleMaker(
+                current_id=args.current_id,
+                role=role_maker.Role.WORKER,
+                worker_num=args.trainers,
+                server_endpoints=args.endpoints.split(","))
+        return role
+
+    def build_strategy(self, args):
        self.strategy = None
        if args.mode == "async":
            self.strategy = StrategyFactory.create_async_strategy()
@ -66,22 +81,7 @@ class FleetDistRunnerBase(object):
                args.geo_sgd_need_push_nums)
        return self.strategy

-    def run_pserver(self, args):
-        if args.role.upper() != "PSERVER":
-            raise ValueError("args role must be PSERVER")
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=args.current_id,
-            role=role_maker.Role.SERVER,
-            worker_num=args.trainers,
-            server_endpoints=args.endpoints.split(","))
-
-        fleet.init(role)
-
-        strategy = self.generate_strategy(args)
-
-        avg_cost = self.net()
-
+    def build_optimizer(self, avg_cost, strategy):
        use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
        if use_grad_clip:
            # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
@ -99,70 +99,33 @@ class FleetDistRunnerBase(object):
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

+    def run_pserver(self, args):
+        fleet.init(self.build_role(args))
+        strategy = self.build_strategy(args)
+        avg_cost = self.net()
+        self.build_optimizer(avg_cost, strategy)
+
        fleet.init_server()
        fleet.run_server()

    def run_dataset_trainer(self, args):
-        if args.role.upper() != "TRAINER":
-            raise ValueError("args role must be TRAINER")
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=args.current_id,
-            role=role_maker.Role.WORKER,
-            worker_num=args.trainers,
-            server_endpoints=args.endpoints.split(","))
-
-        fleet.init(role)
-
-        strategy = self.generate_strategy(args)
-
+        fleet.init(self.build_role(args))
+        strategy = self.build_strategy(args)
        avg_cost = self.net()
-
-        use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
-        if use_grad_clip:
-            # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
-            if use_grad_clip == 1:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByValue(2.0))
-            elif use_grad_clip == 2:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByNorm(2.0))
-            elif use_grad_clip == 3:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(2.0))
-
-        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
+        self.build_optimizer(avg_cost, strategy)
        out = self.do_dataset_training(fleet)

    def run_pyreader_trainer(self, args):
-        if args.role.upper() != "TRAINER":
-            raise ValueError("args role must be TRAINER")
-
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=args.current_id,
-            role=role_maker.Role.WORKER,
-            worker_num=args.trainers,
-            server_endpoints=args.endpoints.split(","))
-
-        fleet.init(role)
-
-        strategy = self.generate_strategy(args)
-
+        fleet.init(self.build_role(args))
+        strategy = self.build_strategy(args)
        avg_cost = self.net()
-
        self.reader = fluid.io.PyReader(
            feed_list=self.feeds,
            capacity=64,
            iterable=False,
            use_double_buffer=False)

-        optimizer = fluid.optimizer.SGD(LEARNING_RATE)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
-
+        self.build_optimizer(avg_cost, strategy)
        out = self.do_pyreader_training(fleet)

    def net(self, batch_size=4, lr=0.01):
@ -263,7 +226,7 @@ class TestFleetBase(unittest.TestCase):
        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe

    def _run_cluster(self, model, envs):
-        env = {'CPU_NUM': '1', 'GRAD_CLIP': str(self._grad_clip_mode)}
+        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
        env.update(envs)

        python_path = self._python_interp
@ -307,29 +270,6 @@ class TestFleetBase(unittest.TestCase):

        ps0.terminate()
        ps1.terminate()
-        '''
-        with open("/tmp/tr0_out.log", "wb+") as wn:
-            wn.write(tr0_out)
-        with open("/tmp/tr1_out.log", "wb+") as wn:
-            wn.write(tr1_out)
-        # print server log
-        '''
-
-        # print server log
-        '''
-        with open("/tmp/ps0_err.log", "r") as fn:
-            sys.stderr.write("ps0 stderr: %s\n" % fn.read())
-        with open("/tmp/ps1_err.log", "r") as fn:
-            sys.stderr.write("ps1 stderr: %s\n" % fn.read())
-        '''
-
-        # print log
-        '''
-        with open("/tmp/tr0_err.log", "r") as fn:
-            sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
-        with open("/tmp/tr1_err.log", "r") as fn:
-            sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
-        '''

        return 0, 0

--- a/Show More
+++ b/Show More