add fl_listen_and_serv &fl_transpiler,test=develop (#19091)
add fl_listen_and_serv op for Federated_learning and fl_distribute_transpiler add this op to pserver program . This op just listen the endpoint and sum&scale.padding_in_crf
parent
5368b36512
commit
539c870753
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,91 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "paddle/fluid/framework/executor.h"
|
||||
#include "paddle/fluid/framework/lod_tensor.h"
|
||||
#include "paddle/fluid/framework/op_registry.h"
|
||||
#include "paddle/fluid/framework/threadpool.h"
|
||||
#include "paddle/fluid/operators/distributed/request_handler.h"
|
||||
#include "paddle/fluid/operators/distributed/rpc_server.h"
|
||||
#include "paddle/fluid/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace operators {
|
||||
|
||||
constexpr char kOptimizeBlocks[] = "optimize_blocks";
|
||||
|
||||
void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
|
||||
|
||||
template <class TKey, class TValue>
|
||||
class DoubleFindMap : public std::unordered_map<TKey, TValue> {
|
||||
public:
|
||||
typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
|
||||
return std::find_if(this->begin(), this->end(),
|
||||
[&v](const std::pair<const std::string, int> p) {
|
||||
return p.second == v;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
class FlListenAndServOp : public framework::OperatorBase {
|
||||
public:
|
||||
FlListenAndServOp(const std::string& type,
|
||||
const framework::VariableNameMap& inputs,
|
||||
const framework::VariableNameMap& outputs,
|
||||
const framework::AttributeMap& attrs);
|
||||
virtual ~FlListenAndServOp();
|
||||
|
||||
void RunSyncLoop(framework::Executor* executor,
|
||||
framework::ProgramDesc* program,
|
||||
framework::Scope* recv_scope,
|
||||
platform::DeviceContext* dev_ctx) const;
|
||||
|
||||
void SavePort() const;
|
||||
|
||||
int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
|
||||
|
||||
void RunImpl(const framework::Scope& scope,
|
||||
const platform::Place& dev_place) const override;
|
||||
|
||||
protected:
|
||||
mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
|
||||
mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
|
||||
mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
|
||||
|
||||
mutable std::shared_ptr<std::thread> server_thread_;
|
||||
mutable std::vector<std::string> sparse_vars_;
|
||||
mutable std::vector<std::string> dense_vars_;
|
||||
};
|
||||
|
||||
class FlSignalHandler {
|
||||
public:
|
||||
static void StopAndExit(int signal_num);
|
||||
|
||||
private:
|
||||
DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
|
||||
};
|
||||
|
||||
} // namespace operators
|
||||
} // namespace paddle
|
@ -0,0 +1,176 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid import Program
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import unittest
|
||||
from multiprocessing import Process
|
||||
from op_test import OpTest
|
||||
import numpy
|
||||
import urllib
|
||||
import sys
|
||||
|
||||
|
||||
def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id):
|
||||
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
|
||||
y_predict = fluid.layers.fc(input=x, size=1, act=None)
|
||||
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
|
||||
# loss function
|
||||
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
|
||||
avg_cost = fluid.layers.mean(cost)
|
||||
# optimizer
|
||||
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||
sgd_optimizer.minimize(avg_cost)
|
||||
with open("trainer_recv_program.dms", "rb") as f:
|
||||
trainer_recv_program_desc_str = f.read()
|
||||
with open("trainer_main_program.dms", "rb") as f:
|
||||
trainer_main_program_desc_str = f.read()
|
||||
with open("trainer_send_program.dms", "rb") as f:
|
||||
trainer_send_program_desc_str = f.read()
|
||||
recv_program = Program.parse_from_string(trainer_recv_program_desc_str)
|
||||
main_program = Program.parse_from_string(trainer_main_program_desc_str)
|
||||
send_program = Program.parse_from_string(trainer_send_program_desc_str)
|
||||
|
||||
trainer_startup_program = fluid.default_startup_program()
|
||||
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
||||
exe = fluid.Executor(place)
|
||||
|
||||
exe.run(trainer_startup_program)
|
||||
for i in range(5):
|
||||
exe.run(recv_program)
|
||||
exe.run(main_program,
|
||||
feed={
|
||||
"x": numpy.array([1, 2]).astype('float32').reshape(2, 1),
|
||||
"y": numpy.array([2, 3]).astype('float32').reshape(2, 1)
|
||||
})
|
||||
exe.run(send_program)
|
||||
|
||||
|
||||
def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
|
||||
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
|
||||
y_predict = fluid.layers.fc(input=x, size=1, act=None)
|
||||
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
|
||||
# loss function
|
||||
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
|
||||
avg_cost = fluid.layers.mean(cost)
|
||||
# optimizer
|
||||
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
|
||||
sgd_optimizer.minimize(avg_cost)
|
||||
with open("pserver_startup_program.dms", "rb") as f:
|
||||
pserver_startup_program_desc_str = f.read()
|
||||
with open("pserver_main_program.dms", "rb") as f:
|
||||
pserver_main_program_desc_str = f.read()
|
||||
|
||||
startup_program = Program.parse_from_string(
|
||||
pserver_startup_program_desc_str)
|
||||
main_program = Program.parse_from_string(pserver_main_program_desc_str)
|
||||
|
||||
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(startup_program)
|
||||
exe.run(main_program)
|
||||
|
||||
|
||||
class TestFlListenAndServOp(OpTest):
|
||||
def setUp(self):
|
||||
self.ps_timeout = 5
|
||||
self.ip = "127.0.0.1"
|
||||
self.port = "6000"
|
||||
self.trainers = 2
|
||||
self.trainer_id = 0
|
||||
|
||||
def _start_pserver(self, use_cuda, sync_mode, pserver_func):
|
||||
p = Process(
|
||||
target=pserver_func,
|
||||
args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
|
||||
self.trainer_id))
|
||||
p.daemon = True
|
||||
p.start()
|
||||
return p
|
||||
|
||||
def _start_trainer0(self, use_cuda, sync_mode, pserver_func):
|
||||
p = Process(
|
||||
target=pserver_func,
|
||||
args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 0))
|
||||
p.daemon = True
|
||||
p.start()
|
||||
return p
|
||||
|
||||
def _start_trainer1(self, use_cuda, sync_mode, pserver_func):
|
||||
p = Process(
|
||||
target=pserver_func,
|
||||
args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 1))
|
||||
p.daemon = True
|
||||
p.start()
|
||||
return p
|
||||
|
||||
def _wait_ps_ready(self, pid):
|
||||
start_left_time = self.ps_timeout
|
||||
sleep_time = 0.5
|
||||
while True:
|
||||
assert start_left_time >= 0, "wait ps ready failed"
|
||||
time.sleep(sleep_time)
|
||||
try:
|
||||
os.stat("/tmp/paddle.%d.port" % pid)
|
||||
return
|
||||
except os.error:
|
||||
start_left_time -= sleep_time
|
||||
|
||||
def test_rpc_interfaces(self):
|
||||
# TODO(Yancey1989): need to make sure the rpc interface correctly.
|
||||
pass
|
||||
|
||||
def test_handle_signal_in_serv_op(self):
|
||||
# run pserver on CPU in sync mode
|
||||
if sys.platform == 'win32' or sys.platform == 'sys.platform':
|
||||
pass
|
||||
else:
|
||||
print(sys.platform)
|
||||
cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_startup_program.dms"
|
||||
os.system(cmd)
|
||||
cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/pserver_main_program.dms"
|
||||
os.system(cmd)
|
||||
cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_recv_program.dms"
|
||||
os.system(cmd)
|
||||
cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_main_program.dms"
|
||||
os.system(cmd)
|
||||
cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/trainer_send_program.dms"
|
||||
os.system(cmd)
|
||||
p1 = self._start_pserver(False, True, run_pserver)
|
||||
self._wait_ps_ready(p1.pid)
|
||||
time.sleep(5)
|
||||
t1 = self._start_trainer0(False, True, run_trainer)
|
||||
time.sleep(2)
|
||||
t2 = self._start_trainer1(False, True, run_trainer)
|
||||
# raise SIGTERM to pserver
|
||||
time.sleep(2)
|
||||
cmd_del = "rm trainer*dms* pserver*dms*"
|
||||
os.system(cmd_del)
|
||||
os.kill(p1.pid, signal.SIGINT)
|
||||
p1.join()
|
||||
os.kill(t1.pid, signal.SIGINT)
|
||||
t1.join()
|
||||
os.kill(t2.pid, signal.SIGINT)
|
||||
t2.join()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue