From e1290c4fd7facfa9abfbb6e710ab3fa5f4ed3d10 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 28 Mar 2018 23:09:32 +0800 Subject: [PATCH 01/29] Make Average Model support for 'moving mean' and 'moving variance' of batch_normal op --- python/paddle/fluid/optimizer.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 180575c35d..d21320f705 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -850,23 +850,39 @@ class ModelAverage(Optimizer): self.min_average_window = min_average_window self.max_average_window = max_average_window self.params_grads = params_grads + + # append 'moving mean' and 'moving variance' to self.params_grads + pattern = re.compile(r"batch_norm_\d+\.w_[1,2]") + for param in framework.default_main_program().global_block( + ).all_parameters(): + if pattern.match(param.name) is not None: + self.params_grads.append((param, None)) + # create a tmp gradient variable to backup parameter value + # for parameter whose grad is None + for i, param_grad in enumerate(self.params_grads): + param, grad = param_grad + if grad is None: + grad = param.block.create_var( + name=unique_name.generate(".".join([param.name, 'tmp'])), + dtype=param.dtype, + persistable=False, + stop_gradient=stop_gradient) + self.params_grads[i] = (param, grad) + for param, grad in self.params_grads: - if grad is not None: - self._append_average_accumulate_op(param) + self._append_average_accumulate_op(param) self.apply_program = Program() block = self.apply_program.global_block() with program_guard(main_program=self.apply_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_apply_op(block, param_grad) + self._add_average_apply_op(block, param_grad) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): for param_grad in self.params_grads: - if param_grad[1] is not None: - self._add_average_restore_op(block, param_grad) + self._add_average_restore_op(block, param_grad) def _add_average_apply_op(self, block, param_grad): param = block.clone_variable(param_grad[0]) From 9708b21f191b3ff606651dfaeb7cf65dfd250881 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Mon, 2 Apr 2018 10:51:31 +0800 Subject: [PATCH 02/29] Refine average model option 1. Add attr 'average' into ParamAttr. 2. Make 'params_grads' optional for AverageModel. 3. Add option 'average_mean' and 'average_variance' for batch_normal. --- python/paddle/fluid/framework.py | 4 +++- python/paddle/fluid/layers/nn.py | 12 +++++++++--- python/paddle/fluid/optimizer.py | 28 ++++++++++++---------------- python/paddle/fluid/param_attr.py | 9 ++++++--- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3e78788f47..92c299a4b6 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1137,6 +1137,8 @@ class Parameter(Variable): self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) + self.average = kwargs.get('average', True) + def __str__(self): return self.to_string(True) @@ -1157,7 +1159,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr") + "gradient_clip_attr", "average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, str(getattr(self, attr_name))) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0332556f62..3265ff733b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1486,7 +1486,9 @@ def batch_norm(input, in_place=False, name=None, moving_mean_name=None, - moving_variance_name=None): + moving_variance_name=None, + average_mean=True, + average_variance=True): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1517,7 +1519,10 @@ def batch_norm(input, mean = helper.create_parameter( attr=ParamAttr( - name=moving_mean_name, initializer=Constant(0.0), trainable=False), + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + average=average_variance), shape=param_shape, dtype=input.dtype) mean.stop_gradient = True @@ -1526,7 +1531,8 @@ def batch_norm(input, attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), - trainable=False), + trainable=False, + average=average_mean), shape=param_shape, dtype=input.dtype) variance.stop_gradient = True diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d21320f705..560257a356 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import re from collections import defaultdict from paddle.fluid.framework import Program import framework @@ -818,8 +818,8 @@ class ModelAverage(Optimizer): min_average_window, max_average_window and current update times. Args: - params_grads: A list of parameter-grad variable pairs. average_window_rate: The rate of average window. + params_grads: A list of parameter-grad variable pairs. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. @@ -840,8 +840,8 @@ class ModelAverage(Optimizer): """ def __init__(self, - params_grads, - average_window_rate, + average_window_rate=0.15, + params_grads=None, min_average_window=10000, max_average_window=10000, **kwargs): @@ -849,25 +849,21 @@ class ModelAverage(Optimizer): self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window - self.params_grads = params_grads - # append 'moving mean' and 'moving variance' to self.params_grads - pattern = re.compile(r"batch_norm_\d+\.w_[1,2]") + self.params_grads = [] if params_grads is None else params_grads + params = {} + for param, grad in self.params_grads: + params[param.name] = (param, grad) for param in framework.default_main_program().global_block( ).all_parameters(): - if pattern.match(param.name) is not None: - self.params_grads.append((param, None)) - # create a tmp gradient variable to backup parameter value - # for parameter whose grad is None - for i, param_grad in enumerate(self.params_grads): - param, grad = param_grad - if grad is None: + if param.name not in params and param.average: grad = param.block.create_var( name=unique_name.generate(".".join([param.name, 'tmp'])), dtype=param.dtype, persistable=False, - stop_gradient=stop_gradient) - self.params_grads[i] = (param, grad) + stop_gradient=True) + params[param.name] = (param, grad) + self.params_grads = params.values() for param, grad in self.params_grads: self._append_average_accumulate_op(param) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 255cd21043..74b968f8ee 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -28,13 +28,15 @@ class ParamAttr(object): learning_rate=1.0, regularizer=None, trainable=True, - gradient_clip=None): + gradient_clip=None, + average=True): self.name = name self.initializer = initializer self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable self.gradient_clip = gradient_clip + self.average = average def set_default_initializer(self, initializer): if initializer is None: @@ -80,7 +82,8 @@ class ParamAttr(object): }, 'regularizer': self.regularizer, 'trainable': self.trainable, - 'gradient_clip_attr': self.gradient_clip + 'gradient_clip_attr': self.gradient_clip, + 'average': self.average } if with_initializer: kwargs['initializer'] = self.initializer @@ -90,7 +93,7 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except + Besides, an extra field dim can be set to indicate the dimension except which to normalize. """ # List to record the parameters reparameterized by weight normalization. From f43be75b82582ec5f81c2ceba45eb14128638478 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 2 Apr 2018 20:25:11 +0800 Subject: [PATCH 03/29] multi stream thread pool --- paddle/fluid/framework/threadpool.cc | 15 +++++++++++++++ paddle/fluid/framework/threadpool.h | 16 ++++++++++++++++ paddle/fluid/operators/detail/grpc_client.cc | 12 +++++++----- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 9854d618d2..0a8377cc47 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -91,5 +91,20 @@ void ThreadPool::TaskLoop() { } } +std::unique_ptr MultiStreamThreadPool::io_threadpool_(nullptr); +std::once_flag MultiStreamThreadPool::io_init_flag_; + +MultiStreamThreadPool* MultiStreamThreadPool::GetInstanceIO() { + std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO); + return static_cast(io_threadpool_.get()); +} + +void MultiStreamThreadPool::InitIO() { + if (io_threadpool_.get() == nullptr) { + // TODO(typhoonzero1986): make this configurable + io_threadpool_.reset(new ThreadPool(100)); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index f9dce7105e..5d437594ab 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -135,6 +135,17 @@ class ThreadPool { std::condition_variable completed_; }; +class MultiStreamThreadPool : ThreadPool { + public: + static MultiStreamThreadPool* GetInstanceIO(); + static void InitIO(); + + private: + // NOTE: threadpool in base will be inhereted here. + static std::unique_ptr io_threadpool_; + static std::once_flag io_init_flag_; +}; + // Run a function asynchronously. // NOTE: The function must return void. If the function need to return a value, // you can use lambda to capture a value pointer. @@ -143,5 +154,10 @@ std::future Async(Callback callback) { return ThreadPool::GetInstance()->Run(callback); } +template +std::future AsyncIO(Callback callback) { + return MultiStreamThreadPool::GetInstanceIO()->Run(callback); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index d79ba6d291..3f96ce3718 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -33,7 +33,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, + this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; @@ -88,7 +89,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, + this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -131,8 +133,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -195,7 +197,7 @@ bool RPCClient::Wait() { std::vector> waits(req_count_); for (int i = 0; i < req_count_; i++) { - waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); }); } for (int i = 0; i < req_count_; i++) { From b851c0739f29eebfb9d63db026c847733fa8d252 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 3 Apr 2018 10:02:34 +0800 Subject: [PATCH 04/29] update compile --- paddle/fluid/framework/threadpool.h | 32 ++++++++++---------- paddle/fluid/operators/detail/grpc_client.cc | 12 +++----- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 5d437594ab..0a60488d9f 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -28,6 +28,22 @@ limitations under the License. */ namespace paddle { namespace framework { +struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + LOG(FATAL) << "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception.\n" + "The default exception handler is LOG(FATAL)." + << ex->what(); + } + } +}; + // ThreadPool maintains a queue of tasks, and runs them using a fixed // number of threads. class ThreadPool { @@ -87,22 +103,6 @@ class ThreadPool { void Wait(); private: - struct ExceptionHandler { - mutable std::future> future_; - explicit ExceptionHandler( - std::future>&& f) - : future_(std::move(f)) {} - void operator()() const { - auto ex = this->future_.get(); - if (ex != nullptr) { - LOG(FATAL) << "The exception is thrown inside the thread pool. You " - "should use RunAndGetException to handle the exception.\n" - "The default exception handler is LOG(FATAL)." - << ex->what(); - } - } - }; - DISABLE_COPY_AND_ASSIGN(ThreadPool); // If the task queue is empty and avaialbe is equal to the number of diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 3f96ce3718..d79ba6d291 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -33,8 +33,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, - this] { + framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; @@ -89,8 +88,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, - this] { + framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -133,8 +131,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -197,7 +195,7 @@ bool RPCClient::Wait() { std::vector> waits(req_count_); for (int i = 0; i < req_count_; i++) { - waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); }); + waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); } for (int i = 0; i < req_count_; i++) { From fbd3604cad8fdb3ad7fa2f6717395b1c40e6ecaf Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 3 Apr 2018 05:31:52 +0000 Subject: [PATCH 05/29] Split Executor.Run to Executor.Prepare and Executor.RunPreparedContext for inference. --- paddle/fluid/framework/executor.cc | 94 ++++++++++++------- paddle/fluid/framework/executor.h | 7 ++ .../test_inference_image_classification.cc | 4 +- paddle/fluid/inference/tests/test_helper.h | 20 +++- 4 files changed, 85 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 64c06687b6..009d0fbeb8 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -129,13 +129,15 @@ static bool has_feed_operators( feed_count, feed_targets.size(), "The number of feed operators should match 'feed_targets'"); - // When feed operator are present, so should be feed_holder - auto var = block.FindVar(feed_holder_name); - PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", - feed_holder_name); - PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, - "'%s' variable should be 'FEED_MINIBATCH' type", - feed_holder_name); + if (!feed_holder_name.empty()) { + // When feed operator are present, so should be feed_holder + auto var = block.FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } } return feed_count > 0; @@ -169,13 +171,15 @@ static bool has_fetch_operators( fetch_count, fetch_targets.size(), "The number of fetch operators should match 'fetch_targets'"); - // When fetch operator are present, so should be fetch_holder - auto var = block.FindVar(fetch_holder_name); - PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", - fetch_holder_name); - PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, - "'%s' variable should be 'FETCH_LIST' type", - fetch_holder_name); + if (!fetch_holder_name.empty()) { + // When fetch operator are present, so should be fetch_holder + auto var = block.FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } } return fetch_count > 0; @@ -222,16 +226,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } } - // map the data of feed_targets to feed_holder - for (auto* op : global_block->AllOps()) { - if (op->Type() == kFeedOpType) { - std::string feed_target_name = op->Output("Out")[0]; - int idx = boost::get(op->GetAttr("col")); - SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, - idx); - } - } - if (!has_fetch_ops) { // create fetch_holder variable auto* fetch_holder = global_block->Var(fetch_holder_name); @@ -255,17 +249,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } } - Run(*copy_program, scope, 0, create_vars, create_vars); - - // obtain the data of fetch_targets from fetch_holder - for (auto* op : global_block->AllOps()) { - if (op->Type() == kFetchOpType) { - std::string fetch_target_name = op->Input("X")[0]; - int idx = boost::get(op->GetAttr("col")); - *fetch_targets[fetch_target_name] = - GetFetchVariable(*scope, fetch_holder_name, idx); - } - } + auto ctx = Prepare(*copy_program, 0); + RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, + feed_holder_name, fetch_holder_name, create_vars); } std::unique_ptr Executor::Prepare( @@ -343,5 +329,43 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } } +void Executor::RunPreparedContext( + ExecutorPrepareContext* ctx, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name, const std::string& fetch_holder_name, + bool create_vars) { + auto& global_block = ctx->prog_.Block(ctx->block_id_); + + // map the data of feed_targets to feed_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + PADDLE_ENFORCE(feed_targets.find(feed_target_name) != feed_targets.end(), + "Variable %s is not feeded."); + + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, + idx); + } + } + + RunPreparedContext(ctx, scope, create_vars, create_vars); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + PADDLE_ENFORCE( + fetch_targets.find(fetch_target_name) != fetch_targets.end(), + "Variable %s is not fetched."); + + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 7173c51c95..b0e64d5de0 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -65,6 +65,13 @@ class Executor { bool create_local_scope = true, bool create_vars = true); + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch", + bool create_vars = true); + private: const platform::Place place_; }; diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index e9a27171f1..9126efb8c2 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -48,7 +48,7 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; - TestInference( + TestInference( dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); @@ -59,7 +59,7 @@ TEST(inference, image_classification) { // Run inference on CUDA GPU LOG(INFO) << "--- GPU Runs: ---"; - TestInference( + TestInference( dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index dce541c097..d559cc7d03 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -88,7 +88,7 @@ void CheckError(paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, std::vector& cpu_fetchs, @@ -170,7 +170,14 @@ void TestInference(const std::string& dirname, // 6. Run the inference program { // Ignore the profiling results of the first run - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + std::unique_ptr ctx; + if (PrepareContext) { + ctx = executor.Prepare(*inference_program, 0); + executor.RunPreparedContext( + ctx.get(), scope, feed_targets, fetch_targets); + } else { + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + } // Enable the profiler paddle::platform::EnableProfiler(state); @@ -181,7 +188,14 @@ void TestInference(const std::string& dirname, "run_inference", paddle::platform::DeviceContextPool::Instance().Get(place)); - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + if (PrepareContext) { + // Note: if you changed the inference_program, you need to call + // executor.Prepare() again to get a new ExecutorPrepareContext. + executor.RunPreparedContext( + ctx.get(), scope, feed_targets, fetch_targets); + } else { + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + } } // Disable the profiler and print the timing information From 2e40660e7a81962a56d89bdd1e2a86d9f78cab35 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 4 Apr 2018 18:13:45 +0800 Subject: [PATCH 06/29] Fix some issues. --- python/paddle/fluid/framework.py | 4 ++-- python/paddle/fluid/layers/nn.py | 20 +++++++++++--------- python/paddle/fluid/optimizer.py | 4 ++-- python/paddle/fluid/param_attr.py | 6 +++--- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 370a477932..6120d66c12 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1155,7 +1155,7 @@ class Parameter(Variable): self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) - self.average = kwargs.get('average', True) + self.do_model_average = kwargs.get('do_model_average', None) def __str__(self): return self.to_string(True) @@ -1177,7 +1177,7 @@ class Parameter(Variable): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "gradient_clip_attr", "average") + "gradient_clip_attr", "do_model_average") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, str(getattr(self, attr_name))) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e5ae10636d..37ce738275 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1489,8 +1489,7 @@ def batch_norm(input, name=None, moving_mean_name=None, moving_variance_name=None, - average_mean=True, - average_variance=True): + do_model_average_for_mean_and_var=False): """ This function helps create an operator to implement the BatchNorm layer using the configurations from the input parameters. @@ -1519,12 +1518,15 @@ def batch_norm(input, bias = helper.create_parameter( attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) + if do_model_average_for_mean_and_var: + do_model_average_for_mean_and_var = None + mean = helper.create_parameter( attr=ParamAttr( name=moving_mean_name, initializer=Constant(0.0), trainable=False, - average=average_variance), + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) mean.stop_gradient = True @@ -1534,7 +1536,7 @@ def batch_norm(input, name=moving_variance_name, initializer=Constant(1.0), trainable=False, - average=average_mean), + do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) variance.stop_gradient = True @@ -3352,14 +3354,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): Here are some examples to explain it. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - is [6, 8], the reshape operator will transform x into a 2-D tensor with + is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape specified is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this - case, one dimension of the target shape is set to -1, the value of this - dimension is inferred from the total element number of x and remaining + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining dimensions. 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape @@ -3593,7 +3595,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): def pad(x, paddings, pad_value=0., name=None): """ Pads a tensor with a constant value given by :attr:`pad_value`, and the - padded width is specified by :attr:`paddings`. + padded width is specified by :attr:`paddings`. Specifically, the number of values padded before the contents of :attr:`x` in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number @@ -3621,7 +3623,7 @@ def pad(x, paddings, pad_value=0., name=None): x (Variable): The input tensor variable. paddings (list): A list of integers. Its elements specify the padded width before and after for each dimension in turn. - The length of :attr:paddings must be + The length of :attr:paddings must be :math:`rank(x) \\times 2`. pad_value (float): The constant value used to pad. name(str|None): A name for this layer(optional). If set None, the layer diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 560257a356..1917b7d044 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -840,7 +840,7 @@ class ModelAverage(Optimizer): """ def __init__(self, - average_window_rate=0.15, + average_window_rate, params_grads=None, min_average_window=10000, max_average_window=10000, @@ -856,7 +856,7 @@ class ModelAverage(Optimizer): params[param.name] = (param, grad) for param in framework.default_main_program().global_block( ).all_parameters(): - if param.name not in params and param.average: + if param.name not in params and param.do_model_average != False: grad = param.block.create_var( name=unique_name.generate(".".join([param.name, 'tmp'])), dtype=param.dtype, diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 74b968f8ee..1c6970441b 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -29,14 +29,14 @@ class ParamAttr(object): regularizer=None, trainable=True, gradient_clip=None, - average=True): + do_model_average=None): self.name = name self.initializer = initializer self.learning_rate = learning_rate self.regularizer = regularizer self.trainable = trainable self.gradient_clip = gradient_clip - self.average = average + self.model_average = do_model_average def set_default_initializer(self, initializer): if initializer is None: @@ -83,7 +83,7 @@ class ParamAttr(object): 'regularizer': self.regularizer, 'trainable': self.trainable, 'gradient_clip_attr': self.gradient_clip, - 'average': self.average + 'model_average': self.model_average } if with_initializer: kwargs['initializer'] = self.initializer From a9e826ed495bcd5a5b625d4ce364c8c42d0d0b7d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Sun, 8 Apr 2018 06:32:30 +0000 Subject: [PATCH 07/29] Add the check of has_feed/fetch_operators back. --- paddle/fluid/framework/executor.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8a0ab118d0..3edaede8d6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -352,13 +352,17 @@ void Executor::RunPreparedContext( bool create_vars) { auto& global_block = ctx->prog_.Block(ctx->block_id_); + PADDLE_ENFORCE( + has_feed_operators(global_block, feed_targets, feed_holder_name), + "Program in ExecutorPrepareContext should has feed_ops."); + PADDLE_ENFORCE( + has_fetch_operators(global_block, fetch_targets, fetch_holder_name), + "Program in the prepared context should has fetch_ops."); + // map the data of feed_targets to feed_holder for (auto* op : global_block.AllOps()) { if (op->Type() == kFeedOpType) { std::string feed_target_name = op->Output("Out")[0]; - PADDLE_ENFORCE(feed_targets.find(feed_target_name) != feed_targets.end(), - "Variable %s is not feeded."); - int idx = boost::get(op->GetAttr("col")); SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, idx); @@ -371,10 +375,6 @@ void Executor::RunPreparedContext( for (auto* op : global_block.AllOps()) { if (op->Type() == kFetchOpType) { std::string fetch_target_name = op->Input("X")[0]; - PADDLE_ENFORCE( - fetch_targets.find(fetch_target_name) != fetch_targets.end(), - "Variable %s is not fetched."); - int idx = boost::get(op->GetAttr("col")); *fetch_targets[fetch_target_name] = GetFetchVariable(*scope, fetch_holder_name, idx); From 9fe938cb2aefcbced1e60fa459c943fa2ea245e6 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 03:48:26 +0000 Subject: [PATCH 08/29] Changing network configuration, avoid nan --- .../fluid/tests/book/test_label_semantic_roles.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index c0a6df831a..5fc64ea958 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -77,7 +77,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim) + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') ]) lstm = fluid.layers.dynamic_lstm( @@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, input_tmp = [mix_hidden, lstm] feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len) + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') ]) return feature_out @@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True): # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( - learning_rate=0.0001, + learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) From d9a52223852a92d532ff2522cb648758511abe26 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 04:57:30 +0000 Subject: [PATCH 09/29] code style --- .../tests/book/test_label_semantic_roles.py | 67 ++++++++++--------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index 5fc64ea958..4f5d30ac00 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -70,14 +70,15 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) for x in word_input + param_attr=fluid.ParamAttr(name=embedding_name, trainable=False)) + for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) @@ -163,8 +164,7 @@ def train(use_cuda, save_dirname=None, is_local=True): crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) @@ -189,8 +189,7 @@ def train(use_cuda, save_dirname=None, is_local=True): num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -223,24 +222,25 @@ def train(use_cuda, save_dirname=None, is_local=True): exe) if batch_id % 10 == 0: - print("avg_cost:" + str(cost) + " precision:" + str( - precision) + " recall:" + str(recall) + " f1_score:" + - str(f1_score) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str( - pass_recall) + " pass_f1_score:" + str( - pass_f1_score)) + print( + "avg_cost:" + str(cost) + " precision:" + + str(precision) + " recall:" + str(recall) + + " f1_score:" + str(f1_score) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str(pass_recall) + + " pass_f1_score:" + str(pass_f1_score)) if batch_id != 0: - print("second per batch: " + str((time.time( - ) - start_time) / batch_id)) + print("second per batch: " + str( + (time.time() - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(pass_precision) > 0.05: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode - fluid.io.save_inference_model(save_dirname, [ - 'word_data', 'verb_data', 'ctx_n2_data', - 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', - 'ctx_p2_data', 'mark_data' - ], [feature_out], exe) + fluid.io.save_inference_model( + save_dirname, [ + 'word_data', 'verb_data', 'ctx_n2_data', + 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'mark_data' + ], [feature_out], exe) return batch_id = batch_id + 1 @@ -320,19 +320,20 @@ def infer(use_cuda, save_dirname=None): assert feed_target_names[6] == 'ctx_p2_data' assert feed_target_names[7] == 'mark_data' - results = exe.run(inference_program, - feed={ - feed_target_names[0]: word, - feed_target_names[1]: pred, - feed_target_names[2]: ctx_n2, - feed_target_names[3]: ctx_n1, - feed_target_names[4]: ctx_0, - feed_target_names[5]: ctx_p1, - feed_target_names[6]: ctx_p2, - feed_target_names[7]: mark - }, - fetch_list=fetch_targets, - return_numpy=False) + results = exe.run( + inference_program, + feed={ + feed_target_names[0]: word, + feed_target_names[1]: pred, + feed_target_names[2]: ctx_n2, + feed_target_names[3]: ctx_n1, + feed_target_names[4]: ctx_0, + feed_target_names[5]: ctx_p1, + feed_target_names[6]: ctx_p2, + feed_target_names[7]: mark + }, + fetch_list=fetch_targets, + return_numpy=False) print(results[0].lod()) np_data = np.array(results[0]) print("Inference Shape: ", np_data.shape) From 7c1434dd73d367932e98ae569093183d33b7e5fb Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 07:36:15 +0000 Subject: [PATCH 10/29] code style --- .../tests/book/test_label_semantic_roles.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index 4f5d30ac00..ace2e39ba4 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -70,8 +70,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, - param_attr=fluid.ParamAttr(name=embedding_name, trainable=False)) - for x in word_input + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -164,7 +164,8 @@ def train(use_cuda, save_dirname=None, is_local=True): crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) @@ -189,7 +190,8 @@ def train(use_cuda, save_dirname=None, is_local=True): num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( - paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192), + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -222,25 +224,24 @@ def train(use_cuda, save_dirname=None, is_local=True): exe) if batch_id % 10 == 0: - print( - "avg_cost:" + str(cost) + " precision:" + - str(precision) + " recall:" + str(recall) + - " f1_score:" + str(f1_score) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str(pass_recall) - + " pass_f1_score:" + str(pass_f1_score)) + print("avg_cost:" + str(cost) + " precision:" + str( + precision) + " recall:" + str(recall) + " f1_score:" + + str(f1_score) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str( + pass_recall) + " pass_f1_score:" + str( + pass_f1_score)) if batch_id != 0: - print("second per batch: " + str( - (time.time() - start_time) / batch_id)) + print("second per batch: " + str((time.time( + ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(pass_precision) > 0.05: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode - fluid.io.save_inference_model( - save_dirname, [ - 'word_data', 'verb_data', 'ctx_n2_data', - 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', - 'ctx_p2_data', 'mark_data' - ], [feature_out], exe) + fluid.io.save_inference_model(save_dirname, [ + 'word_data', 'verb_data', 'ctx_n2_data', + 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'mark_data' + ], [feature_out], exe) return batch_id = batch_id + 1 @@ -320,20 +321,19 @@ def infer(use_cuda, save_dirname=None): assert feed_target_names[6] == 'ctx_p2_data' assert feed_target_names[7] == 'mark_data' - results = exe.run( - inference_program, - feed={ - feed_target_names[0]: word, - feed_target_names[1]: pred, - feed_target_names[2]: ctx_n2, - feed_target_names[3]: ctx_n1, - feed_target_names[4]: ctx_0, - feed_target_names[5]: ctx_p1, - feed_target_names[6]: ctx_p2, - feed_target_names[7]: mark - }, - fetch_list=fetch_targets, - return_numpy=False) + results = exe.run(inference_program, + feed={ + feed_target_names[0]: word, + feed_target_names[1]: pred, + feed_target_names[2]: ctx_n2, + feed_target_names[3]: ctx_n1, + feed_target_names[4]: ctx_0, + feed_target_names[5]: ctx_p1, + feed_target_names[6]: ctx_p2, + feed_target_names[7]: mark + }, + fetch_list=fetch_targets, + return_numpy=False) print(results[0].lod()) np_data = np.array(results[0]) print("Inference Shape: ", np_data.shape) From ad6ddf533cfb1542283f741cddb78835fb3b8658 Mon Sep 17 00:00:00 2001 From: jshower Date: Tue, 10 Apr 2018 09:23:11 +0000 Subject: [PATCH 11/29] for ci --- python/paddle/fluid/tests/book/test_label_semantic_roles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index ace2e39ba4..4d8bca4d24 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -37,7 +37,7 @@ depth = 8 mix_hidden_lr = 1e-3 IS_SPARSE = True -PASS_NUM = 10 +PASS_NUM = 100 BATCH_SIZE = 10 embedding_name = 'emb' @@ -234,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True): print("second per batch: " + str((time.time( ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test - if float(pass_precision) > 0.05: + if float(pass_precision) > 0.01: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ From a7c6bf771c493cc9031975ceabcb126ef9ed1188 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 11 Apr 2018 09:53:56 +0800 Subject: [PATCH 12/29] Change do_model_average_for_mean_and_var to boolean in batch_normal. --- python/paddle/fluid/layers/nn.py | 3 --- python/paddle/fluid/optimizer.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 37ce738275..56c37f05cc 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1518,9 +1518,6 @@ def batch_norm(input, bias = helper.create_parameter( attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) - if do_model_average_for_mean_and_var: - do_model_average_for_mean_and_var = None - mean = helper.create_parameter( attr=ParamAttr( name=moving_mean_name, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1917b7d044..36503cac6d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -853,7 +853,8 @@ class ModelAverage(Optimizer): self.params_grads = [] if params_grads is None else params_grads params = {} for param, grad in self.params_grads: - params[param.name] = (param, grad) + if param.do_model_average != False: + params[param.name] = (param, grad) for param in framework.default_main_program().global_block( ).all_parameters(): if param.name not in params and param.do_model_average != False: From 52987902c98378432ba9e3fc54307e19e87aaca3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 11 Apr 2018 16:52:16 +0800 Subject: [PATCH 13/29] Polish reshape op --- paddle/fluid/operators/reshape_op.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 807e5ad951..9abc78421a 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel { static framework::DDim ValidateShape(const std::vector shape, const framework::DDim &in_dims) { const int64_t in_size = framework::product(in_dims); - // only one dimension canbe set to -1, whose size will be automatically + // only one dimension can be set to -1, whose size will be automatically // infered. const int64_t unk_dim_val = -1; const int64_t copy_dim_val = 0; @@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel { auto *shape_tensor = ctx.Input("Shape"); framework::DDim out_dims = out->dims(); + if (shape_tensor) { auto *shape_data = shape_tensor->data(); + framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { - framework::Tensor cpu_shape_tensor; TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); + ctx.device_context().Wait(); } auto shape = std::vector(shape_data, shape_data + shape_tensor->numel()); From 70500398b63cf8a80a6113ada9e06aa5e98a541e Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 12 Apr 2018 09:54:33 +0800 Subject: [PATCH 14/29] wip --- paddle/fluid/operators/detail/grpc_client.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 8bbfd1f159..b546aa1d2f 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, + this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; @@ -90,7 +91,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, + this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -133,8 +135,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -197,7 +199,7 @@ bool RPCClient::Wait() { std::vector> waits(req_count_); for (int i = 0; i < req_count_; i++) { - waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); }); } for (int i = 0; i < req_count_; i++) { From d798e3258d475140a532804d7cd4980aa38475cd Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 12 Apr 2018 09:57:38 +0800 Subject: [PATCH 15/29] update grpc version --- cmake/external/grpc.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 0853b98181..aa24915947 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh) ELSE() - SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin) ENDIF() ExternalProject_Add( extern_grpc DEPENDS protobuf zlib GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.8.x" + GIT_TAG "v1.11.x" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" From 0532bc4078f59e44967df2ebca4e2aa0bd28ea36 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 12 Apr 2018 11:43:46 +0800 Subject: [PATCH 16/29] init --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index fbec88c796..7856d3bbc4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 +FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR From e90e7ab237723ddab75be247d5f29780968924f4 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 12 Apr 2018 11:45:43 +0800 Subject: [PATCH 17/29] Remove the use of ARCHIVE_START/END (#9844) * Add USE_OP of all operators and kernels and remove ARCHIVE_START/END in CMakeLists.txt of inference unittests. * Remove ARCHIVE_START/END when linking inference shared library. * Disable some fluid related cmake operations for cross-compiling. --- cmake/cblas.cmake | 34 +++++++++++-------- cmake/external/snappy.cmake | 16 ++++----- cmake/external/snappystream.cmake | 14 ++++---- cmake/generic.cmake | 15 ++------ cmake/inference_lib.cmake | 32 +++++++++++++++++ paddle/CMakeLists.txt | 2 +- paddle/fluid/CMakeLists.txt | 3 +- paddle/fluid/inference/CMakeLists.txt | 4 +-- paddle/fluid/inference/io.cc | 6 ++++ paddle/fluid/inference/io.h | 3 ++ .../fluid/inference/tests/book/CMakeLists.txt | 2 +- 11 files changed, 83 insertions(+), 48 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6320b17520..52a22c1fbf 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -62,29 +62,33 @@ endif() ## Then find the reference-cblas. www.netlib.org/blas/ - - set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH "Folder contains reference-cblas") -set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/include - /usr/include - /usr/include/cblas -) - -set(REFERENCE_CBLAS_LIB_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/reference/ - /usr/lib/reference/ -) +if(NOT CMAKE_CROSSCOMPILING) + set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/include + /usr/include + /usr/include/cblas + ) + + set(REFERENCE_CBLAS_LIB_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/lib + /usr/lib + /usr/lib/blas/reference/ + /usr/lib/reference/ + ) +else() + # Diable the finding of reference cblas under host's system path + set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include) + set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib) +endif() find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) -if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) +if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) set(CBLAS_FOUND ON) set(CBLAS_PROVIDER REFERENCE) set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index 71f54c425d..80282329c6 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -11,19 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -IF(MOBILE_INFERENCE) +if(MOBILE_INFERENCE OR RPI) return() -ENDIF() +endif() include (ExternalProject) # NOTE: snappy is needed when linking with recordio -SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) -SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) -SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE) +set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) +set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) +set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) + +set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") ExternalProject_Add( extern_snappy @@ -51,8 +52,7 @@ ExternalProject_Add( ) add_library(snappy STATIC IMPORTED GLOBAL) -set_property(TARGET snappy PROPERTY IMPORTED_LOCATION - "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) include_directories(${SNAPPY_INCLUDE_DIR}) add_dependencies(snappy extern_snappy) diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake index 8f7a3bf8ee..20a9643082 100644 --- a/cmake/external/snappystream.cmake +++ b/cmake/external/snappystream.cmake @@ -11,9 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -IF(MOBILE_INFERENCE) +IF(MOBILE_INFERENCE OR RPI) return() ENDIF() @@ -21,9 +20,11 @@ include (ExternalProject) # NOTE: snappy is needed when linking with recordio -SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) -SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) -SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE) +set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) +set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) +set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) + +set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") ExternalProject_Add( extern_snappystream @@ -51,8 +52,7 @@ ExternalProject_Add( ) add_library(snappystream STATIC IMPORTED GLOBAL) -set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION - "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") +set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers. include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers. diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c4c9f77df8..1d3e2ade6d 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -195,14 +195,7 @@ function(cc_library TARGET_NAME) list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() - if("${cc_library_DEPS}" MATCHES "ARCHIVE_START") - # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). - # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. - target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) - list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END) - else() - target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) - endif() + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() @@ -243,11 +236,7 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) - target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) - if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") - list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) - endif() + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 0323cd9698..cc75801982 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -1,7 +1,22 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set_property(GLOBAL PROPERTY FLUID_MODULES "") # find all fluid modules is used for paddle fluid static library function(find_fluid_modules TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) string(FIND "${__target_path}" "fluid" pos) if(pos GREATER 1) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -77,6 +92,23 @@ elseif (WITH_MKLML) ) endif() +if(NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) +endif() + # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid") diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index c44f8a8a8e..8b1ca5e165 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY) endif() add_subdirectory(testing) -if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS) +if(NOT MOBILE_INFERENCE AND NOT RPI) add_subdirectory(fluid) endif() diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index d725763b01..d274d96c29 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(pybind) -add_subdirectory(inference) add_subdirectory(string) add_subdirectory(recordio) +# NOTE: please add subdirectory inference at last. +add_subdirectory(inference) diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index f417f62f3f..e53bcf2384 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,4 +1,4 @@ -set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init) +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init) cc_library(paddle_fluid_api SRCS io.cc @@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules}) # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc - DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END) + DEPS ${fluid_modules}) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index a5b62ef322..a29d457b6f 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,10 +17,16 @@ limitations under the License. */ #include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/pybind/pybind.h" namespace paddle { namespace inference { +// Temporarilly add this function for exposing framework::InitDevices() when +// linking the inference shared library. +void Init(bool init_p2p) { framework::InitDevices(init_p2p); } + void ReadBinaryFile(const std::string& filename, std::string& contents) { std::ifstream fin(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index d07d315b93..756c936b33 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -18,12 +18,15 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { +void Init(bool init_p2p); + void LoadPersistables(framework::Executor& executor, framework::Scope& scope, const framework::ProgramDesc& main_program, const std::string& dirname, diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 86e36f3f65..97d9f03f88 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -17,7 +17,7 @@ function(inference_test TARGET_NAME) string(REGEX REPLACE "^_$" "" arg "${arg}") cc_test(test_inference_${TARGET_NAME}${arg} SRCS test_inference_${TARGET_NAME}.cc - DEPS ARCHIVE_START paddle_fluid ARCHIVE_END + DEPS paddle_fluid ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) set_tests_properties(test_inference_${TARGET_NAME}${arg} PROPERTIES DEPENDS test_${TARGET_NAME}) From 1204d9f3d1b76de8d3fce594634134bcfb653c8e Mon Sep 17 00:00:00 2001 From: Dang Qingqing Date: Thu, 12 Apr 2018 13:12:05 +0800 Subject: [PATCH 18/29] Refine batch_norm_op. --- paddle/fluid/operators/batch_norm_op.cu.cc | 27 ++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index eecb58e11e..cb1927bc0f 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -114,23 +114,11 @@ class BatchNormKernel const auto *bias = ctx.Input("Bias"); auto *y = ctx.Output("Y"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); // alloc memory y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); - math::SetConstant> - functor; - functor(dev_ctx, saved_mean, static_cast>(0)); - functor(dev_ctx, saved_variance, static_cast>(0)); auto handle = dev_ctx.cudnn_handle(); @@ -159,6 +147,21 @@ class BatchNormKernel // Run training mode. // obtain running mean and running inv var, and see if we need to // initialize them. + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + math::SetConstant> + functor; + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + double this_factor = 1. - momentum; CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( From ad73b331c757a0a0d795d9aa99a86b077f144357 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 12 Apr 2018 13:30:04 +0800 Subject: [PATCH 19/29] Eagerly drop local scope in iteration (#9838) * Eagerly drop local scope in iteration * Correct create var * Fix typo * Debug --- .../details/computation_op_handle.cc | 4 +- .../framework/details/fetch_op_handle.cc | 8 +++- .../fluid/framework/details/op_handle_base.h | 2 + .../framework/details/ssa_graph_executor.h | 4 +- .../details/threaded_ssa_graph_executor.cc | 30 ------------ .../details/threaded_ssa_graph_executor.h | 3 -- paddle/fluid/framework/parallel_executor.cc | 47 +++++++++++++++---- 7 files changed, 54 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7a1b40c0b6..e3f8bbb72f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" +#include + namespace paddle { namespace framework { namespace details { @@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() { } } - op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get(), place_); + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); } std::string ComputationOpHandle::Name() const { return op_->Type(); } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 9180903b86..e3e7c55d15 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include +#include + namespace paddle { namespace framework { namespace details { @@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() { for (size_t i = 0; i < scopes.size(); ++i) { auto &scope = scopes[i]; - auto &t = scope->FindVar(var_name)->Get(); + auto &t = scope->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(var_name) + ->Get(); if (platform::is_gpu_place(var->place_)) { #ifdef PADDLE_WITH_CUDA TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index d7a541ac4b..fbdb54ba8d 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -24,6 +24,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; + class OpHandleBase { private: DISABLE_COPY_AND_ASSIGN(OpHandleBase); diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 3b818b1a45..a8833b7388 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -15,13 +15,15 @@ #pragma once #include +#include +#include + #include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/feed_fetch_type.h" namespace paddle { namespace framework { namespace details { - class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 62af4c1d79..1ce69ab02b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ready_ops.clear(); }; - // Create local scopes. - for (auto &scope : local_scopes_) { - auto &local_scope = scope->NewScope(); - *scope->Var("@TMP_SCOPE@")->GetMutable() = &local_scope; - } - // Step 3. Execution while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) { // 1. Run All Ready ops @@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(delayed_ops.empty()); PADDLE_ENFORCE(blocked_by_delayed_ops.empty()); - ++computation_count_; - - auto sync_computation = [&] { - computation_count_ = 0; - // Wait All computational streams - for (auto p : this->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &scope : local_scopes_) { - scope->DropKids(); - } - }; // Wait FetchOps. if (!fetch_ops.empty()) { fetch_ops.clear(); - sync_computation(); - } - - if (computation_count_ == max_async_computation) { - sync_computation(); - } - - // NOTE: the temp scope can be dropped lazily if needed. - // Drop tmp scopes; - for (auto &scope : local_scopes_) { - auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); - kid = nullptr; } return fetch_data; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 79cfc26b46..bb5e837b13 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unique_ptr exception_; std::atomic running_ops_; bool allow_op_delay_; - - size_t computation_count_{0}; - size_t max_async_computation{100}; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 20dcc080b6..c1486b527d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include +#include #include #ifdef PADDLE_WITH_CUDA @@ -41,6 +42,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif + + std::vector> var_types_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor( allow_op_delay)); // Step 3. Create vars in each scope; - for (auto *scope : member_->local_scopes_) { - for (auto *var : main_program.Block(0).AllVars()) { - if (scope->FindVar(var->Name()) != nullptr) { - continue; - } - - InitializeVariable(scope->Var(var->Name()), var->GetType()); - } + for (auto *var : main_program.Block(0).AllVars()) { + member_->var_types_.emplace_back(var->Name(), var->GetType(), + var->Persistable()); } } @@ -163,9 +161,42 @@ void ParallelExecutor::Run( const std::unordered_map &feed_tensors) { platform::RecordBlock b(0); SplitTensorToPlaces(feed_tensors); + + // Create local scopes. + for (auto &scope : member_->local_scopes_) { + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &name_type_pair : member_->var_types_) { + if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) { + continue; + } + + if (std::get<2>(name_type_pair)) { // Persistable + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } else { + InitializeVariable(scope->Var(std::get<0>(name_type_pair)), + std::get<1>(name_type_pair)); + } + } + } + auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; + + // Wait All computational streams + for (auto p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : member_->local_scopes_) { + auto &local_scope = + *scope->Var(details::kLocalExecScopeName)->GetMutable(); + scope->DeleteScope(local_scope); + local_scope = nullptr; + } } void ParallelExecutor::SplitTensorToPlaces( From 339be6254ea5e3432e4cbe44f35609bb45662e12 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 12 Apr 2018 05:58:26 +0000 Subject: [PATCH 20/29] Refine the order of arguments. --- paddle/fluid/framework/executor.cc | 5 ++--- paddle/fluid/framework/executor.h | 4 ++-- paddle/fluid/inference/tests/test_helper.h | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 910012927b..34bba77f40 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -359,9 +359,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext( ExecutorPrepareContext* ctx, Scope* scope, std::map& feed_targets, - std::map& fetch_targets, - const std::string& feed_holder_name, const std::string& fetch_holder_name, - bool create_vars) { + std::map& fetch_targets, bool create_vars, + const std::string& feed_holder_name, const std::string& fetch_holder_name) { auto& global_block = ctx->prog_.Block(ctx->block_id_); PADDLE_ENFORCE( diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index cbd70d9544..8b3ea01542 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -73,9 +73,9 @@ class Executor { void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, std::map& feed_targets, std::map& fetch_targets, + bool create_vars = true, const std::string& feed_holder_name = "feed", - const std::string& fetch_holder_name = "fetch", - bool create_vars = true); + const std::string& fetch_holder_name = "fetch"); private: const platform::Place place_; diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 09fe344ec7..9875e43860 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -178,8 +178,8 @@ void TestInference(const std::string& dirname, std::unique_ptr ctx; if (PrepareContext) { ctx = executor.Prepare(*inference_program, 0); - executor.RunPreparedContext(ctx.get(), scope, feed_targets, - fetch_targets); + executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, + CreateVars); } else { executor.Run(*inference_program, scope, feed_targets, fetch_targets, CreateVars); @@ -198,7 +198,7 @@ void TestInference(const std::string& dirname, // Note: if you changed the inference_program, you need to call // executor.Prepare() again to get a new ExecutorPrepareContext. executor.RunPreparedContext(ctx.get(), scope, feed_targets, - fetch_targets); + fetch_targets, CreateVars); } else { executor.Run(*inference_program, scope, feed_targets, fetch_targets, CreateVars); From 26cfc634b9f4dc02b051b49f54e33b57938e5ff2 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 12 Apr 2018 14:48:26 +0800 Subject: [PATCH 21/29] multi stream thread pool --- paddle/fluid/framework/threadpool.cc | 10 +++++++--- paddle/fluid/framework/threadpool.h | 10 +++++----- paddle/fluid/operators/detail/grpc_server.cc | 2 +- .../paddle/fluid/tests/book/test_recognize_digits.py | 1 - 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 0a8377cc47..109c2c745c 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -14,8 +14,12 @@ #include "paddle/fluid/framework/threadpool.h" +#include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +DEFINE_int32(io_threadpool_size, 100, + "number of threads used for doing IO, default 100"); + namespace paddle { namespace framework { @@ -94,15 +98,15 @@ void ThreadPool::TaskLoop() { std::unique_ptr MultiStreamThreadPool::io_threadpool_(nullptr); std::once_flag MultiStreamThreadPool::io_init_flag_; -MultiStreamThreadPool* MultiStreamThreadPool::GetInstanceIO() { +ThreadPool* MultiStreamThreadPool::GetInstanceIO() { std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO); - return static_cast(io_threadpool_.get()); + return io_threadpool_.get(); } void MultiStreamThreadPool::InitIO() { if (io_threadpool_.get() == nullptr) { // TODO(typhoonzero1986): make this configurable - io_threadpool_.reset(new ThreadPool(100)); + io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size)); } } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 0a60488d9f..1cc058834c 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,12 +14,12 @@ limitations under the License. */ #pragma once -#include +#include // NOLINT #include -#include -#include +#include // NOLINT +#include // NOLINT #include -#include +#include // NOLINT #include #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -137,7 +137,7 @@ class ThreadPool { class MultiStreamThreadPool : ThreadPool { public: - static MultiStreamThreadPool* GetInstanceIO(); + static ThreadPool* GetInstanceIO(); static void InitIO(); private: diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index d5fc163bc2..36dad5dd43 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -216,10 +216,10 @@ void AsyncGRPCServer::RunSyncUpdate() { std::function prefetch_register = std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); + // TODO(wuyi): Run these "HandleRequest" in thread pool t_send_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_send_.get(), "cq_send", send_register))); - t_get_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_get_.get(), "cq_get", get_register))); diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index e4997b4069..5ec6890c1b 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -157,7 +157,6 @@ def train(nn_type, for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... - pserver_endpoints = os.getenv("PSERVERS") trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) From 4c55a6022a0a758295177371fc67c6800658b286 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 12 Apr 2018 15:26:35 +0800 Subject: [PATCH 22/29] Dist transpiler support prefetch (#9714) * init * add some check * add dist transpile logic * add insert op for block * init change get_pserver_program * optimize code * fix a bug * can run now * start to do table split * start to process table gradient * complete pserver part * can send_vars now * revert cpplint * fix a bug * optimize code * move dist test to models * revert the interface of distribute_transpiler.transpile * fix prefetch_block * optimize trainspiler code * add comment to sum_op * add warning log * fix comment * fix test_send_recv * fix test_send_recv * fix train with no distributed table * optimize GetDims --- paddle/fluid/framework/block_desc.h | 2 +- paddle/fluid/framework/operator.cc | 13 +- paddle/fluid/operators/concat_op.cc | 6 +- paddle/fluid/operators/detail/grpc_server.cc | 1 + paddle/fluid/operators/listen_and_serv_op.cc | 45 ++- paddle/fluid/operators/listen_and_serv_op.h | 2 + paddle/fluid/operators/lookup_table_op.cc | 3 + paddle/fluid/operators/prefetch_op.cc | 8 +- paddle/fluid/operators/send_recv_op_test.cc | 26 +- paddle/fluid/operators/send_vars_op.cc | 4 +- paddle/fluid/operators/sgd_op.cc | 4 +- paddle/fluid/operators/split_ids_op.cc | 14 +- paddle/fluid/operators/split_ids_op.h | 70 ++-- paddle/fluid/operators/sum_op.cc | 7 +- python/paddle/fluid/distribute_transpiler.py | 343 +++++++++++++++++-- python/paddle/fluid/layers/nn.py | 8 +- 16 files changed, 450 insertions(+), 106 deletions(-) diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 873969b2a8..eef19c4f09 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -92,7 +92,7 @@ class BlockDesc { /* * Remove Op and its input/output variables. - * Note that for either input or ouput variable, if it is also an input or + * Note that for either input or output variable, if it is also an input or * output variable of other ops, we should remain it. */ void RemoveOp(size_t s, size_t e); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a3b4a8c082..f97bd08274 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) { } } -static DDim GetDims(const Scope& scope, const std::string& name) { +static DDim GetDims(const Scope& scope, const std::string& name, + bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { return DDim({-1}); @@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) { if (var->IsType()) { return var->Get().dims(); } else if (var->IsType()) { - return var->Get().GetCompleteDims(); + if (get_actual_dim) { + return var->Get().value().dims(); + } else { + return var->Get().GetCompleteDims(); + } } else { return DDim({-1}); } @@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { - ss << "[" << GetDims(*scope, input.second[i]) << "]"; + ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } if (i != input.second.size() - 1) { @@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { - ss << "[" << GetDims(*scope, output.second[i]) << "]"; + ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } if (i != output.second.size() - 1) { diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index d65a7b3467..4a36b03cb6 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" + #include #include @@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel { size_t axis = static_cast(ctx->Attrs().Get("axis")); const size_t n = ins.size(); - PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1."); + PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); + if (n == 1) { + VLOG(3) << "Warning: concat op have only one input, may waste memory"; + } auto out_dims = ins[0]; size_t in_zero_dims_size = out_dims.size(); diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index d5fc163bc2..0b582a08bc 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase { ::grpc::ByteBuffer reply; std::string var_name = request_->OutVarname(); + VLOG(3) << "prefetch var " << var_name; auto var_desc = program_->Block(0).FindVar(var_name); framework::Scope* local_scope = &scope_->NewScope(); auto* var = local_scope->FindVar(var_name); diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 9188f2d989..5d293665f0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include +#include // NOLINT +#include #include "paddle/fluid/operators/listen_and_serv_op.h" @@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, auto ins = Inputs("X"); auto fan_in = Attr("Fanin"); - auto *block = Attr(kOptimizeBlock); - auto *program = block->Program(); + auto *optimize_block = Attr(kOptimizeBlock); + auto *prefetch_block = Attr(kPrefetchBlock); + auto *program = optimize_block->Program(); size_t num_blocks = program->Size(); PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); @@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, framework::Executor executor(dev_place); std::vector block_list; for (size_t blkid = 1; blkid < num_blocks; ++blkid) { - block_list.push_back(blkid); + if (blkid != prefetch_block->ID()) { + block_list.push_back(blkid); + } } - auto prepared = executor.Prepare(*program, block_list); + auto optimize_prepared = executor.Prepare(*program, block_list); // Insert placeholder for block0 which holds current op itself. - prepared.insert(prepared.begin(), - std::shared_ptr(nullptr)); + optimize_prepared.insert( + optimize_prepared.begin(), + std::shared_ptr(nullptr)); rpc_service_->SetScope(&recv_scope); rpc_service_->SetDevCtx(&dev_ctx); // TODO(qiao) set proper fields for table lookup and update rpc_service_->SetExecutor(&executor); - rpc_service_->SetPrefetchBlkdId(0); + VLOG(3) << "prefetch block id is " << prefetch_block->ID(); + auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID()); + rpc_service_->SetPrefetchBlkdId(prefetch_block->ID()); + rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get()); + prefetch_prepared.release(); rpc_service_->SetProgram(program); // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); @@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, parallel_blkids.push_back(1); double ts = detail::GetTimestamp(); for (size_t blkid = 2; blkid < num_blocks; ++blkid) { - if (program->Block(blkid).Parent() != last_parent_blkid) { - ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, - &recv_scope); - parallel_blkids.clear(); - last_parent_blkid = program->Block(blkid).Parent(); + if (blkid != prefetch_block->ID()) { + if (program->Block(blkid).Parent() != last_parent_blkid) { + ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared, + program, &recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); + } + parallel_blkids.push_back(blkid); } - parallel_blkids.push_back(blkid); } - ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program, - &recv_scope); + ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared, + program, &recv_scope); VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; // Reset the received sparse variables, the sum operator would not @@ -211,6 +222,8 @@ from send_op and send back variables to recv_op. .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); AddAttr(kOptimizeBlock, "BlockID to run on server side."); + AddAttr(kPrefetchBlock, + "prefetch block to run on server side."); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 0da87afc96..759b2a462b 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -27,6 +28,7 @@ namespace paddle { namespace operators { constexpr char kOptimizeBlock[] = "OptimizeBlock"; +constexpr char kPrefetchBlock[] = "PrefetchBlock"; void RunServer(std::shared_ptr service); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index bf33be3106..5e59bd1b17 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); AddAttr("padding_idx", "(int64, default -1) " "If the value is -1, it makes no effect to lookup. " diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 09ab7da663..f9ae01ab5d 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get " - << outs[i] << "back"; + VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " + << outs[i] << " back"; rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], outs[i]); } else { @@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker { "(RPCClient) The RPC client object which will be" "initialized at most once."); AddOutput("Out", - "(SelectedRows) result " + "(LoDTensor) result " "to be fetched from parameter server") .AsDuplicable(); AddAttr>( diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index 542bc3fde2..3bf5d57809 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include #include -#include +#include // NOLINT #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" @@ -37,11 +37,11 @@ namespace m = paddle::operators::math; std::unique_ptr listen_and_serv_op; int selected_port; -void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { +void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) { p::CPUDeviceContext ctx(place); for (int i = 0; i < 2; ++i) { auto var_name = paddle::string::Sprintf("x%d", i); - auto var = scope.Var(var_name); + auto var = scope->Var(var_name); auto tensor = var->GetMutable(); tensor->Resize({10, 10}); float *expect = tensor->mutable_data(place); @@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) { } } - auto out_var = scope.Var("Out"); + auto out_var = scope->Var("Out"); auto out_tensor = out_var->GetMutable(); out_tensor->Resize({10, 10}); out_tensor->mutable_data(place); // allocate } -void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { +void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) { p::CPUDeviceContext ctx(place); int64_t height = 10; int64_t row_numel = 10; m::SetConstant set_one; // init x0 std::vector rows0{0, 4, 7}; - auto x0_var = scope.Var("x0"); + auto x0_var = scope->Var("x0"); auto x0 = x0_var->GetMutable(); x0->set_rows(rows0); x0->set_height(height); @@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { // init x1 std::vector rows1{2, 9}; - auto x1_var = scope.Var("x1"); + auto x1_var = scope->Var("x1"); auto x1 = x1_var->GetMutable(); x1->set_rows(rows1); x1->set_height(height); @@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) { f::make_ddim({static_cast(rows1.size()), row_numel}), place); set_one(ctx, x1_value, 1.0); - auto out_var = scope.Var("Out"); + auto out_var = scope->Var("Out"); auto out = out_var->GetMutable(); auto out_value = out->mutable_value(); out->set_height(height); @@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) { f::Scope scope; p::CPUPlace place; if (is_sparse) { - InitSelectedRowsInScope(scope, place); + InitSelectedRowsInScope(place, &scope); } else { - InitTensorsInScope(scope, place); + InitTensorsInScope(place, &scope); } // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); auto *optimize_block = program.AppendBlock(root_block); + auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensers, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); @@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) { attrs.insert({"ParamList", std::vector({"Out"})}); attrs.insert({"GradList", std::vector({"x1"})}); attrs.insert({"OptimizeBlock", optimize_block}); + attrs.insert({"PrefetchBlock", prefetch_block}); listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); LOG(INFO) << "selected port before run " << selected_port; @@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) { // local net f::Scope scope; p::CPUPlace place; - InitTensorsInScope(scope, place); + InitTensorsInScope(place, &scope); // create rpc client var scope.Var("RPC_CLIENT_VAR"); @@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) { f::Scope scope; p::CPUPlace place; p::CPUDeviceContext ctx(place); - InitSelectedRowsInScope(scope, place); + InitSelectedRowsInScope(place, &scope); scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; selected_port = static_cast( diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc index 2cbd9e2394..56b3713d6a 100644 --- a/paddle/fluid/operators/send_vars_op.cc +++ b/paddle/fluid/operators/send_vars_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase { auto ins = Inputs("X"); std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_sent"); + int sync_send = Attr("sync_send"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 074fa9e00f..06cb0550ad 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 element"); auto param_dim = ctx->GetInputDim("Param"); - // TODO(qijun): check dimensions of Param and Grad at complie - // and run time. + // TODO(qijun): check dimensions of Param and Grad at compile + // and runtime. ctx->SetOutputDim("ParamOut", param_dim); } diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc index a54f8a2878..a53cbc8ac5 100644 --- a/paddle/fluid/operators/split_ids_op.cc +++ b/paddle/fluid/operators/split_ids_op.cc @@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR); - auto ids_dims = ctx->GetInputDim("Ids"); - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } } }; @@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); for (auto &out_var : op_desc.Output("Out")) { - block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR); + block->Var(out_var)->SetType(input_var->GetType()); } } }; @@ -73,4 +74,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, ops::SplitIdsOpInferVarType); REGISTER_OP_CPU_KERNEL( - split_ids, ops::SplitIdsOpKernel); + split_ids, ops::SplitIdsOpKernel, + ops::SplitIdsOpKernel); diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d36ed398eb..ba1e903dbb 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -24,35 +24,63 @@ namespace operators { template class SplitIdsOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext &ctx) const override { auto place = ctx.GetPlace(); if (!platform::is_cpu_place(place)) { PADDLE_THROW("SplitIds do not support GPU kernel"); } - auto& ids_dims = ctx.Input("Ids")->dims(); - const T* ids = ctx.Input("Ids")->data(); - auto outs = ctx.MultiOutput("Out"); - const size_t shard_num = outs.size(); + const auto *ids_var = ctx.InputVar("Ids"); + if (ids_var->IsType()) { + const auto &ids_dims = ctx.Input("Ids")->dims(); + const T *ids = ctx.Input("Ids")->data(); + auto outs = ctx.MultiOutput("Out"); + const size_t shard_num = outs.size(); - std::vector> out_ids; - out_ids.resize(outs.size()); + std::vector> out_ids; + out_ids.resize(outs.size()); - // split id by their shard_num. - for (int i = 0; i < ids_dims[0]; ++i) { - T id = ids[i]; - size_t shard_id = static_cast(id) % shard_num; - out_ids[shard_id].push_back(id); - } + // split id by their shard_num. + for (int i = 0; i < ids_dims[0]; ++i) { + T id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + out_ids[shard_id].push_back(id); + } + + // create tensor for each shard and send to parameter server + for (size_t i = 0; i < out_ids.size(); ++i) { + auto *shard_t = outs[i]; + std::vector ids = out_ids[i]; + auto *shard_data = shard_t->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + for (size_t i = 0; i < ids.size(); ++i) { + shard_data[i] = ids[i]; + } + } + } else if (ids_var->IsType()) { + const auto *ids_selected_rows = ctx.Input("Ids"); + auto &ids_dims = ids_selected_rows->value().dims(); + PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), ""); + const T *ids = ids_selected_rows->value().data(); + const auto &ids_rows = ids_selected_rows->rows(); + auto outs = ctx.MultiOutput("Out"); + const size_t shard_num = outs.size(); + // get rows for outputs + for (auto &id : ids_rows) { + size_t shard_id = static_cast(id) % shard_num; + outs[shard_id]->mutable_rows()->push_back(id); + } - // create tensor for each shard and send to parameter server - for (size_t i = 0; i < out_ids.size(); ++i) { - auto* shard_t = outs[i]; - std::vector ids = out_ids[i]; - auto* shard_data = shard_t->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - for (size_t i = 0; i < ids.size(); ++i) { - shard_data[i] = ids[i]; + int64_t row_width = ids_dims[1]; + for (auto &out : outs) { + out->set_height(ids_selected_rows->height()); + framework::DDim ddim = framework::make_ddim( + {static_cast(out->rows().size()), row_width}); + T *output = out->mutable_value()->mutable_data(ddim, place); + for (size_t i = 0; i < ddim[0]; ++i) { + memcpy(output + i * row_width, ids + out->rows()[i] * row_width, + row_width * sizeof(T)); + } } } } diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 9061e137bd..108f26fafe 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -10,9 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sum_op.h" + #include #include #include + #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/detail/safe_ref.h" @@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputsDim("X"); size_t N = x_dims.size(); - PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); + PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); + if (N == 1) { + VLOG(3) << "Warning: sum have only one input, may waste memory"; + } framework::DDim in_dim({0}); for (auto& x_dim : x_dims) { diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index e18ace844e..b0522b49f4 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -13,14 +13,17 @@ # limitations under the License. from __future__ import print_function -import framework -from framework import Program, default_main_program, default_startup_program, Parameter, Variable -import optimizer -from layer_helper import LayerHelper -import distributed_splitter as splitter + import math + +import distributed_splitter as splitter +import framework +from framework import Program, default_main_program, Variable from . import core -import debuger + +LOOKUP_TABLE_TYPE = "lookup_table" +LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" +RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR" class VarBlock: @@ -35,9 +38,9 @@ class VarBlock: class UnionFind(object): - """ Union-find data struct. + """ Union-find data structure. - Union-find is a data struct that keeps track of a set of elements partitioned + Union-find is a data structure that keeps track of a set of elements partitioned into a number of disjoint (non-overlapping) subsets. Reference: @@ -185,19 +188,66 @@ class DistributeTranspiler: assert (callable(split_method)) if program is None: program = default_main_program() - self.program = program - self.trainers = trainers + self.origin_program = program + self.trainer_num = trainers self.optimize_ops = optimize_ops # TODO(typhoonzero): currently trainer_id is fetched from cluster system # like Kubernetes, we should port this to use etcd later when developing # fluid distributed training with fault-tolerance. self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") + self.pserver_endpoints = pserver_endpoints + + # process lookup_table_op + # 1. check all lookup_table_op is distributed + # 2. check all lookup_table_op share the same table. + distributed_lookup_table_ops = [] + # support only one distributed_lookup_table now + self.table_name = None + for op in program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if op.attrs['is_distributed'] is True: + if self.table_name is None: + self.table_name = op.input("W")[0] + if self.table_name != op.input("W")[0]: + raise RuntimeError("all distributed lookup_table_ops" + " should have only one table") + distributed_lookup_table_ops.append(op) + else: + if self.table_name is not None: + assert op.input("W")[0] != self.table_name + + self.has_distributed_lookup_table = len( + distributed_lookup_table_ops) > 0 # step1: For large parameters and gradients, split them into smaller # blocks. param_list = [pg[0] for pg in params_grads] grad_list = [pg[1] for pg in params_grads] + + if self.has_distributed_lookup_table: + param_list = [ + param for param in param_list if param.name != self.table_name + ] + grad_list = [ + grad for grad in grad_list + if grad.name != framework.grad_var_name(self.table_name) + ] + self.table_param_grad = [ + param_grad for param_grad in params_grads + if param_grad[0].name == self.table_name + ][0] + table_grad_var = self.table_param_grad[1] + self.table_grad_list = [ + program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, trainer_id, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) # step2: Create new vars for the parameters and gradients blocks and @@ -229,7 +279,7 @@ class DistributeTranspiler: self.param_grad_ep_mapping[ep]["grads"].append(grad) rpc_client_var = program.global_block().create_var( - name="RPC_CLIENT_VAR", + name=RPC_CLIENT_VAR_NAME, persistable=True, type=core.VarDesc.VarType.RAW) @@ -252,13 +302,19 @@ class DistributeTranspiler: outputs={"Out": [orig_param]}, attrs={"axis": 0}) + if self.has_distributed_lookup_table: + self._replace_lookup_table_op_with_prefetch(program, rpc_client_var, + eplist) + self._split_table_grad_and_add_send_vars(program, rpc_client_var, + pserver_endpoints) + def get_trainer_program(self): # remove optimize ops and add a send op to main_program - self.program.global_block().delete_ops(self.optimize_ops) - self.program.sync_with_cpp() + self.origin_program.global_block().delete_ops(self.optimize_ops) + self.origin_program.sync_with_cpp() # FIXME(typhoonzero): serialize once will fix error occurs when clone. - self.program.__str__() - return self.program + self.origin_program.__str__() + return self.origin_program def get_pserver_program(self, endpoint): """ @@ -294,8 +350,8 @@ class DistributeTranspiler: type=v.type, dtype=v.dtype, shape=v.shape) - if self.trainers > 1: - for trainer_id in xrange(self.trainers): + if self.trainer_num > 1: + for trainer_id in xrange(self.trainer_num): var = pserver_program.global_block().create_var( name="%s.trainer_%d" % (orig_var_name, trainer_id), persistable=False, @@ -309,7 +365,7 @@ class DistributeTranspiler: # step3 optimize_block = pserver_program.create_block(0) # step 4 - # Create a union-find data struct from optimize ops, + # Create a union-find data structure from optimize ops, # If two ops are connected, we could add these two ops # into one set. ufind = self._create_ufind(self.optimize_ops) @@ -384,6 +440,23 @@ class DistributeTranspiler: # __append_optimize_op__(glb_op, optimize_block) # break + # process distributed lookup_table + prefetch_block = None + if self.has_distributed_lookup_table: + pserver_index = self.pserver_endpoints.index(endpoint) + self._create_table_optimize_block(pserver_index, pserver_program, + append_block) + prefetch_block = self._create_prefetch_block( + pserver_index, pserver_program, optimize_block) + + # NOTE: if has_distributed_lookup_table is False, then prefetch_block will + # not be executed, so it's safe to use optimize_block to hold the place + if self.has_distributed_lookup_table: + assert prefetch_block is not None + else: + assert prefetch_block is None + prefetch_block = pserver_program.global_block() + # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", @@ -392,8 +465,10 @@ class DistributeTranspiler: attrs={ "OptimizeBlock": optimize_block, "endpoint": endpoint, - "Fanin": self.trainers + "Fanin": self.trainer_num, + "PrefetchBlock": prefetch_block }) + pserver_program.sync_with_cpp() return pserver_program @@ -451,6 +526,197 @@ class DistributeTranspiler: attrs=op.attrs) return s_prog + # transpiler function for dis lookup_table + def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var, + eplist): + # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op + self.prefetch_input_vars = None + self.prefetch_output_vars = None + + continue_search_lookup_table_op = True + while continue_search_lookup_table_op: + continue_search_lookup_table_op = False + all_ops = program.global_block().ops + for op in all_ops: + if op.type == LOOKUP_TABLE_TYPE: + continue_search_lookup_table_op = True + + op_index = list(all_ops).index(op) + ids_name = op.input("Ids") + out_name = op.output("Out") + + if self.prefetch_input_vars is None: + ids_var = program.global_block().vars[ids_name[0]] + self.prefetch_input_vars = self.create_splited_vars( + source_var=ids_var, + block=program.global_block(), + tag="_prefetch_in_") + if self.prefetch_output_vars is None: + out_var = program.global_block().vars[out_name[0]] + self.prefetch_output_vars = self.create_splited_vars( + source_var=out_var, + block=program.global_block(), + tag="_prefetch_out_") + + # insert split_ids_op + program.global_block().insert_op( + index=op_index, + type="split_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ] + }, + outputs={"Out": self.prefetch_input_vars}) + + # insert prefetch_op + program.global_block().insert_op( + index=op_index + 1, + type="prefetch", + inputs={'X': self.prefetch_input_vars}, + outputs={ + "Out": self.prefetch_output_vars, + "RPCClient": rpc_client_var + }, + attrs={"epmap": eplist}) + + # insert concat_op + program.global_block().insert_op( + index=op_index + 2, + type="concat", + inputs={'X': self.prefetch_output_vars}, + outputs={ + "Out": [ + program.global_block().vars[varname] + for varname in out_name + ] + }, + attrs={"axis": 0}) + + # delete lookup_table_op + program.global_block().delete_ops([op]) + program.sync_with_cpp() + # break for loop + break + + def _split_table_grad_and_add_send_vars(self, program, rpc_client_var, + pserver_endpoints): + # 2. add split_ids_op and send_vars_op to send gradient to pservers + # there should only be one table_name + all_ops = program.global_block().ops + table_grad_name = framework.grad_var_name(self.table_name) + for op in all_ops: + if table_grad_name in op.output_arg_names: + op_index = list(all_ops).index(op) + # insert split_ids_op + program.global_block().insert_op( + index=op_index + 1, + type="split_ids", + inputs={ + 'Ids': [program.global_block().vars[table_grad_name]] + }, + outputs={"Out": self.table_grad_list}) + program.global_block().insert_op( + index=op_index + 2, + type="send_vars", + inputs={'X': self.table_grad_list}, + outputs={"RPCClient": rpc_client_var}, + attrs={"sync_send": True, + "epmap": pserver_endpoints}) + break + + def _create_prefetch_block(self, pserver_index, pserver_program, + optimize_block): + # STEP: create prefetch block + table_var = pserver_program.global_block().vars[self.table_name] + prefetch_block = pserver_program.create_block(optimize_block.idx) + trainer_ids = self.prefetch_input_vars[pserver_index] + pserver_ids = pserver_program.global_block().create_var( + name=trainer_ids.name, + type=trainer_ids.type, + shape=trainer_ids.shape, + dtype=trainer_ids.dtype) + trainer_out = self.prefetch_output_vars[pserver_index] + pserver_out = pserver_program.global_block().create_var( + name=trainer_out.name, + type=trainer_out.type, + shape=trainer_out.shape, + dtype=trainer_out.dtype) + prefetch_block.append_op( + type=LOOKUP_TABLE_TYPE, + inputs={'Ids': pserver_ids, + "W": table_var}, + outputs={"Out": pserver_out}, + attrs={ + "is_sparse": True, # has no effect on lookup_table op + "is_distributed": True, + "padding_idx": -1 + }) + return prefetch_block + + def _create_table_optimize_block(self, pserver_index, pserver_program, + append_block): + def _clone_var(block, var, persistable=True): + assert isinstance(var, Variable) + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + persistable=persistable) + + # STEP: create table optimize block + # create table param and grad var in pserver program + param_var = _clone_var( + pserver_program.global_block(), + self.origin_program.global_block().vars[self.table_name]) + grad_var = _clone_var( + pserver_program.global_block(), + self.origin_program.global_block().vars[framework.grad_var_name( + self.table_name)], + persistable=False) + + # create grad vars in pserver program + table_grad_var = self.table_param_grad[1] + table_grad_list = [ + pserver_program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, index, pserver_index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) for index in range(self.trainer_num) + ] + + # create table optimize block in pserver program + table_opt_op = [ + op for op in self.optimize_ops + if op.input("Param")[0] == self.table_name + ][0] + table_opt_block = pserver_program.create_block(append_block.idx) + # only support sgd now + assert table_opt_op.type == "sgd" + + # append sum op for table_grad_list + table_opt_block.append_op( + type="sum", + inputs={"X": table_grad_list}, + outputs={"Out": [grad_var]}) + + lr_var = pserver_program.global_block().vars[table_opt_op.input( + "LearningRate")[0]] + inputs = { + "Param": [param_var], + "Grad": [grad_var], + "LearningRate": [lr_var] + } + outputs = {"ParamOut": [param_var]} + table_opt_block.append_op( + type=table_opt_op.type, + inputs=inputs, + outputs=outputs, + attrs=table_opt_op.attrs) + # ====================== private transpiler functions ===================== def _create_vars_from_blocklist(self, program, @@ -512,7 +778,17 @@ class DistributeTranspiler: program.global_block().sync_with_cpp() return var_mapping - def _clone_var(self, block, var): + def create_splited_vars(self, source_var, block, tag): + return [ + block.create_var( + name=str(source_var.name + tag + str(index)), + type=source_var.type, + shape=source_var.shape, + dtype=source_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + + def _clone_var(self, block, var, persistable=True): assert isinstance(var, Variable) return block.create_var( name=var.name, @@ -520,12 +796,12 @@ class DistributeTranspiler: dtype=var.dtype, type=var.type, lod_level=var.lod_level, - persistable=True) + persistable=persistable) def _append_split_op(self, program, gradblocks): # Split variables that need to be split and append respective ops add_suffix = False - if self.trainers > 1: + if self.trainer_num > 1: add_suffix = True var_mapping = self._create_vars_from_blocklist( program, gradblocks, add_trainer_suffix=add_suffix) @@ -616,9 +892,9 @@ class DistributeTranspiler: return merged_var = \ pserver_block.vars[self._orig_varname(grad_block.name)] - if self.trainers > 1: + if self.trainer_num > 1: vars2merge = [] - for i in xrange(self.trainers): + for i in xrange(self.trainer_num): per_trainer_name = "%s.trainer_%d" % \ (self._orig_varname(grad_block.name), i) vars2merge.append(pserver_block.vars[per_trainer_name]) @@ -633,7 +909,7 @@ class DistributeTranspiler: type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, - attrs={"scale": 1.0 / float(self.trainers)}) + attrs={"scale": 1.0 / float(self.trainer_num)}) new_inputs[key] = merged_var elif key == "Param": # param is already created on global program @@ -669,7 +945,7 @@ class DistributeTranspiler: new_shape = None if key in ["Param", "Grad", "LearningRate"]: continue - var = self.program.global_block().vars[opt_op.input(key)[0]] + var = self.origin_program.global_block().vars[opt_op.input(key)[0]] # update accumulator variable shape param_shape = new_inputs["Param"].shape new_shape = self._get_optimizer_input_shape(opt_op.type, key, @@ -682,8 +958,8 @@ class DistributeTranspiler: new_inputs[key] = tmpvar # change output's ParamOut variable - outputs = self._get_output_map_from_op(self.program.global_block().vars, - opt_op) + outputs = self._get_output_map_from_op( + self.origin_program.global_block().vars, opt_op) outputs["ParamOut"] = new_inputs["Param"] optimize_block.append_op( @@ -695,8 +971,8 @@ class DistributeTranspiler: def _append_pserver_non_opt_ops(self, optimize_block, opt_op): program = optimize_block.program # Append the ops for parameters that do not need to be optimized/updated - inputs = self._get_input_map_from_op(self.program.global_block().vars, - opt_op) + inputs = self._get_input_map_from_op( + self.origin_program.global_block().vars, opt_op) for varlist in inputs.itervalues(): if not isinstance(varlist, list): varlist = [varlist] @@ -709,8 +985,8 @@ class DistributeTranspiler: dtype=var.dtype, shape=var.shape) - outputs = self._get_output_map_from_op(self.program.global_block().vars, - opt_op) + outputs = self._get_output_map_from_op( + self.origin_program.global_block().vars, opt_op) for varlist in outputs.itervalues(): if not isinstance(varlist, list): @@ -783,7 +1059,6 @@ class DistributeTranspiler: if same_or_split_var(n, param) and n != param: return True return False - return False def _get_input_map_from_op(self, varmap, op): """Returns a dict from op input name to the vars in varmap.""" @@ -821,7 +1096,7 @@ class DistributeTranspiler: find_ops = [] # find ops which output is lr var - block = self.program.global_block() + block = self.origin_program.global_block() for op in block.ops: if set(op.output_arg_names) & lr_vars: find_ops.append(op) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 7ca4ed9a7b..5c2c2dd7ab 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -218,6 +218,7 @@ def fc(input, def embedding(input, size, is_sparse=False, + is_distributed=False, padding_idx=None, param_attr=None, dtype='float32'): @@ -268,8 +269,11 @@ def embedding(input, inputs={'Ids': input, 'W': w}, outputs={'Out': tmp}, - attrs={'is_sparse': is_sparse, - 'padding_idx': padding_idx}) + attrs={ + 'is_sparse': is_sparse, + 'is_distributed': is_distributed, + 'padding_idx': padding_idx + }) return tmp From 449bdde58accc9beb94d56c8ef33c0bde4c007b7 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 12 Apr 2018 06:15:24 +0000 Subject: [PATCH 23/29] Correct some typos. --- cmake/cblas.cmake | 2 +- paddle/fluid/framework/executor.cc | 19 +++++++++++-------- paddle/fluid/framework/executor.h | 3 +++ paddle/fluid/inference/io.cc | 2 +- paddle/fluid/inference/tests/test_helper.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 52a22c1fbf..e3b9d94215 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING) /usr/lib/reference/ ) else() - # Diable the finding of reference cblas under host's system path + # Disable the finding of reference cblas under host's system path set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include) set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib) endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 34bba77f40..513e720fd0 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name, if (tensor.memory_size() == 0) { return; } - if (tensor.type().hash_code() != typeid(float).hash_code() && - tensor.type().hash_code() != typeid(double).hash_code()) { + if (tensor.type().hash_code() != typeid(float).hash_code() && // NOLINT + tensor.type().hash_code() != typeid(double).hash_code()) { // NOLINT return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), @@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, // Return true if the block has feed operators and holder of matching info. static bool has_feed_operators( const BlockDesc& block, - std::map& feed_targets, + const std::map& feed_targets, const std::string& feed_holder_name) { size_t feed_count = 0; for (auto* op : block.AllOps()) { if (op->Type() == kFeedOpType) { feed_count++; + // The input variable's name of feed_op should be feed_holder_name. PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, "Input to feed op should be '%s'", feed_holder_name); std::string feed_target_name = op->Output("Out")[0]; @@ -167,7 +168,7 @@ static bool has_feed_operators( "The number of feed operators should match 'feed_targets'"); if (!feed_holder_name.empty()) { - // When feed operator are present, so should be feed_holder + // When feed operator are present, so should be feed_holder. auto var = block.FindVar(feed_holder_name); PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", feed_holder_name); @@ -187,12 +188,14 @@ static bool has_feed_operators( // and fetch_holder_name. Raise exception when any mismatch is found. // Return true if the block has fetch operators and holder of matching info. static bool has_fetch_operators( - const BlockDesc& block, std::map& fetch_targets, + const BlockDesc& block, + const std::map& fetch_targets, const std::string& fetch_holder_name) { size_t fetch_count = 0; for (auto* op : block.AllOps()) { if (op->Type() == kFetchOpType) { fetch_count++; + // The output variable's name of fetch_op should be fetch_holder_name. PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, "Output of fetch op should be '%s'", fetch_holder_name); std::string fetch_target_name = op->Input("X")[0]; @@ -209,7 +212,7 @@ static bool has_fetch_operators( "The number of fetch operators should match 'fetch_targets'"); if (!fetch_holder_name.empty()) { - // When fetch operator are present, so should be fetch_holder + // When fetch operator are present, so should be fetch_holder. auto var = block.FindVar(fetch_holder_name); PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", fetch_holder_name); @@ -287,8 +290,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } auto ctx = Prepare(*copy_program, 0); - RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, - feed_holder_name, fetch_holder_name, create_vars); + RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars, + feed_holder_name, fetch_holder_name); } std::unique_ptr Executor::Prepare( diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 8b3ea01542..43defdacf2 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index a29d457b6f..3b58019db6 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -23,7 +23,7 @@ limitations under the License. */ namespace paddle { namespace inference { -// Temporarilly add this function for exposing framework::InitDevices() when +// Temporarily add this function for exposing framework::InitDevices() when // linking the inference shared library. void Init(bool init_p2p) { framework::InitDevices(init_p2p); } diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 9875e43860..c3a8d0889c 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -195,7 +195,7 @@ void TestInference(const std::string& dirname, paddle::platform::DeviceContextPool::Instance().Get(place)); if (PrepareContext) { - // Note: if you changed the inference_program, you need to call + // Note: if you change the inference_program, you need to call // executor.Prepare() again to get a new ExecutorPrepareContext. executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, CreateVars); From d24b5e060f738139feab99b1c4a97042bce1982f Mon Sep 17 00:00:00 2001 From: mozga-intel Date: Thu, 12 Apr 2018 14:33:38 +0200 Subject: [PATCH 24/29] The fully connected: the operator is removed when the MKLDNN flag is OFF --- paddle/fluid/operators/CMakeLists.txt | 8 ++++++++ python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 3c8696b508..7d6781c2c3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency) op_library(channel_recv_op DEPS concurrency) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) + +# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF +# Because the fully connected layer has only one MKLDNN's operator +if(NOT WITH_MKLDNN) + list(REMOVE_ITEM GENERAL_OPS fc_op) +endif(NOT WITH_MKLDNN) + foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() + file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") add_subdirectory(reader) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f10ef9b634..3bd24c98a2 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1,6 +1,12 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +# The fully connected test is removed whe the WITH_MKLDNN flag is OFF +# Because the fully connected layer has only one kernel (MKLDNN) +if(NOT WITH_MKLDNN) + list(REMOVE_ITEM TEST_OPS test_fc_op) +endif(NOT WITH_MKLDNN) + if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_recv_op) endif(NOT WITH_DISTRIBUTE) From 617e790a596ccd3f2eb940fcfe76803c01ee6cc8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 12 Apr 2018 11:48:17 -0700 Subject: [PATCH 25/29] fix cuda 7.5 compile error (#9885) --- paddle/fluid/operators/math/math_function.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index e53183603f..c28047e6e9 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -288,9 +288,14 @@ void batched_gemm( // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, "cublas Hgemm requires GPU compute capability >= 53"); + +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -310,9 +315,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> @@ -332,9 +341,13 @@ void batched_gemm( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int strideC = M * N; +#if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); +#else + PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5"); +#endif } template <> From 59234b7287980ef0fec0a064f524e6c25697b7c7 Mon Sep 17 00:00:00 2001 From: redrayqll Date: Fri, 13 Apr 2018 03:25:44 +0800 Subject: [PATCH 26/29] =?UTF-8?q?modify=20=E2=80=9Cif-then-else=E2=80=9D?= =?UTF-8?q?=20md=20path=20(#9876)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/fluid/design/motivation/fluid.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md index 5e147f8263..4b7696cc1b 100644 --- a/doc/fluid/design/motivation/fluid.md +++ b/doc/fluid/design/motivation/fluid.md @@ -119,7 +119,7 @@ An actual Fluid example is described [here](https://github.com/PaddlePaddle/Pad From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. -We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid. +We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid. ## Turing Completeness From 3794027d7fbb4d6636534c78452aad589db66361 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 12 Apr 2018 15:45:07 -0700 Subject: [PATCH 27/29] Fix warnings in sgd_op.h --- paddle/fluid/operators/sgd_op.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index 8d2bdf7590..cfc8793e1e 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel { auto &grad_rows = grad->rows(); size_t grad_row_numel = grad_value.numel() / grad_rows.size(); - PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height); + PADDLE_ENFORCE_EQ(static_cast(grad_row_numel), + param_out->numel() / grad_height); auto *grad_data = grad_value.data(); auto *out_data = param_out->data(); @@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel { for (size_t i = 0; i < grad_rows.size(); i++) { PADDLE_ENFORCE(grad_rows[i] < grad_height, "Input rows index should less than height"); - for (int64_t j = 0; j < grad_row_numel; j++) { + for (size_t j = 0; j < grad_row_numel; j++) { out_data[grad_rows[i] * grad_row_numel + j] -= lr[0] * grad_data[i * grad_row_numel + j]; } @@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel { PADDLE_ENFORCE(grad.rows()[i] < grad.height(), "Input rows index should less than height"); int64_t id_index = param.index(grad.rows()[i]); - for (int64_t j = 0; j < grad_row_width; j++) { + for (size_t j = 0; j < grad_row_width; j++) { out_data[id_index * grad_row_width + j] -= lr[0] * grad_data[i * grad_row_width + j]; } From c241959e489053259274edb2614381d7058463a4 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 12 Apr 2018 16:45:40 -0700 Subject: [PATCH 28/29] Fix CPPLint errors in operators (#9828) * Fix CPPLint errors in operators * Fix prior box op * Fix Prior Box op * Fix top_k_op.cu * Fix pool mkmldnn * Fix pool mkmldnn --- paddle/fluid/operators/pad_op.h | 2 + paddle/fluid/operators/pool_mkldnn_op.cc | 12 ++- paddle/fluid/operators/pool_op.h | 2 + paddle/fluid/operators/pool_with_index_op.h | 1 + paddle/fluid/operators/prelu_op.cc | 1 - paddle/fluid/operators/prior_box_op.cc | 2 +- paddle/fluid/operators/prior_box_op.cu | 2 +- paddle/fluid/operators/prior_box_op.h | 18 +++-- paddle/fluid/operators/rank_loss_op.cc | 1 + paddle/fluid/operators/recv_op.cc | 2 +- paddle/fluid/operators/roi_pool_op.h | 2 + paddle/fluid/operators/strided_memcpy.h | 4 +- paddle/fluid/operators/top_k_op.cu | 83 +++++++++++---------- 13 files changed, 73 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h index a36abe3789..c93c096575 100644 --- a/paddle/fluid/operators/pad_op.h +++ b/paddle/fluid/operators/pad_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index c88578570c..63eaaedcd5 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + mkldnn::memory({src_md, mkldnn_engine}, + static_cast(const_cast(input_data))); auto dst_memory = - mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + mkldnn::memory({dst_md, mkldnn_engine}, + static_cast(const_cast(output_data))); auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, *workspace_memory); @@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { pool_bwd_desc, mkldnn_engine, *pool_pd); auto diff_src_memory = - mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + mkldnn::memory({diff_src_md, mkldnn_engine}, + static_cast(const_cast(in_x_grad_data))); auto diff_dst_memory = - mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + mkldnn::memory({diff_dst_md, mkldnn_engine}, + static_cast(const_cast(out_grad_data))); auto bwd_prim = mkldnn::pooling_backward( pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 2fec50ef25..a48127ea69 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index 83e7bd138a..b55fa76eae 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 7fb45bd19d..8eaa12a4a6 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/prelu_op.h" - #include namespace paddle { diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index 82e54139c8..058b13eeb8 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { bool flip = ctx->Attrs().Get("flip"); std::vector aspect_ratios_vec; - ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); if (max_sizes.size() > 0) { diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu index 76bf2b3b7d..0ea8909296 100644 --- a/paddle/fluid/operators/prior_box_op.cu +++ b/paddle/fluid/operators/prior_box_op.cu @@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h index 1e4a12aac1..1c62fd8d2c 100644 --- a/paddle/fluid/operators/prior_box_op.h +++ b/paddle/fluid/operators/prior_box_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/transform.h" @@ -22,23 +24,23 @@ namespace operators { inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, bool flip, - std::vector& output_aspect_ratior) { + std::vector* output_aspect_ratior) { constexpr float epsilon = 1e-6; - output_aspect_ratior.clear(); - output_aspect_ratior.push_back(1.0f); + output_aspect_ratior->clear(); + output_aspect_ratior->push_back(1.0f); for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { float ar = input_aspect_ratior[i]; bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { - if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { + if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { already_exist = true; break; } } if (!already_exist) { - output_aspect_ratior.push_back(ar); + output_aspect_ratior->push_back(ar); if (flip) { - output_aspect_ratior.push_back(1.0f / ar); + output_aspect_ratior->push_back(1.0f / ar); } } } @@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel { auto clip = ctx.Attr("clip"); std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); T step_w = static_cast(ctx.Attr("step_w")); T step_h = static_cast(ctx.Attr("step_h")); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 767eef5686..a1127f11a7 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 083c1fae5e..a4dcf704a6 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include // NOLINT #include #include "paddle/fluid/framework/data_type.h" @@ -19,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include #include "paddle/fluid/operators/detail/grpc_client.h" namespace paddle { diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h index f38c5a3c0c..54e0749031 100644 --- a/paddle/fluid/operators/roi_pool_op.h +++ b/paddle/fluid/operators/roi_pool_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 22c1db82e9..7a10218e15 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_dim, const framework::DDim& dst_stride, T* dst) { - using namespace detail; - StridedCopyDimVisitor func(dev_ctx, src, src_stride, dst_stride, dst); + paddle::operators::detail::StridedCopyDimVisitor func( + dev_ctx, src, src_stride, dst_stride, dst); boost::apply_visitor(func, dst_dim); } diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index bfd26c2f22..d7f4d383ce 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" namespace paddle { @@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* src, - bool& firstStep, bool& is_empty, - Pair& max, int dim, + bool* firstStep, bool* is_empty, + Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, src, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - (*beam)) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, src, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int& beam, +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* val, - int* col, bool& firstStep, - bool& is_empty, Pair& max, + int* col, bool* firstStep, + bool* is_empty, Pair* max, int dim, const int tid) { - if (beam > 0) { - int length = beam < beam_size ? beam : beam_size; - if (firstStep) { - firstStep = false; + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; GetTopK(topk, val, col, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - beam) { - topk[k] = topk[k + beam]; + if (k < MaxLength - *beam) { + topk[k] = topk[k + *beam]; } else { topk[k].set(-INFINITY, -1); } } - if (!is_empty) { - GetTopK(topk + MaxLength - beam, val, col, tid, dim, max, + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, val, col, tid, dim, max, length); } } - max = topk[MaxLength - 1]; - if (max.v == -1) is_empty = true; - beam = 0; + *max = topk[MaxLength - 1]; + if ((*max).v == -1) *is_empty = true; + *beam = 0; } } template __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, Pair topk[], T** topVal, - int64_t** topIds, int& beam, int& k, + int64_t** topIds, int* beam, int* k, const int tid, const int warp) { while (true) { __syncthreads(); @@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, (*topVal)++; (*topIds)++; } - if (tid == maxid[0]) beam++; - if (--k == 0) break; + if (tid == maxid[0]) (*beam)++; + if (--(*k) == 0) break; __syncthreads(); if (tid == maxid[0]) { - if (beam < MaxLength) { - sh_topk[tid] = topk[beam]; + if (*beam < MaxLength) { + sh_topk[tid] = topk[*beam]; } } if (maxid[0] / 32 == warp) { - if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break; + if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break; } } } @@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, topk[k].set(-INFINITY, -1); } while (k) { - ThreadGetTopK(topk, beam, k, - src + blockIdx.x * lds, firststep, - is_empty, max, dim, tid); + ThreadGetTopK(topk, &beam, k, + src + blockIdx.x * lds, &firststep, + &is_empty, &max, dim, tid); sh_topk[tid] = topk[0]; BlockReduce(sh_topk, maxid, topk, &output, - &indices, beam, k, tid, warp); + &indices, &beam, &k, tid, warp); } } @@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel { KeMatrixTopK<<< grid, threads, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(output_data, output->dims()[1], - indices_data, input_data, - input_width, input_width, int(k)); + .stream()>>>( + output_data, output->dims()[1], indices_data, input_data, input_width, + input_width, static_cast(k)); } }; From a08bf76f74cbdd4db4a773a4557b4ad6551ce679 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 13 Apr 2018 13:52:39 +0800 Subject: [PATCH 29/29] refine name --- paddle/fluid/framework/threadpool.cc | 10 +++++----- paddle/fluid/framework/threadpool.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 109c2c745c..f26f212d4d 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -95,15 +95,15 @@ void ThreadPool::TaskLoop() { } } -std::unique_ptr MultiStreamThreadPool::io_threadpool_(nullptr); -std::once_flag MultiStreamThreadPool::io_init_flag_; +std::unique_ptr ThreadPoolIO::io_threadpool_(nullptr); +std::once_flag ThreadPoolIO::io_init_flag_; -ThreadPool* MultiStreamThreadPool::GetInstanceIO() { - std::call_once(io_init_flag_, &MultiStreamThreadPool::InitIO); +ThreadPool* ThreadPoolIO::GetInstanceIO() { + std::call_once(io_init_flag_, &ThreadPoolIO::InitIO); return io_threadpool_.get(); } -void MultiStreamThreadPool::InitIO() { +void ThreadPoolIO::InitIO() { if (io_threadpool_.get() == nullptr) { // TODO(typhoonzero1986): make this configurable io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size)); diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 1cc058834c..94111ee335 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -135,7 +135,7 @@ class ThreadPool { std::condition_variable completed_; }; -class MultiStreamThreadPool : ThreadPool { +class ThreadPoolIO : ThreadPool { public: static ThreadPool* GetInstanceIO(); static void InitIO(); @@ -156,7 +156,7 @@ std::future Async(Callback callback) { template std::future AsyncIO(Callback callback) { - return MultiStreamThreadPool::GetInstanceIO()->Run(callback); + return ThreadPoolIO::GetInstanceIO()->Run(callback); } } // namespace framework