From 76d8b14bceeb7f2292b617bb19c33dbcfd6dc8f6 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 3 May 2018 05:30:50 -0700 Subject: [PATCH 01/13] Add timeline support for distributed training --- benchmark/cluster/vgg16/vgg16_fluid.py | 28 ++++-- cmake/external/grpc.cmake | 2 +- paddle/fluid/operators/detail/send_recv.proto | 4 + .../operators/detail/sendrecvop_utils.cc | 8 ++ .../operators/detail/variable_response.cc | 22 ++++- paddle/fluid/operators/listen_and_serv_op.cc | 8 +- paddle/fluid/platform/profiler.cc | 35 ++++++-- paddle/fluid/platform/profiler.h | 8 ++ tools/timeline.py | 90 +++++++++++-------- 9 files changed, 149 insertions(+), 56 deletions(-) diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 6c7d2c1036..05b5f3977c 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -80,6 +80,8 @@ parser.add_argument( type=str, default="", help="Comma-separated list of hostname:port pairs") +parser.add_argument( + "--profile", action='store_true', help="If set, profile a few steps.") # Flags for defining the tf.train.Server parser.add_argument( @@ -183,8 +185,8 @@ def main(): start_time = time.time() num_samples = 0 train_pass_acc.reset() - for batch_id, data in enumerate(train_reader()): - ts = time.time() + + def run_step(batch_id, data): img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype( "float32") @@ -196,14 +198,28 @@ def main(): feed={"pixel": img_data, "label": y_data}, fetch_list=[avg_cost, batch_acc, batch_size]) + return loss, acc, b_size + + if args.profile and args.task_index == 0: + # warmup. + for batch_id, data in enumerate(train_reader()): + if batch_id > 5: break + run_step(batch_id, data) + with profiler.profiler('All', 'total', '/tmp/profile_vgg'): + for batch_id, data in enumerate(train_reader()): + if batch_id > 5: break + run_step(batch_id, data) + + for batch_id, data in enumerate(train_reader()): + ts = time.time() + loss, acc, b_size = run_step(batch_id, data) iters += 1 num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( - "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " - "Speed = %.2f img/s " % (args.task_index, pass_id, iters, - loss, acc, - len(data) / (time.time() - ts)) + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " + "Speed = %.2f img/s" % (pass_id, iters, loss, acc, + len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index e90948782b..ef520b1287 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -33,7 +33,7 @@ ExternalProject_Add( extern_grpc DEPENDS protobuf zlib GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.10.x" + GIT_TAG "v1.8.x" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 02bb2b9ceb..fffa9ae7a4 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -69,6 +69,10 @@ message VariableMessage { bytes rows = 9; // Look up table block execution output variable name. string out_varname = 10; + // If true, the ps server will start profiling, the ps + // server stops profiling and generates a profile to /tmp/profile_ps_* + // when profile switches from true to false. + bool profile = 11; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 766bcf1ac5..d68cf467f7 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void* payload = nullptr; size_t payload_size; ProtoEncodeHelper e(static_cast(buf), 1024); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled()); + } e.WriteString(VarMsg::kVarnameFieldNumber, name); if (var->IsType()) { e.WriteUint64(VarMsg::kTypeFieldNumber, 0); diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index fbef8d02a4..335491e95d 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" @@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) { meta_.set_out_varname(temp); break; } - + case sendrecv::VariableMessage::kProfileFieldNumber: { + bool profiling; + if (!input.ReadRaw(reinterpret_cast(&profiling), 1)) { + return tag; + } + meta_.set_profile(profiling); + int64_t lisner_id = platform::ListenerId(); + if (lisner_id <= 0) { + break; + } + if (profiling && !platform::IsProfileEnabled()) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else if (!profiling && platform::IsProfileEnabled()) { + // TODO(panyx0718): Should we allow to customize file dir. + platform::DisableProfiler( + platform::EventSortingKey::kDefault, + string::Sprintf("/tmp/profile_ps_%lld", lisner_id)); + } + break; + } default: { // Unknown tag, return unknown error. return -1; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 59b9451155..470a567e8b 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/listen_and_serv_op.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { + // Mark this as PS that it should decide profiling by listening from trainer. + platform::SetProfileLisener(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); framework::Scope &recv_scope = scope.NewScope(); @@ -328,9 +331,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, rpc_service_->WaitServerReady(); // Write to a file of server selected port for python use. - std::string file_path = - string::Sprintf("/tmp/paddle.%d.selected_port", - static_cast(::getpid())); + std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port", + static_cast(::getpid())); SavePort(file_path); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 412cdda286..ac16e4cd59 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,12 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" + #include #include #include #include +#include #include #include // NOLINT +#include #include #ifdef PADDLE_WITH_CUDA #include @@ -33,6 +36,9 @@ namespace platform { struct EventList; +static int64_t profiler_lister_id = 0; +static bool should_send_profile_state = false; + // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; // The thread local event list only can be accessed by the specific thread @@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); - PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, - "The profiling state should be disabled when calling ", - "EnableProfiler."); - g_state = state; - if (g_state == ProfilerState::kAll) { - GetDeviceTracer()->Enable(); + if (state == g_state) { + return; } + g_state = state; + should_send_profile_state = true; + GetDeviceTracer()->Enable(); #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { // Generate some dummy events first to reduce the startup overhead. @@ -435,8 +440,7 @@ void ParseEvents(const std::vector>& events, void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path) { - PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, - "Can't disable profiling, since it's not starting."); + if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. Mark("_stop_profiler_", nullptr); @@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key, ParseEvents(all_events, sorted_key); ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); - if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) { + if (tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); } g_state = ProfilerState::kDisabled; + should_send_profile_state = true; +} + +bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } +bool ShouldSendProfileState() { return should_send_profile_state; } + +void SetProfileLisener() { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + profiler_lister_id = dist6(rng); } +int64_t ListenerId() { return profiler_lister_id; } } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 428d9ebcea..c8b8c258a8 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -114,5 +114,13 @@ void ResetProfiler(); void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path); +// Test if the profiler is currently enabled. +bool IsProfileEnabled(); +// Whether the trainer should send profiling state to PS. +bool ShouldSendProfileState(); +// Mark current process as PS by assigning a lister id. +void SetProfileLisener(); +int64_t ListenerId(); + } // namespace platform } // namespace paddle diff --git a/tools/timeline.py b/tools/timeline.py index f4083c824e..8cd6353d46 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - '--profile_path', type=str, default='', help='Input profile file name.') + '--profile_path', + type=str, + default='', + help='Input profile file name. If there are multiple file, the format ' + 'should be trainer1=file1,trainer2=file2,ps=file3') parser.add_argument( '--timeline_path', type=str, default='', help='Output timeline file name.') args = parser.parse_args() @@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object): class Timeline(object): - def __init__(self, profile_pb): - self._profile_pb = profile_pb + def __init__(self, profile_dict): + self._profile_dict = profile_dict self._pid = 0 self._devices = dict() self._chrome_trace = _ChromeTraceFormatter() @@ -120,35 +124,37 @@ class Timeline(object): return cur_pid def _allocate_pids(self): - for event in self._profile_pb.events: - if event.type == profiler_pb2.Event.CPU: - if (event.device_id, "CPU") not in self._devices: - pid = self._allocate_pid() - self._devices[(event.device_id, "CPU")] = pid - self._chrome_trace.emit_pid("cpu:block:%d" % - (event.device_id), pid) - elif event.type == profiler_pb2.Event.GPUKernel: - if (event.device_id, "GPUKernel") not in self._devices: - pid = self._allocate_pid() - self._devices[(event.device_id, "GPUKernel")] = pid - self._chrome_trace.emit_pid("gpu:%d" % (event.device_id), - pid) + for k, profile_pb in self._profile_dict.iteritems(): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + if (k, event.device_id, "CPU") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "CPU")] = pid + self._chrome_trace.emit_pid("%s:cpu:block:%d" % + (k, event.device_id), pid) + elif event.type == profiler_pb2.Event.GPUKernel: + if (k, event.device_id, "GPUKernel") not in self._devices: + pid = self._allocate_pid() + self._devices[(k, event.device_id, "GPUKernel")] = pid + self._chrome_trace.emit_pid("%s:gpu:%d" % + (k, event.device_id), pid) def _allocate_events(self): - for event in self._profile_pb.events: - if event.type == profiler_pb2.Event.CPU: - type = "CPU" - elif event.type == profiler_pb2.Event.GPUKernel: - type = "GPUKernel" - pid = self._devices[(event.device_id, type)] - args = {'name': event.name} - if event.memcopy.bytes > 0: - args = {'mem_bytes': event.memcopy.bytes} - # TODO(panyx0718): Chrome tracing only handles ms. However, some - # ops takes micro-seconds. Hence, we keep the ns here. - self._chrome_trace.emit_region( - event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, - event.sub_device_id, 'Op', event.name, args) + for k, profile_pb in self._profile_dict.iteritems(): + for event in profile_pb.events: + if event.type == profiler_pb2.Event.CPU: + type = "CPU" + elif event.type == profiler_pb2.Event.GPUKernel: + type = "GPUKernel" + pid = self._devices[(k, event.device_id, type)] + args = {'name': event.name} + if event.memcopy.bytes > 0: + args = {'mem_bytes': event.memcopy.bytes} + # TODO(panyx0718): Chrome tracing only handles ms. However, some + # ops takes micro-seconds. Hence, we keep the ns here. + self._chrome_trace.emit_region( + event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, + event.sub_device_id, 'Op', event.name, args) def generate_chrome_trace(self): self._allocate_pids() @@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline' if args.timeline_path: timeline_path = args.timeline_path -with open(profile_path, 'r') as f: - profile_s = f.read() - profile_pb = profiler_pb2.Profile() - profile_pb.ParseFromString(profile_s) - -tl = Timeline(profile_pb) +profile_paths = profile_path.split(',') +profile_dict = dict() +if len(profile_path) == 1: + with open(profile_path, 'r') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict['trainer'] = profile_pb +else: + for profile_path in profile_paths: + k, v = profile_path.split('=') + with open(v, 'r') as f: + profile_s = f.read() + profile_pb = profiler_pb2.Profile() + profile_pb.ParseFromString(profile_s) + profile_dict[k] = profile_pb + +tl = Timeline(profile_dict) with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace()) From 9927413991bd16e4fd16eaf30531885097457553 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 May 2018 10:14:09 +0800 Subject: [PATCH 02/13] remove version change --- cmake/external/grpc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index ef520b1287..e90948782b 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -33,7 +33,7 @@ ExternalProject_Add( extern_grpc DEPENDS protobuf zlib GIT_REPOSITORY "https://github.com/grpc/grpc.git" - GIT_TAG "v1.8.x" + GIT_TAG "v1.10.x" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" From 5a9f17f02b37ed369c37d44a516faadc66b6d15a Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 4 May 2018 15:51:43 +0800 Subject: [PATCH 03/13] clean up --- paddle/fluid/operators/listen_and_serv_op.cc | 2 +- paddle/fluid/platform/profiler.cc | 2 +- paddle/fluid/platform/profiler.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 470a567e8b..8acbf82025 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -296,7 +296,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { // Mark this as PS that it should decide profiling by listening from trainer. - platform::SetProfileLisener(); + platform::SetProfileListener(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); framework::Scope &recv_scope = scope.NewScope(); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index ac16e4cd59..cfddd8e871 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -459,7 +459,7 @@ void DisableProfiler(EventSortingKey sorted_key, bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } bool ShouldSendProfileState() { return should_send_profile_state; } -void SetProfileLisener() { +void SetProfileListener() { std::mt19937 rng; rng.seed(std::random_device()()); std::uniform_int_distribution dist6( diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index c8b8c258a8..61b98143e4 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -119,7 +119,7 @@ bool IsProfileEnabled(); // Whether the trainer should send profiling state to PS. bool ShouldSendProfileState(); // Mark current process as PS by assigning a lister id. -void SetProfileLisener(); +void SetProfileListener(); int64_t ListenerId(); } // namespace platform From cdd52f3a30c70f98044ad3dd7a86cd27b5c6071d Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sat, 5 May 2018 20:58:04 +0800 Subject: [PATCH 04/13] Add comment to explain how to run inference test --- paddle/fluid/inference/tests/book/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 97d9f03f88..ec5ca4a70f 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -24,6 +24,10 @@ function(inference_test TARGET_NAME) endforeach() endfunction(inference_test) +#################### +# Inference tests here depend on fluid/tests/book +# User need to run tests in fluid/tests/book first to generate saved model. +#################### # This unittest is buggy! #inference_test(fit_a_line) inference_test(image_classification ARGS vgg resnet) From cd54a31cc88fa1d4feb5215f0c36e3a84971e972 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 7 May 2018 11:04:56 +0800 Subject: [PATCH 05/13] fix fluid Metric --- python/paddle/fluid/metrics.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index c618b02a76..1301b6f961 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -251,7 +251,7 @@ class EditDistance(MetricBase): self.instance_error += seq_num - seq_right_count self.total_distance += total_distance - def eval(): + def eval(self): if self.seq_num == 0: raise ValueError( "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." @@ -340,8 +340,8 @@ class Auc(MetricBase): raise ValueError("The 'predictions' must be a numpy ndarray.") kepsilon = 1e-7 # to account for floating point imprecisions - thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) - for i in range(num_thresholds - 2)] + thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1) + for i in range(self._num_thresholds - 2)] thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] # caculate TP, FN, TN, FP count @@ -358,19 +358,20 @@ class Auc(MetricBase): fp += 1 else: tn += 1 - tp_list[idx_thresh] += tp - fn_list[idx_thresh] += fn - tn_list[idx_thresh] += tn - fp_list[idx_thresh] += fp + self.tp_list[idx_thresh] += tp + self.fn_list[idx_thresh] += fn + self.tn_list[idx_thresh] += tn + self.fp_list[idx_thresh] += fp def eval(self): epsilon = self._epsilon num_thresholds = self._num_thresholds - tpr = (tp_list.astype("float32") + epsilon) / ( - tp_list + fn_list + epsilon) - fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon) - rec = (tp_list.astype("float32") + epsilon) / ( - tp_list + fp_list + epsilon) + tpr = (self.tp_list.astype("float32") + epsilon) / ( + self.tp_list + self.fn_list + epsilon) + fpr = self.fp_list.astype("float32") / ( + self.fp_list + self.tn_list + epsilon) + rec = (self.tp_list.astype("float32") + epsilon) / ( + self.tp_list + self.fp_list + epsilon) x = fpr[:num_thresholds - 1] - fpr[1:] y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 From 171d3e861c51240940c5e33dd213d286cfb790a3 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 7 May 2018 11:10:12 +0800 Subject: [PATCH 06/13] fix CompositeMetric --- python/paddle/fluid/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 1301b6f961..7f9e958a8e 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -116,7 +116,7 @@ class CompositeMetric(MetricBase): super(CompositeMetric, self).__init__(name, kwargs) self._metrics = [] - def add_metric(self, metric): + def update(self, metric): if not isinstance(metric, MetricBase): raise ValueError("SubMetric should be inherit from MetricBase.") self._metrics.append(metric) @@ -280,6 +280,7 @@ class DetectionMAP(MetricBase): super(DetectionMAP, self).__init__(name) # the current map value self.value = .0 + self.weight = .0 def update(self, value, weight): if not _is_number_or_matrix_(value): From d1ea74d3b99e227f89ae5f9a4130a57abdfaa283 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 7 May 2018 13:07:48 +0800 Subject: [PATCH 07/13] follow comments --- paddle/fluid/operators/detail/variable_response.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 335491e95d..f4a374d56d 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -434,8 +434,8 @@ int VariableResponse::Parse(Source* source) { return tag; } meta_.set_profile(profiling); - int64_t lisner_id = platform::ListenerId(); - if (lisner_id <= 0) { + int64_t listener_id = platform::ListenerId(); + if (listener_id <= 0) { break; } if (profiling && !platform::IsProfileEnabled()) { @@ -444,7 +444,7 @@ int VariableResponse::Parse(Source* source) { // TODO(panyx0718): Should we allow to customize file dir. platform::DisableProfiler( platform::EventSortingKey::kDefault, - string::Sprintf("/tmp/profile_ps_%lld", lisner_id)); + string::Sprintf("/tmp/profile_ps_%lld", listener_id)); } break; } From 7f37060879a8e10ee92028f92bef7346afb86a13 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 7 May 2018 13:11:35 +0800 Subject: [PATCH 08/13] revert CompositeMetric::add_metric --- python/paddle/fluid/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 7f9e958a8e..bb9c6fdc60 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -116,7 +116,7 @@ class CompositeMetric(MetricBase): super(CompositeMetric, self).__init__(name, kwargs) self._metrics = [] - def update(self, metric): + def add_metric(self, metric): if not isinstance(metric, MetricBase): raise ValueError("SubMetric should be inherit from MetricBase.") self._metrics.append(metric) From 9fccf46270cee6a60b0ab0a0939764dcf6f2199f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Mon, 7 May 2018 13:11:45 +0800 Subject: [PATCH 09/13] reword comments --- paddle/fluid/inference/tests/book/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index ec5ca4a70f..cc179a8625 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -25,8 +25,9 @@ function(inference_test TARGET_NAME) endfunction(inference_test) #################### -# Inference tests here depend on fluid/tests/book -# User need to run tests in fluid/tests/book first to generate saved model. +# Inference tests here depend on fluid/tests/book. If users want to run +# individual test with ctest, they need to run tests in fluid/tests/book +# first to generate saved model. #################### # This unittest is buggy! #inference_test(fit_a_line) From 2a2c83b9e6b1b818edc3a0d67cc21225922e290c Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Mon, 7 May 2018 14:35:20 +0800 Subject: [PATCH 10/13] feature/convert tensorrt io (#10440) * init * init * add ut * split singleton from base class * add singleton * ad singleton --- .../fluid/inference/tensorrt/CMakeLists.txt | 1 + .../fluid/inference/tensorrt/io_converter.cc | 57 +++++++++++++++ .../fluid/inference/tensorrt/io_converter.h | 66 +++++++++++++++++ .../inference/tensorrt/test_io_converter.cc | 53 ++++++++++++++ paddle/fluid/inference/utils/singleton.h | 73 +++++++++++++++++++ 5 files changed, 250 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/io_converter.cc create mode 100644 paddle/fluid/inference/tensorrt/io_converter.h create mode 100644 paddle/fluid/inference/tensorrt/test_io_converter.cc create mode 100644 paddle/fluid/inference/utils/singleton.h diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 288789d6e4..c8b656394b 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda) +nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/io_converter.cc b/paddle/fluid/inference/tensorrt/io_converter.cc new file mode 100644 index 0000000000..2baac96c26 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/io_converter.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/io_converter.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using platform::is_gpu_place; +using platform::is_cpu_place; + +class DefaultInputConverter : public EngineInputConverter { + public: + DefaultInputConverter() {} + // NOTE out is GPU memory. + virtual void operator()(const LoDTensor& in, void* out, + size_t max_size) override { + PADDLE_ENFORCE(out != nullptr); + PADDLE_ENFORCE_LE(in.memory_size(), max_size); + const auto& place = in.place(); + if (is_cpu_place(place)) { + PADDLE_ENFORCE(stream_ != nullptr); + PADDLE_ENFORCE_EQ(0, + cudaMemcpyAsync(out, in.data(), in.memory_size(), + cudaMemcpyHostToDevice, *stream_)); + + } else if (is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(0, + cudaMemcpyAsync(out, in.data(), in.memory_size(), + cudaMemcpyHostToHost, *stream_)); + + } else { + PADDLE_THROW("Unknown device for converter"); + } + cudaStreamSynchronize(*stream_); + } +}; + +REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter); + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/io_converter.h b/paddle/fluid/inference/tensorrt/io_converter.h new file mode 100644 index 0000000000..6ea61cbbac --- /dev/null +++ b/paddle/fluid/inference/tensorrt/io_converter.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using framework::LoDTensor; + +/* + * Convert Input from Fluid to an Engine. + * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in + * most cases just need to copy the data. + */ +class EngineInputConverter { + public: + EngineInputConverter() {} + + virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} + + void SetStream(cudaStream_t* stream) { stream_ = stream; } + + static void Run(const std::string& in_op_type, const LoDTensor& in, void* out, + size_t max_size, cudaStream_t* stream) { + PADDLE_ENFORCE(stream != nullptr); + auto* converter = Registry::Lookup(in_op_type); + PADDLE_ENFORCE_NOT_NULL(converter); + converter->SetStream(stream); + (*converter)(in, out, max_size); + } + + virtual ~EngineInputConverter() {} + + protected: + cudaStream_t* stream_{nullptr}; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \ + struct trt_input_##in_op_type__##_converter { \ + trt_input_##in_op_type__##_converter() { \ + ::paddle::inference::Registry::Register< \ + Converter__>(#in_op_type__); \ + } \ + }; \ + trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__; diff --git a/paddle/fluid/inference/tensorrt/test_io_converter.cc b/paddle/fluid/inference/tensorrt/test_io_converter.cc new file mode 100644 index 0000000000..365e936686 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/test_io_converter.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/tensorrt/io_converter.h" + +#include + +namespace paddle { +namespace inference { +namespace tensorrt { + +class EngineInputConverterTester : public ::testing::Test { + public: + void SetUp() override { tensor.Resize({10, 10}); } + + framework::LoDTensor tensor; +}; + +TEST_F(EngineInputConverterTester, DefaultCPU) { + void* buffer; + tensor.mutable_data(platform::CPUPlace()); + ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); + + cudaStream_t stream; + EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(), + &stream); +} + +TEST_F(EngineInputConverterTester, DefaultGPU) { + void* buffer; + tensor.mutable_data(platform::CUDAPlace()); + ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); + + cudaStream_t stream; + EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(), + &stream); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h new file mode 100644 index 0000000000..f05921067c --- /dev/null +++ b/paddle/fluid/inference/utils/singleton.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +// NOTE not thread-safe. +template +struct Singleton { + static T& Global() { + static T* x = new T; + return *x; + } + + Singleton() = delete; + Singleton& operator=(const Singleton&) = delete; +}; + +/* + * An registor for any type. + * NOTE not thread-safe. + */ +template +struct Registry { + static Registry& Global() { + static auto* x = new Registry; + return *x; + } + + template + static void Register(const std::string& name) { + PADDLE_ENFORCE_EQ(items_.count(name), 0); + items_[name] = new ItemChild; + } + + static ItemParent* Lookup(const std::string& name) { + auto it = items_.find(name); + if (it == items_.end()) return nullptr; + return it->second; + } + + ~Registry() { + for (auto& item : items_) { + delete item.second; + } + } + + private: + Registry() = default; + static std::unordered_map items_; +}; + +template +std::unordered_map Registry::items_; + +} // namespace inference +} // namespace paddle From bb3247e33973ca02d900421e7f823214f4b0a067 Mon Sep 17 00:00:00 2001 From: Yancey Date: Mon, 7 May 2018 15:14:08 +0800 Subject: [PATCH 11/13] fix traner.py import error (#10442) --- python/paddle/fluid/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 8252592c8c..a9fa2359e0 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -22,7 +22,7 @@ import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module import optimizer as opt_module -import distribute_transpiler +from transpiler import distribute_transpiler __all__ = [ 'Trainer', From 5b06944857e74f9b1388e081d4502bfd8c002832 Mon Sep 17 00:00:00 2001 From: Yancey Date: Mon, 7 May 2018 18:55:39 +0800 Subject: [PATCH 12/13] fix trainer import error on ce (#10448) * fix trainer import error on ce * fix setup.py.in --- python/paddle/fluid/__init__.py | 1 + python/paddle/fluid/trainer.py | 1 + python/setup.py.in | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 37d3689467..c8a435748d 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -60,6 +60,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ 'io', 'initializer', 'layers', + 'transpiler' 'nets', 'optimizer', 'learning_rate_decay', diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index a9fa2359e0..1cbecd69e5 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -19,6 +19,7 @@ import executor import data_feeder import contextlib import io +import transpiler # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module import optimizer as opt_module diff --git a/python/setup.py.in b/python/setup.py.in index a811b509a9..c42601d335 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -68,7 +68,8 @@ packages=['paddle', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', - 'paddle.fluid.layers'] + 'paddle.fluid.layers', + 'paddle.fluid.transpiler'] if '${WITH_FLUID_ONLY}'== 'OFF': packages+=['paddle.proto', From f43b71b242467d665c134262c2b7167cef622757 Mon Sep 17 00:00:00 2001 From: whs Date: Mon, 7 May 2018 19:20:38 +0800 Subject: [PATCH 13/13] Fix clone function of Program to avoid memory leak. (#10358) * Fix clone function of Program to avoid memory leak. * Fix inference_optimize function of framework.py. * Reuse inference_optimize in framework.py. * Add comments. --- python/paddle/fluid/framework.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ce9b880aeb..d7eda619c3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1042,13 +1042,14 @@ class Program(object): Returns(Program): The cloned Program object. """ - p = Program() if for_test: - p.desc = core.inference_optimize(self.desc) + p = self.inference_optimize() else: + p = Program() p.desc = core.ProgramDesc(self.desc) - p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] - p.sync_with_cpp() + p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] + p.sync_with_cpp() + p.copy_param_info_from(self) return p @@ -1061,7 +1062,7 @@ class Program(object): if isinstance(t, Variable): # After transpiler processing, the op that output this # variable maybe has been changed, so t.op is not reliable - # and we need to find the current op that generate this + # and we need to find the current op that generate this # variable here. t.op = None global_block = self.global_block() @@ -1087,8 +1088,16 @@ class Program(object): return res def inference_optimize(self): + # this is an alternative implement before + # core.inference_optimize being fixed. res = Program() - res.desc = core.inference_optimize(self.desc) + res.desc = core.ProgramDesc(self.desc) + for i in xrange(res.desc.num_blocks()): + block = res.desc.block(i) + for j in xrange(block.op_size()): + op = block.op(j) + if op.has_attr('is_test'): + op.set_attr('is_test', True) res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] res.sync_with_cpp() return res