From ca535d18ab808785cc969c8c8f96413536cd7926 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 8 Dec 2017 12:43:39 +0800 Subject: [PATCH 001/181] add detection_output code only --- paddle/operators/detection_output_op.cc | 91 +++++++ paddle/operators/detection_output_op.cu.cc | 21 ++ paddle/operators/detection_output_op.h | 114 ++++++++ paddle/operators/math/detection_util.h | 292 +++++++++++++++++++++ 4 files changed, 518 insertions(+) create mode 100644 paddle/operators/detection_output_op.cc create mode 100644 paddle/operators/detection_output_op.cu.cc create mode 100644 paddle/operators/detection_output_op.h create mode 100644 paddle/operators/math/detection_util.h diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc new file mode 100644 index 0000000000..c018795fd4 --- /dev/null +++ b/paddle/operators/detection_output_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/detection_output_op.h" +namespace paddle { +namespace operators { + +class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { + public: + Detection_output_OpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Loc", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "Conf", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "PriorBox", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of detection_output operator." + "N * M." + "M = C * H * W"); + AddAttr("background_label_id", "(int), multi level pooling"); + AddAttr("num_classes", "(int), multi level pooling"); + AddAttr("nms_threshold", "(int), multi level pooling"); + AddAttr("confidence_threshold", "(int), multi level pooling"); + AddAttr("top_k", "(int), multi level pooling"); + AddAttr("nms_top_k", "(int), multi level pooling"); + AddComment(R"DOC( + "Does spatial pyramid pooling on the input image by taking the max, + etc. within regions so that the result vector of different sized + images are of the same size + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(H_{out}, W_{out})$ + Where + $$ + H_{out} = N \\ + W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in} + $$ + )DOC"); + } +}; + +class Detection_output_Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Detection_output_Op" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Detection_output_Op should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int pyramid_height = ctx->Attrs().Get("pyramid_height"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Detection_output_ing intput must be of 4-dimensional."); + int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1]; + std::vector output_shape({in_x_dims[0], outlen}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::Detection_output_Op, + ops::Detection_output_OpMaker); +REGISTER_OP_CPU_KERNEL( + detection_output, + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc new file mode 100644 index 0000000000..8edcfc0be3 --- /dev/null +++ b/paddle/operators/detection_output_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/detection_output_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + detection_output, + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h new file mode 100644 index 0000000000..184b864974 --- /dev/null +++ b/paddle/operators/detection_output_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/framework/tensor.h" +#include "paddle/operators/math/detection_util.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/softmax.h" + +namespace paddle { +namespace operators { +template +class Detection_output_Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_loc = context.Input("Loc"); + const framework::Tensor* in_conf = context.Input("Conf"); + const framework::Tensor* in_priorbox = + context.Input("PriorBox"); + auto* out = context.Output("Out"); + int num_classes = context.template Attr("num_classes"); + int top_k = context.template Attr("top_k"); + int nms_top_k = context.template Attr("nms_top_k"); + int background_label_id = context.template Attr("background_label_id"); + float nms_threshold = context.template Attr("nms_threshold"); + float confidence_threshold = + context.template Attr("confidence_threshold"); + + int input_num = in_loc->dims()[0]; + int batch_size = in_loc->dims()[1]; + int loc_sum_size = in_loc->numel(); + int conf_sum_size = in_conf->numel(); + std::vector loc_shape_vec({1, loc_sum_size}); + std::vector conf_shape_vec( + {conf_sum_size / num_classes, num_classes}); + framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); + framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); + framework::Tensor loc_tensor; + framework::Tensor conf_tensor; + loc_tensor.mutable_data(loc_shape, context.GetPlace()); + conf_tensor.mutable_data(conf_shape, context.GetPlace()); + + // KNCHW ==> NHWC + for (int i = 0; i < input_num; ++i) { + math::appendWithPermute(*in_loc, &loc_tensor); + math::appendWithPermute(*in_conf, &conf_tensor); + } + // softmax + math::SoftmaxFunctor()(context.device_context(), &conf_tensor, + &conf_tensor); + // get decode bboxes + size_t num_priors = in_priorbox->numel() / 8; + std::vector>> all_decoded_bboxes; + for (size_t n = 0; n < batch_size; ++n) { + std::vector> decoded_bboxes; + for (size_t i = 0; i < num_priors; ++i) { + size_t prior_offset = i * 8; + size_t loc_pred_offset = n * num_priors * 4 + i * 4; + std::vector> prior_bbox_vec; + math::getBBoxFromPriorData(in_priorbox->data() + prior_offset, 1, + prior_bbox_vec); + std::vector> prior_bbox_var; + math::getBBoxVarFromPriorData(in_priorbox->data() + prior_offset, + 1, prior_bbox_var); + std::vector loc_pred_data; + for (size_t j = 0; j < 4; ++j) + loc_pred_data.push_back( + *(loc_tensor.data() + loc_pred_offset + j)); + math::BBox bbox = math::decodeBBoxWithVar( + prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); + decoded_bboxes.push_back(bbox); + } + all_decoded_bboxes.push_back(decoded_bboxes); + } + + std::vector>> all_indices; + int num_kept = math::getDetectionIndices( + conf_tensor.data(), num_priors, num_classes, background_label_id, + batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k, + all_decoded_bboxes, &all_indices); + + framework::Tensor out_tmp; + if (num_kept <= 0) { + std::vector out_shape_vec({0, 0}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->Resize(out_shape); + return; + } + std::vector out_shape_vec({num_kept, 7}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out_tmp.mutable_data(out_shape, context.GetPlace()); + + T* out_data = out_tmp.data(); + math::getDetectionOutput(conf_tensor.data(), num_kept, num_priors, + num_classes, batch_size, all_indices, + all_decoded_bboxes, out_data); + out->mutable_data(out_shape, context.GetPlace()); + out->ShareDataWith(out_tmp); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h new file mode 100644 index 0000000000..265fa07701 --- /dev/null +++ b/paddle/operators/math/detection_util.h @@ -0,0 +1,292 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/framework/selected_rows.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct BBox { + BBox(T x_min, T y_min, T x_max, T y_max) + : x_min(x_min), + y_min(y_min), + x_max(x_max), + y_max(y_max), + is_difficult(false) {} + + BBox() {} + + T get_width() const { return x_max - x_min; } + + T get_height() const { return y_max - y_min; } + + T get_center_x() const { return (x_min + x_max) / 2; } + + T get_center_y() const { return (y_min + y_max) / 2; } + + T get_area() const { return get_width() * get_height(); } + + // coordinate of bounding box + T x_min; + T y_min; + T x_max; + T y_max; + // whether difficult object (e.g. object with heavy occlusion is difficult) + bool is_difficult; +}; +// KNCHW ==> NHWC +template +int appendWithPermute(const framework::Tensor& input, + framework::Tensor* output) { + const int input_nums = input.dims()[0]; + const int batch_size = input.dims()[1]; + const int channels = input.dims()[2]; + const int height = input.dims()[3]; + const int weight = input.dims()[4]; + int image_size = height * weight; + int offset = 0; + for (int p = 0; p < input_nums; ++p) { + int in_p_offset = p * batch_size * channels * image_size; + for (int n = 0; n < batch_size; ++n) { + int in_n_offset = n * channels * image_size; + int out_n_offset = n * input.numel() / batch_size + offset; + int in_stride = image_size; + int out_stride = channels; + const T* in_data = input.data() + in_p_offset + in_n_offset; + T* out_data = output->data() + out_n_offset; + for (int i = 0; i < channels; ++i) { + for (int c = 0; c < image_size; ++c) { + out_data[out_stride * c + i] = in_data[i * in_stride + c]; + } + } + } + offset += image_size * channels; + } + return 0; +} +template +void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { + size_t out_offset = bbox_vec.size(); + bbox_vec.resize(bbox_vec.size() + num_bboxes); + for (size_t i = 0; i < num_bboxes; ++i) { + BBox bbox; + bbox.x_min = *(prior_data + i * 8); + bbox.y_min = *(prior_data + i * 8 + 1); + bbox.x_max = *(prior_data + i * 8 + 2); + bbox.y_max = *(prior_data + i * 8 + 3); + bbox_vec[out_offset + i] = bbox; + } +} +template +void getBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec) { + size_t out_offset = var_vec.size(); + var_vec.resize(var_vec.size() + num); + for (size_t i = 0; i < num; ++i) { + std::vector var; + var.push_back(*(prior_data + i * 8 + 4)); + var.push_back(*(prior_data + i * 8 + 5)); + var.push_back(*(prior_data + i * 8 + 6)); + var.push_back(*(prior_data + i * 8 + 7)); + var_vec[out_offset + i] = var; + } +} +template +BBox decodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { + T prior_bbox_width = prior_bbox.get_width(); + T prior_bbox_height = prior_bbox.get_height(); + T prior_bbox_center_x = prior_bbox.get_center_x(); + T prior_bbox_center_y = prior_bbox.get_center_y(); + + T decoded_bbox_center_x = + prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width + + prior_bbox_center_x; + T decoded_bbox_center_y = + prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height + + prior_bbox_center_y; + T decoded_bbox_width = + std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width; + T decoded_bbox_height = + std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height; + + BBox decoded_bbox; + decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2; + decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2; + decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2; + decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2; + + return decoded_bbox; +} +template +bool sortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} +template +bool sortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); +template +T jaccardOverlap(const BBox& bbox1, const BBox& bbox2) { + if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || + bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { + return 0.0; + } else { + T inter_x_min = std::max(bbox1.x_min, bbox2.x_min); + T inter_y_min = std::max(bbox1.y_min, bbox2.y_min); + T interX_max = std::min(bbox1.x_max, bbox2.x_max); + T interY_max = std::min(bbox1.y_max, bbox2.y_max); + + T inter_width = interX_max - inter_x_min; + T inter_height = interY_max - inter_y_min; + T inter_area = inter_width * inter_height; + + T bbox_area1 = bbox1.get_area(); + T bbox_area2 = bbox2.get_area(); + + return inter_area / (bbox_area1 + bbox_area2 - inter_area); + } +} + +template +void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices) { + std::vector> scores; + for (size_t i = 0; i < num_priors; ++i) { + size_t conf_offset = i * num_classes + class_idx; + if (conf_score_data[conf_offset] > conf_threshold) + scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); + } + std::stable_sort(scores.begin(), scores.end(), + sortScorePairDescend); + if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); + while (scores.size() > 0) { + const size_t idx = scores.front().second; + bool keep = true; + for (size_t i = 0; i < indices->size(); ++i) { + if (keep) { + const size_t saved_idx = (*indices)[i]; + T overlap = jaccardOverlap(bboxes[idx], bboxes[saved_idx]); + keep = overlap <= nms_threshold; + } else { + break; + } + } + if (keep) indices->push_back(idx); + scores.erase(scores.begin()); + } +} +template +int getDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices) { + int total_keep_num = 0; + for (size_t n = 0; n < batch_size; ++n) { + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + size_t num_detected = 0; + std::map> indices; + size_t conf_offset = n * num_priors * num_classes; + for (size_t c = 0; c < num_classes; ++c) { + if (c == background_label_id) continue; + applyNMSFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); + num_detected += indices[c].size(); + } + if (top_k > 0 && num_detected > top_k) { + // std::vector> score_index_pairs; + std::vector>> score_index_pairs; + for (size_t c = 0; c < num_classes; ++c) { + const std::vector& label_indices = indices[c]; + for (size_t i = 0; i < label_indices.size(); ++i) { + size_t idx = label_indices[i]; + score_index_pairs.push_back( + std::make_pair((conf_data + conf_offset)[idx * num_classes + c], + std::make_pair(c, idx))); + } + } + std::sort(score_index_pairs.begin(), score_index_pairs.end(), + sortScorePairDescend>); + score_index_pairs.resize(top_k); + std::map> new_indices; + for (size_t i = 0; i < score_index_pairs.size(); ++i) { + size_t label = score_index_pairs[i].second.first; + size_t idx = score_index_pairs[i].second.second; + new_indices[label].push_back(idx); + } + all_detection_indices->push_back(new_indices); + total_keep_num += top_k; + } else { + all_detection_indices->push_back(indices); + total_keep_num += num_detected; + } + } + return total_keep_num; +} +template +BBox clipBBox(const BBox& bbox) { + T one = static_cast(1.0); + T zero = static_cast(0.0); + BBox clipped_bbox; + clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero); + clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero); + clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero); + clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero); + return clipped_bbox; +} +template +void getDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data) { + size_t count = 0; + for (size_t n = 0; n < batch_size; ++n) { + for (std::map>::const_iterator it = + all_indices[n].begin(); + it != all_indices[n].end(); ++it) { + size_t label = it->first; + const std::vector& indices = it->second; + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + size_t conf_offset = n * num_priors * num_classes + idx * num_classes; + out_data[count * 7] = n; + out_data[count * 7 + 1] = label; + out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; + BBox clipped_bbox = clipBBox(decoded_bboxes[idx]); + out_data[count * 7 + 3] = clipped_bbox.x_min; + out_data[count * 7 + 4] = clipped_bbox.y_min; + out_data[count * 7 + 5] = clipped_bbox.x_max; + out_data[count * 7 + 6] = clipped_bbox.y_max; + ++count; + } + } + } + // out.copyFrom(out_data, num_kept * 7); +} +} // namespace math +} // namespace operators +} // namespace paddle From fe177b629207aca199ac32d6856455aa68c78c42 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Sat, 9 Dec 2017 22:25:01 +0800 Subject: [PATCH 002/181] test detection_output cpu and gpu ok, but doc will be modify --- paddle/operators/detection_output_op.cc | 15 +-- paddle/operators/detection_output_op.h | 95 ++++++++++++++----- paddle/operators/math/detection_util.h | 22 ++--- .../fluid/tests/test_detection_output_op.py | 55 +++++++++++ 4 files changed, 145 insertions(+), 42 deletions(-) create mode 100644 python/paddle/v2/fluid/tests/test_detection_output_op.py diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index c018795fd4..a04d6e5758 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -65,17 +65,18 @@ class Detection_output_Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), + PADDLE_ENFORCE(ctx->HasInput("Loc"), + "Input(X) of Detection_output_Op" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Conf"), + "Input(X) of Detection_output_Op" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), "Input(X) of Detection_output_Op" "should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of Detection_output_Op should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - int pyramid_height = ctx->Attrs().Get("pyramid_height"); - PADDLE_ENFORCE(in_x_dims.size() == 4, - "Detection_output_ing intput must be of 4-dimensional."); - int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1]; - std::vector output_shape({in_x_dims[0], outlen}); + std::vector output_shape({1, 7}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); } }; diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 184b864974..d03452ff8d 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -40,6 +40,9 @@ class Detection_output_Kernel : public framework::OpKernel { int input_num = in_loc->dims()[0]; int batch_size = in_loc->dims()[1]; + int channels = in_loc->dims()[2]; + int height = in_loc->dims()[3]; + int weight = in_loc->dims()[4]; int loc_sum_size = in_loc->numel(); int conf_sum_size = in_conf->numel(); std::vector loc_shape_vec({1, loc_sum_size}); @@ -49,17 +52,62 @@ class Detection_output_Kernel : public framework::OpKernel { framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); framework::Tensor loc_tensor; framework::Tensor conf_tensor; + loc_tensor.Resize(loc_shape); + conf_tensor.Resize(conf_shape); loc_tensor.mutable_data(loc_shape, context.GetPlace()); conf_tensor.mutable_data(conf_shape, context.GetPlace()); + framework::Tensor loc_cpu; + framework::Tensor conf_cpu; + framework::Tensor priorbox_cpu; + const T* in_loc_data = in_loc->data(); + const T* in_conf_data = in_conf->data(); + T* loc_data; + T* conf_data; + const T* priorbox_data = in_priorbox->data(); - // KNCHW ==> NHWC + if (platform::is_gpu_place(context.GetPlace())) { + loc_cpu.mutable_data(in_loc->dims(), platform::CPUPlace()); + framework::CopyFrom(*in_loc, platform::CPUPlace(), + context.device_context(), &loc_cpu); + in_loc_data = loc_cpu.data(); + conf_cpu.mutable_data(in_conf->dims(), platform::CPUPlace()); + framework::CopyFrom(*in_conf, platform::CPUPlace(), + context.device_context(), &conf_cpu); + in_conf_data = conf_cpu.data(); + priorbox_cpu.mutable_data(in_priorbox->dims(), platform::CPUPlace()); + framework::CopyFrom(*in_priorbox, platform::CPUPlace(), + context.device_context(), &priorbox_cpu); + priorbox_data = priorbox_cpu.data(); + loc_tensor.mutable_data(loc_shape, platform::CPUPlace()); + conf_tensor.mutable_data(conf_shape, platform::CPUPlace()); + } + T* loc_tensor_data = loc_tensor.data(); + T* conf_tensor_data = conf_tensor.data(); for (int i = 0; i < input_num; ++i) { - math::appendWithPermute(*in_loc, &loc_tensor); - math::appendWithPermute(*in_conf, &conf_tensor); + math::appendWithPermute(in_loc_data, input_num, batch_size, channels, + height, weight, loc_tensor_data); + math::appendWithPermute(in_conf_data, input_num, batch_size, channels, + height, weight, conf_tensor_data); + } + loc_data = loc_tensor.data(); + if (platform::is_gpu_place(context.GetPlace())) { + framework::Tensor conf_gpu; + conf_gpu.Resize(conf_shape); + conf_gpu.mutable_data(conf_shape, context.GetPlace()); + framework::CopyFrom(conf_tensor, platform::GPUPlace(), + context.device_context(), &conf_gpu); + // softmax + math::SoftmaxFunctor()(context.device_context(), &conf_gpu, + &conf_gpu); + conf_tensor.mutable_data(conf_gpu.dims(), platform::CPUPlace()); + framework::CopyFrom(conf_gpu, platform::CPUPlace(), + context.device_context(), &conf_tensor); + } else { + // softmax + math::SoftmaxFunctor()(context.device_context(), &conf_tensor, + &conf_tensor); } - // softmax - math::SoftmaxFunctor()(context.device_context(), &conf_tensor, - &conf_tensor); + conf_data = conf_tensor.data(); // get decode bboxes size_t num_priors = in_priorbox->numel() / 8; std::vector>> all_decoded_bboxes; @@ -69,29 +117,26 @@ class Detection_output_Kernel : public framework::OpKernel { size_t prior_offset = i * 8; size_t loc_pred_offset = n * num_priors * 4 + i * 4; std::vector> prior_bbox_vec; - math::getBBoxFromPriorData(in_priorbox->data() + prior_offset, 1, + math::getBBoxFromPriorData(priorbox_data + prior_offset, 1, prior_bbox_vec); std::vector> prior_bbox_var; - math::getBBoxVarFromPriorData(in_priorbox->data() + prior_offset, - 1, prior_bbox_var); + math::getBBoxVarFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_var); std::vector loc_pred_data; for (size_t j = 0; j < 4; ++j) - loc_pred_data.push_back( - *(loc_tensor.data() + loc_pred_offset + j)); + loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); math::BBox bbox = math::decodeBBoxWithVar( prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); decoded_bboxes.push_back(bbox); } all_decoded_bboxes.push_back(decoded_bboxes); } - std::vector>> all_indices; int num_kept = math::getDetectionIndices( - conf_tensor.data(), num_priors, num_classes, background_label_id, - batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k, + conf_data, num_priors, num_classes, background_label_id, batch_size, + confidence_threshold, nms_top_k, nms_threshold, top_k, all_decoded_bboxes, &all_indices); - framework::Tensor out_tmp; if (num_kept <= 0) { std::vector out_shape_vec({0, 0}); framework::DDim out_shape(framework::make_ddim(out_shape_vec)); @@ -100,14 +145,20 @@ class Detection_output_Kernel : public framework::OpKernel { } std::vector out_shape_vec({num_kept, 7}); framework::DDim out_shape(framework::make_ddim(out_shape_vec)); - out_tmp.mutable_data(out_shape, context.GetPlace()); - - T* out_data = out_tmp.data(); - math::getDetectionOutput(conf_tensor.data(), num_kept, num_priors, - num_classes, batch_size, all_indices, - all_decoded_bboxes, out_data); out->mutable_data(out_shape, context.GetPlace()); - out->ShareDataWith(out_tmp); + framework::Tensor out_cpu; + T* out_data = out->data(); + if (platform::is_gpu_place(context.GetPlace())) { + out_cpu.mutable_data(out->dims(), platform::CPUPlace()); + out_data = out_cpu.data(); + } + math::getDetectionOutput(conf_data, num_kept, num_priors, num_classes, + batch_size, all_indices, all_decoded_bboxes, + out_data); + if (platform::is_gpu_place(context.GetPlace())) { + framework::CopyFrom(out_cpu, platform::GPUPlace(), + context.device_context(), out); + } } }; } // namespace operators diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h index 265fa07701..12d9ca9da8 100644 --- a/paddle/operators/math/detection_util.h +++ b/paddle/operators/math/detection_util.h @@ -50,27 +50,23 @@ struct BBox { }; // KNCHW ==> NHWC template -int appendWithPermute(const framework::Tensor& input, - framework::Tensor* output) { - const int input_nums = input.dims()[0]; - const int batch_size = input.dims()[1]; - const int channels = input.dims()[2]; - const int height = input.dims()[3]; - const int weight = input.dims()[4]; +int appendWithPermute(const T* input_data, int input_nums, int batch_size, + int channels, int height, int weight, T* output_data) { int image_size = height * weight; + int numel = input_nums * batch_size * channels * height * weight; int offset = 0; for (int p = 0; p < input_nums; ++p) { int in_p_offset = p * batch_size * channels * image_size; for (int n = 0; n < batch_size; ++n) { int in_n_offset = n * channels * image_size; - int out_n_offset = n * input.numel() / batch_size + offset; + int out_n_offset = n * numel / batch_size + offset; int in_stride = image_size; int out_stride = channels; - const T* in_data = input.data() + in_p_offset + in_n_offset; - T* out_data = output->data() + out_n_offset; - for (int i = 0; i < channels; ++i) { - for (int c = 0; c < image_size; ++c) { - out_data[out_stride * c + i] = in_data[i * in_stride + c]; + const T* in_data = input_data + in_p_offset + in_n_offset; + T* out_data = output_data + out_n_offset; + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < image_size; ++i) { + out_data[out_stride * i + c] = in_data[c * in_stride + i]; } } } diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py new file mode 100644 index 0000000000..56cd5dde9f --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py @@ -0,0 +1,55 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestUnpoolOp(OpTest): + def setUp(self): + self.op_type = "detection_output" + self.init_test_case() + + #loc = np.zeros((1, 4, 4, 1, 1)) + #conf = np.zero((1, 4, 2, 1, 1)) + + loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]], + [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], + [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], + [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]]) + conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]]], + [[[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]]) + priorbox = np.array([0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,\ + 0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,\ + 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,\ + 0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2]) + + output = np.array([0, 1, 0.68997443, 0.099959746, 0.099959746,\ + 0.50804031, 0.50804031]) + self.inputs = { + 'Loc': loc.astype('float32'), + 'Conf': conf.astype('float32'), + 'PriorBox': priorbox.astype('float32') + } + self.attrs = { + 'num_classes': self.num_classes, + 'top_k': self.top_k, + 'nms_top_k': self.nms_top_k, + 'background_label_id': self.background_label_id, + 'nms_threshold': self.nms_threshold, + 'confidence_threshold': self.confidence_threshold, + } + self.outputs = {'Out': output.astype('float32')} + + def test_check_output(self): + self.check_output() + + def init_test_case(self): + self.num_classes = 2 + self.top_k = 10 + self.nms_top_k = 20 + self.background_label_id = 0 + self.nms_threshold = 0.01 + self.confidence_threshold = 0.01 + + +if __name__ == '__main__': + unittest.main() From 65b641bf660a8ecbcb831dc0b35a1e58bc15174a Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 11 Dec 2017 22:56:41 +0800 Subject: [PATCH 003/181] add detection_output op --- paddle/operators/detection_output_op.cc | 63 +++++------ paddle/operators/detection_output_op.h | 102 +++++++++--------- paddle/operators/math/detection_util.h | 70 +++++++----- .../fluid/tests/test_detection_output_op.py | 24 +++-- 4 files changed, 133 insertions(+), 126 deletions(-) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index a04d6e5758..ced9caf992 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -21,42 +21,37 @@ class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { Detection_output_OpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput( - "Loc", - "(Tensor) The input tensor of detection_output operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); - AddInput( - "Conf", - "(Tensor) The input tensor of detection_output operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); - AddInput( - "PriorBox", - "(Tensor) The input tensor of detection_output operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); + AddInput("Loc", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is 4, H and W both are 1."); + AddInput("Conf", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is kNCHW. Where K is priorbox point " + "numbers," + "N is How many boxes are there on each point, " + "C is the number of classes, H and W both are 1."); + AddInput("PriorBox", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is the position and variance " + "of the boxes"); AddOutput("Out", - "(Tensor) The output tensor of detection_output operator." - "N * M." - "M = C * H * W"); - AddAttr("background_label_id", "(int), multi level pooling"); - AddAttr("num_classes", "(int), multi level pooling"); - AddAttr("nms_threshold", "(int), multi level pooling"); - AddAttr("confidence_threshold", "(int), multi level pooling"); - AddAttr("top_k", "(int), multi level pooling"); - AddAttr("nms_top_k", "(int), multi level pooling"); + "(Tensor) The output tensor of detection_output operator."); + AddAttr("background_label_id", + "(int), the attr of detection_output operator"); + AddAttr("num_classes", + "(int), the attr of detection_output operator"); + AddAttr("nms_threshold", + "(float), the attr of detection_output operator"); + AddAttr("confidence_threshold", + "(float), the attr of detection_output operator"); + AddAttr("top_k", "(int), the attr of detection_output operator"); + AddAttr("nms_top_k", "(int), the attr of detection_output operator"); AddComment(R"DOC( - "Does spatial pyramid pooling on the input image by taking the max, - etc. within regions so that the result vector of different sized - images are of the same size - Input shape: $(N, C_{in}, H_{in}, W_{in})$ - Output shape: $(H_{out}, W_{out})$ - Where - $$ - H_{out} = N \\ - W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in} - $$ + detection output for SSD(single shot multibox detector) + )DOC"); } }; diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index d03452ff8d..508e3d6939 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -18,10 +18,34 @@ limitations under the License. */ #include "paddle/operators/math/detection_util.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/softmax.h" - +#include "paddle/operators/strided_memcpy.h" namespace paddle { namespace operators { template +void transpose_fun(const platform::DeviceContext& context, + const framework::Tensor& src, framework::Tensor* dst) { + int input_nums = src.dims()[0]; + int offset = 0; + for (int j = 0; j < input_nums; ++j) { + framework::Tensor in_p_tensor = src.Slice(j, j + 1); + std::vector shape_vec( + {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3], + in_p_tensor.dims()[4], in_p_tensor.dims()[2]}); + framework::DDim shape(framework::make_ddim(shape_vec)); + framework::Tensor in_p_tensor_transpose; + in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); + std::vector shape_axis({0, 1, 3, 4, 2}); + math::Transpose trans5; + trans5(context, in_p_tensor, &in_p_tensor_transpose, shape_axis); + auto dst_stride = framework::stride(dst->dims()); + auto src_stride = framework::stride(in_p_tensor_transpose.dims()); + StridedMemcpy(context, in_p_tensor_transpose.data(), src_stride, + in_p_tensor_transpose.dims(), dst_stride, + dst->data() + offset); + offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; + } +} +template class Detection_output_Kernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,77 +61,51 @@ class Detection_output_Kernel : public framework::OpKernel { float nms_threshold = context.template Attr("nms_threshold"); float confidence_threshold = context.template Attr("confidence_threshold"); - - int input_num = in_loc->dims()[0]; - int batch_size = in_loc->dims()[1]; - int channels = in_loc->dims()[2]; - int height = in_loc->dims()[3]; - int weight = in_loc->dims()[4]; - int loc_sum_size = in_loc->numel(); + int batch_size = in_conf->dims()[1]; int conf_sum_size = in_conf->numel(); - std::vector loc_shape_vec({1, loc_sum_size}); - std::vector conf_shape_vec( + // for softmax + std::vector conf_shape_softmax_vec( {conf_sum_size / num_classes, num_classes}); + framework::DDim conf_shape_softmax( + framework::make_ddim(conf_shape_softmax_vec)); + // for knchw => nhwc + std::vector loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3], + in_loc->dims()[4], in_loc->dims()[2]}); + std::vector conf_shape_vec({1, in_conf->dims()[1], + in_conf->dims()[3], in_conf->dims()[4], + in_conf->dims()[2]}); framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); framework::Tensor loc_tensor; framework::Tensor conf_tensor; - loc_tensor.Resize(loc_shape); - conf_tensor.Resize(conf_shape); loc_tensor.mutable_data(loc_shape, context.GetPlace()); conf_tensor.mutable_data(conf_shape, context.GetPlace()); + // for cpu framework::Tensor loc_cpu; framework::Tensor conf_cpu; framework::Tensor priorbox_cpu; - const T* in_loc_data = in_loc->data(); - const T* in_conf_data = in_conf->data(); - T* loc_data; - T* conf_data; const T* priorbox_data = in_priorbox->data(); - + transpose_fun(context.device_context(), *in_loc, &loc_tensor); + transpose_fun(context.device_context(), *in_conf, &conf_tensor); + conf_tensor.Resize(conf_shape_softmax); + math::SoftmaxFunctor()(context.device_context(), &conf_tensor, + &conf_tensor); + T* loc_data = loc_tensor.data(); + T* conf_data = conf_tensor.data(); if (platform::is_gpu_place(context.GetPlace())) { - loc_cpu.mutable_data(in_loc->dims(), platform::CPUPlace()); - framework::CopyFrom(*in_loc, platform::CPUPlace(), + loc_cpu.mutable_data(loc_tensor.dims(), platform::CPUPlace()); + framework::CopyFrom(loc_tensor, platform::CPUPlace(), context.device_context(), &loc_cpu); - in_loc_data = loc_cpu.data(); - conf_cpu.mutable_data(in_conf->dims(), platform::CPUPlace()); - framework::CopyFrom(*in_conf, platform::CPUPlace(), + loc_data = loc_cpu.data(); + conf_cpu.mutable_data(conf_tensor.dims(), platform::CPUPlace()); + framework::CopyFrom(conf_tensor, platform::CPUPlace(), context.device_context(), &conf_cpu); - in_conf_data = conf_cpu.data(); + conf_data = conf_cpu.data(); priorbox_cpu.mutable_data(in_priorbox->dims(), platform::CPUPlace()); framework::CopyFrom(*in_priorbox, platform::CPUPlace(), context.device_context(), &priorbox_cpu); priorbox_data = priorbox_cpu.data(); - loc_tensor.mutable_data(loc_shape, platform::CPUPlace()); - conf_tensor.mutable_data(conf_shape, platform::CPUPlace()); - } - T* loc_tensor_data = loc_tensor.data(); - T* conf_tensor_data = conf_tensor.data(); - for (int i = 0; i < input_num; ++i) { - math::appendWithPermute(in_loc_data, input_num, batch_size, channels, - height, weight, loc_tensor_data); - math::appendWithPermute(in_conf_data, input_num, batch_size, channels, - height, weight, conf_tensor_data); - } - loc_data = loc_tensor.data(); - if (platform::is_gpu_place(context.GetPlace())) { - framework::Tensor conf_gpu; - conf_gpu.Resize(conf_shape); - conf_gpu.mutable_data(conf_shape, context.GetPlace()); - framework::CopyFrom(conf_tensor, platform::GPUPlace(), - context.device_context(), &conf_gpu); - // softmax - math::SoftmaxFunctor()(context.device_context(), &conf_gpu, - &conf_gpu); - conf_tensor.mutable_data(conf_gpu.dims(), platform::CPUPlace()); - framework::CopyFrom(conf_gpu, platform::CPUPlace(), - context.device_context(), &conf_tensor); - } else { - // softmax - math::SoftmaxFunctor()(context.device_context(), &conf_tensor, - &conf_tensor); } - conf_data = conf_tensor.data(); // get decode bboxes size_t num_priors = in_priorbox->numel() / 8; std::vector>> all_decoded_bboxes; diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h index 12d9ca9da8..b671f7b517 100644 --- a/paddle/operators/math/detection_util.h +++ b/paddle/operators/math/detection_util.h @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/framework/selected_rows.h" #include "paddle/platform/device_context.h" namespace paddle { namespace operators { namespace math { - template struct BBox { BBox(T x_min, T y_min, T x_max, T y_max) @@ -49,31 +49,47 @@ struct BBox { bool is_difficult; }; // KNCHW ==> NHWC +// template template -int appendWithPermute(const T* input_data, int input_nums, int batch_size, - int channels, int height, int weight, T* output_data) { - int image_size = height * weight; - int numel = input_nums * batch_size * channels * height * weight; - int offset = 0; - for (int p = 0; p < input_nums; ++p) { - int in_p_offset = p * batch_size * channels * image_size; - for (int n = 0; n < batch_size; ++n) { - int in_n_offset = n * channels * image_size; - int out_n_offset = n * numel / batch_size + offset; - int in_stride = image_size; - int out_stride = channels; - const T* in_data = input_data + in_p_offset + in_n_offset; - T* out_data = output_data + out_n_offset; - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < image_size; ++i) { - out_data[out_stride * i + c] = in_data[c * in_stride + i]; - } - } - } - offset += image_size * channels; - } - return 0; -} +void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec); +template +void getBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec); +template +BBox decodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data); +template +bool sortScorePairDescend(const std::pair& pair1, + const std::pair& pair2); +template +bool sortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); +template +T jaccardOverlap(const BBox& bbox1, const BBox& bbox2); + +template +void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices); +template +int getDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices); +template +BBox clipBBox(const BBox& bbox); +template +void getDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data); template void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, std::vector>& bbox_vec) { @@ -136,9 +152,6 @@ bool sortScorePairDescend(const std::pair& pair1, return pair1.first > pair2.first; } template -bool sortScorePairDescend(const std::pair>& pair1, - const std::pair>& pair2); -template T jaccardOverlap(const BBox& bbox1, const BBox& bbox2) { if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { @@ -281,7 +294,6 @@ void getDetectionOutput( } } } - // out.copyFrom(out_data, num_kept * 7); } } // namespace math } // namespace operators diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py index 56cd5dde9f..080a9743b0 100644 --- a/python/paddle/v2/fluid/tests/test_detection_output_op.py +++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py @@ -8,22 +8,24 @@ class TestUnpoolOp(OpTest): self.op_type = "detection_output" self.init_test_case() - #loc = np.zeros((1, 4, 4, 1, 1)) - #conf = np.zero((1, 4, 2, 1, 1)) + #loc.shape ((1, 4, 4, 1, 1)) + #conf.shape ((1, 4, 2, 1, 1)) loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]], [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]]) - conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]]], - [[[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]]) - priorbox = np.array([0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,\ - 0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,\ - 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,\ - 0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2]) - - output = np.array([0, 1, 0.68997443, 0.099959746, 0.099959746,\ - 0.50804031, 0.50804031]) + conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]], + [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]]) + priorbox = np.array([ + 0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1, + 0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4, + 0.8, 0.8, 0.1, 0.1, 0.2, 0.2 + ]) + + output = np.array([ + 0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031 + ]) self.inputs = { 'Loc': loc.astype('float32'), 'Conf': conf.astype('float32'), From c65d2fc356d1626ffcd1c64e2ab0729ec0bf89c8 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 11 Dec 2017 23:17:33 +0800 Subject: [PATCH 004/181] add inline --- paddle/operators/detection_output_op.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 508e3d6939..74a609d0a4 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -22,8 +22,9 @@ limitations under the License. */ namespace paddle { namespace operators { template -void transpose_fun(const platform::DeviceContext& context, - const framework::Tensor& src, framework::Tensor* dst) { +inline void transpose_fun(const platform::DeviceContext& context, + const framework::Tensor& src, + framework::Tensor* dst) { int input_nums = src.dims()[0]; int offset = 0; for (int j = 0; j < input_nums; ++j) { From 5fe4d7fb6b03164b11ba6074da848c241177adf9 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 11 Dec 2017 23:21:26 +0800 Subject: [PATCH 005/181] modify a bug *input_nums --- paddle/operators/detection_output_op.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 74a609d0a4..510d82251d 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -71,10 +71,11 @@ class Detection_output_Kernel : public framework::OpKernel { framework::make_ddim(conf_shape_softmax_vec)); // for knchw => nhwc std::vector loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3], - in_loc->dims()[4], in_loc->dims()[2]}); - std::vector conf_shape_vec({1, in_conf->dims()[1], - in_conf->dims()[3], in_conf->dims()[4], - in_conf->dims()[2]}); + in_loc->dims()[4], + in_loc->dims()[2] * in_loc->dims()[0]}); + std::vector conf_shape_vec( + {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4], + in_conf->dims()[2] * in_conf->dims()[0]}); framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); framework::Tensor loc_tensor; From b34df5f12c6d92d7785edc2dfc32f899db2d7745 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 12 Dec 2017 15:31:41 +0800 Subject: [PATCH 006/181] add some doc --- paddle/operators/detection_output_op.cc | 27 ++++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index ced9caf992..2bf0ef4414 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -22,36 +22,39 @@ class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Loc", - "(Tensor) The input tensor of detection_output operator. " + "(Tensor) The input tensor of detection_output operator." + "The input predict locations" "The format of input tensor is kNCHW. Where K is priorbox point " "numbers," "N is How many boxes are there on each point, " "C is 4, H and W both are 1."); AddInput("Conf", - "(Tensor) The input tensor of detection_output operator. " + "(Tensor) The input tensor of detection_output operator." + "The input priorbox confidence." "The format of input tensor is kNCHW. Where K is priorbox point " "numbers," "N is How many boxes are there on each point, " "C is the number of classes, H and W both are 1."); AddInput("PriorBox", - "(Tensor) The input tensor of detection_output operator. " + "(Tensor) The input tensor of detection_output operator." "The format of input tensor is the position and variance " "of the boxes"); AddOutput("Out", "(Tensor) The output tensor of detection_output operator."); - AddAttr("background_label_id", - "(int), the attr of detection_output operator"); - AddAttr("num_classes", - "(int), the attr of detection_output operator"); + AddAttr("background_label_id", "(int), The background class index."); + AddAttr("num_classes", "(int), The number of the classification."); AddAttr("nms_threshold", - "(float), the attr of detection_output operator"); + "(float), The Non-maximum suppression threshold."); AddAttr("confidence_threshold", - "(float), the attr of detection_output operator"); - AddAttr("top_k", "(int), the attr of detection_output operator"); - AddAttr("nms_top_k", "(int), the attr of detection_output operator"); + "(float), The classification confidence threshold."); + AddAttr("top_k", "(int), The bbox number kept of the layer’s output."); + AddAttr("nms_top_k", + "(int), The bbox number kept of the NMS’s output."); AddComment(R"DOC( detection output for SSD(single shot multibox detector) - + Apply the NMS to the output of network and compute the predict + bounding box location. The output’s shape of this layer could + be zero if there is no valid bounding box. )DOC"); } }; From a3addcdc59e1dffdbe429942a3543c44b2526fba Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 12 Dec 2017 21:09:38 +0800 Subject: [PATCH 007/181] modify for some update in trunk --- paddle/operators/CMakeLists.txt | 4 +++- paddle/operators/detection_output_op.cc | 4 ++-- paddle/operators/detection_output_op.cu.cc | 6 +++--- paddle/operators/detection_output_op.h | 24 ++++++++++++---------- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf99332..68346001b1 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -210,7 +210,8 @@ set(DEPS_OPS save_op load_op send_op - recv_op) + recv_op + detection_output_op) if(WITH_DISTRIBUTE) add_subdirectory(detail) @@ -233,6 +234,7 @@ op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) +op_library(detection_output_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index 2bf0ef4414..109cf7d4c7 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -86,5 +86,5 @@ REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::Detection_output_Op, ops::Detection_output_OpMaker); REGISTER_OP_CPU_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc index 8edcfc0be3..e65b2afd21 100644 --- a/paddle/operators/detection_output_op.cu.cc +++ b/paddle/operators/detection_output_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/operators/detection_output_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 510d82251d..733ec3b0ed 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/operators/strided_memcpy.h" namespace paddle { namespace operators { -template -inline void transpose_fun(const platform::DeviceContext& context, +template +inline void transpose_fun(const framework::ExecutionContext& context, const framework::Tensor& src, framework::Tensor* dst) { int input_nums = src.dims()[0]; @@ -36,17 +36,18 @@ inline void transpose_fun(const platform::DeviceContext& context, framework::Tensor in_p_tensor_transpose; in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); std::vector shape_axis({0, 1, 3, 4, 2}); - math::Transpose trans5; - trans5(context, in_p_tensor, &in_p_tensor_transpose, shape_axis); + math::Transpose trans5; + trans5(context.template device_context(), in_p_tensor, + &in_p_tensor_transpose, shape_axis); auto dst_stride = framework::stride(dst->dims()); auto src_stride = framework::stride(in_p_tensor_transpose.dims()); - StridedMemcpy(context, in_p_tensor_transpose.data(), src_stride, - in_p_tensor_transpose.dims(), dst_stride, + StridedMemcpy(context.device_context(), in_p_tensor_transpose.data(), + src_stride, in_p_tensor_transpose.dims(), dst_stride, dst->data() + offset); offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; } } -template +template class Detection_output_Kernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -87,11 +88,12 @@ class Detection_output_Kernel : public framework::OpKernel { framework::Tensor conf_cpu; framework::Tensor priorbox_cpu; const T* priorbox_data = in_priorbox->data(); - transpose_fun(context.device_context(), *in_loc, &loc_tensor); - transpose_fun(context.device_context(), *in_conf, &conf_tensor); + transpose_fun(context, *in_loc, &loc_tensor); + transpose_fun(context, *in_conf, &conf_tensor); conf_tensor.Resize(conf_shape_softmax); - math::SoftmaxFunctor()(context.device_context(), &conf_tensor, - &conf_tensor); + math::SoftmaxFunctor()( + context.template device_context(), &conf_tensor, + &conf_tensor); T* loc_data = loc_tensor.data(); T* conf_data = conf_tensor.data(); if (platform::is_gpu_place(context.GetPlace())) { From ffd4e8c1722cd2a03a3731ffa966da4ba7844262 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 15 Dec 2017 07:13:37 +0800 Subject: [PATCH 008/181] modify xx_y to xxY --- paddle/operators/detection_output_op.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index 109cf7d4c7..ae807d2810 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -16,10 +16,10 @@ limitations under the License. */ namespace paddle { namespace operators { -class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { +class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker { public: - Detection_output_OpMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) + DetectionOutputOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Loc", "(Tensor) The input tensor of detection_output operator." @@ -59,21 +59,21 @@ class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { } }; -class Detection_output_Op : public framework::OperatorWithKernel { +class DetectionOutputOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Loc"), - "Input(X) of Detection_output_Op" + "Input(X) of DetectionOutputOp" "should not be null."); PADDLE_ENFORCE(ctx->HasInput("Conf"), - "Input(X) of Detection_output_Op" + "Input(X) of DetectionOutputOp" "should not be null."); PADDLE_ENFORCE(ctx->HasInput("PriorBox"), - "Input(X) of Detection_output_Op" + "Input(X) of DetectionOutputOp" "should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of Detection_output_Op should not be null."); + "Output(Out) of DetectionOutputOp should not be null."); std::vector output_shape({1, 7}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); } @@ -82,8 +82,8 @@ class Detection_output_Op : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::Detection_output_Op, - ops::Detection_output_OpMaker); +REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp, + ops::DetectionOutputOpMaker); REGISTER_OP_CPU_KERNEL( detection_output, ops::Detection_output_Kernel, From f2e76008d37d5a0203fdd42edf4fa1ac2907c400 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 15 Dec 2017 15:42:26 +0800 Subject: [PATCH 009/181] update --- python/paddle/v2/fluid/backward.py | 19 +++++++++++++++++++ python/paddle/v2/fluid/framework.py | 6 ++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index f188582178..3a128b8e61 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,8 +1,27 @@ from paddle.v2.fluid import framework as framework +from . import core __all__ = ['append_backward_ops'] +def backward_impl(block, target_block, no_grad_set, grad_to_var, callback): + grad_op_descs = [] + program = block.program + for each_op in block.ops: + grad_sub_block_list = [] + if each_op.has_attr("sub_block"): + sub_block_idx = each_op.block_attr("sub_block") + sub_block = program.block(sub_block_idx) + grad_sub_block = program.create_block(parent_idx=sub_block_idx) + backward_impl(sub_block, grad_sub_block, no_grad_set, grad_to_var, + callback) + grad_sub_block_list.append(grad_sub_block) + grad_op_desc = core.get_grad_op_desc(each_op.desc, + no_grad_set[block.idx], + grad_to_var, grad_sub_block_list) + grad_op_descs.append(grad_op_desc) + + def append_backward_ops(loss, parameter_list=None, no_grad_set=None): """ Create and add gradient Operators in BlockDesc to compute diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index bf0cd275b6..244a963936 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -806,9 +806,11 @@ class Program(object): self.sync_with_cpp() return param_to_grad_info - def create_block(self): + def create_block(self, parent_idx=None): new_block_idx = len(self.blocks) - self.desc.append_block(self.current_block().desc) + parent = self.current_block() if parent_idx is None else self.block( + parent_idx) + self.desc.append_block(parent.desc) self.current_block_idx = new_block_idx self.blocks.append(Block(self, self.current_block_idx)) return self.current_block() From 784740d8bee8e9127270edd1288289e9d9c864b8 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 11 Dec 2017 18:29:05 +0800 Subject: [PATCH 010/181] refine cos-sim-op --- paddle/operators/cos_sim_op.h | 189 +++++++++++++-------- paddle/operators/elementwise_op_function.h | 55 ++++++ 2 files changed, 170 insertions(+), 74 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index fecb5a79b2..3a7e67506d 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/elementwise_add_op.h" namespace paddle { namespace operators { @@ -27,6 +28,28 @@ template using EigenVector = framework::EigenVector; +template +void Function_forward(T* out, T* x_norm, T* y_norm, + ElementIterator& x, + ElementIterator& y, int row, int col) { + for (int i = 0; i < row; ++i) { + T xx = 0; + T yy = 0; + T xy = 0; + for (int j = 0; j < col; ++j) { + xy += (*x) * (*y); + xx += (*x) * (*x); + yy += (*y) * (*y); + ++y; + ++x; + } + x_norm[i] = sqrt(xx); + y_norm[i] = sqrt(yy); + + out[i] = xy / (x_norm[i] * y_norm[i]); + } +} + template class CosSimKernel : public framework::OpKernel { public: @@ -41,32 +64,63 @@ class CosSimKernel : public framework::OpKernel { out_x_norm->mutable_data(context.GetPlace()); out_y_norm->mutable_data(context.GetPlace()); - // convert Tensor to Eigen Tensor int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; - auto x = EigenMatrix::Reshape(*in_x, 1); - auto y = EigenMatrix::Reshape(*in_y, 1); - auto z = EigenVector::Flatten(*out_z); - auto x_norm = EigenVector::Flatten(*out_x_norm); - auto y_norm = EigenVector::Flatten(*out_y_norm); - - // compute - auto& place = - *context.template device_context().eigen_device(); - auto row_along = Eigen::array({{1}}); - x_norm.device(place) = x.square().sum(row_along).sqrt(); - y_norm.device(place) = y.square().sum(row_along).sqrt(); - if (rows_x == rows_y) { - auto xy = (x * y).sum(Eigen::array({{1}})); - z.device(place) = xy / x_norm / y_norm; - } else { - Eigen::DSizes bcast(rows_x, 1); - auto xy = (x * y.broadcast(bcast)).sum(row_along); - z.device(place) = xy / x_norm / y_norm.broadcast(bcast); - } + + int cols = framework::product(in_x->dims()) / rows_x; + auto x_iter = ElementIterator(in_x->data(), rows_x, + cols, rows_x, cols); + auto y_iter = ElementIterator(in_y->data(), rows_y, + cols, rows_x, cols); + + Function_forward(out_z->data(), out_x_norm->data(), + out_y_norm->data(), x_iter, y_iter, rows_x, cols); + // + // // convert Tensor to Eigen Tensor + //// int rows_x = in_x->dims()[0]; + //// int rows_y = in_y->dims()[0]; + // auto x = EigenMatrix::Reshape(*in_x, 1); + // auto y = EigenMatrix::Reshape(*in_y, 1); + // auto z = EigenVector::Flatten(*out_z); + // auto x_norm = EigenVector::Flatten(*out_x_norm); + // auto y_norm = EigenVector::Flatten(*out_y_norm); + // + // // compute + // auto& place = + // *context.template device_context().eigen_device(); + // auto row_along = Eigen::array({{1}}); + // x_norm.device(place) = x.square().sum(row_along).sqrt(); + // y_norm.device(place) = y.square().sum(row_along).sqrt(); + // if (rows_x == rows_y) { + // auto xy = (x * y).sum(Eigen::array({{1}})); + // z.device(place) = xy / x_norm / y_norm; + // } else { + // Eigen::DSizes bcast(rows_x, 1); + // auto xy = (x * y.broadcast(bcast)).sum(row_along); + // z.device(place) = xy / x_norm / y_norm.broadcast(bcast); + // } } }; +template +void Function_element(T* result, ElementIterator dz, + ElementIterator y, + ElementIterator x_norm, + ElementIterator y_norm, + ElementIterator z, + ElementIterator x, int num, int block) { + for (int i = 0; i < num; ++i) { + result[i % block] += (*dz) * ((*y) / ((*x_norm) * (*y_norm)) - + (*z) * (*x) / ((*x_norm) * (*x_norm))); + ++dz; + ++y; + ++x_norm; + ++y_norm; + ++z; + ++x; + } +} + template class CosSimGradKernel : public framework::OpKernel { public: @@ -81,63 +135,50 @@ class CosSimGradKernel : public framework::OpKernel { auto* out_grad_y = context.Output(framework::GradVarName("Y")); auto* in_grad_z = context.Input(framework::GradVarName("Out")); - // convert Tensor to Eigen Tensor - auto x = EigenMatrix::Reshape(*in_x, 1); - auto y = EigenMatrix::Reshape(*in_y, 1); - auto z = EigenMatrix::Reshape(*in_z, 1); - auto x_norm = EigenMatrix::Reshape(*in_x_norm, 1); - auto y_norm = EigenMatrix::Reshape(*in_y_norm, 1); - auto dz = EigenMatrix::Reshape(*in_grad_z, 1); - // compute gradident int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - Eigen::DSizes bcast_cols(1, cols); - auto z_bcast = z.broadcast(bcast_cols); - auto dz_bcast = dz.broadcast(bcast_cols); - auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols); - auto& place = - *context.template device_context().eigen_device(); - if (rows_x == rows_y) { - auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols); - auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols); - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - auto dx = EigenMatrix::Reshape(*out_grad_x, 1); - auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast; - dx.device(place) = dz_bcast * grad; - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenMatrix::Reshape(*out_grad_y, 1); - auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast; - dy.device(place) = dz_bcast * grad; - } - } else { - Eigen::DSizes bcast_rows(rows_x, 1); - Eigen::DSizes bcast_rows_cols(rows_x, cols); - auto y_bcast = y.broadcast(bcast_rows); - auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols); - auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows)) - .eval() - .broadcast(bcast_cols); - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - auto dx = EigenMatrix::Reshape(*out_grad_x, 1); - auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast; - dx.device(place) = dz_bcast * grad; - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - auto dy = EigenVector::Flatten(*out_grad_y); - auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; - dy.device(place) = (dz_bcast * grad).sum(Eigen::array({{0}})); - } + + ////////////////////////////// + // ## + auto x_iter = ElementIterator(in_x->data(), rows_x, + cols, rows_x, cols); + auto y_iter = ElementIterator(in_y->data(), rows_y, + cols, rows_x, cols); + auto z_iter = ElementIterator(in_z->data(), rows_x, 1, + rows_x, cols); + auto dz_iter = ElementIterator(in_grad_z->data(), + rows_x, 1, rows_x, cols); + auto x_norm_iter = ElementIterator( + in_x_norm->data(), rows_x, 1, rows_x, cols); + auto y_norm_iter = ElementIterator( + in_y_norm->data(), rows_y, 1, rows_x, cols); + // ## + ////////////////////////////// + // compute dx + if (out_grad_x) { + out_grad_x->mutable_data(context.GetPlace()); + + ////////////////////////////// + // ## + Function_element(out_grad_x->data(), dz_iter, y_iter, x_norm_iter, + y_norm_iter, z_iter, x_iter, rows_x * cols, + rows_x * cols); + // ## + ////////////////////////////// + } + // compute dy + if (out_grad_y) { + out_grad_y->mutable_data(context.GetPlace()); + + ////////////////////////////// + // ## + Function_element(out_grad_y->data(), dz_iter, x_iter, y_norm_iter, + x_norm_iter, z_iter, y_iter, rows_x * cols, + rows_y * cols); + // ## + ////////////////////////////// } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 7ebfc7df8c..33b7d06467 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -131,6 +131,61 @@ class MidWiseTransformIterator { int post_; }; +template +class ElementIterator; + +// Fixed(zcd) : Only support 2D +template +class ElementIterator { + public: + ElementIterator(const T* ptr, int t_m, int t_n, int m, int n) + : ptr_(ptr), + index_(0), + i_(0), + j_(0), + t_m_(t_m), + t_n_(t_n), + m_(m), + n_(n) {} + + ElementIterator& operator++() { + ++j_; + + if ((j_ == n_)) { + j_ = 0; + ++i_; + } + int t_i = (t_m_ == 1) ? 0 : i_; + int t_j = (t_n_ == 1) ? 0 : j_; + index_ = t_i * t_n_ + t_j; + + return *this; + } + + bool operator==( + const ElementIterator& rhs) const { + return (ptr_ + index_) == &(*rhs); + } + + bool operator!=( + const ElementIterator& rhs) const { + return (ptr_ + index_) != &(*rhs); + } + + const T& operator*() { return ptr_[index_]; } + + private: + // t_m_ == m_ || t_n_ == n_ || (t_m_ == 1 && t_m_ == 1) + const T* ptr_; + int index_; + int i_; + int j_; + int64_t t_m_; + int64_t t_n_; + int64_t m_; + int64_t n_; +}; + #ifdef __NVCC__ template class RowwiseTransformIterator From b3ea677a2b7c052793e242bc8a699cea34257201 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 18 Dec 2017 11:25:18 +0800 Subject: [PATCH 011/181] update --- paddle/pybind/protobuf.cc | 1 + paddle/pybind/pybind.cc | 2 +- python/paddle/v2/fluid/backward.py | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 6c8f06cccb..f67aa4a81e 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -243,6 +243,7 @@ void BindOpDesc(py::module &m) { .def("set_input", &OpDescBind::SetInput) .def("output", &OpDescBind::Output) .def("output_names", &OpDescBind::OutputNames) + .def("output_arg_names", &OpDescBind::OutputArgumentNames) .def("set_output", &OpDescBind::SetOutput) .def("has_attr", &OpDescBind::HasAttr) .def("attr_type", &OpDescBind::GetAttrType) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 1faf24bcb8..cd4887d63b 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -282,7 +282,7 @@ All parameter, weight, gradient are variables in Paddle. } return ret_values; }); - m.def("get_grad_op_descs", + m.def("get_grad_op_desc", [](const OpDescBind &op_desc, const std::unordered_set &no_grad_set, std::unordered_map &grad_to_var, diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 3a128b8e61..1756f1a7af 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,5 +1,6 @@ from paddle.v2.fluid import framework as framework from . import core +import collections __all__ = ['append_backward_ops'] @@ -20,6 +21,20 @@ def backward_impl(block, target_block, no_grad_set, grad_to_var, callback): no_grad_set[block.idx], grad_to_var, grad_sub_block_list) grad_op_descs.append(grad_op_desc) + # grad_op_descs = [[op1_g1, op1_g2], [op2_g], ...] + # flatten grad_op_descs + grad_op_descs = [op for sublist in grad_op_descs for op in sublist] # ????? + + output_vars = collections.defaultdict(list) + for pos, op_desc in enumerate(grad_op_descs): + for var_name in op_desc.output_arg_names(): + output_vars[var_name].append(pos) + for var_name, poses in output_vars.iteritems(): + if len(poses) == 1: + continue + renamed_list = [] + for pos in reversed(sorted(poses)): + new_name = var_name + "@RENAMED@" + len(renamed_list) def append_backward_ops(loss, parameter_list=None, no_grad_set=None): From 7c63eaa5c7d98f6154c9d7cbb33b7bf33d1eb235 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 18 Dec 2017 14:34:54 +0800 Subject: [PATCH 012/181] Add profiler design documentation. --- doc/design/images/profiler.png | Bin 0 -> 51116 bytes doc/design/profiler.md | 95 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 doc/design/images/profiler.png create mode 100644 doc/design/profiler.md diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png new file mode 100644 index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1 GIT binary patch literal 51116 zcmeFZbx>Vh^DYPk2p$N*-QC^Y9fG^N%fThMyL<2icZc8(!QC~u6ZCH0@BMxE&QwiJ z&HqCcbWrA3=AR}3=Dh-1_CIF8x0W!KER!o zC4|8$CvcB|2UrJ5O=mDLc+~e_aIlOlOrXM?2cR|> z7_U1w@YUAT#gNe5*2d16+nta2pBmi2_xEB3V#0r_xLETMYse`Qir71u60*^=(lZkC z!x0h^@;aHAaVv|8|JxjR;v=?jadF^gU;u$Y^dJ^`dna=SCN3^621aHEW@b8|2A#8q zor|G6ot-nue>(Y}end^3jh!qVTrBPF2;ciPG_rSf;Ugw~ALze7|Cy(YrP==;$fCn6FnotfBOcS^1hdHi`d&bIGH*-1O4-}@%~fu|5En9$N5ivMGJcudq4_K zmc~+cE~ZXEV;95s+3_>~+xq{x#{a!91t&{WVAcP$X8O1F|5^5LeO`w5h5sLd_>YwT zDFwvL568>!-!0>ZtKMN=0|OHTlM)sB>JEOg21>IPOXhgRV2QwjL?ep|$O|KsL~hP2 zd?F4@P!f$RMT!6y&-KXqjpBpbdyKG2nQ-0FLx$*t5{FC5LYzm5)$gBH_sqXD{oVH| zh`+=>_sm(o>RJMHx{TK4b^ch{V5K@|6wCj3_%^Ih90x-R5iSUPM16sX#VMONGe!*% zB}9>g{2U;<>4zfuI&uH+6CnyFA7nz8#Cf`DfW03g?E1; ziF>^N@*$(23?jT+lJKJ3obR}0(&uc_@71}2?|#*pYMa65NQR+#^IM6)%h8Hm+o^)@ zRbrLTsrCIDXk&-q?Ey_qBVfC;@3V|(m+#xtMw{zFHt6+8!B5NQb}m)V^JvAs%jdml z0@UfeWSV-~xMBxP{_SP$-1k0^;5jb|9)te*uBq$gr~Zp{jdfMW-2je#IdY2kCC_R7 zOuBiF$0Xmq#Q^(?&GjyYz~7U)$`Bkg^!Gn<`@lGPqBWBt4zwn7D-mL}J2>c(Fm1_glW6@}t^9)f?w z6%2eE)qi!`jh8TbLF8DykW%oRW?Rs4o{{4SS08%s!r3Yn2Fu+WXC_4^S? zo|1U``rqR1yTp4HP3sYxmrXLx7k{*@D;k34Tvr_jZL5DlYumLH$&mSN!Rxyp=7%P9 z<;Qo?0}=&+Q`o=j#RH+>Pw~Vefs+GZFJG=k83>q$oH}mjBN=u7HdOi8#x-G5QSA|5 z-s6E*1N2{r$@bljKMFhu={&X#lV^$!Q1EWS61*Hrki5oA;O8fy623kk2&5+cEU%Rg z%Ds#-hbeFU^TQFVUblo+lQ|A%78rWL!sOk)44?;?rs}!@sup{uDX^~&N)FG2(vFg{ zwqo7!@VV!j^q2s&kS=7eJs&;Q<5IF~jCnLEs{T2Wq4VKspYs$ckNdE9v=x_FGAm*{NC&>?v$^D?K7E4Py>dE;-2}q zR+-h`tu!5knuq4RDHdjl%ko{j$2Fw;_anwwmJqwM?ZYLlDpYyS0Id!5FvULmwo`W6 z*8+l6|6bCPr)cI?FPXnpbRM^is6xn1%)YoTD% zdZ1G9fa|(pnc&t($j zB?ajzWWY3v=_!?J`Hw$HvxTWn%!Fr9S#F&~nD`YviOrT#v{ugEzf9RWO5auwHOSj~ z!@Jl^6a9o^e|@899jPjQ7b&66}OuubF>X`*=ykjigo@RhPzfIq9l2KVYp!)Q9ic5bb#}P)3oQ#FNRVUAiel*YG9TQ`yv{ zSkyoXMe?mdOw3{IN}+d=!z|1c6ap7;6fZPx_=KoBSjSX{9FX;v6a0y zbw4D~bMjvOeV=GW0ds)Pe{oUC8rSnqD6|%OOb;AEp%~44hb4R&~CmKrZrk+5d(WvL7Kj5}W2p{S*dChPnS6DwK%zc$;h_|CR8GcpgLO$f|&=TNmxp6VP=UH{_o+jgrb)TFU`s z%&Qm5#4hubam@S(zo(TA{v*I6Jns+)2;1U|s=NWt<8||+fcs^n{;JGY*F!%8@KOuV zw(KT!kWR3&1ivp!5Bu+kdS`8_NxIns%0uqk-&`VlljX_KXYg@{Yu|@ng4YAT*8`xV z%XjyZXg9)jkmU>*;6}6aFB=(7l17z)UHn;*`?Ge~RtNa2@d6;~`}N89Vy?>9btC;PXAAHSuQGa930?YpcU>ya z##A9xL9q04lcuab&!|Ppqi==NY9lS}W{l_GB;!@A}<}k=|K>FSk5YV-q z)K2=|)wPjj9Ij_rS1z$jl{_?toAX`)-hjd(Z+Bh0_Vw{N*JWDD-n|zm=XsanK8Hqy z{QY&9f`|C9drlI=xn0?IW@X#=k^ds#fcJo_;xaLYiN<3P4^&>=1E=7%Jfj8w7x3^c z>z>E9fX6Ct0B!i(w*e|T&q}sW;rCKeKP^pWHV^IldYzl=&;tv|EtbIZkSMWp#kOf} zT805J=ij&_x5dGKGMfh!PL=|Ok~s;u5FqGiyWAiWm|n16W!wn-#NXVD#4{V;`?UW@+*6T&L9bIP<@Y#~TkjA_*Z2_X_wX?o z#1Mz|9D-vvg_pRLY1_E;4GP{32y&2Be1Up@`hf`Qdd!^v8VzvHKvdJdXcYOf$Kcnp z9YNE$WR`g-{Kfa_w8`&Sd*kyCl(u=c>+a)wBy=F~wuEqqv+*!62!s#If0Igh_keIf z6L8vTOh^LH4xhU-H=g$?N#Bq@K(Nqy%Qk*z$}tc3YnnbHT?F99ls$^x?L?E6L1Q6z^?)Q82Y%W z#)PDy7elf@_*SNQ`T>cvy8;-F!#yb|s9ig^ha{F)IM%+uA|@5@jc*5QGOyw@#u%%Y zLYyNBhSMH~8S}bsh4Q(Q1fCdf-%DD8py2)kpy=kSOGVx9j`{g|Qlixt+ZzZzs8nKS zZ!da39VgKf8W{Sp28@49X1xYHcPD2774Du_!K@b$Iwnwu;jPn0hbmkA0m4J}?Nl*{ z=ZcGu`0j+EvjZQpFJ&0IIWIYu-^0T9F!#|$UB|J%EI3so&ji8+p|BHzz+k1UrmO;_ zmOP&-vIpti7}kxo6ne0NNoRdZOx$(2#Mida^?JMU_IjWDw0ZzXd9jg)ucHUXK zwl_jk^$f(sNb0t&N5%A)xIJFyUK=r6gbkb_gYzz{x+i^z~{oi z1Z`1n(C7U-QqKhX@?9Gx0|{A7&c0QdIjfo4U zi_cYWQwv$rcZ99G!li6I^r0%Wzr8+PP8o*c-ThGSkp63%$QF^!mwGd_(0SJooRfQ7 zZvNX}3>u}!Fj0=33Pb@uO8v83*(u%ZE+lD`+rRK=9HkDhZ@wqS1gK^W#oPLzHCL3F zHd}ZTOo=V)@*45N%hYDY2W^m!a?(yEXd0%UT15KdN1usMfde&LG~|~eK|0pyezK%e zS~8t!6X5&>39)WFQ1U2-CrYPeHpYQi`Aq8}3~3h9m8xlm<=bLLAfh0gPEG<^b(9$2 zyBI@=Z_Cdi=$r znV}l8QSRtBX8s5r1H!WST|M|8r*I4lZ9lC=EK~&sZ4_jPj7|kFf%W}| z2c`b^*m`fZdxrFC?kgr*>t(x4F(n#a;{3O* zvEl(4BtA{IYv94hT54e= zR^d&XYucH)aKv&N&%k@!i9k^EV@TwTVhxhSzlO!J_TwC~ii zmjd4+gd_Q3yQZ0S7i>#SGuJ>aEsto^_mmV7v(z~3Z2g$2Z3VD2#-3w@iWkqX3lgP| z>8z^p2ciC4$KpjzYi^nD@j1DnryVF^1Sn&pq|D0bk)^4y=j?}tgY=Y%H}`<^PM%i4 zzO_2*dV8%XUp219r@^~5AvHqnr6n#R>0~nU)juwiUN3`lUs%$G(IYwu^_}W)=$OJ# znI*&GV(!`QwX3RsK*2hg%snS_v>YlBNz&nijxuH9*f4&SO@Wcc2ESwUK4BY3fL{o z+PpTT%vz|mVqJLjl5o)s80=iox<^U2TL#BiTV=eJ#aW3 z!tv*Csm>ewTIF+rYZdmV(+%8rQ)}DyQkI8pI8Ib^;eio>*j`yus+?z#z^`3t;7Q7L zeH|xo-FpTBwQ;`WhFU(_TXuJbe&>{|Iixa z-eFxUb;l#m$bGT{fDKZ|NrO~az31PZz}qJ3B^gkXjfw^fzSquW|{D$?{q@7ogZ-Z2y=g zGOCg@6q;ZHsDfr+55dO6Od+hh^L9pDzXiDkx7}a|ArdZa#t8pJN@pfv7n!`>l8ry9c}Ls*nl%c40buPEDIG z=c8AQWXv8vwa1HTMRZF#{U=&IT71-OHC(nUFSnKxU#B@gq zSlt9RxGdY$xUyj{v$j|atjWrt?R;ISgUb2~jk@Kq4Dz^cTeG;f7#NMt6{D*3?(Lc;1yG{yR| z+7l`v{G5`=8E9;4#OI7j(uB+p+C-9Cdhx}abXzqeH=D zhcvsx%W-_Q^TB0heU5f~(K>evC-QmxT@o15R!G}a&0yLAs`D7e1YcrPjV4l{ktXbk zPMUG%B6(g*735m{dwwLuWsM~a8HHVLD6&{>qfE^B6S=z+ra}J-I-ZvKPJX?1b6F}8 zWWw>J2b1~I9%MdAa%}wqy1GB{U@EREVJ~`{E#dF~*>x(8II|G*cJ23|EZa=75>+31 zdj%X(88h>jk8o|GsLo^$0*6{Z2Qa+@mk9XV4z2+xV<}DZD7Y?DLZ%pD=~my%+O~K% zkeSJ;L^OY`x@~eWaH>#_?|TA~i}QWmIZ1T4!ONrsc81OKF7jrt@aaH?DX3&PnT4#S zd^ab-skj|^ov#gu2$C=|q3$BP%eCIt7&y~?A5jkfneUB2o@3?Z<8f8j)d2@=R9h?^#M=20?nGutW`69b@Bx_5}IpO zxfaiO=FjgRtfL+7!^^4TcVM=Yxwf;5p2N+jvUpkhVYy_q_!BWBc5J!+se}v>S8}pEiI?U}o<2OdnyP1u}5q2M0 zN@^vHV~RmOY9Yoy^F_qEyRSs$Uott>GL-e_VMk2J39n?2CwP91_`y_(wrB^d5&5kn z4n*_hHf({CI=7NGNmMtVp67uBz1cG_T!_$CAIyr`bo;zyVL~D3v9+Y7S~rZFGPUZy z6U}ZO9YLGs*cWq(=xwiU-@$gYn;=79V=-q}o-O@z$5w%e=aV*q9AVs=8;m1mVxlWp zGaxhVcIyJc;@kJLoxHQVZVDENs?;lf3mQVCbG7vMayzc;dm88k7D@f-)b<+?_TMN6 zMK7L=1XZ7Qqqeg3qpT^9XG>zx$eqUWfX-GqQ5(;OgRm|y@x4w^oL3^me%9}*y;hF0 zGHrOi+JIik$rc^PVQ&m)m4B;uUZHt4DZLb!WH>V6q%jbdRYTr1Q1D)n-^_m8bg~Uh z%cU-#YRHTmXUh9g`N`){mBrvuZ$Q=hCzS{xY5T!uGTzxBNp82?*pQB z`G5~}HH<4997Y!F&F!mn0kAC7A(6%z5NI;N*@_&e_s^Q-$JLagy=gC}-y`IV4+!Fa z)jBTBAAU}%OzynFsWuEgg3kOf-Gg>m-d>%njUn@B$#>YeamC^j{SM}E8w0qp&_6KJ zpN&9wNX`8XWP?t0>P!w;?HMVBh7#k#ydRC(j5&-XJ1X3ZAn~PbS1Znm*cAi4v1zf)m>FR6k}qf4I~7= zqqvViJJ0_{}u-nVYz~eJp-l z$sT`U>wODSPsFE@2xy@KXD-mLUT*{!#IW-o(jC~ z2w3c{tz?0ogu#CJ6rCre1@59q;2LUfeo0v>YHeprd3T!WfGE5_hLoxx1r=3mp^9;^ z%|)+chEnaJve$TtAJ3%dx-Ip75?IZ%w8Qk^S#|DVev-@EdpfE6WYh@zv%!SEVxTdl zJgKf9UHgLTfTKjh`<#yTyvUl&3ljU|X6J2%J=6k_tl2q}qON)RHMN`;6!h(&yKj8gs^$kPTps)aMY(r!`7w>{0igJqs2 zMAKi6Znz^q;;Bv1jQ{hH3@cOMr42vzZ@Tu{+Q)0+2sPbRDCwyxG4JmLRTUq+EF?M<0B(y3 zFN0Hby_-AP{VInOEK=0mo>$2Iv{Wb$W z7Dw>bDHxVoH5JBvH0@#$6`2dR8H*x`JMJ=nMltyS9qF&R^-FC^Q}OQcra94PAmqao?nZtjoU zjfuV;zfK^$Lo80$v?88@24>8jXEJLe(oOif^~o;{j%^38Y2-|M<5A5xJ`L(H+IG{d zJ==$E695jK=dJWLi|H=ER?Mv%1R^dPhcsr+gqW+dR0 z*n166zpYf3zJVt-+$~%4oVd1w;_H42z%_N`p~ZgiXN=6EgSk0thmP<1TcrufrM-zY z;=5n-OXRPHS*OxaH+XprS8iV;Bg@8PSCfD0hSkrCebIMS+&gzPe$eM=QC()Mno;g$ zI&pTh6i0C%d*h`tDaIFtQ-~7)61I4dJ#Imj%_Ur4SE3k9a6{zkJo;($YAy$GK87wy z_uqXV%Nbj1G;(EMh@h#&oSeuhuv_{NiE{x*Q@&|(dGW~{zw4xS)NRF<18U>}z>(v` zX>2~BdlhnKJNLZk6nJYF5feRSI`I#dIbl`d@l|>pSGL&|{{CSc%cnMnZ9oa4=9)`X zUFF)CPkzS+3&wWQZu}$=~K7WOV?db@W|MAp=ft;d3WGmc&@oFIQD$w zrLaO&%Y*#!x{cBG?$}mNvTZkt*3L}8^D2cllYw;lJC0iVQzaFz^3S1**QRK|?`*?| zmDHA6uRsZ{fj{dF?L1j9g)iQmxufOQXcL=zCNj<oIv#YvVVjyT!GPHkD=ai2`q90&2- z!+PUQ1psJwJ}7L($mjL>ug3U5rD*hf60asHPf)ZOP!bup{_iHb$bmzeUFoW@SlHo5 z;qcqH@)%**6VP*7Nt0%k)5A+<-+;^0vB-#q70SO*6nK~KybDVi#xGB8Y|{n! zeVaaAEma6Gp*sLp&#G89v_8P{NmYlCO9-~ON&(K3i1zLueiJc*;5|j*>|8W-;QLn& zwK>R0v<0CrCD5{4%dNR#YNiv2U~fz-aTC~MRak6BC0lJz<4+@`{#0DD;Oso%)u+k= zho{)14VFlm`5c%jyRwQ|A;m1uKV>e~*v{R;BK-~;>D)2CcWWShgah>Pb0FEjVc)z3 ztt0Q)gpsNJ%dOr@J4_|ju_VbI-n#R?9-h$(hJ^{)AQvZ4KEVe&d(yNhAFv*JD+neeH&g0qSq61f#aKaMYmy^ zlvoLL>LVa;ieOTTl0WSaWJTouv_(*^n$FCh;U#Hwu$qAjh{IpO^Jb?i<@R{NrsSZs+inMF0rm1E`UWR7v#WB)(^@_w7bgh?)N2tySXf?UwAl# zRLjUkXk05yH5EAHn7Y@uMNLbyFl}wLIeh6t-cFGyfZjE=5%gyJ2 zGeivz{2?`kX`o0uZI zMkT;@2UhLm`&i~!I5e}7ffDU<@#GLVk--BT#0BG}N&TPiVRY*P8q9DwkZO`(z9HQ(KX@>uB}p$X~eOMTB)4 zAfKWaigK4XGKxet}CgI9D*5#`3BXmNr8bp_p*HVg6+WC$Qg{ z5g)!qK==^iLoI7Q*3L#tEI0Ldax9sO&=cjvUlIa${8ASFBPn(cTgUmppkZe83%uXD6c+cgG$_WfDL5GZ20l^`&rfK^q&0Q!X-tn zH?w2FOP0rn*?oA~^#xdR1kgFN_YgNKI`cSe@m(Ut$gFZwC+7Wb@C`&g7LDcD{$OMdiKXU(>vGTB8Kmc494&|;y66-^^a7)qpotfCHCe#!FmP0E8jDoB?^a|n=8TSlSqS#Ek-?v`KQ|pG zw1pyOi=$e49vQ5`2NGT&qU|r5@Yt|PR$euLTA!w=l z#r1OX+`fd3O%?tgI5xCn%Ofw<_1H^Z*M1HK#q)IyRgdIxYcATf>k7?TkZ)emQR)*> z_b{!a|HRxi8deXBa#@89*z={t3aKj;w)~1+6dxY1pPhXSYji#Flhwx9Vvc9vWOpxC z%VhBQ@!NCzC-({Su!|webs7j$NYa$zuuREF4KPF|?!(GXQ9)gk;Atq40BY|Cqvd%t zDz{ncC=Pu6iT%bs{o!q$Z=P za7timbbMS7uf>F2t)PkI_0{X^q}eH#Zd$S9xyqn;X>97C`*Zc&Ui>+&F#J|{TN~b< zm6ibmFCdzIIZP!Dw5%Ap6@d@d& zZ)(KS7Qu9=hpzLiy#sj{$Q0W+*vP33x>tog>oi1*@|%4~j~Gc{C&`#xLCW~k?-&1j zG`w1YYwwGmy{Jfo`{{d9aiPC41ksQfu`Pc-?hwrE{j}rs9s6dY9DuMd z|79J^cgJ{EKaa`uXzI>pc0^tZE<6i{WNgM-rZT|YDU*uz;R_r5UmX4c5#s{&c!&c; zCSh#MUQDSZlc=I~CcLLe-YJbkkM8@!%bF7O>pmjvk#A6lp3)c1$@VA8gm-A{POZl* z;le|0g)5I(4Ixads@m5^@}cWT$K7qCbWK?&1LF2cI$H7*7kqHX-}ce`A}xM!=s&I9 zawTbVpV^YeyQ3i%b{6y~bRMVSB#6QKLXg`btcgmkZj7!>Yo!Wcos>q_Wm0C?@WKBB(glLxCK}V{ zen=Bq6{e7@?a2J4hi^Q?QA3NR0avC0c>x6zi`_#w`)gOil!H4$^=f55Q0CWJj5#L7 z&qXnvEi8ldtzCJwAPr38lSV-5IPJ|4%pA6K^b#}DaLSAyLG3$u{0ZV)(Q12p zNran@jJw+g${2c=TTq5iDHq5z+#6149uDhS7ov&5A5%c#s^fGYwsXAB!h5w=c3xJ! z&O2JX-v<4jB?@=%&orBg%Z+Jq5<|XjnO@JMOpg+hVJ(x$CON7^)r@7u$8G~hoOYQ& z7)POj*0K3&{NI=edF$%{>p){L9BRySnb+`R1;c_cH!4~HXB$97fGPd9eOyu8T(72X zR2Xyt?=k0j6h)$=V0qR+|&G^2-`8*cm@i50j5{Mqvu!SMER} z7%^DUVLjO1NJ-v)%=(KM{3{;PouCtuBmOR=o0T zb+L;tw)1|}$4OJrHck-Z5;jS9WY~uI_DmNO{rY;v(H4h72hFx2@Ukc&!*~NRCu8Kl zy#&%K{>Dak5`La_qG@gu+-v;^FhN;@_}R2_1D}jT!vE-?lcX;u7N3p~i$m1rpu^4X zdGTh?!%L}Q=pBq)6rxbF$0|dZPD~D-Tf;`$tul*iK%2>4Mnxe($0YQt|B)amk((F! zP(*YSk}wT_2k;>!h+M`W_#=c$2&8Zo}3(Vl1=rcJ#u5 zH}h>Fj-UAgOdtj_81Kyw1f>kaQhT*3sjdO(q%P$N?wAl_c(@kE-z-#x2yQW`*nbp= zxl8>?Rr}@+x30YPh>m}&L%dE`rREJKZ9yjXnJB(eN!O=Lia$N1%A&oJc5>N2tt0f+ zvpGaZm9dBsHE{AG{qjI(tjJ4dXP!nHb9`LDTBP zX`ykFs5=cOYi0ub?2Z&JEmTmrr3)5c^*y>!zRZAQ2byd-=od$hWq3G!A}3;CX^lA8 zxiE9>=03L!UzO4}+BQ^NEq$S*=0-&I)e^as{cQziVzL<$jfm2; zme@R*%O~p}kcm@9HD>n-YtcOcSl~=Ohau_t`x*02R$ocfe>KXNF9{z1>~g(3GyF2& zCd-xl-0^a^V$A83<-sXwXau>m?6<9#YD(Ra$ngc6q^v1UiEk>u!ToQLVYJKI4<+kz z+8_2}oW&f=l^6+mU)D32jK#47dS-fO-~-8P7TcAha(ZW~*R{XD?2lsWTc0ltQkf1F z)Sog>Vby-amXbM{?*D9;6T|k~PJpOV3E=-&l&jW@$rBevwKz0O6_3$F|ST0G9k=pUZ(G$J)yW-INnDLB%WRprs$Xw=Z#wBc|4b_zESr!bKPJQCNvEvUCt3GC27h ztaajO<@f7zL|7`=w$8(Jc~1xP8K%od07^^m`4a(;)&nNNO(^e|_&(}UZ4UEv|2G|& z8l-`35@^DfmAss$ugdg60Qz!E^9l*$o;d}=pI&IfEXh38@-IB#(djA^{R!24zY2i zc}5}leuylnQBF;Qnt)Shn~0o8O%Yf zN|9EA@~!pmFs8u|)kpOlgmx$_IF{$6ukD5{p=i8r{)$dj=8;^{e2%gP{ zdeALk1Eno@S$VfDccNshm@?fI$xAH8q^T16n}fc(Q`ei46iG!X4uiwx^U*ZsAFyf_ znY8^(fbCJy?U1Id%F_B`G=(&HGBoO*2fiFmQd0|xy;`Geyj7MdELkXv(a6JCqu4Ki z+$1#Ut;+ptdCM$QkG>veSWZlO*-EI^J*rGk(Kxs4=DJX*zd z31BTG9c|=rG#U#WZ0@_HTZ-&%)a_HC3{2lP!V)L4|P}Yqv|Prd+0B2T>L9 z4S#JmVF-Z5SWikK+a$Y#cF|(^nqc~$DSx$VmSPX8(di?Nt+znsnpT<%9(ubyxyF zaNUZo*Xt_%1(}wRxi+Z{+mZ?yt^E$$S^dMGUPiPDUy<71>^`Y^tEbATo!6|2@mF57h5+` z>!kj*Qg$=iH%sSj`rgDPun1ttdbxEeE^irVb=320~#EHlQMJnt`uRU?_Nj@995?6>< zI$;HA(LccQpem?U_zc{<7H|Z%x;toLkFmxWpy`TK#X$ZYmP20r-@zU8l!~3?@aangt zFT(1~{d7p$>>*y=1GNe-+doephh-`CmHRTG*%!xrB3E`HxE<}#QnOe&C!O{s0klS4 zE6lJ<7n$@D)1NaVINWw784?N!?rbm?JCB?3w@%GSQY(2JAZYV6W*g3g!|9cZ)S}Lg zU2MtYAbXPcT2{ixJ^n#i!}=$yHEMHdU(O#L`>_)T?`U19<5p%~YM4deKB`J@NB8J{ z95!mYNjGwcH5B)#u5lb!g$xjCz%L?y=9M!g1>OVcfjnC%YTHGfkYIXtqNWWinqZE` zV)GA8K6>u%qN5ezVN1wOw0aKfup#@fHl{HR(9gy#a5!gjgOGbFzvQ3|Z+6dr@jVKq z%JT0J?K$#54H6qXNnaMzCxW=%_$;35o_`TtVEQQ}zb$Vrb61j;?CuUTYqH2TtQD;5 zx*RViXsp`n=Jqz8%5RKpC{HJkcHUEVjq>!$+fkOd$rulAD0pw9rm$_>oB1#A{F3e& z;GfhT8Pf(NP85aG@AUcD6ZRXrUasQ+mRg_tV-(X&*Q*x{(Fi%>CkWFPT?V<4zvp_2 zBdi}J{9{HevK`}T+zoJxwgY${^43DrQ( zKGW&gZ*Ur>r{_sP$|iB{&LZJVuN)4~hS;hFmyJ$5M((hd&v5~CB4d3NhF&0(vFAV` z3Se<2oJXgsSHTS4zm!~iQY7;aIHhB5?uHe7r=MY7k#M4#;LTAX-!2K>exnL(hmj0f zIqfl^VB$kQI+c3MzTxK}naG&oK%kx}>7!CR7;!u;hSdb9>TAz5=^k2cbbC5OLj**V z*x1%k%8+75kIBGXaOJ%YTzL|rE}XcNt=A3IUiPiAS$_U0b3$wOW<^zGNM6I_ENe#7 z7cW^;8MFea;&JjKwdwqTGvQ{yq#dcFkwq0OLY%=Ny`0Q$1)}i_@1`UMnEnw8wyVoF zas4sT<8c*;bk8HcdiPxP#zU1m!7Jh^tnB!dTk_zI^IBUmW~3gVZ`RY72>8D7*@EcI z=`v$@yL3@4f$hOHuE>pyXZNTV*L|ZRC93ce%nZr7kO)BI{bm~KDaC#)qL~x%fO`b<- z>4doRONjqr9s&KJ53$##?<0bQcBJ+y=_AVtQKvw)^9m7?$_*fr@Ru5Z-`T5K&1)W$ya3qyQi01p)NH%ZI=p~QK@nMkQSl0778WbzI!g1R>FOeF*LUO{rOQ1HqiQFi zn5`RMMv3NfcmpZ|1yMuoq9)eIW|@}qmWzNSLW7+Lnem$WMe&^~v-ZsQ;$RX@65xNN zP?t8e=o^7OR@-`z<3+q_(N2`MXdaLrE{i)RVhhk#!?kamI&GMKEge$L9HVR7{Hhzi zVr=C^uUbf8`cV|w&#KIT^Ba7ql$#A1i$~dY!?;-Q4RAoI2 zOo(PXy63VF9Ty4Hl|CIj^_WUGVmN8%bALt&rM{7y%)wZnN8W&=nH5Qysn~$E2M!ia zY+sLl3XOTX#;LJsm~fiOo~v0~a1YrfaQDnkb^((K9ov%B+<3KHV&IODxB@vj9)wUN z7u~o3%Hp!_>I^D40kSLZOBdc$rt%{GaPGZ`9ETI@Ugx&p$E>rQB&r&i`-3C4^>i#5 zzdwKKPp^|Nxy@k=nt#H0M=y&SZtG%(7L&!&^$=PjthpzzMUc1(O}=n$`hVE|+_MWE zyF$-oqnWi}#E{DE7%~VQ0MIOrXO2&X+zA=uZjEEUHN~gYEXErR@Q3y({gBm|u8A5Q z&=0sE5Grll5nNF=QoON@!K!`0yk$uHha;;4uSm0_*G;5(VO9@Qn z?)`}41qe)U*8*>-A1uMZP4n20M7H{xp*>R7Jc1AR*gR6Tbs-7^p!YyhxJ{S`6{bC_ zq`b8TV+9InG^kO;5M#*&izhbM%|Jd|WKZzMp4B)fp%K!kQ9@i2yU=mT(eN;x=xaBf zwG&L&z=+0qfahuD1;C{Wq1LhQBXZ2Hx!kShS6$!AM~Tgkr{9y}68AO@QBP_`*b@bp z?n*AWQ$ete%E}L3RRM_VVr0m8xLjy5nlM<``{Tx9-Pvh@UuyR=tdTrlwT;0fXB}R7 z9RnPU+bHQ`(JG@m5B3O=FQhlmf%{caydU3$zQ&u9Lj;QWA4-jxHelqc1mF~?${85jFWP2jX%E}4MIXv1(o;mcBi4# z4*6&N3N?6Kkda-)NLUTBEanXmq8=!tV&HYfc%!kiPJqx?qAs> z)z|`xqu4;4R90b;VH#wb;Kh!zgk|LK7P7j7!X)BONCU1Z_}1lh2*k2~cZB!MxpOZ! z+?4M8U0f1cLkfcf4_#BzVa8zo;e{6l`>0CiGD*uO%dZw$SwWwEm-*Jt2JmH^N^xQQ zGRw#^YJ835kh<3?aPW+ejZ$M`mJZV*(SsSj(9*apnLnPgG#XQFpX}p449Q%p;ekx$ z%}pRinKaFYKJG6JM@?79jrLz>aq-orS)1E$0IPVa@X0$0Cp`J+o3}G$mXWK+53mI; z-mgF_K}O|1n_Y0Mlu(J;iQ|G6*+$JYp|zjJB1@VUk)=4I$JCaFh{bEaAtq( zSpvYP&7Q)s@a(4e)^?hDRk9?$x(DK=wrB>8(&{TMY0_yseC++;>(sSHoPATllb5gS;`27#D6XP@r?%gaNanSai(ng&Kyh} zk9~dv$^o#M5iXJUSp3C$IHY?;(k=BPc z?9d`>`|YXI(>*n&{@@t?fONai%AgSAt1~T{7hiL#JLf@%O^(s;dniI+Rd}EA-J6y^ z^2ey|IK3hGxlYIEAbs13^2WHs`|Uq&2-be4Oo;Q#a7=2%z9cs5%_jJnoY@-g#AkMzq3wDW=smK#<&{U=*LL4bUpMGI-o4hpYQVf$b9mU|}xr*BwmSysv9Rs?lnjQxd^R@qI(p&??ICR>upDF3;NI`)rI zi$P}|n$%PEB`1pqalwA~V&HtyAm~HO;$YHuq47*>po2|}6cofz{~D+!r=;CFkhvB_ zwT?!}5!9=7--->Af^DBMo=;ZDvb@qWGr$RsdNh4!2wdJH+qJX(QTRT1SF$@xqWI6% zLQe@W@EHrb_PgF+=>Lngw~VT)d&7Pe5CH+{2I){br8}fUy1P>vBqXG}8)*flyStH) z?vn1#GdIukf6n`kaXy}J%8>2ed#yF+J+J$^ez&#e1DC!tz2M&T7WyYC>G0~suQ7DR z0`pd!HcbiQJUwIu@9Qy1ztq=8bG-Lzcc*Bb^!Iz;9`d@;ohrNYQgM{AV0$3@gz153 zliqml-`FXoNS2Ap4l|KxH{$F{j0wvR>gNlSqGid6nuU0gH|iw$FgzNHtF|o;>g7Rg zH(nDq>O0A{6FO9}<*{vki5_tT?2m!3wToDx*`UB#GADWk5)9RbV^ocB`C7KUUK@p{ z9SgFkMTL}yI^~u53PHM-p52w8gkK2x&<#oQf=qLrL$xE@Ehoa0Gkf9A`b($mTFx`W zitu7CcKxa*g;Oz|r<(>6pjGk}D7X%&Hs)uq8NIlO6DK{CqOUV@9ZooC@=LnSjg_4R znKx;g^MB%$t;IsE*?kq?TV`f(MfB>gJzyJBl%HPfw*cg>AhX>y{<&)2MQ#ZD6;z*d zzAS*QP?^L~-F~U@Ns9;6(rnwusC$}Qrnui8J3ox|YIap#{EB|bHxN$hH=h-RDNqfJFbJAt zU^;gql(q57_20}xEd>?(xdYp>F08W4?<`z@Y_bC+_t$J67p<#jx|y6+bw5`bO2t*^ zwLx*;AAoq7b!v1padcQV$NG-ZfGK?sD06F*SInUFfviAcl z)FsxXW0`)WcoEv$fQHboB*%+M#|G_rhHC z#ryNu0Or5~@C&SaJZQIwX&^s3+@cg33*k-7p1{$)6X~;epd(eSCF=mLNu>cSrlxd+ zTyiExZ_b`UdsC3@#Owrg-%8Vo{j~`ePllUoOYfbd2%kr2uNdL;04dWP>(K0?L$H#!B_0^4OMVi5 zp<3KaS2M5ple!aD09)G;Rb3z0q{OeOlRcLcKJBXLxEVVuR7E`6+>=;$qC4Q4!nYEL2Y0 za(4pK&{Ep(oqsW~JRt#E$x&*-si}6m@ihHV%Ou&DGMcwy&;jNqhVrG5hR%|mn(#N) zWne`|;$8%q=J1zm_nUj#4Y|kMEcU}Ggf#ovx{mftwAvTj*3=t=3kWN25%FsUQJ+S5 zIj*L)swCD+5+KQx5MxZ$^5CPEziRECbxrDcLjwcgl1jkbi298GRBv%t+pY1J&V)M) zY8ALj{T`0Y(0C+N&&Frcw%JZtY-OnVX^}cX3CH6-Hj7kst!JnsJF*8iU-iw8%EcI# z2b=yn5a7N}{-kKR_iJELwvP{Zs;Aqiw;od3HYkWff0x#>nJ0WpenfuhKyfPg=`v19 zw(Vwc4=V#Tc~??u-|5*>!-T}c&J$h(vZFyRxgkfm0)dlbqdiA!W-Z>mYc8DNSy)}h%lG4GxgHX1VrFhX)#JZA>R7uY zSZ@L-kxFM}5)hF`R{caCK0MRsRv&LPz3{dQ1e+UBTJUBrly5-RU58oO&L^?V$Q+-M zVK_lgyEb4MkX!8LSaFlVa+~v%%wKF^_ee8w?;%&Z_yWn#hjw|0nWj4|QJ@u$n^q;g z6JHswCKMQ%u-EYTJU;~7w{87RBYd}NB;9}0$`?O}XpnrjRGXSpQ6-I+w$T%1U}8bT zfw^KCi!Npb7lc^vKD3NlrGcC{SU?x30A;w2352YmgOYS|zVtTF?euxbDWSW$ zaQ+7vgr^69!8;Y1LA(DS@TsC(3BsST9uB4V}+bfnXemsb#$Ka0E})hfMR5%DHOkuLZvIenHCRcSGd=- z1d0@ok=Xx%AZAIIx4Np2!6lUDgc>jLo||m-+Kk=m&SjDP_On`jZ^A^$MLfBxDfMY5 z33=K{x{ys$dvJWWHxC0|VRO+Mq4~&itoq`vsX{c&heWv=0$J0b{TWmeeJt9Gk= zdT;xGbI$E5svuiZBiy=1%*to>Sc{o6hJhVW4$9x{%MSx4!)y^bJC64`U={^(G_I_C zeZzM?p|t%g;>;PV_ER?j<_-*MNENhLy1YRLR)N2D<8im0v(M#t#szCk?+a`#Jt`rH z#xt@Cs7gf)Kzll{4@**N$1u3hYaG7uoSR!zLCf^I*+y0x#bE;B+E6hrp$QsOkbUt= zd$Tjhio2h_p2oxmVi{w|il0u3nmcW}Ut!U@4u8t1xIJh{WM=sX&LP<9+eMM)8!gt_ zHYvRL;5$8H+juNSb@aaW6L6kr7=}N+1i(kN>yx|#fQ8ji1{>o5pSAabe)6`>*Cv&8 z6tJS1ATa&a)icbU@L_-5)@nATED*B+kQcHI7O%QFNNxXQdnXRiRty3P!$oZ79XT2| zA%uw_&__>5!3VZnsxRCgaamx;rFtKu)%cBv1qe4*K)cIDVE;swN>??ZXzy7Y+ogLF zlPKbQ%>nKlF+zx-l(RTv^_WD$h;pAwv`5Wo>IFSuKJQHC4j4^^Q`M~EBUq|%wHviI z!({jX7?H;LN$cAp2*AXLOWE8$!2vk z8tKK_i{GZ}v0qiW?_dJo{MyTUXga74?16nSukY@Agoa_kvATw(@mG2O+pyE<4Ekme zate)$qHhMS-Ur?u=#QQ^H4-$zP=br?K!Ev9P4f8k_&_eRpD+Yu;=8pJHqvstaA@|! zY83g=#4dlYdg1qy$Q2+ugmI6um*>)BYvpDh4cZ3NjPrC18QsP=LPek)xqu>rAL#1) zK_an~Q^3)TOp4og4~e?pm3YV0A$E8X%5nd<)iILk^^X)9?-6c_nap7k`uZN#n3UX3 zz0qVXnesmad?z=E-f@*QOqU_((Q<#0gVX>a_~bkGNxQ+jz;`M%XpO-kIWfCeOF7$) z&&{^p>@(;8teb#miQ(U%kt?b=Q!`Ab=zu(YV^2@(iUf#=BSx2?& z7Dt=3OXWK}tfVcJI@~oHd5#M`Qli_vG!_zH*_>LdG9*b1{d>LiiMH)o>qZHQjp0qaq-O6i7 zpNZlKC(>wsd~d;Ix3Zk9jLn2}glm@;iRJJ9uPRd=m)u3_y5@;ziGY-Vt)3k*0y*cB zLkWL*=8*UGx3rKduGx=b@^7z@Dnr!OTn~pWTXBQ#(>ypc_klY<;cYSUM+;PEvzp@h z-V*@bSTfE042SprJgnuRG)GG(2zc#7j;3#>cf#)0c3^0aRAQBGN^T9W=il3LyA2gZq)mp z!hyWRW+TOz9fPZlG5nI%8}!*aqmP7!WN&oX?N~O@ddPF)B(w5L1M!H8bX3kz;G_cs z|Dx=N=ENwZz{h6B-m{Pn2*2hP@narS98CIX94uKS2rp^)MY()5sjG5-p2u49$6cOe zpUrbSX+JT}>l#eou6E8>1+v#-EOF+CAyoUmRVrErjl4!;wwi@4LHq>0QePIKzn|`$ zJ@ht7m;38yU25d>Y!J6*o`{+F2Dhq_GFF`~HE-a~w@_^1Dy>@G1o|c}psFXlU>i&u zlXngCP`O?;F-IItQl@8_>6Q!gu?M-KbTnHERG|%t&cfgf>jJ@nn9&91N4iiN39X+x z9ky{|0|)>b@R1}VpB#x(Kxcm5HGN+3S4_N|O_cP0%ESWgEVYpZ!?ms)C!MVItC!!r z#nFo^dOG!$v)j}hJgYK32GJg zEG4Qhqu(9Sa>Z+&ovS$21qxx!_%(K|S;fHou)v_spDZ?;rf61xTYY_k^~>7%kb0<4 zDZ(X*U*(Wc`r&}^hcu)dgTUo}JENs|%_n{7EDBJWbJoGS(_RXZI!0AjotPvVet$Ss zsQR-ycsInv?TYEPE;&^_Le!Zb++Uz;2UcHkLINUwbm!-mCxSm>>x>d?86DGo78+>Z zhPoQj%8xszH?t9(26y`KaHG)J(r zu4U{hsj@cYW=%#s9GP_32aedEPEAaoMGoa^j)&@_{dLi@T#=M_!tS??6CP1v;P$jn zMq>}Q;cC1p<0uc?-Hy{m;Y7Y z`UX0^p;_PE8s0|IzH?|pvTN{l?XkmOTGBsOOEj+gUs?rtx5WH#XqbpFaCWb{WepRXr;3dUjiWMSK&hhS|ZV zOMCdsTM7w7-(*@=r)V=x(UlpUF=?FT#_k;(RL;$WB&tSCGtS2a!kYENgDXHs05z!T zlkUflQfb;UCQG}8V~jpcJt^mjQ#jLszdK=B^;)yxPB*f=WE+Fz{TgrksYAoq+~kr757~P?+G$JR;}}!> zxCe!Da7n`{Ge7r0SHHlfdDZ z8rZ!#7F`VmxQ1f?W*5=?O;Vp=uq88uwtB!&(LbJ8*3qvQB^l&9ama9Vl>Fd%vt5vS z^}q7X!KLyM9I$@bCP=IU){ql!Nt~T*FrJ$}AW^dCyPb8FQJ~SPmHJd{95fu#r?o)7 z3MUwoKR1U8-fA-suu#w|hX5cos-X92bFj_RqyYfD&}=Xubg<7?#VD;=z03r7sBaTR z&-!u%pMCCMrRS^uf?`bS)UZ`I*Q@3N{Zz22AaFTLUjbBBRal#Yv&VzBhRINWXKq$*p(adZFvQbdFeN(Hs+X+KedNBC>{SHt^v6$ja zavN3x@!x(`&1F_zNvN+CjUu7G|C{Ok0MJ+vXjrfdRxOsn&3nDLEuT`F85>k)poH{< z_%ymXeLDr}Piz(n=Ef5EVkKZKU>vZE@xl@1pnQSn|K)0n$pR7CbstrxbafSu9=IYu zUa!+@x93Bsm41z=ci@C+nfd%37U^?PxzTNNzo0%af_gdA9qBa!EBnL5wV;7zSX}Jo zo>~R9B6I!wygOT5B3GZb`D+)_JaPmWw^uz0!cA5h6GhgznUGZva`r-m@&zRuWh3FH zq^7ZCIb0j=&$xwZ%Li$HUC-mLT6AkNsU3&y+_NUDCE?^vTpP-{U+b89UYCiZ4}bt` z;!*Hnyv1c&|7CR8`UBCYL8gaE{ywch{g#M=^r^&glxnr%!7m@aPBm__fs_0&>qL?= z;1Hri>go#gZ+qvzNf}WdCN)ocysSnmb*nch5k+*Dg$G}`P8N2RGGOq2P!2fRi{TrxDOiv%v4deEFXRkxa!j^LQC+|Tfo^x zeQF3N;JdXShR#+vE4mJI{3o|HQy_?#zRtQn{rmMnhO%@B1)U3lqlIV=N>8NDHRF-D z2MzHV&m#V6m%s9B`sayP0G77^L#*7n_|@;nJ1JSPQ$n66G?+kx`G6@nrAYJ z;`lyh36sV%G}xcMHixPQh^8=L>mf2IEwg3)Q=QF%H|7a&ZbhM?R{%s?b5*;#JfcSh{d_4q`@-qaJK1oTM(EQSSCc@IScn9c`t>-hq9oPwGv zg?F_7i)$roesQ}ATnw4OHPp~JJmko@v7A%Cr`WB)=amHhsss{^j!%zZy)W(D5{%nH1 zHOdw_CyDLiGlHA(g9>|0Y9PIu=iyc&5SA?w zM$F1ko1nM&0BKL;7S)YGCw%<(Ofe}c9E=$c><}DFLEmhvKuUT=d?a74Y+NJ{;tF&m}f*9tygU3_uvqB6HG+ffqW*-2$G+IV3+|J$KW^;1#`+@^iToCw8OqrF#Dd#t8kcEcVbUsvjN0YgQ_j z2k=PJ8;F~eiF2GieO3@{zQp`v8PMjbN+IaC_KO<($?r&Jc}CEHT}anY%KLZxy_|O6 zMb19!wZm#K)vVW?J%A-F0m6crk0j7UZ&ck zj&lbq!)wRcBKpq(ymR*Z&KLSQX@zG1U0H_7BUHYWMzaqBxFpVo_YlIK&p9MEB;99n zP)B@{MbACiCCGqK7MuY+!7;4|ZDp}GofE;;-{-KpANfNOf2C=I&XxpM&-h9t?_3vr z^7%zv$17&!EG3CO>aP=J`Qs~gJ=AlKT&^KJxL%!aJ*su9}ELsT`3E`U0b~i2yhZxxZ6D)%cLHc@XaIWHnn~3UuHj(Zexp z01D77LUK9gOf%#f!nW)Vn=a{RcApgx&CnEiq5iVVjTkeV zGixUAZnM5_Am75Erh8q%gG!T=J7BonN^=%IMi)JYXY*_2TWD0YMH zQoLkIedOmoLrHm5vTIOlVy{rg{zh5fM{|5=C%NWs?Q4N7No#P3AuZ`h78=p3qHMW? z$#2+>s)hr1T5>wGHW6=n_UoT->r=7)O8n) zjlz{a|K~2+5ii?E>8A7I(+x@&clWd{jedsl3uE_GOy3P*rF%(c-}i5-Beuti>Wj)@ zURL}_jc4rgsa8L)(w*9Y+O_^n#^+Nq{>0oy^Gy~4O&aS~QGBzpv$xi-(OF!ZceM`< z>WqKpcE-)f_xN+av;v}0PkHZR@!K5ZL6PoC3hT(J_+#v$)N03wILZobdti*|JvvhQ(BH?xV zMkY3{LhRP8wquDRD72#Je|kS)peb$Q{nB-E7ReK!s!n#Id{g>$m(sU;-Y#8Q4IZm> zzkGn+iRV5(Pa}*wp# z6vU9Qx;kuhL}B5h&nZCF`FmD?8X~zr&ko&qI_KGZJk- z{a*Pcis_&Fk_u|UM44*AV2uwclE3s^R7`Wd1`s>3wM#Ihu7y_upgY$bcC>4C>sXiB z^XWdo4ld`x__`;kPRs45{Xp6>3)~4Vq38&BPdP~zZ@wU5I}u?HLI(TArrK+)UqdPC zmfWx7P=gtNJ_vQ_60L28sN)n>WS(-qd}WW&Bq(s*Hr`l29jl$PqH;@9kVeY;gZmY` zQbmAT#;bsM>F!^eeZQRpJqk7z;&_eC+jkae)DvBm-WD_PY*x0t(rdz{DW$Y^qZv=< z!lX!T7*(py1lo7Ttc=)Mh#36>)r)qj`Pq)`CMk~-m8bo1J(^@{r_b{bEIbErm% zW!?=Zv%E}XioHxVwAElzTnF`g~Qz~oYzQ`dQwFzCHzoA1Q@ypPyfCFTWE zrR=U)@*3-|IY~nD3o$qzB((#LmDBUdn&x=Nb3lNVO=Mz~ME+&_`D@aK=|5VR2)#$0 z#5JVrBsqzZfVKfw@o`Pz?XMUn*13idsMc;>#kjw{UFj)O<~H;D8)IR?JB_kN$WAoB zg~}(x(zz(|Fm3R>0ce5+jrzIUCwPp*Dpu^)QSGG9#$O4YiHnn@uK{R!QNs(e@PkF> zBg9>H{yWHgt00X;cKipd`)TcF?(Xo{Da`o7j4U=6oyIbwU*kO#=|vHH%meRmv)k;7 zrMo}vVJ<4B=R55hh*QrK;}2r2=e{~KAN<;-E@+`26utt1#!0~rO@AtqAq$_eDaY=4 zu|xCq)Q2ldf!UiW;2myGY`BHX86Iye_M!LVt<)g@{W!m9(%)+c5_A>!*^ZQ4UGvA< z%=%8U<(MDHj@uZx0b8y0KL^%DOTxn%cduVEB2u#~a>=lr3$rbOQ|4}a(XErmjwbF? zX|5-Rf)jKP{TW6j z)}`(boCKLY*%(E}kL^;nwux>v&dK0LDN${%JnXnVp~dJfe7H1HW-L;M7bgFtS5_eH zYMDIYualIcCC5y!*c*&X%a6+j1MzO^lGMwMbKXVgYtPIb+wUFYx>luE7iZ_*Re{Bw zUdp7DU1{KXMluIGHu{IyPc=>&HT2bN^GWzNGK?Ecn+5Ue3B+=4*Xj^M54g6iTi`eb zX^+m5gZ8Lk2hXsh`K;o!?v89SlJRUFX0m>s<*LT%xOBN-6Pim(wN>kac>+g_zRQn^ zNwBc?8*~680$4q3n z^ie{0y~LkL%BIBfHxn(E$$|OZ@V4X$5$mkF*VD((32J=?hGDGW#IS4bVf>9o{NPcJ z`?QFWLxJGkB08Vm5Fhpy%6{Q{H#a18w`2ry)YDd9krU#o^d8fYE zntUwZxU_t%5n8$@DS8Gh50l;_u*>C<|GsW;D{n+%@)_gisfj0<<+jn)o)8w*K+P)s$#>JVnS zjIg%4=>Oi_Z$d~P@}Q)z;eWsO1V)c6d9zF5bZ8E-!GIWHzkA+PlgN`3-hy7inLh<0lMIxqy6PUr@~v=10xI#GFZH#q`=PB<;Q0{_Ld;osr(AG{G{ z+@Kdm7}ovvz$c2Xdb*u*KZf&@=>D*-0*&(AVji-Ay~FGuRo(j`TQ4ODyyrcN6=XIC z^QrCsJ}4s@WCoiM>D5+gY@00a`NyzY zVC-oI-krMTFRx@F-knv@L#hJ+?cZ{|1RWXi#ui z_Q4LOR)gTh=l!ZFEVaYhHRQOk)GlI&00=V{S?m^|i+r{T_1g+Y})hsx*zo|n7CY| zc=|=A0fQau#Q6i6%dHFa*ORjxa3-^Q{VPNToR20sD2*11M1fAJ3t7eLE*0&ZxQ9POhbC6DK(<^00#27 zRQ*vw$BKFY{tKC0@gv&x=t1po!cjGL)VqoZe;J-0C5Vzj1Oxlc+i@8>v<>h0By%igYyT& zOZF$T4y5n>Ej08RgxEXamgM`B8!+2X3EuU{k$^6~%~4sMMRczwoj#8$@{igW8Gnd2 zwK=dqG-md%-vRR935WiFS#<6?srW|II6^isx`~Fn24m>xmq-umcj+&jJ+01F+XntG zmkvvX&H+McjZ!(JKUgt)=HyNIw@Zo#D_h^*a*S4#<&hA1z5M9^^yz2=@zyEN>6#+! zFn}$l#Xb1{zUw=^WZSB&@R)M!nha*Dr9(NAOIt9u0O~)6;Reurjk~bpQwma@rfVnf z&LG1FvUGxRSKLa0Y_+oW=9uhIhJ7X%lQIA_2RXHdqFK32ss3)O{BMKW7rL`Bnp52d zgALsJKDIsA>5j>>Zf$(8`Q?92L+f!PqyN?rUN~B;VRgqz8ve&QMW*4u_4<}^?djGY z0U1SWl^02w5v-?Rpb!5R zyXp16*O>);@n?ryXv4^n$M7CONe#KV^tb1Vko^gyhcZfc{*Rd^EKty3dd4Q7k<))S ziIuRP&&?&Nka;9K)|DtNdBuM&PQ&0^e0a*q<-jr?F$fZ;krHL;{Uw_ z4q4RclHr|Qh2_6LG7JTiV5a$I_tgHq6T!c@J)Dd5*)xX^;u-U-_`IG8F*{%E-`pgL6VOl z66AFtbF?TD)K=9;+$Vw5#Qq{+ID*s*%yWQDi}D_e%{T*V$t{4rukLL*UNAqcMKll= zt%6`9n2rJZ|2%huFF+|fGUvWq0vQ1zO6Nj*;ti(UaDvU~y;hVJ9^nlL20DN-2VTHK z8F~fAiS_+!9a%kr%x9?qfij3<99xeLcMNNKV#15N2Tp7!kn- zyb9)@evK6JKVFYOCZR!eTGQreX`9Lk-QP0x=SonCF1KK`3saTv*ttJd7&&HyQ4Z*i z3$k(|27wxSi669?bZ=*;AX6XWdVv~#3f5%KhpRP6Rxw(LbGzo#9t+sAy?KB)0~@pi zKryhC!-}lqeSZ#VdO*fifwqK4cf(_-?%gWnuYfvo=hC#R&E}*H_FBl0HQtJh9lKjF zJ7)Lu9QaQ01nAUCmw};J+q&wH^Q<{l4VcOj1XQCR(Ix?yD$~FWJ8!@`3w!~1E^A}s zqcUH%!H_m?CfHiY*dPc?Is|2sS{;f&?!tZJo$gF^D?AyS=GtfGH+Rb~@q<4ri4!_u zq)t-t9S3)pBsblGDR7ePf0h^Vms;>AR7~JGnJXQv9KG z?wSa4l0S(h`f1@NW{aR(wZVVa!5-|@4>NBj#Ju@@HhO#5vh>yN zq>t(r3#;p7Jzjna&_h*g`rj$&BFkNUc}1`HhO|mN*4@gpP39*AFl6UfBDe--2CyNJ zu;Xrc7@?Mu*~+bhmYy2M1Ldoq%{M!vp^#Bu;DT^5kJ~45eP4|MFf~ zZUEAj>ra0{=nat1S~hG3#7QkumL24DF;Q?YV~0;HK6BCTm(sgTM@PkpHsgWRwZh^? z32L==Bo-4y)T1L9??Tye)cQun2s&zPb>JcF$TOi^cmiT=<9KCK$_G5HxcmfNpziXS zS4tW=OBHm2Cv3QQ2{?jkpmZUCl?j+3GxfkKb5cKurc!HH=zN1Z@UXd$T2Lo%A6!U#n$YU9 zzQhwML{~1N@pn4MA&}-7I%UQ%3~V}N->65N(-;0;SU@`8?xg; zop#+0v%EZ9UiRj0868{f=nViX?O6Kd{^!SFCk@te-ZcI5$`VBiE!UL*P3tOU+%NS; z<{kqc2iP^92Hhdu+!X-bSv~YVSmIZqLXfmpXhtlxc!Jl;+bYN=zK+&zhB$Dudv4+Y zVR9q-i3-tef8}I!zuQOZFctnnLld(d{5cpgl-Q7RPz`HH7>Bk4BW`RKj8p8@3YEGE zC+T}t34RflghWC5!?#O568t@!dB6B%ETnvQ8DY`x3EHI{{KX^Kn4Hl`y7e1SQv@bQ z&l4w`1o5xTw}!{71G^_UqVR1%^(UXfGC zvNxC{w9T+BcsX+qM<5Zv0~u`%Z8agm z1<$Lt_JO>WoV%klv3ov*@$AOh)+5#H`iM1#gP@*GS`SL`OLNal8e>#uNz4W4bu{MZ z_PJ3iWV|@)6Ur<0Q6!MWQWNrZAnhtVUdkU?v!k($&lfWpwK|YSx3N!x>UE5Hg>Mq= z*g$PQ*7B~NOu}!<33m#-@V(ub!S$ihMj$w(CnqikwIn|mF1EthEtwpD{MF>~IWd{&NE|~<^@6kW zT@q@iAo)fv)Y~9bA#;ldTBPmbdp;ofoDcF;n3{|XW+ZWDzcuA!@MNwZ1n;UqZM5=R*1L1Q~)-K?7x)aCJ}O zAw{f!>|_&WmYysgp6%RNPm*!y=eq^+eHU}K{JGfSmCpF@{59Uy;>309(FNcX!=~BF zQH#SHk&mZ3iN&8YDW=q5>5}!CG{(hc7Gl$rAdqw!VC==x%&!YF%AdIgEY+aJmkb55 z-WMBJZ~RuZR{BMe&kFBEPM;L}V3G6z)5Of8ElcgesIzk`eKTHX;yFgICORy)!%k5_ z^BUn{YmPvctRMN^@Znv=TSxRI6lJrLK561-aa}eW{qYjb$tHxA{hGV#1rP(SA%- z5#Zi`;b;P05YHE$x%a4dAqSdB4VxAsq@oEP5(%!-@J60NXyeeVYva_+xXR%s&&I`k z;wPqrF|9ZHEpi77k&H3hgm6$PBrNJ`XTI_aXCN? zCe#(BQl@mw zKk+8DK@p&Iy;B;~FzSmL?fD}sML~(r@WOjnPXBxEn?4XeWQti$&?hmY%f=P0G*Nd) z6#s-~&s^l5qrc1Hs@Ml}XTUO@qx>tQBO#EwpIAWLlIx?QhK?N1u}B{ZqNNJ0a(lD_ z^NEswd>}kygS;dBpuQu8PJD~X;94X<9}!)U5vr6Xjf|&x%(p0gTT#3%Sn3!oTm_}x zi5R8M9^G_f!u%ou==6ZG)u6YjN6}dyS5PUc5#S_>xeNlI2#_axN<2L;lYnQb2m zF&^*?**ZT^zj;0Jn@$|Q=@An4npD~C-TCw5_}0mwJYYtLeQzWlMpSBZd_A622WJPhMsRnQ#|NEvQXAmm|_gZx)kU}Di7()8WE_BQ!nWGz*@W? z!MTaElqak7byP31^g9_DK_w?z&~caFrh4`SS}He*YVxZaqJ>LTAdqOiGVq&qN??Fh zxiKd1tPXu!ybY=&?;+MRSi1MxqS4fuHm4P&VKQRuqMovHBy2w+yL8i@1=efKD0Q2y?-YHtB4%; zK5OP5Mfe&V2?bP`2(BG!sZ4QQ@dB%Pk)e=JEB8pagFyd5|0I$ zUR>086)xb4h3#p`@|)QIY7y1vGq8SZ$c8n)>~GX&fw4-ShlbLX<=Hg1N%l6@f>pBm z{Z0M`dSYXLu02iUE7)(Zo;#7*izK{_GFofQJzm)M(xZp{Yqar4h#2Q9WUQoQ`3qupvM_=tX5l)gp$N+^%lva0abw0f+O!!%enhD414UaxRODY3`Cc(Df7H@;%m zNx2Cn{v(EbApZ=1$2tyoC)rGTszpR`&HsdW(Lkl}cldLwZ#YB558YyD!}u6&Da(3$ zTrZ9jq^Iy`&8O<|qVI`|75DHi1{liMT2P(N=1PmEV8}TsgN&fO0_>TTgaXU1%vkb} zC$adglSS8ZrukMZ97#f~MJ6M~f<_d~8Z|s!qQfz>hSBu*T-DQs;=T$jEDh6!h9&df zlys;){FE9bEyH1LFiCUhr->MvpzIfW>-(CJxr6&vZsSfEpf0ake~n*+zpvQQCllhp z572bn(Fj7m*rO9RLfz&r0KtDNW{cN+S9(gL-(?206MwT6FOp`8ev#*LqRelIE!UZ0 zqR9Vh%@gWWmcKn1RER#ad!OJj#j&IKcZc6fCL4)tNeydP zYo`fN4t1CTv0ckhGs3rAtbR^ z&`vkVvx(pQ^gg!6dgDyx72WDC5kPc?AI-9)A0#+sB29M`AN-CvNy_*8?w*j_9M;xL z{WV~bf1!);4h~Llr ze=LA0)q-tA%`Q73@sA%InuaJ69@&Gjw*M(<+OY_~^px_iiZ)E9RGE2fi`A8wWUtvgfRzsw= z=Rf$i(`aE(Eu$n?^4E;#zR?4cp3czNYkg8}vR@3A0Z2vd??@H?T41WjJ;U;N^p{%t zQiL5`02dl=fFBu>ClM8j5()nr&5w2wKi;zmvO+6Gc;VL=+01o=Y?h~V z88!OL-y^vPvp$Q_*=*)&pSuy@>t*NN6E{nS%dZ4g_9Jj*wQST9_ugH?t7?0dqPCh_ zdCTvOmtDj7l-o=Q^!@zyvoRtdkKXfSrKf0keK*4ouV}WIIl)0O$JX3zjDQrUj5_sH zj81*NUclanJNNOAo4g$@*Z0ygNwMq}1gSzIrLH439kyTZ^g}5Sou+8sNfojpt!i}c zS!FNS!tqvO*(DG8&eU2jLpOxnAx`MkGKyEMzmM{pRXyq%jhWdm! zn?M&*)!vKV_6NUH6D!m(2|=^b9b@hmGcE}(%3jKak=4lE9edZqQ>jtnPzcXTD+uO+A>DF14VpR@#4*EG44w~$qtL>1z)HveH^O&KjTh>85rf*ru_JNSXbFTQ-sA6+&} z*7R1O3B6nLz02(x47Ru-MO1ANL+JM^O_O+1E2|s`33u#!nVJH3#Rz*yDA*tG*>t!E zDmFDr2(l-ODu%gUg;ULFwi%*1=SSioWo0K6rUF~5@CH<&FvFzG=$){3iabtx~QNi9j_j~^`mXKEE=;kYWJgPo% z01hV~M}8;!cd=RP(Tx|Ek4CTgVjj{249T-tO~m~WFKvjv+in@Oqht@qg}$^F#41&w zxk19Ms5O(MCSpusdnsRjX47cIw&w~pSHJvc9OV#&JeK;(PtkD

&}-SkQM{wL54O4jz82Rq%eUX%2S?BK<%#s8r+u^pnr_8RiZJ-H-sJGn0CI%YC9XfVpK7p)0;N0 zO>G*snWPX&f-$$vw_cd~=n0=rRY_Cn*PZ%^rLq$E;S>Zeb^68qmd^4yaOWqZlMwR; zC}d#ZaMjE^xIW&Wt(;qoZL#rw)is(>BG;N4{3~nR@Zy_IM*Qe6g>$<|zNi-*A0q^^ zW{vH9zM_~+2x|smGyTDLuOo6&!2HcJH!ZyH$+l}NtUXU=+xShDb#a-!Q!7h_gnCL@ zE%oXi3AyiWBuBfVM3(jNT<7)$*Jn*vAUcXAB<)E)BY5RfT4BA7e_VbAu-pw^wOxK# zgq6u5FIZpju&cF-c~~9ed~Eh8!96KMDG-y4Y{F*{gm%t+ZK{PsW_+>f6h)7Ry@mvD zJAq6O(>J0yR7#&s->mT=Fo8ve7$KBnVJ`xHDpdJHUGFoAqK5G2ftexgw-#bn2(3dq z3w8JNIbzJO=^}>A$+ej8kZR+|8i}>EL)nOD5?Gi_d~k2aO`@uh{fM^1nRY8^1gNnM zpQ)J$(-=6T4pCOz=8B{?3Akw{bMX*SRB;=r?+a9t4H`8XJbXHQt=pij|otctK#-lVa>>H!M?H zx=D?fo`yaFHR+?5))Byo&3c5IDOxA@xo?%jc?QxFxlwX6vV~J7O595m@Eqi2&AijI{~X z^q-=5eSu!N^4aYa?r0oN;UZE)-9Cm}P}qYKD3_MxIYG`osrwQNdmLzvpW~7mHCr@z zw3_)A1hh@a2*lvSB_f}lf%oe<2+9bYa}_2TqRyG`(eB~&q2b-Ds;x6SuGQ?QIB?JB z1ia70y+tG|T*l4OhpYX}gV15f)ca9C&I?S3yq@TNK%bkWu&GxY5epvT`$Btet6H)1 zFhD>`!OK5@qdBX6DSkt12W59Y6O$)Xt#XtkH5XLOi>jspvn8~veVS)2J;_G__gjJK zwaCgW4D83K6*XIu=8@*%D&3(BDzZy$bB+O-Vwf1DV=+D?ilSWAHK6 z-k1;ik57dDk?vnY?|%0K5=S!*dUozG{h{i-Y234%ZsW`GSqD1tVv6ALW{sXlcZ%LI zUTAO|n+gGzuIt!gm*IP8iqdnvd|&gbuM?ZjhOqU~GLYmt=qg>2Kz(TO4(d#M5U(Kg z*Q@B>Vy*8zEIY;NstyW7;hxVYp)F(-y)dj$GGKn{-wKeH`orZuCQ!h5qba{Uc77Ps zUPcb${@k090Nt+0q4~SExVY01sk5l(=P%}edPgF9CvdrUyl#V38e}aBo(uk&SPtys zZ)7(dN;b?4XEWu_q>PN8nsi|pzPU?e0iXI23BXSV!TWI8o#6{9yk<>t~7JdS?JMpr>fd}V^K_ado;qOW8%O8QlJ@ws? zuneU497f_blR{hQHSg{tA?IUz zFC&M(C+5t$v-}?!assm0>}Y4l#Chq!O@E1JD0d#QQ=A_nG+D`)7ZO*vYMS!ISxZ#B{tB4DT3MXp#(aIL^6GNorUhAOJ;?H;4LnP^Oh{&-Z zT(bSAF}|I^%C#bvdv@86V2cZYN+-CcsTgs4b&cXuh>9n#VY zNQrbwN_T^FcPsH9)3x^g?c?|8U55)l&ok#7^BLog>;7JiuBBXq``z}Icn#swgdMYS z>Na*VSsc$uJSrvKXy=V&#fQJ{5y^VvWQ(`yeW2?8V1LKuzT49k==aNSP{^&v_wnOZVw3odwU%$Fw)|7ecCV4sH4<%tf6~;86GYyX&qb`AoYLj)~MCCY|aWqT9^lydjuz2Xfw?m`$B#wZ{eu z69Tg#p&l8&m~jHtfP%1YdRnPtL$Fp*!ZDeapOlJo4eWN-J^30Jv5mGGn?Az_+F8Q} z%uhDq?f5d!%=S!LmJlhY&Rvek~%O^cM$E>RmtV)J^4nxUd`a)?n)0dF2gpViK;75Sk?Ww*tM=p z&v^%cUDO4crxC0Tp+)96H?lBk-c;Y@3_NDi|m^%3>dXe=K;_rOnlF z4jN#Z@FV#jFa(hTpzt*<_D z$Q#kgl!!{Fc566`&~u%plCn%Ie}3L*TQ?p)0=~9@5`ZQ_tv2 zK2-aCk#Z@sj9O7a%{}pXuaH`}fo8!B?UGJ=U+>>OeRp_U{?64qp@g9Sw(0v30vvXR z$q8$>lXu9x!i&nmDR&^e3vlEbwn86u_{ zmL&IRkf^a-)1pQw=XejLr4yr>(-rX15WTx=AP28*SpWVj&%pDl>yuoKtzo? z>6ec$t zbq~~*Z6jD}v0{ctI>3-8N3@g?`HVewRkLG3k4cJ-bcDZqbx(H?`-Nc#!GA^=p~;96 zEO1%m7$U4-7F6`R1{X8=*Uip-A} zL4htn&OdF0chuyhHYDR&RVT#yLSCA{%>Cj6n%>{_pUCH_$HhG zOie;xDF=P!%bWdo*cM5wbE7mUW*xQ*LbKcXUWI>oA-J>B%vK7fJAVY$l~PTKZ5igj zvM!XE#AGD%e;d#uko$OhH#gn#mX@x*6{r|mS})i#Eafz0P!fhnIc`#sA_)Q@9SGSD z8JK8Hof}QgC;+OLwkB%!@~-!oN@5M>ufC#NTq!&OU2=3!#Z3a`T%Y6i#|gpBB0x`p zI^rV;@4!GG_fd|dFn1jvYlp4*`+<#3&fO)E*0kbo;L|>jS{%k!V3xT904b3OuuYpE zd8}Rya^bL7$4cqnP_h%;-2hbuscw$yOVXJpA8Z@dJ;YK>v0L^7!RcHoOQ+ZlU4W5j zm-*~?xe~dkvT@D9XOvEG%9qvlhV1pWY~9@jmaTd3hM~v(4>ILK82!^JeKvwpL0nq6 z2r2*jotTH~nB|jbw}FJK>lcRcvtUjwM;W1;&QGYZ%QQX2CfvF47wzIP*>i1I*8YMw z=B~Ysj7%(iyV1KeTZt$OOJb*aYUiQa9g=XdtmWa~Q&6$0L_ycRooJYpH-1!crWT8w5NLsdp z#3TVzHyLqhkhtc2*Aqy06l%){H!`jbQ9?eYT&TPDbWcR@p6%Oga>XwLy$ER+?~{Xw zgkPDc(_g#4cxP}S6x$zrm-6w>8sp-1gc`CKbneoB8w3-bJ0wJJ(}wtS28URKH;ll& z2a`JrJXTe4Y{Ti3xu4Y}nelZZ{;_n}xe1%Q4qNg@gW%6=I zhdyk!!>lXP7~urxFthTk?1OG+;m~z0k<{Yzg-ch%lQkGW!>H_-a474VAeX4`;zuNo z>pcr&$Rf(_8N#@PnUUz=Ew+BC8NRLBISrt3;$KautW`~4HW4i9*D8*Zr|r8uQ#_s5tIv4ZPm zc0NoPrt&0BHG{Hsh#h_eE1a_m6g|{_k8!0|6fr3OTyBQ-ezW~VyDG&=()ITaSS;uV zrUdm|qT0*YwlSNmx+6=mbA$wdSuS>dxw_fZQ!~1KsZV!=AJfe%$Z@AqR)*V{e$Q8Z zFuGNF2#rkSKG4CWn48TV?ADW)Hm3CmdG1c@Z|^4+pQHSibI>CV&gO)s#Mv-Ne8$Jk z9h+2_x87YL@^cx^Vb2du)sT1M+qd*n`4;7jDu-#z`bH{87 z>~svXKQk9wr1~8;WS=jhBFx8ZN9uFyQq7FXtk(Vr!4Ors31)&rq*w{bVfBk!Non3U zL3`=WYWdEzri-nr0oknQj{gOmw<(j$q`MEPy`Zk5+TQnm-VA&6^Bpsia`1gVdx{H> zVB@iVcaro+eVSmoy{srPjn+5ISJ>4ErMAx_a8a;(0{WLnzZ<^&#J6g>=*ZO?1uU>kgX?)2_sC*x!-Uswhv z@;m{ZZ`xz%^=ec6aZz_CAUPG#kQvq!=>Nm6R;lN;yAE@M*4N;RF999Jn)yYMo;MHo z{HS{K;CTD|@- zESx94NOP|+NMbx^;dPrmc~%d2DI3n&Iqf1SYHkZ&#ynk_cLz4X!A&+KyjCk}voaiN zM9o#rejt+Z>B3isAWHoUzJn?9b%-S8)fm%j=ZzViM8ywj?k7mx{0xtyZVB}EJLUBy z@m%6mh>5YotlP(mK{;g!*XoBkQ-%y{@%8<01>$PCCkc3UMoR_zl~pyhw?oM zGS}_(h%v2TH**?7impoGU!vI%|4NiCgllkWn3Yj&xDoxC9N5zvkC&fRzgVFx6=L2c z$85K)VmEshD&c^md}PnU^tP0JhD=NVaBX5;DNHEOH<=wect##)!q>g>6CEKO?$p7e zRH@2v{1r~LUUR;udnH(vljJN>=9p4dNQ08Q==X4!mfn2k=NI`y028VI>69e~aRfAi zIiy2*zt}AyFteC<(;G9Wxw(B0MmXX^`%*{<@-@FewBo7>{Yozu2lV81O5kNEsri^4}l= zz?9Db=k?Ew86XNofT)sh)n_9~eTZ6@k#AWm5`mDHw`6Gul%Y|DD~C zMMwrw=Yd}%>4fU*5UrVO{=7c^=;>!}j~#~aSux2&qMR~p6&-CS9SBpRP90Vlm1aH) zHvfE=k}mi71;3g7?{OFGQoqKI&L`JE;J0C$8Fvh6?NiorA+PFc!J!N(g2GWUsYVug zFQUV$w;PxdZx_f{Aoc%^Zc^z1N(o2_qWn0Jfne_${Z}US^2q-gU#Or(ihOZjXz8kl z%kWldvF?@>zsO@s#FjOq#^D;DIGTn}X-d!}f?ANeqa>&iuKpbgB^kYIDt@!G0MQY8 zMqnLl>E4pSa<5W9{OuNKQ6a5x!`_p-R`Y6%Wmux_OC~YkI zm;5z?31<}7iagFL;h5)-TTCKRK_4L$B8f*&K(pmA3lkrve>ULnt{_dl^=r0f-*d#5l1FAHkyUv+~4 z`(#QHfgC^`-A@nz4trzmRW?4lN_V&f`=2)%NqZvP1EQEIts3VP zRp@tnq&;-MX_o^qN){~cKQnlT>Wgpr4q^JODIsM z66eDzq9Vk7Zr@a;F{CbCKN!ZZYZjL1s2Z8#KB1o4(9$X8?}h%%^UpF;b(E_xEQbHq z3pHVR8jx2!bYho#XZ2?`g%OH zE3O-$O;=?JG~G;IWof&q+ zHx}KEOBbyBoWbyM-5xU=l|N@rZor^JrRgBap|wRT!T?V&c|?iTsfW+|^5zQc?uE&M zUpON*^`{|gcTF9gvU?~IUA0!XoO)Mtzu*++Yd+rMZL6-mJKriD=+H+Or2Pe6OF}Z* zcxfRZEG949{3b`z{5{G^(J+5~u&#I2aADDy*b-@DViBd%_S4T3wcZ}l%SxgY1j@L# za^LV;$7@#Qc^~{z*ZMB|Z%i$?*(g#3;1=4cXSQqBkOPJB-rpH<%u)VY#UR#o4|1${ zk3gtt8+w0ubkAy6wb|+=6|2ZV(;c3F@Z^=K+DjUWxI&gTk>b&&5nuGe7bEWDGU@1l zDtS%UgRZt8y?B4eqth)`neHI^NKf<>aX>r3rlO{6So^We{4d^jbA^-UR(i%3M!CyU zHtWYgyj$U(+rvGhSoh&sqhb4d!(6ICEH&onXlsref*T-lOlu(*!En)Yj_i&J%_CEw z6>^fcoQQYH-kE2~o3U@4LvZ#h!A0gPY&I_DFCSOsex*bd8$p}t?zK~p^6B@)e)2S= z{4TFyZ4ES90p`dh3f_pIbAL%tV6|PzA(=KYZ~j4!<_ z9oS1G&JshwT4Ns#E%&IQnD}A-gP+@fC)|?J(koifx5l7{Gj0A?`qM8non0C~i#vvz zC2VQAjyh@yHr;E`iQFt~n)N@5bD2etS5mFl-u5iUr!lrdi;}iIaZNIds1Rep)6$Lh z`V=h6C>%HG=mk?}W0?t3wK4jt4i0Y0%YNQfop`wsxu*X4a#*_Q&PJ<`k=nIL6UoHJ z=C5PT=YN>PTO{TTy`Ylv(A^j44x!!cI`ruZ$B)*Gtqsa!^%AibvL2g9+B);6+qs@& zZ;7AxH1OD{3ybRi{N_uj3b!Sm%bHbpY2pH+g4S*6xs~F_Xmb9)il%os`2Yv`*y5-9AwiGfLJGdjRao3F)?3At7; z>e{YU;YmchqCC#n67F%uL7t4pSC-*&gH?{v-R4mWT8$!!L9}%xP8`LE0l&Xh8DLY9 zG=&IlGM_P`75rMRm8zuMiCJ1Qn5nRdFknr$_Mh!9VwDo}R->YBIAP|YPg&OLlldXZ zr7Ud&8(Spx_8;F2+O;K?X6$Dw zLcH(8aFH3puZ+Vga)j9vL^#*R8_aAfny=D-&Y=0@WO%rkBfpq$-_hRBoGSjJ+{bE$ z#Fb=J`u3fy{*yQ|pCdIx7#XVnPP{|S9b8HM=jZ9l$hFtnWiLyOeBO43&%oj{ctXt9 zC$DruDl#y$&yh9%0ujL#zH>yNmugLE%Cbn#z7#pKXXBM0QQlGj$!n;j&w(y!j_WpQ6ic7vD!g72tWk;^2Avw z`9t5*o0XHXHrT16dMLSm3uD_lmB9G4=;YZ)mtv35u9hU5#laolk6JbHEj}GM=W5W$(}OgD+vyX_jg9@%5&XaEnq(lfmDat5jbEX^Ym)Xanvs9vLtXnWBZ~ z;lCOb9^;@?V(lx$%p~5c&wL4r*ljLdHB;QXNV~wrj!-vGhq1$N1Q7aTwms>Zy-*h# zN5YJb5)a`wxn>vhI;~O!-{CH@hX(w5|C6r$5XNWNYv~7DVP3NX6N%TFK4occYE1DV z7(TfMgCW1}i3y7tU()C}?!L)89;!7_q#j{_yR?l|zFMvGg0Hs9_u1si=HUD@kOkP^ zQu+ck?D9A14+c+C%Oqgv`0~aNwWPw3M*u|5|L&+RXL|BiXEZ>Lv6(Y?DkDf-_oFX# zV&gb@<6GK@GGi~6D=~WIuh)N!JrI1n5&O9kdqFywkfnYJ3}L>)H-E-uNe3_bHZ`4YA8Ce4Qk2~$(i;+uQRO{DW#-|d7R5lDzN!VsWJi7ZV8T`|(?4Kl)& zlAI)FgE=)cF}x0>eB4hTMl*SJqSX<@n2TF7X+BV_JdA{h<`<8^jBem@n}4=m9uYnO zDL0!(pMt#iYix{W1>+(0T%=DmXI@NBch-F*&qd}BkgX(bj^`ustqPj$@h45eNAptwg_aEJD{+3E|19CL4@ zpm3R!xH?yb9Z^Ika)Fj=G7gbf@h*m(={I=sF7m{a?>_HoR(+c(sHi#I2w9phi|H&l zF8APaoWgD^=BUueTq%bW{^aiyCd4SXn&~<^IMmD5U4$4Zk>t6{k#r4|a z%|^HLzq-NO#I*V|B8eGCX_A}Rfhw(=0o$JW%pA;(N2mJUlk}Y{y5=Q@HF1o#3HHtq zEGwstCm?QhlC-tpe;|_j*o8LvoP9Ob*R-es&FPZqwKKDNVbO-dSbU%rd`(hpi93o)$eNF`~b>5!D8&N+qVt{7>gdFyX=$~gFqva}jwHboNwbgtbF`#C%!cOUrM@&B zhDlvOh?`VWWwm+z9VInQU*kT(i^}3gCi%WW$&AVcK$A2sQZ_aB07Y>lo=*d{*X}}0 zeir?%3l1{74(I8)OlnEy^<8dYWKnD_lFNPuE4sN}V`t)eF3lc2@SyYu5BQnlunp9g z^w}oF|M4r!*_|RV+L!CD!SW zl#Tc*tyzeDddX#~$nK8k)$-ep_2geQI+;x2A|d`C%SWR4c8aJ(bE5jeOZWFY0te zM}mIknaVkCszsf&Tx-CKM%gBD2UT0x2y8@|rf)F=FBvSpeTv1ivl_9b*ja}LBgx$s zB<|jr=BRO_b@on0dtu8nej)8P+4ypvhBH4z6jcy|{`}b>1Y*|TEBns6;f%O1Pko!B z>-n^-utgk&B_9&8^Q-=FN79LYjSgsR4|GANIxY%NbQXUD64Iyy3|S6KfI(1Rs7@>h=2Z-XcH!pe@{L3+i*xA zt|)UHRZfaJTulV1Oe)gFy6>t-*=xyrOVSifj$?{+^G!5PJFhWhdppW`18Po6#f#fm zNerrxl{-47lgywLb>dV^gnhE=rC)B07%DUPoQ`A$BSH-ssBBHKtRa?j(vc#0k6Z)- zkeyKSkc{MK703v~matL=QImIl9|^*L^^+jeibU3@wR<99*7cM84Oe|{q0U+ax=e}f zM|i9$a)Jv2gaO=nl6_GdvQ4d{_g0nr$0#nKcTRaA`jgwuY{5pM4)MAB-_4zTCbzEF z>1ESAds0<`yNuzN$d~6!on9k}V^(j#|0Q|bVJH??fS5(Nl3#?(mHa*X7>xSzL&6LYzW+XnGEywL6o!yJ< zNj%p$O{AsqpcxZ<|*RKi7JTt9=GNizwu_r-S*;O)BYNwX+ZP_^7$@#?PSzjjE3(zCk8TE_JKKLfO z2Po+%-SPg@8jXYx@T?>$xc5G>p<5*bLkoAkjdw-b-e0N$O(I}yUZw+rD*AX;7mEdW|sARFVaA`dMTb=}^!wN+#qUO6fPFU8)znEx9 zD#Hwkf>N+YjcR=F#lXb)+S|wvu&l4G9&Qz8D*k@HDH729 zaGh6Hm7Mv*s9Sm0)-8t2>nn{eI&85UYu|EVF7}H;+@5?a*}I$k*db5T$FPJ4knQ2h zWrUDJd5b&loc#~VueT6O3orRF0q{*oN~*fO?udtB3M#1Z*b~_%RTe1; zw1WE1;)#5l3%o53N%1kA^1>AA0Ll<@5l(5$eEZ@H#BdNN_>}U#({B4{?}D+(TNnAM z)?!kl7pI%{PWSn7^=^rD+X4SVR^yWy9yd^k=^(GOO_#3FDUxV&p@~Mf{;Gn({m!iE zIcMA!Ku>huy2T*H@0`vyWG6?0?V7Kg9or9#DnA@7-yeWc7u_Oa#>fQ9jb#uCV~U(( zQNPsk_fmoD)BVGfN+7YO<5M_zxA2r*YT1?_jQY|A(b3JJ3IxeqQ=rW@lq+j?O!0nJ`n?E4 zOMx3eLFrG6$E+eXKNIbB6IQBMzn8$7F_iId%7FwN&-Q|;aj5*n zbe2i?{cn79fsJ}~m*tj+yCXfCj(`Bi2eiBIpm3?E^N;y+wAqEpfiHsPFDu@@joAmNhRD_Q87!NZH;a~{HEjnOggd>$J||4wxy4C zFPi}+(&H6VNb6$Xfv1Yn0>Bsy&&aqPxYa%1n}hFaD44g=B&%u|)J(-C8fx}3RviMk zs^b+*zf`v}_PRXZY2FqSGA+nf=i>c`vf`mTIzU~xf&lZ4ZRIvv9kRuzvESMeNjnRQ zuNItQqWy#B`^!(1J;xG~E50e{x{zu&-Mf%8q)yP;o#F866RdnD{Ic+%+tHrzMwtdK za=2dN1>)+Z?Qf^7f&M5;%>k3AY)MTCC|*I`jeqt4@tEUrK}H$m5c*uSdA0oY7Nk0HO?dw@HkG$V-#-#TTG+8Rni>c>k&J za^k^midC7-y&ZJ{e={5}s~KXuJAc25q6xIvd9yXg5vCK}q3q@EwSnmkwh@`pZ?WR7 z03kX`>|Em5kXOeC`SMzDE}Rvzi9QyUOPF6RYUDGr(*b*Bci+rBcNad;MaZ=lx!aqJ zo5I4T#ArPFRFJp@NBirm_Xd;sE`7T7E6<9aCtjtx%?-k}j~4LX>h{?J__?lO1L8I^ zlt2!&1NPei9QTkJby`#7QA;C@y;+tAy=%TTmNSd-M;eH(wz>xFh7?o$g^*t~Ab#&s zcG9CCsRhN=?mmQ%+=FihxS?zCFgKYug$Wo!R}bEU;3GVX;TnR+fegY1ASq{!u?d(n z#K>KRXAQShpg37~Nk4hLT~1e~Hb?k9H#%VkB!LY})vp+`Hjg1NN3(q;k!^{snU;5^ zDRJ;W>w)#Dp}!hceeCGBkSRa*JEDM@o5M_2S(kk|8F_j6ktB-uB+Qds;zu*wMUkwqv`HBIMA6?qr4&Z2nC^Z7%xi>)CFrPp= zvd;^#{QdJd;*;9(5FKw8$q(3Bt?oM5u_%_VrdfjIF`V*n^XO51$g9oQkJu^pAFG$X zI|0&OyvVNP3KrkbBw{z+HSJ@(O7zPeC7&G%SLA<-yfJ!8$`yqKazjA~*Md})Z-8hB z>RWs7%cNQ}`Dl2kS0cwYF1-2_bgL-uT>005|Kb~L)^-WbjYN2T9VQdB7+J2qS_h>e zQ)z$+n-1zr$9*eHJ}%&S4X&|_{y-<<o$IS~V{)jZx??mc_DyQ~h0q?aFA5rA;hND7Z8xz%f`!EG{I~+{wFdfUVQ0_is{Y29IB+LrjQ!k>VNZ5ROg>=JRgZP(vO1sr$o&KcVU=x zua+Sj9DF{9CCMCtFEzH?^}!I*&_O`P%u?mX<@V zRR+$8fYT1i`z(Fg8P)7aw|=5(hY+^@O=|@D_6R&{7mF{ShWPa->GD zHI0g1AK$!4-xLrew`oJCH0U!-XA26YK&^WKV=RK&4C9#Ti~l0kQmsIDVfbQeaOhtY z+Q3tA4sMCU6t=|B3DaPhGtfxC*?vcG@F38BTiVzD zC)aGtpauuMl9;tC)4yv|jt$mkI-Bo*E+hY8;_qj0P})OEc0hF0LG|tC%Bu9d*e(Bh z4-ibg7wUfB_U3VJdTC-HsNn-$V01f4hRqLe{MCO8kW0}G}5Wov)zwS!1f0qFC z|NeL1z3+3+3Oyf=L&R<|AArK}T=^l;Xa*Sa<=>72k}#m=Y&})>_<1p{_ca{g!9u(r zfBbo6T^qFzKcN18t_W}F3tQ9*TEmWTDp{k4)8Q6?LkcK11Jsc;my}wY;P^{ieH%` z;sor-P##PdK9HKAn!yA>lfg)3WqUx=m823)NyR${;X!9f{5o*6^dGduBi$39%q{^v zy9zTm5vE$Zs$X~9!_*Tj2Z!F^^6j8f_@67khbzc!rn|k#eh^?qIb8)$oFGWKu3AHfn*BM#tNSNZzzK=-#V?R*FRQ$i z!aG5PMLFRTJL8(xc0!lHYNmptX5apzzRmhffqU6B-A359W>i`aw2_)@5`vMpIjN5? zTk{ZJ%XgpBbW*rXs2V$P{FL|vc}vRg$5ASPbe2^YoClCp>C%0o>EZt9p#_i$P6(c) z+{y$Ue+0VcNUour{Db58jzQjB2%ekqGaSFk}C7==~}yr2MtRe6s91M^5x zPDFML^_`!0s=OcW>2|faE@$Z0RiBqs0Oe5E*268 zM!-`R$s)hsMSdjT7&++lN&kS-hqP#%jdS8|T@wkliq>$htByjRsro5s11>W@fDOJN z&sK_cat$KlCA9VpPQbg6wwcr1f~$PhF5iR%H-4?_m# zH**p~Uq;3A!vl~q71(crMc?we$YWC;TM)}N(38f9T)yj^j*{Mi7%oDyz=MW8AT`n# zK#iu5yRl8b6wH8>s>(Jib%$z+OxbZRgV$y^iqt`Ks^YzRDmA~tBW&LiTy2R4|GckW{qQTB}!U8RxX z?`Id;?#9^4peW22s8_uKsgZ)@6Zp+0r{(Ua!aRm0Y8(|`C%7JQH|?gA0Y^!^-+Sw} z0UB{9rMv$448!E$ONb?xq^=ug8T@80=6*XST#A!i3neb)I3%1aww91d{MAwOqoGs< z!^V9aOjv@SGEZ%+W?Q~C8~bIJX@d7WB2q{yjb^|w;YnPrz@WK*#!FFCo1)gwbzl?7 z$T8x=lr-D2923=sPbJ3x zmVn<2<~XOR>Cg@P7g555eWY=sz#bHi)zrj#h^ntHa_ANy2VM9RA7MK==abh$ODHDA zl@k@z04A%RhZcQti}dwO#F9K`tY&sXm~sp?Op}aVR7QvvxtvK*7WFvcr$l2!_^=f$ z@Fve#Xk*BkYDS{c*nm$I*5ch6*|WWNM(#O*lo)3s@iyxlMW@;J{R!`Xa{1w_??X#3 z@nAc&ZGx8G^6@eIZm3;t+kel0DgASB z>^>1G)G+)f000NEEeI=ONLQF4Tfx791d#}cD`a9Ul2iYQM8K>FDF`gqr1}5x+<`(3 z=@<}Mj2eBU`>zk{8WdX8VGo`Dw}>zb6%o!;O~d|;M#9?apy1;F!{zKectfA#Q(U4DgtQ&A(m5^hcC}vdWf1hWUOGp&)bYWq zT+GFl_20Ni3A*$m3f{(~f5WqBD1O-+PS1qu9^mn&szKq|PRO9`f5I~!C_MXpX1evC z@C=%yb^2{kt@WSqEDAit`unEY`hS;4;01Vw8NM_7|JXV}aUpnw
+ +### Event + +In above work flow, a pair of events are needed before and aftern the piece of code to collect time. So the event has a flag to mark it is starting event or ending event. There three kinds of event: + +```c++ +enum EventKind { kMark, + kPushRange, + kPopRange}; +``` +- kMark: only a mark. +- kPushRange: mark the starting event for time range. +- kPopRange: mark the ending event for the time range. + +For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, a event lists are used to record each piece. +```c++ +class Event { + public: + // The DeviceContext is used to get current CUDA stream. + Event(EventKind kind, std::string name, uint32_t thread_id, + const platform::DeviceContext* dev_ctx = nullptr); + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + std::forward_list> event_blocks; +}; +``` + +As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or distable the profiler. + +```c++ +enum ProfilerState { + kDisabled, + kCPU, + kCUDA +}; +ProfilerState kState; +``` +- kDisabled: the disabled state. +- kCPU: profiling for CPU code. +- kCUDA: profiling for GPU code. + +A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. + +```c++ +struct RecordEvent { + explicit RecordEvent(const std::string name, + platform::DeviceContext* dev_ctx = nullptr) { + if (kState == ProfilerState::kDisabled) return; + // push the starting event to the event lists. + } + ~RecordEvent() { + if (kState == ProfilerState::kDisabled) return; + // push the ending event to the event lists. + } +}; +``` From 6bb4a6fd4246413af8943cf042c621bdd226678c Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 19 Dec 2017 16:51:40 +0800 Subject: [PATCH 013/181] update --- paddle/pybind/protobuf.cc | 16 +++++-- paddle/pybind/pybind.cc | 1 + python/paddle/v2/fluid/backward.py | 69 +++++++++++++++++++++++++----- 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index f67aa4a81e..bb9872f9f7 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -236,15 +236,25 @@ void BindOpDesc(py::module &m) { .value("BLOCK", AttrType::BLOCK); py::class_ op_desc(m, "OpDesc", ""); - op_desc.def("type", &OpDescBind::Type) + op_desc + .def("__init__", + [](OpDescBind &self, const std::string &type, + const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs) { + new (&self) OpDescBind(type, inputs, outputs, attrs); + }) + .def("type", &OpDescBind::Type) .def("set_type", &OpDescBind::SetType) .def("input", &OpDescBind::Input) .def("input_names", &OpDescBind::InputNames) - .def("set_input", &OpDescBind::SetInput) .def("output", &OpDescBind::Output) .def("output_names", &OpDescBind::OutputNames) - .def("output_arg_names", &OpDescBind::OutputArgumentNames) + .def("set_input", &OpDescBind::SetInput) .def("set_output", &OpDescBind::SetOutput) + .def("input_arg_names", &OpDescBind::InputArgumentNames) + .def("output_arg_names", &OpDescBind::OutputArgumentNames) + .def("rename_input", &OpDescBind::RenameInput) + .def("rename_output", &OpDescBind::RenameOutput) .def("has_attr", &OpDescBind::HasAttr) .def("attr_type", &OpDescBind::GetAttrType) .def("attr_names", &OpDescBind::AttrNames) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index cd4887d63b..8311f8827b 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -314,6 +314,7 @@ All parameter, weight, gradient are variables in Paddle. InferenceOptimize(*(origin.Proto()), &pruned_desc); return new ProgramDescBind(pruned_desc); }); + m.def("get_empty_var_name", []() { return framework::kEmptyVarName; }); m.def_submodule( "var_names", "The module will return special predefined variable name in Paddle") diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 1756f1a7af..a399a9712d 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -5,8 +5,19 @@ import collections __all__ = ['append_backward_ops'] -def backward_impl(block, target_block, no_grad_set, grad_to_var, callback): +def rename_arg(op_desc_list, old_name, new_name, begin_idx=None, end_idx=None): + if begin_idx is None: + begin_idx = 0 + if end_idx is None: + end_idx = len(op_desc_list) + for i in range(begin_idx, end_idx): + op_desc_list[i].rename_input(old_name, new_name) + op_desc_list[i].rename_output(old_name, new_name) + + +def backward_impl(block, target_block, no_grad_set, callback=None): grad_op_descs = [] + grad_to_var = {} program = block.program for each_op in block.ops: grad_sub_block_list = [] @@ -14,8 +25,7 @@ def backward_impl(block, target_block, no_grad_set, grad_to_var, callback): sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) grad_sub_block = program.create_block(parent_idx=sub_block_idx) - backward_impl(sub_block, grad_sub_block, no_grad_set, grad_to_var, - callback) + backward_impl(sub_block, grad_sub_block, no_grad_set, callback) grad_sub_block_list.append(grad_sub_block) grad_op_desc = core.get_grad_op_desc(each_op.desc, no_grad_set[block.idx], @@ -25,16 +35,53 @@ def backward_impl(block, target_block, no_grad_set, grad_to_var, callback): # flatten grad_op_descs grad_op_descs = [op for sublist in grad_op_descs for op in sublist] # ????? - output_vars = collections.defaultdict(list) + pending_sum_ops = [] + var_rename_count = collections.defaultdict(int) + var_inputs = collections.defaultdict(list) for pos, op_desc in enumerate(grad_op_descs): + for var_name in op_desc.input_arg_names(): + if len(var_inputs[var_name]) > 1: + pending_sum_ops.append((core.OpDesc( + type="sum_op", + inputs=var_inputs[var_name], + output=[var_name], + attrs={}), pos)) + var_inputs[var_name] = [var_name] for var_name in op_desc.output_arg_names(): - output_vars[var_name].append(pos) - for var_name, poses in output_vars.iteritems(): - if len(poses) == 1: - continue - renamed_list = [] - for pos in reversed(sorted(poses)): - new_name = var_name + "@RENAMED@" + len(renamed_list) + if len(var_inputs[var_name]) == 0: + # it's the first time we get the variable + var_inputs[var_name] = var_name + else: + if len(var_inputs[var_name] == 1): + new_name = var_name + "@RENAME@" + \ + str(var_rename_count[var_name]) + var_rename_count[var_name] = var_rename_count[var_name] + 1 + # rename original var_name + var_inputs[var_name][0] = new_name + rename_arg(grad_op_descs, var_name, new_name, 0, pos) + rename_arg(pending_sum_ops, var_name, new_name) + + new_name = var_name + "@RENAME@" + \ + str(var_rename_count[var_name]) + var_rename_count[var_name] = var_rename_count[var_name] + 1 + op_desc.rename_output(var_name, new_name) + var_inputs[var_name].append(new_name) + for var_name, inputs in var_inputs.iteritems(): + if len(inputs) > 1: + pending_sum_ops.append((core.OpDesc( + type="sum_op", inputs=inputs, outputs=var_name, attrs={}), + len(grad_op_descs))) + # 根据append的顺序可以看出pending_sum_ops一定是根据sum_op的插入位置排序的 + for p in reversed(pending_sum_ops): + grad_op_descs.insert(p[1], p[0]) + # create new gradient variables in the target block + for op_desc in grad_op_descs: + for grad_var_name in op_desc.output_arg_names(): + if target_block.has_var( + grad_var_name) or grad_var_name == core.get_empty_var_name( + ): + continue + target_block.var(grad_var_name) def append_backward_ops(loss, parameter_list=None, no_grad_set=None): From 590e6111f164b559230273496c90ed1879b2dc47 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 19 Dec 2017 17:46:13 +0800 Subject: [PATCH 014/181] update --- paddle/pybind/protobuf.cc | 1 + python/paddle/v2/fluid/backward.py | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index bb9872f9f7..d05eb94644 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -157,6 +157,7 @@ void BindBlockDesc(py::module &m) { .def_property_readonly("parent", &BlockDescBind::Parent) .def("append_op", &BlockDescBind::AppendOp, py::return_value_policy::reference) + .def("append_allocated_op", &BlockDescBind::AppendAllocatedOp) .def("prepend_op", &BlockDescBind::PrependOp, py::return_value_policy::reference) .def("var", diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index a399a9712d..5eb7794948 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -15,7 +15,11 @@ def rename_arg(op_desc_list, old_name, new_name, begin_idx=None, end_idx=None): op_desc_list[i].rename_output(old_name, new_name) -def backward_impl(block, target_block, no_grad_set, callback=None): +def backward_impl(block, + target_block, + no_grad_set, + grad_info_map, + callback=None): grad_op_descs = [] grad_to_var = {} program = block.program @@ -25,7 +29,8 @@ def backward_impl(block, target_block, no_grad_set, callback=None): sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) grad_sub_block = program.create_block(parent_idx=sub_block_idx) - backward_impl(sub_block, grad_sub_block, no_grad_set, callback) + backward_impl(sub_block, grad_sub_block, no_grad_set, grad_info_map, + callback) grad_sub_block_list.append(grad_sub_block) grad_op_desc = core.get_grad_op_desc(each_op.desc, no_grad_set[block.idx], @@ -71,17 +76,28 @@ def backward_impl(block, target_block, no_grad_set, callback=None): pending_sum_ops.append((core.OpDesc( type="sum_op", inputs=inputs, outputs=var_name, attrs={}), len(grad_op_descs))) + # TODO: remove op in no grad set + # 根据append的顺序可以看出pending_sum_ops一定是根据sum_op的插入位置排序的 for p in reversed(pending_sum_ops): grad_op_descs.insert(p[1], p[0]) - # create new gradient variables in the target block + # create new gradient variables in the target block desc for op_desc in grad_op_descs: for grad_var_name in op_desc.output_arg_names(): - if target_block.has_var( + if target_block.desc.has_var( grad_var_name) or grad_var_name == core.get_empty_var_name( ): continue - target_block.var(grad_var_name) + target_block.desc.var(grad_var_name) + if not grad_to_var.has_key(grad_var_name): + continue + grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, + target_block) + # insert backward operators to target_block + for op_desc in grad_op_descs: + target_block.desc.append_allocated_op(op_desc) + + target_block.sync_with_cpp() def append_backward_ops(loss, parameter_list=None, no_grad_set=None): From 624e3e52089e7577ada38e061d0f3f299b1fac6d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 20:41:46 +0800 Subject: [PATCH 015/181] add MKL Packed RecurrentLayer --- paddle/gserver/CMakeLists.txt | 10 + paddle/gserver/layers/MKLPackedGemm.h | 94 ++++++ .../layers/MKLPackedRecurrentLayer.cpp | 311 ++++++++++++++++++ .../gserver/layers/MKLPackedRecurrentLayer.h | 131 ++++++++ 4 files changed, 546 insertions(+) create mode 100644 paddle/gserver/layers/MKLPackedGemm.h create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.cpp create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.h diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 41ead3c5ec..3d6ced713f 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -34,6 +34,16 @@ else() message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations") endif() +if(NOT WITH_MKLML) + file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h") + file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp") + list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER}) + list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES}) + message(STATUS "Skip compiling with MKLPackedLayers") +else() + message(STATUS "Compile with MKLPackedLayers") +endif() + if(NOT WITH_GPU) list(REMOVE_ITEM GSERVER_HEADER layers/CudnnConvBaseLayer.h diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h new file mode 100644 index 0000000000..3c4c62eeb8 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedGemm.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" +#include "paddle/math/Matrix.h" + +namespace paddle { + +class MKLPackedGemm { +protected: + real* weightPacked_; + real* weightTPacked_; + size_t weightHeight_; + size_t weightWidth_; + +public: + MKLPackedGemm(MatrixPtr weight) { + weightHeight_ = weight->getHeight(); + weightWidth_ = weight->getWidth(); + weightPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + weightTPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasNoTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightPacked_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightTPacked_); + } + void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) { + if (transW) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightTPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } else { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } + } + ~MKLPackedGemm() { + cblas_sgemm_free(weightPacked_); + cblas_sgemm_free(weightTPacked_); + } +}; +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp new file mode 100644 index 0000000000..6f455af91e --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLPackedRecurrentLayer.h" + +namespace paddle { + +REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer); + +bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!Layer::init(layerMap, parameterMap)) return false; + CHECK_EQ(1U, inputLayers_.size()); + CHECK_EQ(1U, parameters_.size()); + CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize()); + weight_.reset(new Weight(getSize(), getSize(), parameters_[0])); + if (biasParameter_.get() != NULL) { + bias_.reset(new Weight(1, getSize(), biasParameter_)); + } + reversed_ = config_.reversed(); + + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); + + return true; +} + +void MKLPackedRecurrentLayer::resetState() { + CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); + prevOutput_->zeroMem(); +} + +void MKLPackedRecurrentLayer::setState(LayerStatePtr state) { + CHECK(state->value.size() == 1) << "one matrix is expected for RNN state"; + prevOutput_->copyFrom(*(state->value[0])); +} + +LayerStatePtr MKLPackedRecurrentLayer::getState() { + LayerStatePtr res = std::make_shared(); + res->value.push_back(prevOutput_->clone(0, 0, useGpu_)); + res->value[0]->copyFrom(*prevOutput_); + return res; +} + +void MKLPackedRecurrentLayer::forward(PassType passType) { + REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str()); + Layer::forward(passType); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + size_t numSequences = input.getNumSequences(); + resetOutput(batchSize, getSize()); + CHECK_EQ(getSize(), input.value->getWidth()); + const int* starts = input.sequenceStartPositions->getData(false); + CHECK_EQ(starts[numSequences], batchSize); + + output_.value->assign(*input.value); + if (bias_) { + output_.value->addBias(*bias_->getW(), 1); + } + if (!FLAGS_rnn_use_batch) { + forwardSequence(batchSize, numSequences, starts); + } else { + forwardBatch(batchSize, numSequences, starts); + } +} + +void MKLPackedRecurrentLayer::forwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); + + frameOutput_.reserve(batchSize); + for (int i = frameOutput_.size(); i < batchSize; ++i) { + Argument arg; + arg.value = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + frameOutput_.push_back(arg); + } + + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].value->setData(output_.value->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + forwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) { + if (!reversed_) { + if (prevOutput_) { + frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); + } + activation_->forward(frameOutput_[start]).check(); + + for (int i = 1; i < length; ++i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + if (prevOutput_) { + prevOutput_->assign(*frameOutput_[start + length - 1].value); + } + } else { + activation_->forward(frameOutput_[start + length - 1]).check(); + for (int i = length - 2; i >= 0; --i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + } +} + +void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) { + REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str()); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + const int* starts = input.sequenceStartPositions->getData(false); + size_t numSequences = input.getNumSequences(); + + if (!FLAGS_rnn_use_batch) { + backwardSequence(batchSize, numSequences, starts); + } else { + backwardBatch(batchSize, numSequences, starts); + } + + if (input.grad) { + input.grad->add(*output_.grad); + } + + if (bias_ && bias_->getWGrad()) { + bias_->getWGrad()->collectBias(*output_.grad, 1); + bias_->getParameterPtr()->incUpdate(callback); + } + + weight_->getParameterPtr()->incUpdate(callback); + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); +} + +void MKLPackedRecurrentLayer::backwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + backwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) { + MatrixPtr weightT = weight_->getW()->getTranspose(); + if (!reversed_) { + for (int i = length - 1; i > 0; --i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i - 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start, length - 1)->getTranspose(), + *output_.grad->subMatrix(start + 1, length - 1), + 1, + 1); + } + } else { + for (int i = 0; i < length - 1; ++i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i + 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start + length - 1]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), + *output_.grad->subMatrix(start, length - 1), + 1, + 1); + } + } +} + +void MKLPackedRecurrentLayer::forwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchValue_) { + batchValue_.reset(new SequenceToBatch(useGpu_)); + } + + batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); + + batchValue_->copyFromSeq(*output_.value); + + { + REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); + /* forward one batch */ + for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { + MatrixPtr batch2 = batchValue_->getBatchValue(n); + + if (n != 0) { + MatrixPtr batch1 = + batchValue_->getBatchValue(n - 1, batch2->getHeight()); + + // batch2->mul(*batch1, *weight_->getW(), 1, 1); + sgemm_packed_->compute(batch2, batch1); + } + +#pragma omp parallel for collapse(2) + for (size_t i = 0; i < batch2->getHeight(); i++) { + for (size_t j = 0; j < batch2->getWidth(); j++) { + *(batch2->getData() + i * batch2->getWidth() + j) = + *(batch2->getData() + i * batch2->getWidth() + j) > 0 + ? *(batch2->getData() + i * batch2->getWidth() + j) + : 0; + } + } + } + } + + batchValue_->copyBackSeq(*output_.value); +} + +void MKLPackedRecurrentLayer::backwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchGrad_) { + batchGrad_.reset(new SequenceToBatch(useGpu_)); + } + batchGrad_->shareIndexWith(*batchValue_); + + size_t numBatch = batchGrad_->getNumBatch(); + bool backwardByBatch = numBatch < numSequences; + + batchGrad_->copyFromSeq(*output_.grad); + { + REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); + /* backward one batch */ + for (int n = (int)numBatch - 1; n >= 0; n--) { + MatrixPtr batch2 = batchGrad_->getBatchValue(n); + MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight()); + + Argument arg; + arg.value = batch1; + arg.grad = batch2; + activation_->backward(arg).check(); + + if (n != 0) { + batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); + // batch1->mul(*batch2, *weightT, 1, 1); + sgemm_packed_->compute(batch1, batch2, true); + } + + if (backwardByBatch && weight_->getWGrad()) { + if (n != 0) { + /* backward weight */ + batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); + weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); + } + } + } + } + + batchGrad_->copyBackSeq(*output_.grad); + + if (!backwardByBatch && weight_->getWGrad()) { + REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); + for (size_t seq = 0; seq < numSequences; ++seq) { + int len = starts[seq + 1] - starts[seq]; + if (!reversed_) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq] + 1, len - 1), + 1, + 1); + } else { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq], len - 1), + 1, + 1); + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h new file mode 100644 index 0000000000..719137f2db --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "Layer.h" +#include "MKLPackedGemm.h" +#include "SequenceToBatch.h" +#include "paddle/utils/Stat.h" + +DECLARE_bool(rnn_use_batch); + +namespace paddle { + +/** + * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the + * same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + +class MKLPackedRecurrentLayer : public Layer { +public: + explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + + void backward(const UpdateCallback& callback) override; + + void resetState() override; + + void setState(LayerStatePtr state) override; + + LayerStatePtr getState() override; + +protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void backwardOneSequence(int start, int length); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardBatch(int batchSize, size_t numSequences, const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardBatch(int batchSize, size_t numSequences, const int* starts); + +protected: + std::unique_ptr weight_; + std::unique_ptr bias_; + + /// frameOutput_[i] is used to hold the i-th sample of output_ + std::vector frameOutput_; + MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. + bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. + std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. + std::unique_ptr batchGrad_; + + std::unique_ptr sgemm_packed_; +}; +} From 2e101df7c656d524c92b6c31711a8bbcaf7ce09f Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 21:40:34 +0800 Subject: [PATCH 016/181] enable gtest for MKLPackedRecurrentLayer --- paddle/gserver/tests/test_RecurrentLayer.cpp | 165 ++++++++++++++++++- 1 file changed, 159 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 16ab0e6aec..1f31158579 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -420,12 +420,165 @@ TEST(Layer, LstmLayer) { } } +#ifdef PADDLE_WITH_MKLML + +LayerPtr initMKLPackedLayer(LayerConfig layerConfig, + bool reversed, + int layerSize, + LayerPtr dataLayer, + ParameterPtr para, + ParameterPtr bias = nullptr) { + LayerMap layerMap; + ParameterMap parameterMap; + layerMap[dataLayer->getName()] = dataLayer; + parameterMap[para->getName()] = para; + if (bias) { + parameterMap[bias->getName()] = bias; + layerConfig.set_bias_parameter_name("bias_0"); + } + + layerConfig.set_size(layerSize); + layerConfig.set_reversed(reversed); + layerConfig.add_inputs(); + LayerInputConfig& input = *(layerConfig.mutable_inputs(0)); + input.set_input_layer_name("layer_0"); + input.set_input_parameter_name("para_0"); + + LayerPtr testLayer = Layer::create(layerConfig); + layerMap[testLayer->getName()] = testLayer; + + testLayer->init(layerMap, parameterMap); + testLayer->setNeedGradient(true); + + return testLayer; +} + +void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { + const VectorPtr& weightGrad = + (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT); + const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad(); + CpuVector wgt_grad1(weightGrad->getSize()); + CpuVector wgt_grad2(weightGrad->getSize()); + CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth()); + CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth()); + + CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth()); + outputGrad.randomizeUniform(); + + for (int i = 0; i < 2; i++) { + FLAGS_rnn_use_batch = true; + + testLayer1->forward(PASS_GC); + + testLayer1->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer1->backward(nullptr); + + wgt_grad1.copyFrom(*weightGrad); + input_grad1.copyFrom(*inputGrad); + + FLAGS_rnn_use_batch = true; + + testLayer2->forward(PASS_GC); + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad2.copyFrom(*weightGrad); + input_grad2.copyFrom(*inputGrad); + + checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); + + checkError(wgt_grad1, wgt_grad2); + checkError(input_grad1, input_grad2); + } + + for (int i = 0; i < 2; i++) { + CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(), + testLayer2->getOutputValue()->getWidth()); + + FLAGS_rnn_use_batch = true; + + testLayer2->forward(PASS_GC); + outputValue.copyFrom(*testLayer2->getOutputValue()); + + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad1.copyFrom(*weightGrad); + input_grad1.copyFrom(*inputGrad); + + FLAGS_rnn_use_batch = false; + + testLayer2->getOutputValue()->zero(); + + testLayer2->forward(PASS_GC); + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad2.copyFrom(*weightGrad); + input_grad2.copyFrom(*inputGrad); + + checkError(outputValue, *testLayer2->getOutputValue()); + checkError(wgt_grad1, wgt_grad2); + checkError(input_grad1, input_grad2); + } +} + +TEST(MKLPackedLayer, RecurrentLayer) { + LayerConfig layerConfig1; + LayerConfig layerConfig2; + + layerConfig1.set_name("paddle-rnn"); + layerConfig1.set_type("recurrent"); + layerConfig1.set_active_type("relu"); + + layerConfig2.set_name("mkl-packed-rnn"); + layerConfig2.set_type("mkl_packed_recurrent"); + layerConfig2.set_active_type("relu"); + + for (auto layerSize : {32, 64, 128, 256, 512}) { + for (auto batchSize : {1, 5, 100, 500}) { + for (auto reversed : {true, false}) { + LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + << " reversed=" << reversed; + + LayerPtr dataLayer = + creatDataLayer("layer_0", batchSize, layerSize, false); + ParameterPtr para = + creatParameter("para_0", 0, layerSize * layerSize, false); + + LayerPtr testLayer1 = initMKLPackedLayer( + layerConfig1, reversed, layerSize, dataLayer, para); + LayerPtr testLayer2 = initMKLPackedLayer( + layerConfig2, reversed, layerSize, dataLayer, para); + + checkMKLPackedLayer(testLayer1, testLayer2); + } + } + } +} +#endif + int main(int argc, char** argv) { - if (version::isWithGpu()) { - testing::InitGoogleTest(&argc, argv); - initMain(argc, argv); - return RUN_ALL_TESTS(); - } else { - return 0; + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + if (!version::isWithGpu()) { + testing::GTEST_FLAG(filter) = "-Layer.*"; } + return RUN_ALL_TESTS(); } From 0f8aad2934f88540249da6d7e5c8e8ceeafd60ec Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 23:06:54 +0800 Subject: [PATCH 017/181] fix compile error --- paddle/gserver/layers/MKLPackedGemm.h | 3 ++- paddle/gserver/layers/MKLPackedRecurrentLayer.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h index 3c4c62eeb8..91e2515e32 100644 --- a/paddle/gserver/layers/MKLPackedGemm.h +++ b/paddle/gserver/layers/MKLPackedGemm.h @@ -27,7 +27,7 @@ protected: size_t weightWidth_; public: - MKLPackedGemm(MatrixPtr weight) { + explicit MKLPackedGemm(MatrixPtr weight) { weightHeight_ = weight->getHeight(); weightWidth_ = weight->getWidth(); weightPacked_ = @@ -91,4 +91,5 @@ public: cblas_sgemm_free(weightTPacked_); } }; + } // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index 719137f2db..b8727e0ff3 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include #include "Layer.h" #include "MKLPackedGemm.h" @@ -128,4 +130,5 @@ protected: std::unique_ptr sgemm_packed_; }; -} + +} // namespace paddle From b95834dc0c43cedd27124e18a13345712dcf9d47 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Dec 2017 09:05:41 +0800 Subject: [PATCH 018/181] disable use_gpu when test mkl recurrent layer comparing with cpu --- paddle/gserver/tests/test_RecurrentLayer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 1f31158579..44d84dd8be 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -551,6 +551,8 @@ TEST(MKLPackedLayer, RecurrentLayer) { layerConfig2.set_type("mkl_packed_recurrent"); layerConfig2.set_active_type("relu"); + FLAGS_use_gpu = false; + for (auto layerSize : {32, 64, 128, 256, 512}) { for (auto batchSize : {1, 5, 100, 500}) { for (auto reversed : {true, false}) { From 61a7df2e310c9fc0e98610cb5513e8634e075447 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 20 Dec 2017 16:55:27 +0800 Subject: [PATCH 019/181] update --- paddle/pybind/protobuf.cc | 5 +++- python/paddle/v2/fluid/backward.py | 43 +++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index d05eb94644..21b91b3825 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -157,7 +157,10 @@ void BindBlockDesc(py::module &m) { .def_property_readonly("parent", &BlockDescBind::Parent) .def("append_op", &BlockDescBind::AppendOp, py::return_value_policy::reference) - .def("append_allocated_op", &BlockDescBind::AppendAllocatedOp) + .def("append_allocated_op", + [](BlockDescBind &self, OpDescBind *op_desc) { + self.AppendAllocatedOp(std::unique_ptr(op_desc)); + }) .def("prepend_op", &BlockDescBind::PrependOp, py::return_value_policy::reference) .def("var", diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 5eb7794948..0600223732 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,6 +1,7 @@ from paddle.v2.fluid import framework as framework from . import core import collections +import pdb __all__ = ['append_backward_ops'] @@ -15,7 +16,8 @@ def rename_arg(op_desc_list, old_name, new_name, begin_idx=None, end_idx=None): op_desc_list[i].rename_output(old_name, new_name) -def backward_impl(block, +def backward_impl(target, + block, target_block, no_grad_set, grad_info_map, @@ -29,8 +31,8 @@ def backward_impl(block, sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) grad_sub_block = program.create_block(parent_idx=sub_block_idx) - backward_impl(sub_block, grad_sub_block, no_grad_set, grad_info_map, - callback) + backward_impl(target, sub_block, grad_sub_block, no_grad_set, + grad_info_map, callback) grad_sub_block_list.append(grad_sub_block) grad_op_desc = core.get_grad_op_desc(each_op.desc, no_grad_set[block.idx], @@ -46,6 +48,7 @@ def backward_impl(block, for pos, op_desc in enumerate(grad_op_descs): for var_name in op_desc.input_arg_names(): if len(var_inputs[var_name]) > 1: + pdb.set_trace() pending_sum_ops.append((core.OpDesc( type="sum_op", inputs=var_inputs[var_name], @@ -55,7 +58,7 @@ def backward_impl(block, for var_name in op_desc.output_arg_names(): if len(var_inputs[var_name]) == 0: # it's the first time we get the variable - var_inputs[var_name] = var_name + var_inputs[var_name] = [var_name] else: if len(var_inputs[var_name] == 1): new_name = var_name + "@RENAME@" + \ @@ -73,8 +76,9 @@ def backward_impl(block, var_inputs[var_name].append(new_name) for var_name, inputs in var_inputs.iteritems(): if len(inputs) > 1: - pending_sum_ops.append((core.OpDesc( - type="sum_op", inputs=inputs, outputs=var_name, attrs={}), + pdb.set_trace() + pending_sum_ops.append((core.OpDesc("sum_op", {"X": inputs}, + {"Out": var_name}, {}), len(grad_op_descs))) # TODO: remove op in no grad set @@ -84,6 +88,7 @@ def backward_impl(block, # create new gradient variables in the target block desc for op_desc in grad_op_descs: for grad_var_name in op_desc.output_arg_names(): + grad_var_name = grad_var_name.encode("ascii") if target_block.desc.has_var( grad_var_name) or grad_var_name == core.get_empty_var_name( ): @@ -93,6 +98,16 @@ def backward_impl(block, continue grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, target_block) + if target_block.idx == 0: + grad_target_name = (target.name + "@GRAD") + target_block.desc.var(grad_target_name) + grad_op_descs.insert( + 0, + core.OpDesc(u"fill_constant", {}, { + u"Out": [unicode(grad_target_name, "ascii")] + }, {u"shape": (1), + u"value": 1.0, + u"dtype": core.DataType.FP32})) # insert backward operators to target_block for op_desc in grad_op_descs: target_block.desc.append_allocated_op(op_desc) @@ -118,18 +133,22 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): assert isinstance(loss, framework.Variable) if no_grad_set is None: + no_grad_set = dict() program = loss.block.program assert isinstance(program, framework.Program) - no_grad_set = list() for block in program.blocks: assert isinstance(block, framework.Block) + block_no_grad_set = set() for var in block.vars.itervalues(): assert isinstance(var, framework.Variable) if var.stop_gradient: - no_grad_set.append(var.name) - no_grad_set = set(no_grad_set) + block_no_grad_set.add(var.name) + no_grad_set[block.idx] = block_no_grad_set - param_grad_map = loss.block.program.append_backward(loss, no_grad_set) + grad_info_map = dict() + root_block = loss.block.program.block(0) + backward_impl(loss, root_block, root_block, no_grad_set, grad_info_map) + pdb.set_trace() if parameter_list is not None: parameters = parameter_list else: @@ -137,9 +156,9 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): parameters = [param.name for param in params] params_and_grads = [] for param in parameters: - if param not in param_grad_map: + if param not in grad_info_map: raise ValueError("param %s is not in map" % param) - grad_info = param_grad_map[param] + grad_info = grad_info_map[param] grad_block = loss.block.program.block(grad_info[1]) if not grad_block.has_var(grad_info[0]): raise ValueError("grad block[{0}] did not have grad var {1}".format( From 278ac7be5ceca8c157a581a864619ab310a79e9c Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 20 Dec 2017 20:49:27 +0800 Subject: [PATCH 020/181] Compelete basic framework --- paddle/pybind/protobuf.cc | 8 +--- paddle/pybind/pybind.cc | 4 +- python/paddle/v2/fluid/backward.py | 68 ++++++++++++++++++++---------- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 21b91b3825..da686d0b18 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -240,13 +240,7 @@ void BindOpDesc(py::module &m) { .value("BLOCK", AttrType::BLOCK); py::class_ op_desc(m, "OpDesc", ""); - op_desc - .def("__init__", - [](OpDescBind &self, const std::string &type, - const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs) { - new (&self) OpDescBind(type, inputs, outputs, attrs); - }) + op_desc.def("__init__", [](OpDescBind &self) { new (&self) OpDescBind(); }) .def("type", &OpDescBind::Type) .def("set_type", &OpDescBind::SetType) .def("input", &OpDescBind::Input) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 8311f8827b..d84d5efbcf 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -285,8 +285,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_grad_op_desc", [](const OpDescBind &op_desc, const std::unordered_set &no_grad_set, - std::unordered_map &grad_to_var, const std::vector &grad_sub_block) { + std::unordered_map grad_to_var; std::vector> grad_op_descs = framework::OpInfoMap::Instance() .Get(op_desc.Type()) @@ -297,7 +297,7 @@ All parameter, weight, gradient are variables in Paddle. grad_op_descs.begin(), grad_op_descs.end(), grad_op_desc_ptrs.begin(), [](std::unique_ptr &p) { return p.release(); }); - return grad_op_desc_ptrs; + return std::make_pair(grad_op_desc_ptrs, grad_to_var); }); m.def("prune", [](const ProgramDescBind &origin, const std::vector> &targets) { diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 0600223732..b24e124e1e 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -6,7 +6,8 @@ import pdb __all__ = ['append_backward_ops'] -def rename_arg(op_desc_list, old_name, new_name, begin_idx=None, end_idx=None): +def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, + end_idx=None): if begin_idx is None: begin_idx = 0 if end_idx is None: @@ -16,6 +17,21 @@ def rename_arg(op_desc_list, old_name, new_name, begin_idx=None, end_idx=None): op_desc_list[i].rename_output(old_name, new_name) +def _create_op_desc_(op_type, inputs, outputs, attrs): + op_desc = core.OpDesc() + op_desc.set_type(op_type) + for para, args in inputs.iteritems(): + op_desc.set_input(para, args) + for para, args in outputs.iteritems(): + op_desc.set_output(para, args) + for name, val in attrs.iteritems(): + if isinstance(val, framework.Block): + op_desc.set_block_attr(name, val.desc) + else: + op_desc.set_attr(name, val) + return op_desc + + def backward_impl(target, block, target_block, @@ -23,9 +39,9 @@ def backward_impl(target, grad_info_map, callback=None): grad_op_descs = [] - grad_to_var = {} + grad_to_var = dict() program = block.program - for each_op in block.ops: + for each_op in reversed(block.ops): grad_sub_block_list = [] if each_op.has_attr("sub_block"): sub_block_idx = each_op.block_attr("sub_block") @@ -34,10 +50,10 @@ def backward_impl(target, backward_impl(target, sub_block, grad_sub_block, no_grad_set, grad_info_map, callback) grad_sub_block_list.append(grad_sub_block) - grad_op_desc = core.get_grad_op_desc(each_op.desc, - no_grad_set[block.idx], - grad_to_var, grad_sub_block_list) + grad_op_desc, op_grad_to_var = core.get_grad_op_desc( + each_op.desc, no_grad_set[block.idx], grad_sub_block_list) grad_op_descs.append(grad_op_desc) + grad_to_var = dict(grad_to_var, **op_grad_to_var) # grad_op_descs = [[op1_g1, op1_g2], [op2_g], ...] # flatten grad_op_descs grad_op_descs = [op for sublist in grad_op_descs for op in sublist] # ????? @@ -48,11 +64,10 @@ def backward_impl(target, for pos, op_desc in enumerate(grad_op_descs): for var_name in op_desc.input_arg_names(): if len(var_inputs[var_name]) > 1: - pdb.set_trace() - pending_sum_ops.append((core.OpDesc( - type="sum_op", + pending_sum_ops.append((_create_op_desc_( + op_type="sum_op", inputs=var_inputs[var_name], - output=[var_name], + outputs=[var_name], attrs={}), pos)) var_inputs[var_name] = [var_name] for var_name in op_desc.output_arg_names(): @@ -66,8 +81,8 @@ def backward_impl(target, var_rename_count[var_name] = var_rename_count[var_name] + 1 # rename original var_name var_inputs[var_name][0] = new_name - rename_arg(grad_op_descs, var_name, new_name, 0, pos) - rename_arg(pending_sum_ops, var_name, new_name) + _rename_arg_(grad_op_descs, var_name, new_name, 0, pos) + _rename_arg_(pending_sum_ops, var_name, new_name) new_name = var_name + "@RENAME@" + \ str(var_rename_count[var_name]) @@ -76,10 +91,11 @@ def backward_impl(target, var_inputs[var_name].append(new_name) for var_name, inputs in var_inputs.iteritems(): if len(inputs) > 1: - pdb.set_trace() - pending_sum_ops.append((core.OpDesc("sum_op", {"X": inputs}, - {"Out": var_name}, {}), - len(grad_op_descs))) + pending_sum_ops.append((_create_op_desc_( + op_type="sum_op", + inputs={"X": inputs}, + outputs={"Out": var_name}, + attrs={}), len(grad_op_descs))) # TODO: remove op in no grad set # 根据append的顺序可以看出pending_sum_ops一定是根据sum_op的插入位置排序的 @@ -103,15 +119,22 @@ def backward_impl(target, target_block.desc.var(grad_target_name) grad_op_descs.insert( 0, - core.OpDesc(u"fill_constant", {}, { - u"Out": [unicode(grad_target_name, "ascii")] - }, {u"shape": (1), - u"value": 1.0, - u"dtype": core.DataType.FP32})) + _create_op_desc_( + op_type="fill_constant", + inputs={}, + outputs={"Out": [grad_target_name]}, + attrs={ + "shape": [1], + "value": 1.0, + "dtype": core.DataType.FP32 + })) # insert backward operators to target_block for op_desc in grad_op_descs: + op_desc.infer_var_type(target_block.desc) + op_desc.infer_shape(target_block.desc) target_block.desc.append_allocated_op(op_desc) + pdb.set_trace() target_block.sync_with_cpp() @@ -147,6 +170,7 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): grad_info_map = dict() root_block = loss.block.program.block(0) + pdb.set_trace() backward_impl(loss, root_block, root_block, no_grad_set, grad_info_map) pdb.set_trace() if parameter_list is not None: @@ -159,7 +183,7 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): if param not in grad_info_map: raise ValueError("param %s is not in map" % param) grad_info = grad_info_map[param] - grad_block = loss.block.program.block(grad_info[1]) + grad_block = grad_info[1] if not grad_block.has_var(grad_info[0]): raise ValueError("grad block[{0}] did not have grad var {1}".format( grad_info[1], grad_info[0])) From 1a0fc5d8dcab7e3e28c0e3463e8b97e0d90b28b2 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 21 Dec 2017 15:43:47 +0800 Subject: [PATCH 021/181] Add the simple support of no_grad_set --- paddle/pybind/pybind.cc | 3 +- python/paddle/v2/fluid/backward.py | 71 +++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index d84d5efbcf..b453dfbf89 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -314,7 +314,8 @@ All parameter, weight, gradient are variables in Paddle. InferenceOptimize(*(origin.Proto()), &pruned_desc); return new ProgramDescBind(pruned_desc); }); - m.def("get_empty_var_name", []() { return framework::kEmptyVarName; }); + m.def("empty_var_name", []() { return framework::kEmptyVarName; }); + m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; }); m.def_submodule( "var_names", "The module will return special predefined variable name in Paddle") diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index b24e124e1e..df2761d802 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -32,12 +32,27 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): return op_desc -def backward_impl(target, - block, - target_block, - no_grad_set, - grad_info_map, - callback=None): +def _is_all_in_set_(cands, s): + for c in cands: + if not c in s: + return False + return True + + +def _strip_grad_suffix_(name): + return name[:name.find(core.grad_var_suffix())] + + +def _append_grad_suffix_(name): + return name + core.grad_var_suffix() + + +def _backward_impl_(target, + block, + target_block, + no_grad_set, + grad_info_map, + callback=None): grad_op_descs = [] grad_to_var = dict() program = block.program @@ -47,8 +62,8 @@ def backward_impl(target, sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) grad_sub_block = program.create_block(parent_idx=sub_block_idx) - backward_impl(target, sub_block, grad_sub_block, no_grad_set, - grad_info_map, callback) + _backward_impl_(target, sub_block, grad_sub_block, no_grad_set, + grad_info_map, callback) grad_sub_block_list.append(grad_sub_block) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( each_op.desc, no_grad_set[block.idx], grad_sub_block_list) @@ -61,14 +76,14 @@ def backward_impl(target, pending_sum_ops = [] var_rename_count = collections.defaultdict(int) var_inputs = collections.defaultdict(list) - for pos, op_desc in enumerate(grad_op_descs): + for idx, op_desc in enumerate(grad_op_descs): for var_name in op_desc.input_arg_names(): if len(var_inputs[var_name]) > 1: pending_sum_ops.append((_create_op_desc_( op_type="sum_op", inputs=var_inputs[var_name], outputs=[var_name], - attrs={}), pos)) + attrs={}), idx)) var_inputs[var_name] = [var_name] for var_name in op_desc.output_arg_names(): if len(var_inputs[var_name]) == 0: @@ -81,7 +96,7 @@ def backward_impl(target, var_rename_count[var_name] = var_rename_count[var_name] + 1 # rename original var_name var_inputs[var_name][0] = new_name - _rename_arg_(grad_op_descs, var_name, new_name, 0, pos) + _rename_arg_(grad_op_descs, var_name, new_name, 0, idx) _rename_arg_(pending_sum_ops, var_name, new_name) new_name = var_name + "@RENAME@" + \ @@ -96,18 +111,31 @@ def backward_impl(target, inputs={"X": inputs}, outputs={"Out": var_name}, attrs={}), len(grad_op_descs))) - # TODO: remove op in no grad set - # 根据append的顺序可以看出pending_sum_ops一定是根据sum_op的插入位置排序的 for p in reversed(pending_sum_ops): grad_op_descs.insert(p[1], p[0]) + # Remove ops whose outputs are all in no_grad_set + grad_op_descs = filter( + lambda op_desc: not _is_all_in_set_(op_desc.output_arg_names(), no_grad_set[block.idx]), + grad_op_descs) + # Insert fill_zeros_like_op + to_insert = [] + for idx, op_desc in enumerate(grad_op_descs): + for arg in op_desc.input_arg_names(): + if arg in no_grad_set[block.idx]: + to_insert.append((arg, idx)) + for ele in reversed(to_insert): + arg = ele[0] + fill_zeros_like_op = _create_op_desc_( + "fill_zeros_like", {"X": [_strip_grad_suffix_(arg)]}, {"Y": [arg]}, + {}) + grad_op_descs.insert(ele[1], fill_zeros_like_op) # create new gradient variables in the target block desc for op_desc in grad_op_descs: for grad_var_name in op_desc.output_arg_names(): grad_var_name = grad_var_name.encode("ascii") if target_block.desc.has_var( - grad_var_name) or grad_var_name == core.get_empty_var_name( - ): + grad_var_name) or grad_var_name == core.empty_var_name(): continue target_block.desc.var(grad_var_name) if not grad_to_var.has_key(grad_var_name): @@ -115,8 +143,8 @@ def backward_impl(target, grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, target_block) if target_block.idx == 0: - grad_target_name = (target.name + "@GRAD") - target_block.desc.var(grad_target_name) + grad_target_name = _append_grad_suffix_(target.name) + target_block.desc.var(grad_target_name.encode("ascii")) grad_op_descs.insert( 0, _create_op_desc_( @@ -134,7 +162,6 @@ def backward_impl(target, op_desc.infer_shape(target_block.desc) target_block.desc.append_allocated_op(op_desc) - pdb.set_trace() target_block.sync_with_cpp() @@ -165,14 +192,14 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): for var in block.vars.itervalues(): assert isinstance(var, framework.Variable) if var.stop_gradient: - block_no_grad_set.add(var.name) + block_no_grad_set.add(_append_grad_suffix_(var.name)) no_grad_set[block.idx] = block_no_grad_set grad_info_map = dict() root_block = loss.block.program.block(0) - pdb.set_trace() - backward_impl(loss, root_block, root_block, no_grad_set, grad_info_map) - pdb.set_trace() + + _backward_impl_(loss, root_block, root_block, no_grad_set, grad_info_map) + if parameter_list is not None: parameters = parameter_list else: From e902c36cdf5bf2b2c05a41de6f30b9b7c84071b8 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 21 Dec 2017 12:42:48 +0800 Subject: [PATCH 022/181] add conv2d_python doc --- python/paddle/v2/fluid/layers/nn.py | 66 ++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 1db63fbfe8..f49a958a0f 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -481,11 +481,67 @@ def conv2d(input, act=None, name=None): """ - This function creates the op for a 2-dimensional Convolution. - This is performed using the parameters of filters(size, dimensionality etc) - , stride and other configurations for a Convolution operation. - This funciton can also append an activation on top of the - conv-2d output, if mentioned in the input parameters. + **Convlution2D Layer** + + The convolution2D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output) + are in NCHW format. Where N is batch size, C is the number of channels, H is the height + of the feature, and W is the width of the feature. + The details of convolution layer, please refer UFLDL's `convolution, + `_ . + If bias_attr and activation type are provided, bias is added to the output of the convolution, + and the corresponding activation function is applied to the final result. + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCHW format. + * :math:`W`: Filter value, a tensor with MCHW format. + * :math:`b`: Bias, . + * :math:\sigma : Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out}= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 + $$ + + All the input variables are passed in as local variables to the LayerHelper + constructor. + + Args: + input(Variable): Input tensors. The format of input tensor is NCHW. + num_filters(int): Number of filters + filter_size(list/int): Filter size of Conv2d Layer + stride(list/int, optional): Strides(h_s, w_s) of Conv2d Layer. Default: 1 + padding(list/int, optional): Paddings(h_pad, w_pad) of Conv2d Layer. Default: 0 + groups(int, optional): The groups number of the Conv2d Layer. Default: 1 + param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + act(str): Activation type. Default: None + name(str): Name/alias of the function + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3,32, 32], dtype='float32') + conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") """ if stride is None: From 0b080a42da85d67d7a900a9b23bdcea6cfcbc01c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 13:36:59 +0800 Subject: [PATCH 023/181] add recurrent layer header --- paddle/gserver/layers/RecurrentLayer.cpp | 106 +----------------- paddle/gserver/layers/RecurrentLayer.h | 130 +++++++++++++++++++++++ 2 files changed, 131 insertions(+), 105 deletions(-) create mode 100644 paddle/gserver/layers/RecurrentLayer.h diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index e4c2b483d2..285b11b5a0 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "RecurrentLayer.h" #include #include "Layer.h" #include "SequenceToBatch.h" @@ -21,110 +22,6 @@ DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); namespace paddle { -/** - * @brief RecurrentLayer takes 1 input layer. The output size is the same with - * input layer. - * For each sequence [start, end] it performs the following computation: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ - * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end - * - * \f] - * If reversed is true, the order is reversed: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ - * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end - * \f] - * There are two methods to calculate rnn. One way is to compute rnn one - * sequence by one sequence. The other way is to reorganize the input - * into batches, then compute rnn one batch by one batch. Users can select - * them by rnn_use_batch flag. - */ - -class RecurrentLayer : public Layer { -public: - explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {} - - bool init(const LayerMap& layerMap, - const ParameterMap& parameterMap) override; - - void forward(PassType passType) override; - - void backward(const UpdateCallback& callback) override; - - void resetState() override; - - void setState(LayerStatePtr state) override; - - LayerStatePtr getState() override; - -protected: - /** - * @brief If user do not set --rnn_use_batch=true, it will - * compute rnn forward one sequence by one sequence in default. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn forward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void forwardOneSequence(int start, int length); - /** - * @brief Compute rnn backward one sequence by onesequence. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn backward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void backwardOneSequence(int start, int length); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. It will convert batch shape to sequence after finishing forward. - * The batch info can refer to SequenceToBatch class. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardBatch(int batchSize, size_t numSequences, const int* starts); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardBatch(int batchSize, size_t numSequences, const int* starts); - -protected: - std::unique_ptr weight_; - std::unique_ptr bias_; - - /// frameOutput_[i] is used to hold the i-th sample of output_ - std::vector frameOutput_; - MatrixPtr prevOutput_; - /// Whether compute rnn by reverse. - bool reversed_; - /// If compute batch by batch, batchValue_ will be used to save the - /// reorganized input value. - std::unique_ptr batchValue_; - /// If compute batch by batch, batchGrad_ will be used to save the - /// gradient with respect to reorganized input value. - std::unique_ptr batchGrad_; -}; - REGISTER_LAYER(recurrent, RecurrentLayer); bool RecurrentLayer::init(const LayerMap& layerMap, @@ -260,7 +157,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) { bias_->getWGrad()->collectBias(*output_.grad, 1); bias_->getParameterPtr()->incUpdate(callback); } - weight_->getParameterPtr()->incUpdate(callback); } diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h new file mode 100644 index 0000000000..f40dbe150f --- /dev/null +++ b/paddle/gserver/layers/RecurrentLayer.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include "Layer.h" +#include "SequenceToBatch.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +/** + * @brief RecurrentLayer takes 1 input layer. The output size is the same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + +class RecurrentLayer : public Layer { +public: + explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + + void backward(const UpdateCallback& callback) override; + + void resetState() override; + + void setState(LayerStatePtr state) override; + + LayerStatePtr getState() override; + +protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void backwardOneSequence(int start, int length); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + virtual void forwardBatch(int batchSize, + size_t numSequences, + const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + virtual void backwardBatch(int batchSize, + size_t numSequences, + const int* starts); + +protected: + std::unique_ptr weight_; + std::unique_ptr bias_; + + /// frameOutput_[i] is used to hold the i-th sample of output_ + std::vector frameOutput_; + MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. + bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. + std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. + std::unique_ptr batchGrad_; +}; + +} // namespace paddle From 82091035514c0ddeae2c18ff5f523a2647d59948 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 13:43:25 +0800 Subject: [PATCH 024/181] follow comments and refine code --- paddle/gserver/layers/MKLPackedGemm.h | 95 --------- .../layers/MKLPackedRecurrentLayer.cpp | 191 ++---------------- .../gserver/layers/MKLPackedRecurrentLayer.h | 87 ++------ paddle/gserver/layers/MKLPackedWeight.h | 100 +++++++++ 4 files changed, 125 insertions(+), 348 deletions(-) delete mode 100644 paddle/gserver/layers/MKLPackedGemm.h create mode 100644 paddle/gserver/layers/MKLPackedWeight.h diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h deleted file mode 100644 index 91e2515e32..0000000000 --- a/paddle/gserver/layers/MKLPackedGemm.h +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/math/MathFunctions.h" -#include "paddle/math/Matrix.h" - -namespace paddle { - -class MKLPackedGemm { -protected: - real* weightPacked_; - real* weightTPacked_; - size_t weightHeight_; - size_t weightWidth_; - -public: - explicit MKLPackedGemm(MatrixPtr weight) { - weightHeight_ = weight->getHeight(); - weightWidth_ = weight->getWidth(); - weightPacked_ = - cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); - weightTPacked_ = - cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); - cblas_sgemm_pack(CblasRowMajor, - CblasBMatrix, - CblasNoTrans, - 1, - weightWidth_, - weightHeight_, - 1.0, - weight->getData(), - weightWidth_, - weightPacked_); - cblas_sgemm_pack(CblasRowMajor, - CblasBMatrix, - CblasTrans, - 1, - weightWidth_, - weightHeight_, - 1.0, - weight->getData(), - weightWidth_, - weightTPacked_); - } - void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) { - if (transW) { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - batch2->getHeight(), - weightWidth_, - weightHeight_, - batch1->getData(), - weightHeight_, - weightTPacked_, - weightWidth_, - 1, - batch2->getData(), - weightWidth_); - } else { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - batch2->getHeight(), - weightWidth_, - weightHeight_, - batch1->getData(), - weightHeight_, - weightPacked_, - weightWidth_, - 1, - batch2->getData(), - weightWidth_); - } - } - ~MKLPackedGemm() { - cblas_sgemm_free(weightPacked_); - cblas_sgemm_free(weightTPacked_); - } -}; - -} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index 6f455af91e..bd3c4ceb5e 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -20,188 +20,21 @@ REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer); bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - if (!Layer::init(layerMap, parameterMap)) return false; - CHECK_EQ(1U, inputLayers_.size()); - CHECK_EQ(1U, parameters_.size()); - CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize()); - weight_.reset(new Weight(getSize(), getSize(), parameters_[0])); - if (biasParameter_.get() != NULL) { - bias_.reset(new Weight(1, getSize(), biasParameter_)); + if (!RecurrentLayer::init(layerMap, parameterMap)) return false; + packed_weight_.reset(new MKLPackedWeight(weight_->getW())); + packed_weight_->pack(); + if (needGradient_) { + packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true)); + packed_weightT_->pack(); } - reversed_ = config_.reversed(); - - sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); - return true; } -void MKLPackedRecurrentLayer::resetState() { - CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; - Matrix::resizeOrCreate( - prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); - prevOutput_->zeroMem(); -} - -void MKLPackedRecurrentLayer::setState(LayerStatePtr state) { - CHECK(state->value.size() == 1) << "one matrix is expected for RNN state"; - prevOutput_->copyFrom(*(state->value[0])); -} - -LayerStatePtr MKLPackedRecurrentLayer::getState() { - LayerStatePtr res = std::make_shared(); - res->value.push_back(prevOutput_->clone(0, 0, useGpu_)); - res->value[0]->copyFrom(*prevOutput_); - return res; -} - -void MKLPackedRecurrentLayer::forward(PassType passType) { - REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str()); - Layer::forward(passType); - const Argument& input = getInput(0); - CHECK(input.sequenceStartPositions); - int batchSize = input.getBatchSize(); - size_t numSequences = input.getNumSequences(); - resetOutput(batchSize, getSize()); - CHECK_EQ(getSize(), input.value->getWidth()); - const int* starts = input.sequenceStartPositions->getData(false); - CHECK_EQ(starts[numSequences], batchSize); - - output_.value->assign(*input.value); - if (bias_) { - output_.value->addBias(*bias_->getW(), 1); - } - if (!FLAGS_rnn_use_batch) { - forwardSequence(batchSize, numSequences, starts); - } else { - forwardBatch(batchSize, numSequences, starts); - } -} - -void MKLPackedRecurrentLayer::forwardSequence(int batchSize, - size_t numSequences, - const int* starts) { - REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); - - frameOutput_.reserve(batchSize); - for (int i = frameOutput_.size(); i < batchSize; ++i) { - Argument arg; - arg.value = Matrix::create(nullptr, - /* height= */ 1, - getSize(), - /* trans= */ false, - useGpu_); - arg.grad = Matrix::create(nullptr, - /* height= */ 1, - getSize(), - /* trans= */ false, - useGpu_); - frameOutput_.push_back(arg); - } - - for (int i = 0; i < batchSize; ++i) { - frameOutput_[i].value->setData(output_.value->getData() + i * getSize()); - } - - for (size_t i = 0; i < numSequences; ++i) { - forwardOneSequence(starts[i], starts[i + 1] - starts[i]); - } -} - -void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) { - if (!reversed_) { - if (prevOutput_) { - frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); - } - activation_->forward(frameOutput_[start]).check(); - - for (int i = 1; i < length; ++i) { - frameOutput_[start + i].value->mul( - *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); - activation_->forward(frameOutput_[start + i]).check(); - } - if (prevOutput_) { - prevOutput_->assign(*frameOutput_[start + length - 1].value); - } - } else { - activation_->forward(frameOutput_[start + length - 1]).check(); - for (int i = length - 2; i >= 0; --i) { - frameOutput_[start + i].value->mul( - *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); - activation_->forward(frameOutput_[start + i]).check(); - } - } -} - void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) { - REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str()); - const Argument& input = getInput(0); - CHECK(input.sequenceStartPositions); - int batchSize = input.getBatchSize(); - const int* starts = input.sequenceStartPositions->getData(false); - size_t numSequences = input.getNumSequences(); - - if (!FLAGS_rnn_use_batch) { - backwardSequence(batchSize, numSequences, starts); - } else { - backwardBatch(batchSize, numSequences, starts); - } - - if (input.grad) { - input.grad->add(*output_.grad); - } - - if (bias_ && bias_->getWGrad()) { - bias_->getWGrad()->collectBias(*output_.grad, 1); - bias_->getParameterPtr()->incUpdate(callback); - } - - weight_->getParameterPtr()->incUpdate(callback); - sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); -} - -void MKLPackedRecurrentLayer::backwardSequence(int batchSize, - size_t numSequences, - const int* starts) { - REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); - for (int i = 0; i < batchSize; ++i) { - frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize()); - } - - for (size_t i = 0; i < numSequences; ++i) { - backwardOneSequence(starts[i], starts[i + 1] - starts[i]); - } -} - -void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) { - MatrixPtr weightT = weight_->getW()->getTranspose(); - if (!reversed_) { - for (int i = length - 1; i > 0; --i) { - activation_->backward(frameOutput_[start + i]).check(); - frameOutput_[start + i - 1].grad->mul( - *frameOutput_[start + i].grad, *weightT, 1, 1); - } - activation_->backward(frameOutput_[start]).check(); - if (weight_->getWGrad()) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(start, length - 1)->getTranspose(), - *output_.grad->subMatrix(start + 1, length - 1), - 1, - 1); - } - } else { - for (int i = 0; i < length - 1; ++i) { - activation_->backward(frameOutput_[start + i]).check(); - frameOutput_[start + i + 1].grad->mul( - *frameOutput_[start + i].grad, *weightT, 1, 1); - } - activation_->backward(frameOutput_[start + length - 1]).check(); - if (weight_->getWGrad()) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - *output_.grad->subMatrix(start, length - 1), - 1, - 1); - } + RecurrentLayer::backward(callback); + packed_weight_->pack(); + if (needGradient_) { + packed_weightT_->pack(); } } @@ -227,7 +60,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, batchValue_->getBatchValue(n - 1, batch2->getHeight()); // batch2->mul(*batch1, *weight_->getW(), 1, 1); - sgemm_packed_->compute(batch2, batch1); + packed_weight_->compute(batch2, batch1); } #pragma omp parallel for collapse(2) @@ -272,7 +105,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, if (n != 0) { batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); // batch1->mul(*batch2, *weightT, 1, 1); - sgemm_packed_->compute(batch1, batch2, true); + packed_weightT_->compute(batch1, batch2); } if (backwardByBatch && weight_->getWGrad()) { diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index b8727e0ff3..ba6487b11e 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -16,7 +16,8 @@ limitations under the License. */ #include #include "Layer.h" -#include "MKLPackedGemm.h" +#include "MKLPackedWeight.h" +#include "RecurrentLayer.h" #include "SequenceToBatch.h" #include "paddle/utils/Stat.h" @@ -45,90 +46,28 @@ namespace paddle { * them by rnn_use_batch flag. */ -class MKLPackedRecurrentLayer : public Layer { +class MKLPackedRecurrentLayer : public RecurrentLayer { public: - explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {} + explicit MKLPackedRecurrentLayer(const LayerConfig& config) + : RecurrentLayer(config) {} bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) override; - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override; - void resetState() override; - - void setState(LayerStatePtr state) override; - - LayerStatePtr getState() override; - protected: - /** - * @brief If user do not set --rnn_use_batch=true, it will - * compute rnn forward one sequence by one sequence in default. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn forward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void forwardOneSequence(int start, int length); - /** - * @brief Compute rnn backward one sequence by onesequence. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn backward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void backwardOneSequence(int start, int length); + void forwardBatch(int batchSize, + size_t numSequences, + const int* starts) override; - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. It will convert batch shape to sequence after finishing forward. - * The batch info can refer to SequenceToBatch class. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardBatch(int batchSize, size_t numSequences, const int* starts); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardBatch(int batchSize, size_t numSequences, const int* starts); + void backwardBatch(int batchSize, + size_t numSequences, + const int* starts) override; protected: - std::unique_ptr weight_; - std::unique_ptr bias_; - - /// frameOutput_[i] is used to hold the i-th sample of output_ - std::vector frameOutput_; - MatrixPtr prevOutput_; - /// Whether compute rnn by reverse. - bool reversed_; - /// If compute batch by batch, batchValue_ will be used to save the - /// reorganized input value. - std::unique_ptr batchValue_; - /// If compute batch by batch, batchGrad_ will be used to save the - /// gradient with respect to reorganized input value. - std::unique_ptr batchGrad_; - - std::unique_ptr sgemm_packed_; + std::unique_ptr packed_weight_; + std::unique_ptr packed_weightT_; }; } // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h new file mode 100644 index 0000000000..a8dcfd561b --- /dev/null +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" +#include "paddle/parameter/Parameter.h" +#include "paddle/parameter/Weight.h" + +namespace paddle { + +class MKLPackedWeight { +protected: + real *weight_; + real *packedWeight_; + size_t height_; + size_t width_; + bool transW_; + +public: + MKLPackedWeight(MatrixPtr weight, bool transW = false) { + packedWeight_ = nullptr; + weight_ = weight->getData(); + height_ = weight->getHeight(); + width_ = weight->getWidth(); + transW_ = transW; + } + + ~MKLPackedWeight() { free_(); } + + void pack() { pack_(weight_); } + + void compute(MatrixPtr dst, MatrixPtr src) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + src->getHeight(), + transW_ ? height_ : width_, + transW_ ? width_ : height_, + src->getData(), + src->getWidth(), + packedWeight_, + width_, + 1.0, + dst->getData(), + dst->getWidth()); + } + + void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + M, + width_, + height_, + A, + lda, + packedWeight_, + width_, + 1.0, + C, + ldc); + } + +protected: + void pack_(real *src) { + if (!packedWeight_) { + packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_); + } + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + transW_ ? CblasTrans : CblasNoTrans, + 1, + transW_ ? height_ : width_, + transW_ ? width_ : height_, + 1.0, + src, + width_, + packedWeight_); + } + + void free_() { + if (packedWeight_) { + cblas_sgemm_free(packedWeight_); + } + } +}; + +} // namespace paddle From 0596cd8826ddf94c53fd2d834a189be2b829a595 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 14:45:07 +0800 Subject: [PATCH 025/181] refine test recurrent layer --- paddle/gserver/tests/test_RecurrentLayer.cpp | 119 ++++++++----------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 44d84dd8be..0e13084333 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) { #define protected public #include "paddle/gserver/layers/GatedRecurrentLayer.h" #include "paddle/gserver/layers/LstmLayer.h" +#include "paddle/gserver/layers/RecurrentLayer.h" template class TestRecurrentLayer { public: @@ -422,6 +423,8 @@ TEST(Layer, LstmLayer) { #ifdef PADDLE_WITH_MKLML +#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h" + LayerPtr initMKLPackedLayer(LayerConfig layerConfig, bool reversed, int layerSize, @@ -453,7 +456,31 @@ LayerPtr initMKLPackedLayer(LayerConfig layerConfig, return testLayer; } -void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { +void checkMKLPackedLayer(LayerConfig layerConfig1, + LayerConfig layerConfig2, + bool reversed, + int layerSize, + int batchSize, + bool useBatch1, + bool useBatch2) { + LayerPtr dataLayer; + ParameterPtr para, bias; + + if (layerConfig1.type() == "recurrent") { + dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false); + para = creatParameter("para_0", 0, layerSize * layerSize, false); + bias = nullptr; + } else if (layerConfig1.type() == "gated_recurrent") { + dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false); + para = creatParameter("para_0", 0, layerSize * layerSize * 3, false); + bias = creatParameterBias("bias_0", 1, layerSize * 3, false); + } + + LayerPtr testLayer1 = initMKLPackedLayer( + layerConfig1, reversed, layerSize, dataLayer, para, bias); + LayerPtr testLayer2 = initMKLPackedLayer( + layerConfig2, reversed, layerSize, dataLayer, para, bias); + const VectorPtr& weightGrad = (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT); const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad(); @@ -462,78 +489,34 @@ void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth()); CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth()); - CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth()); - outputGrad.randomizeUniform(); - for (int i = 0; i < 2; i++) { - FLAGS_rnn_use_batch = true; + FLAGS_rnn_use_batch = useBatch1; testLayer1->forward(PASS_GC); - testLayer1->getOutputGrad()->copyFrom(outputGrad); - - weightGrad->zero(); - inputGrad->zero(); - - testLayer1->backward(nullptr); - - wgt_grad1.copyFrom(*weightGrad); - input_grad1.copyFrom(*inputGrad); - - FLAGS_rnn_use_batch = true; - - testLayer2->forward(PASS_GC); - testLayer2->getOutputGrad()->copyFrom(outputGrad); - - weightGrad->zero(); - inputGrad->zero(); - - testLayer2->backward(nullptr); - - wgt_grad2.copyFrom(*weightGrad); - input_grad2.copyFrom(*inputGrad); - - checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); - - checkError(wgt_grad1, wgt_grad2); - checkError(input_grad1, input_grad2); - } - - for (int i = 0; i < 2; i++) { - CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(), - testLayer2->getOutputValue()->getWidth()); - - FLAGS_rnn_use_batch = true; - + FLAGS_rnn_use_batch = useBatch2; testLayer2->forward(PASS_GC); - outputValue.copyFrom(*testLayer2->getOutputValue()); - testLayer2->getOutputGrad()->copyFrom(outputGrad); + testLayer1->getOutputGrad()->randomizeUniform(); + testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad()); weightGrad->zero(); inputGrad->zero(); - - testLayer2->backward(nullptr); + FLAGS_rnn_use_batch = useBatch1; + testLayer1->backward(nullptr); wgt_grad1.copyFrom(*weightGrad); input_grad1.copyFrom(*inputGrad); - FLAGS_rnn_use_batch = false; - - testLayer2->getOutputValue()->zero(); - - testLayer2->forward(PASS_GC); - testLayer2->getOutputGrad()->copyFrom(outputGrad); - weightGrad->zero(); inputGrad->zero(); - + FLAGS_rnn_use_batch = useBatch2; testLayer2->backward(nullptr); wgt_grad2.copyFrom(*weightGrad); input_grad2.copyFrom(*inputGrad); - checkError(outputValue, *testLayer2->getOutputValue()); + checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); checkError(wgt_grad1, wgt_grad2); checkError(input_grad1, input_grad2); } @@ -556,20 +539,22 @@ TEST(MKLPackedLayer, RecurrentLayer) { for (auto layerSize : {32, 64, 128, 256, 512}) { for (auto batchSize : {1, 5, 100, 500}) { for (auto reversed : {true, false}) { - LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize - << " reversed=" << reversed; - - LayerPtr dataLayer = - creatDataLayer("layer_0", batchSize, layerSize, false); - ParameterPtr para = - creatParameter("para_0", 0, layerSize * layerSize, false); - - LayerPtr testLayer1 = initMKLPackedLayer( - layerConfig1, reversed, layerSize, dataLayer, para); - LayerPtr testLayer2 = initMKLPackedLayer( - layerConfig2, reversed, layerSize, dataLayer, para); - - checkMKLPackedLayer(testLayer1, testLayer2); + for (auto paddle_use_batch : {true, false}) { + for (auto MKLPacked_use_batch : {true, false}) { + LOG(INFO) << " layerSize=" << layerSize + << " batchSize=" << batchSize << " reversed=" << reversed + << " paddle_use_batch=" << paddle_use_batch + << " MKLPacked_use_batch=" << MKLPacked_use_batch; + + checkMKLPackedLayer(layerConfig1, + layerConfig2, + reversed, + layerSize, + batchSize, + paddle_use_batch, + MKLPacked_use_batch); + } + } } } } From b1025cf50a1f5c2da07ffd4656f0983c20cdf4ea Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 22 Dec 2017 15:02:05 +0800 Subject: [PATCH 026/181] add norm_op for ssd(cross channel norm) --- paddle/operators/norm_op.cc | 106 ++++++++++++ paddle/operators/norm_op.cu | 24 +++ paddle/operators/norm_op.h | 162 +++++++++++++++++++ python/paddle/v2/fluid/tests/test_norm_op.py | 57 +++++++ 4 files changed, 349 insertions(+) create mode 100644 paddle/operators/norm_op.cc create mode 100644 paddle/operators/norm_op.cu create mode 100644 paddle/operators/norm_op.h create mode 100644 python/paddle/v2/fluid/tests/test_norm_op.py diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc new file mode 100644 index 0000000000..3835da630d --- /dev/null +++ b/paddle/operators/norm_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/norm_op.h" +namespace paddle { +namespace operators { + +class NormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput("Scale", + "(Tensor) The input tensor of norm operator. " + "The format of input tensor is C * 1."); + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") + .SetDefault(1.0e-10f); + AddOutput("Out", + "(Tensor) The output tensor of norm operator." + "N * M." + "M = C * H * W"); + AddComment(R"DOC( + "Input shape: $(N, C, H, W)$ + Sclae shape: $(C, 1)$ + Output shape: $(N, C, H, W)$ + Where + forward + $$ + [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}] + $$ + backward + $$ + \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}} + $$ + )DOC"); + } +}; + +class NormOp : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of NormOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of NormOp should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", in_x_dims); + } +}; + +class NormOpGrad : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, ops::NormOpGrad); +REGISTER_OP_CPU_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CPU_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu new file mode 100644 index 0000000000..7d84aaa732 --- /dev/null +++ b/paddle/operators/norm_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#define EIGEN_USE_GPU + +#include "paddle/operators/norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + norm, ops::NormKernel, + ops::NormKernel); +REGISTER_OP_CUDA_KERNEL( + norm_grad, ops::NormGradKernel, + ops::NormGradKernel); diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h new file mode 100644 index 0000000000..d3dcf48341 --- /dev/null +++ b/paddle/operators/norm_op.h @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +class NormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + auto* out = context.Output("Out"); + T epsilon = context.Attr("epsilon"); + out->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + auto x = EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + auto scale_eigen = EigenVector::Flatten(*scale); + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor out_batch = out->Slice(n, n + 1); + auto out_batch_eigen = EigenMatrix::From( + out_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp = EigenVector::Flatten(tmp_tensor); + // get colsum and sqrt , inverse + auto dim = Eigen::array({{0}}); + tmp.device(*place) = x_square_batch_eigen.sum(dim); + tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + out_batch_eigen.device(*place) = + in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col)); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + out_batch_eigen.device(*place) = + out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +template +class NormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* scale = context.Input("Scale"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + T epsilon = context.Attr("epsilon"); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); + in_x_grad->mutable_data(context.GetPlace()); + int batch_size = in_x->dims()[0]; + int channels = in_x->dims()[1]; + int height = in_x->dims()[2]; + int width = in_x->dims()[3]; + int fea_len = height * width; + auto* place = + context.template device_context().eigen_device(); + + auto scale_eigen = EigenVector::Flatten(*scale); + auto x = EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); + // get square + framework::Tensor x_square; + x_square.mutable_data(in_x->dims(), context.GetPlace()); + auto x_square_eigen = EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); + x_square_eigen.device(*place) = x.square(); + + for (int n = 0; n < batch_size; ++n) { + framework::Tensor in_x_batch = in_x->Slice(n, n + 1); + auto in_x_batch_eigen = EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); + auto in_g_batch_eigen = EigenMatrix::From( + in_g_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor x_square_batch = x_square.Slice(n, n + 1); + auto x_square_batch_eigen = EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); + framework::Tensor outg_batch = out_grad->Slice(n, n + 1); + auto outg_batch_eigen = EigenMatrix::From( + outg_batch, framework::make_ddim({channels, fea_len})); + + framework::Tensor tmp_tensor; + tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto tmp_eigen = EigenVector::Flatten(tmp_tensor); + auto dim = Eigen::array({{0}}); + tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); + framework::Tensor norm_tmp_tensor; + norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), + context.GetPlace()); + auto norm_tmp_eigen = EigenVector::Flatten(norm_tmp_tensor); + norm_tmp_eigen.device(*place) = + (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); + Eigen::array broadcast_dim_col; + broadcast_dim_col[1] = 1; + broadcast_dim_col[0] = channels; + in_g_batch_eigen.device(*place) = + in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / + (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col); + in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen; + // outg_batch_eigen + (in_g_batch_eigen * -1); + in_g_batch_eigen.device(*place) = + in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col); + Eigen::array broadcast_dim_row; + broadcast_dim_row[1] = fea_len; + broadcast_dim_row[0] = 1; + in_g_batch_eigen.device(*place) = + in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row)); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_norm_op.py b/python/paddle/v2/fluid/tests/test_norm_op.py new file mode 100644 index 0000000000..23e6841b91 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_norm_op.py @@ -0,0 +1,57 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def norm(input, scale, epsilon): + s0, s1, s2, s3 = input.shape + x_square = input * input + for i in xrange(s0): + input_batch = input[i:i + 1, :, :, :] + input_batch = input_batch.reshape(s1, s2 * s3) + x_square_batch = x_square[i:i + 1, :, :, :] + x_square_batch = x_square_batch.reshape(s1, s2 * s3) + square_colsum = x_square_batch.sum(axis=0) + epsilon + tmp = pow(square_colsum, 0.5) + tmp = np.reciprocal(tmp) + tmp_tile = np.tile(tmp, s1) + tmp_tile = tmp_tile.reshape(s1, s2 * s3) + scale_tile = np.tile(scale, (1, s2 * s3)) + scale_tile = scale_tile.reshape(s1, s2 * s3) + out_batch = input_batch * tmp_tile * scale_tile + out_batch = out_batch.reshape(1, s1, s2, s3) + if i == 0: + out = out_batch + else: + out = np.concatenate((out, out_batch), 0) + out.reshape(s0, s1, s2, s3) + return out + + +class TestNormOp(OpTest): + def setUp(self): + self.op_type = "norm" + self.init_test_case() + input = np.random.random(self.shape).astype("float32") + scale = np.array([10, 10, 10]) + self.inputs = { + 'X': input.astype('float32'), + 'Scale': scale.astype('float32') + } + self.attrs = {'epsilon': self.epsilon} + output = norm(input, scale, self.epsilon) + self.outputs = {'Out': output.astype('float32')} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + def init_test_case(self): + self.shape = [1, 3, 2, 2] + self.epsilon = 1e-6 + + +if __name__ == '__main__': + unittest.main() From d8b13dee5e656bdaa88652e1c8d7ef1cb41d373a Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 22 Dec 2017 15:10:28 +0800 Subject: [PATCH 027/181] add norm_op for ssd(cross channel norm) --- paddle/operators/norm_op.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc index 3835da630d..d23805da86 100644 --- a/paddle/operators/norm_op.cc +++ b/paddle/operators/norm_op.cc @@ -68,6 +68,9 @@ class NormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of NormOp" "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of NormOp" + "should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of NormOp should not be null."); auto in_x_dims = ctx->GetInputDim("X"); From 8a7c309d5f74c668e857fdcc8da223f5768fc521 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 22 Dec 2017 15:46:18 +0800 Subject: [PATCH 028/181] modify for update from trunk --- paddle/operators/norm_op.cc | 2 +- python/paddle/v2/fluid/tests/test_norm_op.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc index d23805da86..990a1504ea 100644 --- a/paddle/operators/norm_op.cc +++ b/paddle/operators/norm_op.cc @@ -18,7 +18,7 @@ namespace operators { class NormOpMaker : public framework::OpProtoAndCheckerMaker { public: - NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", diff --git a/python/paddle/v2/fluid/tests/test_norm_op.py b/python/paddle/v2/fluid/tests/test_norm_op.py index 23e6841b91..7d56320489 100644 --- a/python/paddle/v2/fluid/tests/test_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_norm_op.py @@ -49,7 +49,7 @@ class TestNormOp(OpTest): self.check_grad(['X'], 'Out') def init_test_case(self): - self.shape = [1, 3, 2, 2] + self.shape = [2, 3, 2, 2] self.epsilon = 1e-6 From dcc51da4a74b6429435af7d48ba5b885ec51c24e Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 22 Dec 2017 15:53:56 +0800 Subject: [PATCH 029/181] update pybind --- paddle/pybind/protobuf.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index da686d0b18..e5cf2435b3 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -174,12 +174,23 @@ void BindBlockDesc(py::module &m) { std::string name = byte_name; return self.HasVar(name); }) + .def("has_var_recursive", + [](BlockDescBind &self, py::bytes byte_name) { + std::string name = byte_name; + return self.HasVarRecursive(name); + }) .def("find_var", [](BlockDescBind &self, py::bytes byte_name) { std::string name = byte_name; return self.FindVar(name); }, py::return_value_policy::reference) + .def("find_var_recursive", + [](BlockDescBind &self, py::bytes byte_name) { + std::string name = byte_name; + return self.FindVarRecursive(name); + }, + py::return_value_policy::reference) .def("all_vars", &BlockDescBind::AllVars, py::return_value_policy::reference) .def("op_size", &BlockDescBind::OpSize) @@ -208,7 +219,8 @@ void BindVarDsec(py::module &m) { .def("set_shape", &VarDescBind::SetShape) .def("set_dtype", &VarDescBind::SetDataType) .def("shape", &VarDescBind::Shape, py::return_value_policy::reference) - .def("dtype", &VarDescBind::GetDataType) + .def("dtype", &VarDescBind::GetDataType, + py::return_value_policy::reference) .def("lod_level", &VarDescBind::GetLodLevel) .def("set_lod_level", &VarDescBind::SetLoDLevel) .def("type", &VarDescBind::GetType) @@ -240,7 +252,9 @@ void BindOpDesc(py::module &m) { .value("BLOCK", AttrType::BLOCK); py::class_ op_desc(m, "OpDesc", ""); - op_desc.def("__init__", [](OpDescBind &self) { new (&self) OpDescBind(); }) + op_desc + .def("__init__", [](OpDescBind &self) { new (&self) OpDescBind(); }, + py::return_value_policy::reference) .def("type", &OpDescBind::Type) .def("set_type", &OpDescBind::SetType) .def("input", &OpDescBind::Input) From 4360615850a01a32747b7a4e4d8f99f0ff8c6252 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 16:34:24 +0800 Subject: [PATCH 030/181] fix compile error --- paddle/gserver/layers/MKLPackedWeight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index a8dcfd561b..cc8a336154 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -29,7 +29,7 @@ protected: bool transW_; public: - MKLPackedWeight(MatrixPtr weight, bool transW = false) { + explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) { packedWeight_ = nullptr; weight_ = weight->getData(); height_ = weight->getHeight(); From edba405d3625ead27a7234576f139c99a55600fb Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 22 Dec 2017 17:07:34 +0800 Subject: [PATCH 031/181] Pass test_dyn_rnn.py --- paddle/framework/op_desc.cc | 8 +++++++ paddle/framework/op_desc.h | 2 ++ paddle/framework/var_desc.cc | 2 +- paddle/pybind/protobuf.cc | 1 + python/paddle/v2/fluid/backward.py | 35 ++++++++++++++++++++++-------- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 7ba1e3e4e3..ef7d654079 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -90,6 +90,14 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs, need_update_ = true; } +void OpDescBind::CopyFrom(const OpDescBind &op_desc) { + desc_.set_type(op_desc.Type()); + inputs_ = op_desc.inputs_; + outputs_ = op_desc.outputs_; + attrs_ = op_desc.attrs_; + need_update_ = true; +} + OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog) : desc_(desc), need_update_(false) { // restore inputs_ diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index da032319af..8ad1d52401 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -35,6 +35,8 @@ class OpDescBind { OpDescBind(const OpDesc &desc, ProgramDescBind *prog); + void CopyFrom(const OpDescBind &op_desc); + OpDesc *Proto(); std::string Type() const { return desc_.type(); } diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc index 0babec29f6..08c361a364 100644 --- a/paddle/framework/var_desc.cc +++ b/paddle/framework/var_desc.cc @@ -72,7 +72,7 @@ const TensorDesc &VarDescBind::tensor_desc() const { case VarDesc::LOD_TENSOR_ARRAY: return desc_.tensor_array().tensor(); default: - PADDLE_THROW("Unexpected branch."); + PADDLE_THROW("The type of var '", this->Name(), "' is unsupported."); } } diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index e5cf2435b3..282b9ed641 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -255,6 +255,7 @@ void BindOpDesc(py::module &m) { op_desc .def("__init__", [](OpDescBind &self) { new (&self) OpDescBind(); }, py::return_value_policy::reference) + .def("copy_from", &OpDescBind::CopyFrom) .def("type", &OpDescBind::Type) .def("set_type", &OpDescBind::SetType) .def("input", &OpDescBind::Input) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index df2761d802..416d2ae785 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -32,6 +32,16 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): return op_desc +def _infer_var_data_type_(var_name, block): + grad_var = block.desc.find_var(var_name.encode("ascii")) + fwd_name = _strip_grad_suffix_(var_name.encode("ascii")) + if block.desc.has_var_recursive(fwd_name): + fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii")) + grad_var.set_dtype(fwd_var.dtype()) + else: + grad_var.set_dtype(core.DataType.FP32) + + def _is_all_in_set_(cands, s): for c in cands: if not c in s: @@ -64,7 +74,7 @@ def _backward_impl_(target, grad_sub_block = program.create_block(parent_idx=sub_block_idx) _backward_impl_(target, sub_block, grad_sub_block, no_grad_set, grad_info_map, callback) - grad_sub_block_list.append(grad_sub_block) + grad_sub_block_list.append(grad_sub_block.desc) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( each_op.desc, no_grad_set[block.idx], grad_sub_block_list) grad_op_descs.append(grad_op_desc) @@ -80,17 +90,18 @@ def _backward_impl_(target, for var_name in op_desc.input_arg_names(): if len(var_inputs[var_name]) > 1: pending_sum_ops.append((_create_op_desc_( - op_type="sum_op", - inputs=var_inputs[var_name], - outputs=[var_name], + op_type="sum", + inputs={"X": var_inputs[var_name]}, + outputs={"Out": [var_name]}, attrs={}), idx)) var_inputs[var_name] = [var_name] for var_name in op_desc.output_arg_names(): - if len(var_inputs[var_name]) == 0: + if var_name == core.empty_var_name() or len(var_inputs[ + var_name]) == 0: # it's the first time we get the variable var_inputs[var_name] = [var_name] else: - if len(var_inputs[var_name] == 1): + if len(var_inputs[var_name]) == 1: new_name = var_name + "@RENAME@" + \ str(var_rename_count[var_name]) var_rename_count[var_name] = var_rename_count[var_name] + 1 @@ -107,7 +118,7 @@ def _backward_impl_(target, for var_name, inputs in var_inputs.iteritems(): if len(inputs) > 1: pending_sum_ops.append((_create_op_desc_( - op_type="sum_op", + op_type="sum", inputs={"X": inputs}, outputs={"Out": var_name}, attrs={}), len(grad_op_descs))) @@ -131,13 +142,15 @@ def _backward_impl_(target, {}) grad_op_descs.insert(ele[1], fill_zeros_like_op) # create new gradient variables in the target block desc + new_vars = set() for op_desc in grad_op_descs: for grad_var_name in op_desc.output_arg_names(): grad_var_name = grad_var_name.encode("ascii") - if target_block.desc.has_var( + if target_block.desc.has_var_recursive( grad_var_name) or grad_var_name == core.empty_var_name(): continue target_block.desc.var(grad_var_name) + new_vars.add(grad_var_name) if not grad_to_var.has_key(grad_var_name): continue grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, @@ -160,7 +173,11 @@ def _backward_impl_(target, for op_desc in grad_op_descs: op_desc.infer_var_type(target_block.desc) op_desc.infer_shape(target_block.desc) - target_block.desc.append_allocated_op(op_desc) + for arg in op_desc.output_arg_names(): + if arg in new_vars: + _infer_var_data_type_(arg, target_block) + new_op_desc = target_block.desc.append_op() + new_op_desc.copy_from(op_desc) target_block.sync_with_cpp() From e811c865677c80a5fce13d4bac7178bf0fa20d7b Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 22 Dec 2017 18:36:40 +0800 Subject: [PATCH 032/181] for epsilon dataType --- paddle/operators/norm_op.cc | 10 ++++++---- paddle/operators/norm_op.h | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc index 990a1504ea..1d9b55d887 100644 --- a/paddle/operators/norm_op.cc +++ b/paddle/operators/norm_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ namespace paddle { namespace operators { +template class NormOpMaker : public framework::OpProtoAndCheckerMaker { public: NormOpMaker(OpProto* proto, OpAttrChecker* op_checker) @@ -28,9 +29,9 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Scale", "(Tensor) The input tensor of norm operator. " "The format of input tensor is C * 1."); - AddAttr("epsilon", - "(float, default 1e-10) Constant " - "for numerical stability.") + AddAttr("epsilon", + "(float, default 1e-10) Constant " + "for numerical stability.") .SetDefault(1.0e-10f); AddOutput("Out", "(Tensor) The output tensor of norm operator." @@ -100,7 +101,8 @@ class NormOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, ops::NormOpGrad); +REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, + ops::NormOpGrad); REGISTER_OP_CPU_KERNEL( norm, ops::NormKernel, ops::NormKernel); diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h index d3dcf48341..b22df373af 100644 --- a/paddle/operators/norm_op.h +++ b/paddle/operators/norm_op.h @@ -26,14 +26,14 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { const framework::Tensor* in_x = context.Input("X"); const framework::Tensor* scale = context.Input("Scale"); auto* out = context.Output("Out"); - T epsilon = context.Attr("epsilon"); + auto epsilon = static_cast(context.Attr("epsilon")); out->mutable_data(context.GetPlace()); int batch_size = in_x->dims()[0]; int channels = in_x->dims()[1]; @@ -82,7 +82,7 @@ class NormKernel : public framework::OpKernel { } } }; -template +template class NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -90,7 +90,7 @@ class NormGradKernel : public framework::OpKernel { const framework::Tensor* scale = context.Input("Scale"); const framework::Tensor* out_grad = context.Input(framework::GradVarName("Out")); - T epsilon = context.Attr("epsilon"); + auto epsilon = static_cast(context.Attr("epsilon")); framework::Tensor* in_x_grad = context.Output(framework::GradVarName("X")); in_x_grad->mutable_data(context.GetPlace()); From 1d936f1dfaa884c830723d1eb4a77ef6c1171294 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sat, 23 Dec 2017 17:01:30 +0800 Subject: [PATCH 033/181] refine --- python/paddle/v2/fluid/layers/nn.py | 31 +++++++++++++++++++---------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index f49a958a0f..1240b2576f 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -489,34 +489,40 @@ def conv2d(input, of the feature, and W is the width of the feature. The details of convolution layer, please refer UFLDL's `convolution, `_ . - If bias_attr and activation type are provided, bias is added to the output of the convolution, + If bias attribution and activation type are provided, bias is added to the output of the convolution, and the corresponding activation function is applied to the final result. For each input :math:`X`, the equation is: + .. math:: - Out = \sigma (W\ast X + b) + Out = \sigma (W \\ast X + b) - In the above equation: + In the above equation: * :math:`X`: Input value, a tensor with NCHW format. * :math:`W`: Filter value, a tensor with MCHW format. - * :math:`b`: Bias, . - * :math:\sigma : Activation function. + * :math: \\ast : Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math: \\sigma : Activation function. * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. Example: - Input: + - Input: Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ - Output: + + - Output: Output shape: $(N, C_{out}, H_{out}, W_{out})$ Where - $$ - H_{out}= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + .. math:: + + H_{out}= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 + W_{out}= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 - $$ + All the input variables are passed in as local variables to the LayerHelper constructor. @@ -537,10 +543,13 @@ def conv2d(input, Variable: The tensor variable storing the convolution and \ non-linearity activation result. + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. + Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[3,32, 32], dtype='float32') + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") """ From 4d59b5ace577b39ba62895828476d9990767fc76 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 25 Dec 2017 11:54:32 +0800 Subject: [PATCH 034/181] pass test_understand_sentiment_lstm.py --- python/paddle/v2/fluid/backward.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 416d2ae785..b12767b3bb 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -71,7 +71,9 @@ def _backward_impl_(target, if each_op.has_attr("sub_block"): sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) + original_block_idx = program.current_block_idx grad_sub_block = program.create_block(parent_idx=sub_block_idx) + program.current_block_idx = original_block_idx _backward_impl_(target, sub_block, grad_sub_block, no_grad_set, grad_info_map, callback) grad_sub_block_list.append(grad_sub_block.desc) @@ -120,9 +122,9 @@ def _backward_impl_(target, pending_sum_ops.append((_create_op_desc_( op_type="sum", inputs={"X": inputs}, - outputs={"Out": var_name}, + outputs={"Out": [var_name]}, attrs={}), len(grad_op_descs))) - # 根据append的顺序可以看出pending_sum_ops一定是根据sum_op的插入位置排序的 + # sum_op descs are sorted according to their insert position for p in reversed(pending_sum_ops): grad_op_descs.insert(p[1], p[0]) # Remove ops whose outputs are all in no_grad_set From e12d1a1ce04275e4d91788a0482f3c0ebcfab609 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 25 Dec 2017 16:48:54 +0800 Subject: [PATCH 035/181] for esp data type --- paddle/operators/norm_op.cc | 20 ++------------------ paddle/operators/norm_op.cu | 4 ++-- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc index 1d9b55d887..b198b76cd4 100644 --- a/paddle/operators/norm_op.cc +++ b/paddle/operators/norm_op.cc @@ -55,14 +55,6 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker { }; class NormOp : public framework::OperatorWithKernel { - protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } - public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { @@ -80,14 +72,6 @@ class NormOp : public framework::OperatorWithKernel { }; class NormOpGrad : public framework::OperatorWithKernel { - protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } - public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { @@ -105,7 +89,7 @@ REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker, norm_grad, ops::NormOpGrad); REGISTER_OP_CPU_KERNEL( norm, ops::NormKernel, - ops::NormKernel); + ops::NormKernel); REGISTER_OP_CPU_KERNEL( norm_grad, ops::NormGradKernel, - ops::NormGradKernel); + ops::NormGradKernel); diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu index 7d84aaa732..2941c89b93 100644 --- a/paddle/operators/norm_op.cu +++ b/paddle/operators/norm_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( norm, ops::NormKernel, - ops::NormKernel); + ops::NormKernel); REGISTER_OP_CUDA_KERNEL( norm_grad, ops::NormGradKernel, - ops::NormGradKernel); + ops::NormGradKernel); From 5383f3c445ff23fd8a0bcea9e000fbb82dadd637 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 25 Dec 2017 17:22:31 +0800 Subject: [PATCH 036/181] pass test_machine_translation.py --- doc/design/optimizer.md | 2 +- python/paddle/v2/fluid/backward.py | 87 +++++++++++-------- python/paddle/v2/fluid/optimizer.py | 6 +- python/paddle/v2/fluid/tests/op_test.py | 4 +- .../fluid/tests/test_array_read_write_op.py | 4 +- .../v2/fluid/tests/test_conditional_block.py | 4 +- .../fluid/tests/test_lod_tensor_array_ops.py | 4 +- .../paddle/v2/fluid/tests/test_optimizer.py | 14 +-- .../v2/fluid/tests/test_recurrent_op.py | 4 +- .../paddle/v2/fluid/tests/test_regularizer.py | 6 +- .../fluid/tests/test_rnn_memory_helper_op.py | 2 +- .../v2/fluid/tests/test_shrink_rnn_memory.py | 4 +- .../test_split_and_merge_lod_tensor_op.py | 4 +- python/paddle/v2/fluid/tests/test_while_op.py | 4 +- 14 files changed, 80 insertions(+), 69 deletions(-) diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md index 202b4b6510..691081c268 100644 --- a/doc/design/optimizer.md +++ b/doc/design/optimizer.md @@ -79,7 +79,7 @@ class Optimizer(object): def minimize(self, loss, parameter_list): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ params_grads = self.create_backward_pass(loss, parameter_list) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index b12767b3bb..382d057be4 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -3,7 +3,7 @@ from . import core import collections import pdb -__all__ = ['append_backward_ops'] +__all__ = ['append_backward'] def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, @@ -57,12 +57,11 @@ def _append_grad_suffix_(name): return name + core.grad_var_suffix() -def _backward_impl_(target, - block, - target_block, - no_grad_set, - grad_info_map, - callback=None): +def _append_backward_ops_(target, + block, + target_block, + no_grad_set, + callback=None): grad_op_descs = [] grad_to_var = dict() program = block.program @@ -71,11 +70,10 @@ def _backward_impl_(target, if each_op.has_attr("sub_block"): sub_block_idx = each_op.block_attr("sub_block") sub_block = program.block(sub_block_idx) - original_block_idx = program.current_block_idx grad_sub_block = program.create_block(parent_idx=sub_block_idx) - program.current_block_idx = original_block_idx - _backward_impl_(target, sub_block, grad_sub_block, no_grad_set, - grad_info_map, callback) + sub_grad_to_var = _append_backward_ops_( + target, sub_block, grad_sub_block, no_grad_set, callback) + grad_to_var = dict(grad_to_var, **sub_grad_to_var) grad_sub_block_list.append(grad_sub_block.desc) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( each_op.desc, no_grad_set[block.idx], grad_sub_block_list) @@ -143,20 +141,7 @@ def _backward_impl_(target, "fill_zeros_like", {"X": [_strip_grad_suffix_(arg)]}, {"Y": [arg]}, {}) grad_op_descs.insert(ele[1], fill_zeros_like_op) - # create new gradient variables in the target block desc - new_vars = set() - for op_desc in grad_op_descs: - for grad_var_name in op_desc.output_arg_names(): - grad_var_name = grad_var_name.encode("ascii") - if target_block.desc.has_var_recursive( - grad_var_name) or grad_var_name == core.empty_var_name(): - continue - target_block.desc.var(grad_var_name) - new_vars.add(grad_var_name) - if not grad_to_var.has_key(grad_var_name): - continue - grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, - target_block) + if target_block.idx == 0: grad_target_name = _append_grad_suffix_(target.name) target_block.desc.var(grad_target_name.encode("ascii")) @@ -171,20 +156,40 @@ def _backward_impl_(target, "value": 1.0, "dtype": core.DataType.FP32 })) - # insert backward operators to target_block for op_desc in grad_op_descs: - op_desc.infer_var_type(target_block.desc) - op_desc.infer_shape(target_block.desc) - for arg in op_desc.output_arg_names(): - if arg in new_vars: - _infer_var_data_type_(arg, target_block) new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - target_block.sync_with_cpp() + return grad_to_var + + +def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): + for op_idx in range(start_op_idx, block.desc.op_size()): + op_desc = block.desc.op(op_idx) + if op_desc.has_attr("sub_block"): + sub_block = block.program.block(op_desc.block_attr("sub_block")) + _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) + new_vars = set() + # create new gradient variables + for grad_var_name in op_desc.output_arg_names(): + grad_var_name = grad_var_name.encode("ascii") + if block.desc.has_var_recursive( + grad_var_name) or grad_var_name == core.empty_var_name(): + continue + block.desc.var(grad_var_name) + new_vars.add(grad_var_name) + if not grad_to_var.has_key(grad_var_name): + continue + grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block) + # infer_shape and infer_type + op_desc.infer_var_type(block.desc) + op_desc.infer_shape(block.desc) + for arg in op_desc.output_arg_names(): + if arg in new_vars: + _infer_var_data_type_(arg, block) -def append_backward_ops(loss, parameter_list=None, no_grad_set=None): +def append_backward(loss, parameter_list=None, no_grad_set=None): """ Create and add gradient Operators in BlockDesc to compute gradients of `loss` for parameters in parameter_list @@ -201,9 +206,9 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): """ assert isinstance(loss, framework.Variable) + program = loss.block.program if no_grad_set is None: no_grad_set = dict() - program = loss.block.program assert isinstance(program, framework.Program) for block in program.blocks: assert isinstance(block, framework.Block) @@ -215,14 +220,20 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): no_grad_set[block.idx] = block_no_grad_set grad_info_map = dict() - root_block = loss.block.program.block(0) + root_block = program.block(0) - _backward_impl_(loss, root_block, root_block, no_grad_set, grad_info_map) + fwd_op_num = root_block.desc.op_size() + current_block_idx = program.current_block_idx + grad_to_var = _append_backward_ops_(loss, root_block, root_block, + no_grad_set) + _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) + program.current_block_idx = current_block_idx + program.sync_with_cpp() if parameter_list is not None: parameters = parameter_list else: - params = loss.block.program.global_block().all_parameters() + params = program.global_block().all_parameters() parameters = [param.name for param in params] params_and_grads = [] for param in parameters: @@ -234,7 +245,7 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None): raise ValueError("grad block[{0}] did not have grad var {1}".format( grad_info[1], grad_info[0])) # Get the param var from the global block - param_var = loss.block.program.global_block().var(param) + param_var = program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index bbdfab2df9..e1830a7bc7 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -1,7 +1,7 @@ from collections import defaultdict import framework -from backward import append_backward_ops +from backward import append_backward from framework import unique_name from initializer import Constant from layer_helper import LayerHelper @@ -195,10 +195,10 @@ class Optimizer(object): no_grad_set=None): """Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward_ops()` and + This method combines interface `append_backward()` and `create_optimization_pass()` into one. """ - params_grads = append_backward_ops(loss, parameter_list, no_grad_set) + params_grads = append_backward(loss, parameter_list, no_grad_set) # Add regularization if any params_grads = append_regularization_ops(params_grads, self.regularization) diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index e83c4a0622..e4c9b0218d 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -4,7 +4,7 @@ import random import itertools import paddle.v2.fluid.core as core import collections -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.op import Operator from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.framework import Program, OpProtoHolder @@ -493,7 +493,7 @@ class OpTest(unittest.TestCase): op_loss.desc.infer_var_type(block.desc) op_loss.desc.infer_shape(block.desc) - param_grad_list = append_backward_ops( + param_grad_list = append_backward( loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) feed_dict = { diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py index f6120aedec..01321de8ea 100644 --- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py +++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase): total_sum = layers.sums(input=[a_sum, x_sum]) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) - append_backward_ops(total_sum_scaled) + append_backward(total_sum_scaled) g_vars = map(default_main_program().global_block().var, [each_x.name + "@GRAD" for each_x in x]) diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py index 2b9d8f351a..7d815123f3 100644 --- a/python/paddle/v2/fluid/tests/test_conditional_block.py +++ b/python/paddle/v2/fluid/tests/test_conditional_block.py @@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase): outs = exe.run(feed={'X': x}, fetch_list=[out])[0] print outs loss = layers.mean(x=out) - append_backward_ops(loss=loss) + append_backward(loss=loss) outs = exe.run( feed={'X': x}, fetch_list=[ diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py index 0a916a55bc..ede1948937 100644 --- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py +++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py @@ -4,7 +4,7 @@ import numpy import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -172,7 +172,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): mean = layers.mean(x=result, main_program=program) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py index 2459dfd664..3d40d63bd0 100644 --- a/python/paddle/v2/fluid/tests/test_optimizer.py +++ b/python/paddle/v2/fluid/tests/test_optimizer.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestOptimizer(unittest.TestCase): @@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): learning_rate = 0.01 momentum_optimizer = self.MockMomentum( learning_rate=learning_rate, momentum=0.2, use_nesterov=True) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) opts = momentum_optimizer.create_optimization_pass( @@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 adagrad_optimizer = self.MockAdagrad( learning_rate=learning_rate, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, @@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase): learning_rate = 0.01 adam_optimizer = self.MockAdam( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, @@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase): learning_rate = 0.01 adamax_optimizer = self.MockAdamax( learning_rate=learning_rate, beta1=0.9, beta2=0.999) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, @@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): learning_rate = 0.01 decayed_adagrad_optimizer = self.MockDecayedAdagrad( learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) opts = decayed_adagrad_optimizer.create_optimization_pass( diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py index 694ff0d8dd..609287bbce 100644 --- a/python/paddle/v2/fluid/tests/test_recurrent_op.py +++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core @@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase): def test_backward(self): self.check_forward() - append_backward_ops(self.output) + append_backward(self.output) ana_grad = [np.array(x) for x in self.backward()] diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py index 24baf55e90..890c881a12 100644 --- a/python/paddle/v2/fluid/tests/test_regularizer.py +++ b/python/paddle/v2/fluid/tests/test_regularizer.py @@ -3,7 +3,7 @@ import unittest import paddle.v2.fluid.framework as framework import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.regularizer as regularizer -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestL2DecayRegularizer(unittest.TestCase): @@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) @@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase): dtype="float32", shape=[1], lod_level=0, name="mean.out") block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - params_grads = append_backward_ops(mean_out) + params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) count_ops = len(block.ops) params_grads = optimizer.append_regularization_ops(params_grads) diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py index 9999165ed5..d1bb20f37a 100644 --- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py +++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py @@ -2,7 +2,7 @@ import unittest from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy as np import paddle.v2.fluid.core as core diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py index 86db4c64b4..be1588fc2d 100644 --- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py +++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.core as core from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward from paddle.v2.fluid.framework import default_main_program import numpy @@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase): self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2])) mem3_mean = layers.mean(x=mem3) - append_backward_ops(loss=mem3_mean) + append_backward(loss=mem3_mean) x_grad = exe.run( feed={'x': tensor}, fetch_list=[main_program.global_block().var('x@GRAD')])[0] diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py index f5da4e408f..f3c634e8f1 100644 --- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py @@ -4,7 +4,7 @@ import numpy as np import paddle.v2.fluid.layers as layers from paddle.v2.fluid.framework import Program from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward class TestCPULoDTensorArrayOps(unittest.TestCase): @@ -150,7 +150,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): main_program=program) mean = layers.mean(x=out, main_program=program) - append_backward_ops(mean) + append_backward(mean) tensor = core.LoDTensor() tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py index 033b03a495..7c5593cc5e 100644 --- a/python/paddle/v2/fluid/tests/test_while_op.py +++ b/python/paddle/v2/fluid/tests/test_while_op.py @@ -2,7 +2,7 @@ import unittest import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.core as core -from paddle.v2.fluid.backward import append_backward_ops +from paddle.v2.fluid.backward import append_backward import numpy @@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase): sum_result = layers.array_read(array=mem_array, i=i) loss = layers.mean(x=sum_result) - append_backward_ops(loss) + append_backward(loss) cpu = core.CPUPlace() exe = Executor(cpu) From 77fffc60c6de0ffce97c0610d293179e306e079f Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 25 Dec 2017 17:50:52 +0800 Subject: [PATCH 037/181] fix a bug --- python/paddle/v2/fluid/backward.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 382d057be4..0e3c8762fd 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -13,8 +13,11 @@ def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, if end_idx is None: end_idx = len(op_desc_list) for i in range(begin_idx, end_idx): - op_desc_list[i].rename_input(old_name, new_name) - op_desc_list[i].rename_output(old_name, new_name) + op_desc = op_desc_list[i] + if isinstance(op_desc, tuple): + op_desc = op_desc[0] + op_desc.rename_input(old_name, new_name) + op_desc.rename_output(old_name, new_name) def _create_op_desc_(op_type, inputs, outputs, attrs): From bad3d4b661fca8ae74ab45ff980590e4709a71a9 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Mon, 25 Dec 2017 19:21:13 +0800 Subject: [PATCH 038/181] Grad Check For RNN --- .../operators/tensor_array_read_write_op.cc | 11 + paddle/operators/while_op.cc | 15 +- .../fluid/tests/test_dynrnn_gradient_check.py | 215 ++++++++++++++++++ 3 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 2ee9bf700c..59a4dac940 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -136,6 +136,17 @@ class ReadFromArrayOp : public ArrayOp { auto &dev_ctx = *pool.Borrow(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); + if (Input("X") == "dynamic_rnn_0_output_array_fc_0.tmp_0_0@GRAD") { + VLOG(10) << "Offset = " << offset; + if (x_array[offset].numel() != 0) { + auto d = x_array[offset].dims(); + std::ostringstream sout; + for (int64_t i = 0; i < d[0]; ++i) { + sout << x_array[offset].data()[0 * d[1]] << ", "; + } + VLOG(10) << "Grad = " << sout.str(); + } + } } else { VLOG(10) << "offset " << offset << " >= " << x_array.size(); } diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 11ee96faad..d7c34297cd 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -129,6 +129,9 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); + + VLOG(10) << "OG " << outside_og_name << " Type is " + << og_outside.Type().name(); if (og_outside.Type().hash_code() == typeid(framework::LoDTensor).hash_code()) { auto &outside_tensor = og_outside.Get(); @@ -145,7 +148,6 @@ class WhileGradOp : public framework::OperatorBase { inside_array.resize(outside_array.size()); for (size_t j = 0; j < inside_array.size(); ++j) { - VLOG(10) << j << " " << outside_array[j].numel(); if (outside_array[j].numel() != 0) { inside_array[j].set_lod(outside_array[j].lod()); inside_array[j].ShareDataWith(outside_array[j]); @@ -198,6 +200,17 @@ class WhileGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + + VLOG(10) << "Accumulate the gradient of " << pg_names[param_id]; + + if (pg_names[param_id] == "W@GRAD") { + auto &w_g = detail::Ref(cur_scope.FindVar(new_inside_name)) + .Get(); + VLOG(10) << "W_G is" << w_g.data()[0]; + } else { + VLOG(10) << pg_names[param_id]; + } + sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py new file mode 100644 index 0000000000..99b9285466 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -0,0 +1,215 @@ +import numpy +import random +import collections +import paddle.v2.fluid as fluid +import unittest +import copy + + +class Memory(object): + def __init__(self, shape, dtype='float32'): + self.ex = numpy.zeros(shape=shape, dtype=dtype) + self.cur = None + + def update(self, val): + assert val.shape == self.ex.shape + assert val.dtype == self.ex.dtype + self.cur = val + + def ex(self): + return self.ex + + def next(self): + self.ex = self.cur + self.cur = None + + def __next__(self): + self.next() + + def reset(self): + self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype) + self.cur = None + + +class Output(object): + def __init__(self): + self.outs = [] + + def next_sequence(self): + self.outs.append([]) + + def out(self, val): + self.outs[-1].append(val) + + def last(self): + return self.outs[-1][-1] + + +class BaseRNN(object): + def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15): + self.num_seq = num_seq + self.inputs = collections.defaultdict(list) + + for _ in xrange(num_seq): + seq_len = random.randint(1, max_seq_len - 1) + for iname in ins: + ishape = ins[iname].get('shape', None) + idtype = ins[iname].get('dtype', 'float32') + lst = [] + for _ in xrange(seq_len): + lst.append(numpy.random.random(size=ishape).astype(idtype)) + self.inputs[iname].append(lst) + + self.mems = dict() + for mname in mems: + mshape = mems[mname].get('shape', None) + mdtype = mems[mname].get('dtype', 'float32') + self.mems[mname] = Memory(shape=mshape, dtype=mdtype) + + self.params = dict() + for pname in params: + pshape = params[pname].get('shape', None) + pdtype = params[pname].get('dtype', 'float32') + self.params[pname] = numpy.random.random(size=pshape).astype(pdtype) + + self.outputs = dict() + + for oname in outs: + self.outputs[oname] = Output() + + def step(self, **kwargs): + pass + + def exe(self): + retv = dict() + for out in self.outputs: + retv[out] = [] + + for seq_id in xrange(self.num_seq): + for mname in self.mems: + self.mems[mname].reset() + for out in self.outputs: + self.outputs[out].next_sequence() + + iname0 = self.inputs.keys()[0] + seq_len = len(self.inputs[iname0][seq_id]) + + for step_id in xrange(seq_len): + xargs = dict() + + for iname in self.inputs: + xargs[iname] = self.inputs[iname][seq_id][step_id] + + for mname in self.mems: + xargs[mname] = self.mems[mname] + + for pname in self.params: + xargs[pname] = self.params[pname] + + for out in self.outputs: + xargs[out] = self.outputs[out] + + self.step(**xargs) + + for mname in self.mems: + next(self.mems[mname]) + + for out in self.outputs: + retv[out].append(self.outputs[out].last()) + + for out in retv: + retv[out] = numpy.array(retv[out]) + return retv + + def to_feed(self, place): + feed_dict = dict() + + for iname in self.inputs: + lod = [0] + np_flatten = [] + for seq_id in xrange(len(self.inputs[iname])): + seq_len = len(self.inputs[iname][seq_id]) + lod.append(lod[-1] + seq_len) + np_flatten.extend(self.inputs[iname][seq_id]) + + t = fluid.Tensor() + t.set(numpy.array(np_flatten), place) + t.set_lod([lod]) + feed_dict[iname] = t + + for pname in self.params: + feed_dict[pname] = self.params[pname] + return feed_dict + + def get_numeric_gradient_of_param(self, param_name, delta=0.01): + p = self.params[param_name] + g = numpy.zeros(shape=p.shape, dtype=p.dtype) + + for p_it, g_it in numpy.nditer([p, g], op_flags=['readwrite']): + o = float(p_it) + p_it[...] = o + delta + pos = self._exe_mean_out_() + p_it[...] = o - delta + neg = self._exe_mean_out_() + p_it[...] = o + g[:] = (pos - neg) / (delta * 2) + return g + + def _exe_mean_out_(self): + outs = self.exe() + return numpy.array([o.mean() for o in outs.itervalues()]).mean() + + +class SimpleMul(BaseRNN): + def __init__(self): + super(SimpleMul, self).__init__({ + 'X': { + 'shape': [32] + } + }, {}, {'W': { + 'shape': [32, 10] + }}, ['Out']) + + def step(self, X, W, Out): + Out.out(numpy.matmul(X, W)) + + +class TestSimpleMul(unittest.TestCase): + def setUp(self): + self.python_impl = SimpleMul() + + def test_forward(self): + program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(program, startup_program): + dat = fluid.layers.data(name='X', shape=[32], lod_level=1) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + d = rnn.step_input(dat) + o = fluid.layers.fc(input=d, + param_attr='W', + bias_attr=False, + size=10, + act=None) + rnn.output(o) + + out = rnn() + out = fluid.layers.sequence_pool(out, pool_type='last') + loss = fluid.layers.mean(x=out) + fluid.backward.append_backward_ops(loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + out, w_g = exe.run(program, + feed=self.python_impl.to_feed(cpu), + fetch_list=[out, "W@GRAD"]) + out_by_python = self.python_impl.exe()['Out'] + self.assertTrue(numpy.allclose(out, out_by_python)) + w_g_num = self.python_impl.get_numeric_gradient_of_param("W") + print w_g_num[0][0] + print w_g_num - w_g + + +if __name__ == '__main__': + unittest.main() From bcf0b56f6a0d649e1f7e71df485d2d0f2a278a77 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Sat, 23 Dec 2017 17:59:43 +0800 Subject: [PATCH 039/181] refine iterator --- paddle/operators/cos_sim_op.h | 335 ++++++++++++++------- paddle/operators/elementwise_op_function.h | 55 ---- 2 files changed, 229 insertions(+), 161 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 3a7e67506d..e96592ab28 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" -#include "paddle/operators/elementwise_add_op.h" +#include "paddle/operators/elementwise_op_function.h" namespace paddle { namespace operators { @@ -28,27 +28,73 @@ template using EigenVector = framework::EigenVector; -template -void Function_forward(T* out, T* x_norm, T* y_norm, - ElementIterator& x, - ElementIterator& y, int row, int col) { - for (int i = 0; i < row; ++i) { - T xx = 0; +template +static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { + // This method could be implemented in CUDA + for (; begin1 < last1; ++begin1, ++begin2) { + callback(*begin1, *begin2); + } +} + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline void operator()(T& x_norm, T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + size_t y_offset = &y_norm - y_norm_; + + auto* x = x_ + cols_ * x_offset; + + T xx = 0, xy = 0; T yy = 0; - T xy = 0; - for (int j = 0; j < col; ++j) { - xy += (*x) * (*y); - xx += (*x) * (*x); - yy += (*y) * (*y); - ++y; - ++x; + if (same_row) { + auto* y = y_ + cols_ * y_offset; + for (size_t i = 0; i < cols_; ++i) { + xx += x[i] * x[i]; + yy += y[i] * y[i]; + xy += x[i] * y[i]; + } + xx = sqrt(xx); + yy = sqrt(yy); + x_norm_[x_offset] = xx; + y_norm_[y_offset] = yy; + z_[x_offset] = xy / (xx * yy); + } else { + auto* y = y_; + // if (yy == -1) { + // yy = 0; + // for (size_t i = 0; i < cols_; ++i) { + // yy += y[i] * y[i]; + // } + // y_norm[0] = sqrt(yy); + // } + for (size_t i = 0; i < cols_; ++i) { + xx += x[i] * x[i]; + yy += y[i] * y[i]; // only need + xy += x[i] * y[i]; + } + xx = sqrt(xx); + yy = sqrt(yy); + x_norm_[x_offset] = xx; + y_norm_[0] = yy; + z_[x_offset] = xy / (xx * yy); } - x_norm[i] = sqrt(xx); - y_norm[i] = sqrt(yy); - - out[i] = xy / (x_norm[i] * y_norm[i]); } -} + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; template class CosSimKernel : public framework::OpKernel { @@ -68,58 +114,140 @@ class CosSimKernel : public framework::OpKernel { int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - auto x_iter = ElementIterator(in_x->data(), rows_x, - cols, rows_x, cols); - auto y_iter = ElementIterator(in_y->data(), rows_y, - cols, rows_x, cols); - - Function_forward(out_z->data(), out_x_norm->data(), - out_y_norm->data(), x_iter, y_iter, rows_x, cols); - // - // // convert Tensor to Eigen Tensor - //// int rows_x = in_x->dims()[0]; - //// int rows_y = in_y->dims()[0]; - // auto x = EigenMatrix::Reshape(*in_x, 1); - // auto y = EigenMatrix::Reshape(*in_y, 1); - // auto z = EigenVector::Flatten(*out_z); - // auto x_norm = EigenVector::Flatten(*out_x_norm); - // auto y_norm = EigenVector::Flatten(*out_y_norm); - // - // // compute - // auto& place = - // *context.template device_context().eigen_device(); - // auto row_along = Eigen::array({{1}}); - // x_norm.device(place) = x.square().sum(row_along).sqrt(); - // y_norm.device(place) = y.square().sum(row_along).sqrt(); - // if (rows_x == rows_y) { - // auto xy = (x * y).sum(Eigen::array({{1}})); - // z.device(place) = xy / x_norm / y_norm; - // } else { - // Eigen::DSizes bcast(rows_x, 1); - // auto xy = (x * y.broadcast(bcast)).sum(row_along); - // z.device(place) = xy / x_norm / y_norm.broadcast(bcast); - // } + + if (rows_x == rows_y) { + CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, + out_y_norm->data(), functor); + } else { + CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, + out_y_norm->data(), functor); + } } }; -template -void Function_element(T* result, ElementIterator dz, - ElementIterator y, - ElementIterator x_norm, - ElementIterator y_norm, - ElementIterator z, - ElementIterator x, int num, int block) { - for (int i = 0; i < num; ++i) { - result[i % block] += (*dz) * ((*y) / ((*x_norm) * (*y_norm)) - - (*z) * (*x) / ((*x_norm) * (*x_norm))); - ++dz; - ++y; - ++x_norm; - ++y_norm; - ++z; - ++x; +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + size_t y_offset = &y_norm - y_norm_; + + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + // auto y_norm_square = y_norm_[y_offset] * y_norm_[y_offset]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; + auto dz = dz_[x_offset]; + + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + auto* y = y_ + cols_ * y_offset; + auto z = z_[x_offset]; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] / xy_norm_prod - z * x[i] / x_norm_square); + } } -} + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; + auto dz = dz_[x_offset]; + auto z = z_[x_offset]; + + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] / xy_norm_prod - z * x[i] / x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + void operator()(const T& x_norm, const T& y_norm) const { + size_t x_offset = &x_norm - x_norm_; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; + auto dz = dz_[x_offset]; + auto z = z_[x_offset]; + auto* x = x_ + cols_ * x_offset; + + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] / xy_norm_prod - z * y_[i] / y_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; template class CosSimGradKernel : public framework::OpKernel { @@ -140,45 +268,40 @@ class CosSimGradKernel : public framework::OpKernel { int rows_y = in_y->dims()[0]; int cols = framework::product(in_x->dims()) / rows_x; - ////////////////////////////// - // ## - auto x_iter = ElementIterator(in_x->data(), rows_x, - cols, rows_x, cols); - auto y_iter = ElementIterator(in_y->data(), rows_y, - cols, rows_x, cols); - auto z_iter = ElementIterator(in_z->data(), rows_x, 1, - rows_x, cols); - auto dz_iter = ElementIterator(in_grad_z->data(), - rows_x, 1, rows_x, cols); - auto x_norm_iter = ElementIterator( - in_x_norm->data(), rows_x, 1, rows_x, cols); - auto y_norm_iter = ElementIterator( - in_y_norm->data(), rows_y, 1, rows_x, cols); - // ## - ////////////////////////////// - // compute dx - if (out_grad_x) { - out_grad_x->mutable_data(context.GetPlace()); - - ////////////////////////////// - // ## - Function_element(out_grad_x->data(), dz_iter, y_iter, x_norm_iter, - y_norm_iter, z_iter, x_iter, rows_x * cols, - rows_x * cols); - // ## - ////////////////////////////// - } - // compute dy - if (out_grad_y) { - out_grad_y->mutable_data(context.GetPlace()); - - ////////////////////////////// - // ## - Function_element(out_grad_y->data(), dz_iter, x_iter, y_norm_iter, - x_norm_iter, z_iter, y_iter, rows_x * cols, - rows_y * cols); - // ## - ////////////////////////////// + if (rows_x == rows_y) { + if (out_grad_x) { + CosSimGradFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } + if (out_grad_y) { + CosSimGradFunctor functor( + in_y_norm->data(), in_x_norm->data(), in_y->data(), + in_x->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + ForEachZip(in_y_norm->data(), in_y_norm->data() + rows_x, + in_x_norm->data(), functor); + } + } else { + if (out_grad_x) { + CosSimDxFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } + if (out_grad_y) { + CosSimDyFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, + in_y_norm->data(), functor); + } } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 33b7d06467..7ebfc7df8c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -131,61 +131,6 @@ class MidWiseTransformIterator { int post_; }; -template -class ElementIterator; - -// Fixed(zcd) : Only support 2D -template -class ElementIterator { - public: - ElementIterator(const T* ptr, int t_m, int t_n, int m, int n) - : ptr_(ptr), - index_(0), - i_(0), - j_(0), - t_m_(t_m), - t_n_(t_n), - m_(m), - n_(n) {} - - ElementIterator& operator++() { - ++j_; - - if ((j_ == n_)) { - j_ = 0; - ++i_; - } - int t_i = (t_m_ == 1) ? 0 : i_; - int t_j = (t_n_ == 1) ? 0 : j_; - index_ = t_i * t_n_ + t_j; - - return *this; - } - - bool operator==( - const ElementIterator& rhs) const { - return (ptr_ + index_) == &(*rhs); - } - - bool operator!=( - const ElementIterator& rhs) const { - return (ptr_ + index_) != &(*rhs); - } - - const T& operator*() { return ptr_[index_]; } - - private: - // t_m_ == m_ || t_n_ == n_ || (t_m_ == 1 && t_m_ == 1) - const T* ptr_; - int index_; - int i_; - int j_; - int64_t t_m_; - int64_t t_n_; - int64_t m_; - int64_t n_; -}; - #ifdef __NVCC__ template class RowwiseTransformIterator From 85b98070f79c4f08c0dd2a9df0d5cfb814395d90 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 25 Dec 2017 20:30:53 +0800 Subject: [PATCH 040/181] fix a bug of inplace --- python/paddle/v2/fluid/backward.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 0e3c8762fd..db1b8fa240 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -99,6 +99,9 @@ def _append_backward_ops_(target, attrs={}), idx)) var_inputs[var_name] = [var_name] for var_name in op_desc.output_arg_names(): + if var_name in op_desc.input_arg_names(): + # in place operator + continue if var_name == core.empty_var_name() or len(var_inputs[ var_name]) == 0: # it's the first time we get the variable @@ -221,6 +224,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): if var.stop_gradient: block_no_grad_set.add(_append_grad_suffix_(var.name)) no_grad_set[block.idx] = block_no_grad_set + else: + # FIX ME + no_grad_set = {0: no_grad_set} grad_info_map = dict() root_block = program.block(0) From 49df2a784be8dabda85f82620ff4601ce113d332 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 25 Dec 2017 20:17:27 +0800 Subject: [PATCH 041/181] refine gradient function --- paddle/operators/cos_sim_op.h | 101 +++++++++++----------------------- 1 file changed, 33 insertions(+), 68 deletions(-) diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index e96592ab28..cd5c703c30 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -13,7 +13,6 @@ limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/elementwise_op_function.h" @@ -21,16 +20,9 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; -template -using EigenVector = framework::EigenVector; template static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { - // This method could be implemented in CUDA for (; begin1 < last1; ++begin1, ++begin2) { callback(*begin1, *begin2); } @@ -66,15 +58,8 @@ struct CosSimFunctor { x_norm_[x_offset] = xx; y_norm_[y_offset] = yy; z_[x_offset] = xy / (xx * yy); - } else { + } else { // This can be wrote in a better way. auto* y = y_; - // if (yy == -1) { - // yy = 0; - // for (size_t i = 0; i < cols_; ++i) { - // yy += y[i] * y[i]; - // } - // y_norm[0] = sqrt(yy); - // } for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; yy += y[i] * y[i]; // only need @@ -144,22 +129,25 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - void operator()(const T& x_norm, const T& y_norm) const { + inline void operator()(const T& x_norm, const T& y_norm) const { size_t x_offset = &x_norm - x_norm_; size_t y_offset = &y_norm - y_norm_; auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - // auto y_norm_square = y_norm_[y_offset] * y_norm_[y_offset]; auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; auto dz = dz_[x_offset]; + auto z = z_[x_offset]; auto* dx = dx_ + cols_ * x_offset; auto* x = x_ + cols_ * x_offset; + auto* y = y_ + cols_ * y_offset; - auto z = z_[x_offset]; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y[i] / xy_norm_prod - z * x[i] / x_norm_square); + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); } } @@ -173,10 +161,10 @@ struct CosSimGradFunctor { const size_t cols_; }; -template +template struct CosSimDxFunctor { CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) + const T* z, const T* dz, T* dx, T* dy, int cols) : x_norm_(x_norm), y_norm_(y_norm), x_(x), @@ -184,58 +172,34 @@ struct CosSimDxFunctor { z_(z), dz_(dz), dx_(dx), - cols_(static_cast(cols)) {} - - void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] / xy_norm_prod - z * x[i] / x_norm_square); - } - } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), dy_(dy), cols_(static_cast(cols)) {} - void operator()(const T& x_norm, const T& y_norm) const { + inline void operator()(const T& x_norm, const T& y_norm) const { size_t x_offset = &x_norm - x_norm_; - auto y_norm_square = y_norm_[0] * y_norm_[0]; auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; auto dz = dz_[x_offset]; auto z = z_[x_offset]; auto* x = x_ + cols_ * x_offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] / xy_norm_prod - z * y_[i] / y_norm_square); + if (Dx) { + auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; + auto* dx = dx_ + cols_ * x_offset; + auto* x = x_ + cols_ * x_offset; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } else { + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + } } } @@ -245,6 +209,7 @@ struct CosSimDyFunctor { const T* y_; const T* z_; const T* dz_; + T* dx_; T* dy_; const size_t cols_; }; @@ -287,17 +252,17 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { - CosSimDxFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), cols); + out_grad_x->mutable_data(context.GetPlace()), nullptr, cols); ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, in_y_norm->data(), functor); } if (out_grad_y) { - CosSimDyFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), + in_y->data(), in_z->data(), in_grad_z->data(), nullptr, out_grad_y->mutable_data(context.GetPlace()), cols); ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, in_y_norm->data(), functor); From a1e1ae3ff71467b9ce2a0a34363674da55534f05 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 25 Dec 2017 23:11:49 +0800 Subject: [PATCH 042/181] refine drop_out_op --- paddle/operators/dropout_op.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index c4bee44e3e..82e2140808 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE_GE(ctx->Attrs().Get("dropout_prob"), 0); - PADDLE_ENFORCE_LE(ctx->Attrs().Get("dropout_prob"), 1); auto x_dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", x_dims); @@ -47,7 +45,11 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate(); AddAttr("dropout_prob", "Probability of setting units to zero.") - .SetDefault(.5f); + .SetDefault(.5f) + .AddCustomChecker([](const float& drop_p) { + PADDLE_ENFORCE(drop_p > 0.0f && drop_p < 1.0f, + "'dropout_prob' must be between 0 and 1."); + }); AddAttr("is_test", "True if in test phase.").SetDefault(false); AddAttr("seed", "Dropout random seed.").SetDefault(0); @@ -78,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) must not be null."); - PADDLE_ENFORCE_GE(ctx->Attrs().Get("dropout_prob"), 0); - PADDLE_ENFORCE_LE(ctx->Attrs().Get("dropout_prob"), 1); auto x_dims = ctx->GetInputDim("X"); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_EQ(x_dims, out_dims, From 52119d62a7acb39f044bab8e1ea80d216c3cf647 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 25 Dec 2017 23:32:50 +0800 Subject: [PATCH 043/181] refine --- paddle/operators/dropout_op.cc | 4 ++-- paddle/operators/dropout_op.cu | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index 82e2140808..fe72aa56ef 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -47,8 +47,8 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("dropout_prob", "Probability of setting units to zero.") .SetDefault(.5f) .AddCustomChecker([](const float& drop_p) { - PADDLE_ENFORCE(drop_p > 0.0f && drop_p < 1.0f, - "'dropout_prob' must be between 0 and 1."); + PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f, + "'dropout_prob' must be between 0.0 and 1.0."); }); AddAttr("is_test", "True if in test phase.").SetDefault(false); AddAttr("seed", "Dropout random seed.").SetDefault(0); diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index c31d2195e9..12e8e989e3 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -30,16 +30,15 @@ struct MaskGenerator { __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed) : dropout_prob(dropout_prob), seed(seed) {} - __host__ __device__ T operator()(const unsigned int n) const { + inline __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); thrust::uniform_real_distribution dist(0, 1); rng.discard(n); if (dist(rng) < dropout_prob) { return static_cast(0); - } else { - return static_cast(1); } + return static_cast(1); } }; From 6812e4f43af0910948e6492b2e3f67ffb2b03ac7 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 26 Dec 2017 11:15:33 +0800 Subject: [PATCH 044/181] pass test_inference_model_io.py --- python/paddle/v2/fluid/backward.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index db1b8fa240..2254652e8f 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -150,7 +150,7 @@ def _append_backward_ops_(target, if target_block.idx == 0: grad_target_name = _append_grad_suffix_(target.name) - target_block.desc.var(grad_target_name.encode("ascii")) + # target_block.desc.var(grad_target_name.encode("ascii")) grad_op_descs.insert( 0, _create_op_desc_( From 67e47e693cfb32dad0c1834f177c31ac7556438e Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 26 Dec 2017 11:30:15 +0800 Subject: [PATCH 045/181] refine batch_norm --- paddle/operators/batch_norm_op.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index 49cb0fa4d9..98db28ddee 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -50,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); - const float epsilon = ctx->Attrs().Get("epsilon"); - PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0"); - PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large"); - // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], "Mean and MeanOut should share the same memory"); @@ -91,7 +87,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddAttr("is_test", "").SetDefault(false); AddAttr("momentum", "").SetDefault(0.9); - AddAttr("epsilon", "").SetDefault(1e-5); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); AddAttr("data_layout", "").SetDefault("NCHW"); AddInput("X", "The input tensor"); AddInput("Scale", From 4450a312a9228d0237b794d05a75c6de71b3aa55 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 13:13:01 +0800 Subject: [PATCH 046/181] Polish Unittest --- python/paddle/v2/fluid/tests/decorators.py | 27 +++++++ .../fluid/tests/test_dynrnn_gradient_check.py | 80 +++++++++---------- 2 files changed, 67 insertions(+), 40 deletions(-) create mode 100644 python/paddle/v2/fluid/tests/decorators.py diff --git a/python/paddle/v2/fluid/tests/decorators.py b/python/paddle/v2/fluid/tests/decorators.py new file mode 100644 index 0000000000..d3dcf3562d --- /dev/null +++ b/python/paddle/v2/fluid/tests/decorators.py @@ -0,0 +1,27 @@ +import paddle.v2.fluid as fluid + +__all__ = ['many_times', 'prog_scope'] + + +def many_times(times): + def __impl__(fn): + def __fn__(*args, **kwargs): + for _ in range(times): + fn(*args, **kwargs) + + return __fn__ + + return __impl__ + + +def prog_scope(): + def __impl__(fn): + def __fn__(*args, **kwargs): + prog = fluid.Program() + startup_prog = fluid.Program() + with fluid.program_guard(prog, startup_prog): + fn(*args, **kwargs) + + return __fn__ + + return __impl__ diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 99b9285466..3018588c3a 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -3,7 +3,7 @@ import random import collections import paddle.v2.fluid as fluid import unittest -import copy +from decorators import * class Memory(object): @@ -78,7 +78,7 @@ class BaseRNN(object): self.outputs[oname] = Output() def step(self, **kwargs): - pass + raise NotImplementedError() def exe(self): retv = dict() @@ -141,18 +141,22 @@ class BaseRNN(object): feed_dict[pname] = self.params[pname] return feed_dict - def get_numeric_gradient_of_param(self, param_name, delta=0.01): + def get_numeric_gradient_of_param(self, param_name, delta=0.001): + if len(p.shape) != 2: + raise ValueError("Not support get numeric gradient of an parameter," + " which is not matrix") p = self.params[param_name] g = numpy.zeros(shape=p.shape, dtype=p.dtype) - for p_it, g_it in numpy.nditer([p, g], op_flags=['readwrite']): - o = float(p_it) - p_it[...] = o + delta - pos = self._exe_mean_out_() - p_it[...] = o - delta - neg = self._exe_mean_out_() - p_it[...] = o - g[:] = (pos - neg) / (delta * 2) + for i in xrange(p.shape[0]): + for j in xrange(p.shape[1]): + o = p[i][j] + p[i][j] += delta + pos = self._exe_mean_out_() + p[i][j] -= 2 * delta + neg = self._exe_mean_out_() + p[i][j] = o + g[i][j] = (pos - neg) / (delta * 2) return g def _exe_mean_out_(self): @@ -175,40 +179,36 @@ class SimpleMul(BaseRNN): class TestSimpleMul(unittest.TestCase): - def setUp(self): - self.python_impl = SimpleMul() - - def test_forward(self): - program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(program, startup_program): - dat = fluid.layers.data(name='X', shape=[32], lod_level=1) - - rnn = fluid.layers.DynamicRNN() - with rnn.block(): - d = rnn.step_input(dat) - o = fluid.layers.fc(input=d, - param_attr='W', - bias_attr=False, - size=10, - act=None) - rnn.output(o) - - out = rnn() - out = fluid.layers.sequence_pool(out, pool_type='last') - loss = fluid.layers.mean(x=out) - fluid.backward.append_backward_ops(loss) + # Test many times in local to ensure the random seed cannot breaks CI + # @many_times(10) + @prog_scope() + def test_forward_backward(self): + python_impl = SimpleMul() + dat = fluid.layers.data(name='X', shape=[32], lod_level=1) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + d = rnn.step_input(dat) + o = fluid.layers.fc(input=d, + param_attr='W', + bias_attr=False, + size=10, + act=None) + rnn.output(o) + + out = rnn() + out = fluid.layers.sequence_pool(out, pool_type='last') + loss = fluid.layers.mean(x=out) + fluid.backward.append_backward_ops(loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) - out, w_g = exe.run(program, - feed=self.python_impl.to_feed(cpu), + out, w_g = exe.run(feed=python_impl.to_feed(cpu), fetch_list=[out, "W@GRAD"]) - out_by_python = self.python_impl.exe()['Out'] + out_by_python = python_impl.exe()['Out'] self.assertTrue(numpy.allclose(out, out_by_python)) - w_g_num = self.python_impl.get_numeric_gradient_of_param("W") - print w_g_num[0][0] - print w_g_num - w_g + w_g_num = python_impl.get_numeric_gradient_of_param("W") + self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05)) if __name__ == '__main__': From e566b94fba2a3f5c48629841cbace40af8464fa3 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 13:14:27 +0800 Subject: [PATCH 047/181] Revert C++ changes --- paddle/operators/tensor_array_read_write_op.cc | 11 ----------- paddle/operators/while_op.cc | 15 +-------------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 59a4dac940..2ee9bf700c 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -136,17 +136,6 @@ class ReadFromArrayOp : public ArrayOp { auto &dev_ctx = *pool.Borrow(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); - if (Input("X") == "dynamic_rnn_0_output_array_fc_0.tmp_0_0@GRAD") { - VLOG(10) << "Offset = " << offset; - if (x_array[offset].numel() != 0) { - auto d = x_array[offset].dims(); - std::ostringstream sout; - for (int64_t i = 0; i < d[0]; ++i) { - sout << x_array[offset].data()[0 * d[1]] << ", "; - } - VLOG(10) << "Grad = " << sout.str(); - } - } } else { VLOG(10) << "offset " << offset << " >= " << x_array.size(); } diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index d7c34297cd..11ee96faad 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -129,9 +129,6 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - - VLOG(10) << "OG " << outside_og_name << " Type is " - << og_outside.Type().name(); if (og_outside.Type().hash_code() == typeid(framework::LoDTensor).hash_code()) { auto &outside_tensor = og_outside.Get(); @@ -148,6 +145,7 @@ class WhileGradOp : public framework::OperatorBase { inside_array.resize(outside_array.size()); for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(10) << j << " " << outside_array[j].numel(); if (outside_array[j].numel() != 0) { inside_array[j].set_lod(outside_array[j].lod()); inside_array[j].ShareDataWith(outside_array[j]); @@ -200,17 +198,6 @@ class WhileGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); - - VLOG(10) << "Accumulate the gradient of " << pg_names[param_id]; - - if (pg_names[param_id] == "W@GRAD") { - auto &w_g = detail::Ref(cur_scope.FindVar(new_inside_name)) - .Get(); - VLOG(10) << "W_G is" << w_g.data()[0]; - } else { - VLOG(10) << pg_names[param_id]; - } - sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } From 2bfa9796bf4615e0898b33b7b97bb3ca0db013d5 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 13:19:57 +0800 Subject: [PATCH 048/181] Fix check --- python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 3018588c3a..d0b805882f 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -142,10 +142,10 @@ class BaseRNN(object): return feed_dict def get_numeric_gradient_of_param(self, param_name, delta=0.001): + p = self.params[param_name] if len(p.shape) != 2: raise ValueError("Not support get numeric gradient of an parameter," " which is not matrix") - p = self.params[param_name] g = numpy.zeros(shape=p.shape, dtype=p.dtype) for i in xrange(p.shape[0]): From 32313994ba0091676616435db7b8d3487d4cb41b Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 13:33:42 +0800 Subject: [PATCH 049/181] Add forward test with mem --- .../fluid/tests/test_dynrnn_gradient_check.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index d0b805882f..ef7d5ca9f5 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -211,5 +211,67 @@ class TestSimpleMul(unittest.TestCase): self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05)) +class TestSimpleMulWithMemory(unittest.TestCase): + DATA_WIDTH = 32 + HIDDEN_WIDTH = 10 + DATA_NAME = 'X' + PARAM_NAME = 'W' + + class SimpleMulWithMemory(BaseRNN): + def __init__(self): + super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({ + TestSimpleMulWithMemory.DATA_NAME: { + 'shape': [TestSimpleMulWithMemory.DATA_WIDTH] + } + }, {'Mem': { + 'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH] + }}, { + TestSimpleMulWithMemory.PARAM_NAME: { + 'shape': [ + TestSimpleMulWithMemory.DATA_WIDTH, + TestSimpleMulWithMemory.HIDDEN_WIDTH + ] + } + }, ['Out']) + + def step(self, X, Mem, W, Out): + o = numpy.matmul(X, W) + assert isinstance(Mem, Memory) + o += Mem.ex + Mem.update(o) + assert isinstance(Out, Output) + Out.out(o) + + @prog_scope() + def test_forward_backward(self): + py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory() + + data = fluid.layers.data( + name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1) + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + d = rnn.step_input(data) + mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH]) + hidden = fluid.layers.fc(input=d, + size=self.HIDDEN_WIDTH, + param_attr=self.PARAM_NAME, + bias_attr=False, + act=None) + o = fluid.layers.elementwise_add(x=hidden, y=mem) + rnn.update_memory(mem, o) + rnn.output(o) + + out = rnn() + last = fluid.layers.sequence_pool(input=out, pool_type='last') + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + + last_np, = exe.run(feed=py_rnn.to_feed(cpu), fetch_list=[last]) + last_by_py, = py_rnn.exe().values() + + self.assertTrue(numpy.allclose(last_np, last_by_py)) + + if __name__ == '__main__': unittest.main() From 2a36e8ad76e624de8a051bbe1af2b7e7691c3280 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 13:38:32 +0800 Subject: [PATCH 050/181] Make as const name --- .../fluid/tests/test_dynrnn_gradient_check.py | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index ef7d5ca9f5..837666b76e 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -164,35 +164,44 @@ class BaseRNN(object): return numpy.array([o.mean() for o in outs.itervalues()]).mean() -class SimpleMul(BaseRNN): - def __init__(self): - super(SimpleMul, self).__init__({ - 'X': { - 'shape': [32] - } - }, {}, {'W': { - 'shape': [32, 10] - }}, ['Out']) +class TestSimpleMul(unittest.TestCase): + DATA_NAME = 'X' + DATA_WIDTH = 32 + PARAM_NAME = 'W' + HIDDEN_WIDTH = 10 + OUT_NAME = 'Out' - def step(self, X, W, Out): - Out.out(numpy.matmul(X, W)) + class SimpleMul(BaseRNN): + def __init__(self): + base = TestSimpleMul + super(base.SimpleMul, self).__init__({ + base.DATA_NAME: { + 'shape': [base.DATA_WIDTH] + } + }, {}, { + base.PARAM_NAME: { + 'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH] + } + }, [base.OUT_NAME]) + def step(self, X, W, Out): + Out.out(numpy.matmul(X, W)) -class TestSimpleMul(unittest.TestCase): # Test many times in local to ensure the random seed cannot breaks CI # @many_times(10) @prog_scope() def test_forward_backward(self): - python_impl = SimpleMul() - dat = fluid.layers.data(name='X', shape=[32], lod_level=1) + python_impl = TestSimpleMul.SimpleMul() + dat = fluid.layers.data( + name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1) rnn = fluid.layers.DynamicRNN() with rnn.block(): d = rnn.step_input(dat) o = fluid.layers.fc(input=d, - param_attr='W', + param_attr=self.PARAM_NAME, bias_attr=False, - size=10, + size=self.HIDDEN_WIDTH, act=None) rnn.output(o) @@ -204,10 +213,10 @@ class TestSimpleMul(unittest.TestCase): cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) out, w_g = exe.run(feed=python_impl.to_feed(cpu), - fetch_list=[out, "W@GRAD"]) - out_by_python = python_impl.exe()['Out'] + fetch_list=[out, self.PARAM_NAME + "@GRAD"]) + out_by_python = python_impl.exe()[self.OUT_NAME] self.assertTrue(numpy.allclose(out, out_by_python)) - w_g_num = python_impl.get_numeric_gradient_of_param("W") + w_g_num = python_impl.get_numeric_gradient_of_param(self.PARAM_NAME) self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05)) From 938717ba2b34eb87d25eb451cec5f328c0977148 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 14:37:26 +0800 Subject: [PATCH 051/181] Stash --- paddle/framework/executor.cc | 8 ++ .../fluid/tests/test_dynrnn_gradient_check.py | 79 +++++++++++++++++-- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 997773c168..a07e8e0b1b 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -66,6 +66,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); auto& block = pdesc.Block(block_id); + if (VLOG_IS_ON(100)) { + std::ostringstream sout; + for (auto& name : scope->GetAllNames(false)) { + sout << name << ", "; + } + VLOG(100) << "Scope has variable " << sout.str(); + } + Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 837666b76e..22bb2b1cdf 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -159,6 +159,39 @@ class BaseRNN(object): g[i][j] = (pos - neg) / (delta * 2) return g + def get_numeric_gradient_of_input(self, + input_name, + delta=0.001, + return_one_tensor=True): + ipt = self.inputs[input_name] + grad = [] + + for seq in ipt: + seq_grad = [] + for item in seq: + item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype) + if len(item.shape) != 1: + raise ValueError("Not support") + + for i in xrange(len(item)): + o = item[i] + item[i] += delta + pos = self._exe_mean_out_() + item[i] -= 2 * delta + neg = self._exe_mean_out_() + item[i] = o + item_grad[i] = (pos - neg) / (delta * 2) + seq_grad.append(item_grad) + grad.append(seq_grad) + + if not return_one_tensor: + return grad + + for i in xrange(len(grad)): + grad[i] = numpy.concatenate(grad[i]) + grad = numpy.concatenate(grad) + return grad + def _exe_mean_out_(self): outs = self.exe() return numpy.array([o.mean() for o in outs.itervalues()]).mean() @@ -191,9 +224,10 @@ class TestSimpleMul(unittest.TestCase): # @many_times(10) @prog_scope() def test_forward_backward(self): - python_impl = TestSimpleMul.SimpleMul() + py_rnn = TestSimpleMul.SimpleMul() dat = fluid.layers.data( name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1) + dat.stop_gradient = False rnn = fluid.layers.DynamicRNN() with rnn.block(): @@ -212,17 +246,26 @@ class TestSimpleMul(unittest.TestCase): cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) - out, w_g = exe.run(feed=python_impl.to_feed(cpu), - fetch_list=[out, self.PARAM_NAME + "@GRAD"]) - out_by_python = python_impl.exe()[self.OUT_NAME] + out, w_g, i_g = map(numpy.array, + exe.run(feed=py_rnn.to_feed(cpu), + fetch_list=[ + out, self.PARAM_NAME + "@GRAD", + self.DATA_NAME + "@GRAD" + ], + return_numpy=False)) + out_by_python = py_rnn.exe()[self.OUT_NAME] self.assertTrue(numpy.allclose(out, out_by_python)) - w_g_num = python_impl.get_numeric_gradient_of_param(self.PARAM_NAME) + w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME) self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05)) + i_g_num = py_rnn.get_numeric_gradient_of_input( + input_name=self.DATA_NAME) + i_g_num = i_g_num.reshape(i_g.shape) + self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05)) class TestSimpleMulWithMemory(unittest.TestCase): DATA_WIDTH = 32 - HIDDEN_WIDTH = 10 + HIDDEN_WIDTH = 20 DATA_NAME = 'X' PARAM_NAME = 'W' @@ -251,12 +294,14 @@ class TestSimpleMulWithMemory(unittest.TestCase): assert isinstance(Out, Output) Out.out(o) + # @many_times(10) @prog_scope() def test_forward_backward(self): py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory() data = fluid.layers.data( name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1) + data.stop_gradient = False rnn = fluid.layers.DynamicRNN() with rnn.block(): d = rnn.step_input(data) @@ -272,14 +317,32 @@ class TestSimpleMulWithMemory(unittest.TestCase): out = rnn() last = fluid.layers.sequence_pool(input=out, pool_type='last') + loss = fluid.layers.mean(x=last) + fluid.backward.append_backward_ops(loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) - - last_np, = exe.run(feed=py_rnn.to_feed(cpu), fetch_list=[last]) + feed = py_rnn.to_feed(cpu) + for _ in xrange(2): + last_np, w_g, i_g = map(numpy.array, + exe.run(feed=feed, + fetch_list=[ + last, self.PARAM_NAME + "@GRAD", + self.DATA_NAME + "@GRAD" + ], + return_numpy=False)) last_by_py, = py_rnn.exe().values() self.assertTrue(numpy.allclose(last_np, last_by_py)) + w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME) + print w_g[0], w_g_num[0] + self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1)) + i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME) + i_g_num = i_g_num.reshape(i_g.shape) + + # Since this RNN has many float add. The number could be not stable. + # rtol = 0.1 + self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1)) if __name__ == '__main__': From 6cc4bd536f1c9862bca6e3104cab4b3daf843e1e Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 26 Dec 2017 14:37:47 +0800 Subject: [PATCH 052/181] wip --- paddle/operators/adam_op.h | 120 ++++++++++++++++-- python/paddle/v2/fluid/tests/test_adam_op.py | 125 +++++++++++++++++++ 2 files changed, 232 insertions(+), 13 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index c4e2c8bb88..aa58c4f990 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -79,6 +79,71 @@ struct AdamFunctor { } }; +template +struct SparseAdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t height_; + + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel, int64_t height) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + height_(height) {} + + inline HOSTDEVICE void operator()(size_t i) const { + for (int64_t j = 0; j < row_numel_; ++j) { + T g = grad_[i * row_numel_ + j]; + T mom1 = moment1_[rows_[i] * row_numel_ + j]; + T mom2 = moment2_[rows_[i] * row_numel_ + j]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[rows_[i] * row_numel_ + j]; + + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // FIXME(typhoonzero): row id may be duplicate + moment1_out_[rows_[i] * row_numel_ + j] = mom1; + moment2_out_[rows_[i] * row_numel_ + j] = mom2; + param_out_[rows_[i] * row_numel_ + j] = p; + } // for col id + } +}; + template class AdamOpKernel : public framework::OpKernel { public: @@ -90,7 +155,8 @@ class AdamOpKernel : public framework::OpKernel { T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); auto& param = Ref(ctx.Input("Param"), "Must set Param"); - auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto* grad_var = ctx.InputVar("Grad"); auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); auto& lr = @@ -108,18 +174,46 @@ class AdamOpKernel : public framework::OpKernel { auto& mom2_out = Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); - AdamFunctor functor(beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad.template data(), - param.template data(), - param_out.template mutable_data(ctx.GetPlace())); - platform::ForRange for_range( - static_cast(ctx.device_context()), param.numel()); - for_range(functor); + if (grad_var->IsType()) { + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } else if (grad_var->IsType()) { + auto& grad = + Ref(ctx.Input("Grad"), "Must set Grad"); + auto& grad_tensor = grad.value(); + const T* grad_data = grad_tensor.template data(); + auto* rows = grad.rows().data(); + auto height = grad.height(); + auto row_numel = grad_tensor.numel() / height; + + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad_data, param.template data(), + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + height); + platform::ForRange for_range( + static_cast(ctx.device_context()), + grad.rows().size()); + for_range(functor); + } else { + PADDLE_THROW("Variable type not supported by adam_op"); + } } }; diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index a0d6655d4c..a66fd33102 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -176,5 +176,130 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): + ''' + Simulate one step of the adam optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment1, moment2, + beta1 power accumulator and beta2 power accumulator + ''' + param = inputs['Param'] + # grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + beta1 = attributes['beta1'] + beta2 = attributes['beta2'] + epsilon = attributes['epsilon'] + + moment1_out = np.array([height, row_numel]) + moment2_out = np.array([height, row_numel]) + param_out = np.array([height, row_numel]) + + for idx, row_id in enumerate(rows): + moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 + ) * np_grad[idx] + moment2_out[row_id] = beta2 * moment2[row_id] + ( + 1 - beta2) * np.square(np_grad[idx]) + lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) + param_out[row_id] = param[row_id] - lr_t * (moment1_out / ( + np.sqrt(moment2_out) + epsilon)) + return param_out, moment1_out, moment2_out + + +class TestSparseAdamOp(unittest.TestCase): + def setup(self, scope, place): + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + + height = 10 + rows = [0, 4, 7] + row_numel = 12 + self.dense_inputs = { + "Param": np.full((height, row_numel), 5.0).astype("float32"), + "Moment1": np.full((height, row_numel), 5.0).astype("float32"), + "Moment2": np.full((height, row_numel), 5.0).astype("float32"), + 'Beta1Pow': np.array([0.9**10]).astype("float32"), + 'Beta2Pow': np.array([0.999**10]).astype("float32"), + "LearningRate": np.full((1), 2.0).astype("float32") + } + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + self.sparse_inputs = ["Grad"] + + param_out, mom1, mom2 = adam_step_sparse( + self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + self.outputs = { + "Param": param_out, + "Moment1Out": mom1, + "Moment2Out": mom2 + } + + def check_with_place(self, place): + scope = core.Scope() + self.setup(scope, place) + + op_args = dict() + for key, np_array in self.dense_inputs.iteritems(): + var = scope.var(key).get_tensor() + var.set(np_array, place) + op_args[key] = key + for s in self.sparse_inputs: + op_args[s] = s + for k in self.attrs: + op_args[k] = self.attrs[k] + + # create and run sgd operator + sgd_op = Operator("adam", **op_args) + sgd_op.run(scope, place) + + for key, np_array in self.outputs.iteritems(): + out_var = scope.var(key).get_tensor() + actual = np.array(out_var) + actual.reshape([actual.size()]) + np_array.reshape([np_array.size()]) + i = 0 + while i < actual.size(): + self.assertAlmostEqual(actual[i], np_array[i]) + i += 1 + + # # rows[0] = 0, 5.0 - 2.0 * 2.0 + # self.assertAlmostEqual(1.0, result_array[rows[0], 0]) + # # rows[0] = 0, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[0], 2]) + # # 5.0 - 2.0 * 0.0 + # self.assertAlmostEqual(5.0, result_array[1, 0]) + # # rows[1] = 4, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[1], 10]) + # # 5.0 - 2.0 * 0.0 + # self.assertAlmostEqual(5.0, result_array[5, 8]) + # # rows[2] = 7, 5.0 - 2.0 * 1.0 + # self.assertAlmostEqual(3.0, result_array[rows[2], 1]) + # # rows[2] = 7, 5.0 - 2.0 * 4.0 + # self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + + def test_sparse_sgd(self): + places = [core.CPUPlace()] + if core.is_compile_gpu(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() From b05c253cacd2d984e786a566fcef3725efeb2497 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 26 Dec 2017 14:42:06 +0800 Subject: [PATCH 053/181] fix an error --- python/paddle/v2/fluid/backward.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 2254652e8f..9bc3e73f59 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -53,7 +53,8 @@ def _is_all_in_set_(cands, s): def _strip_grad_suffix_(name): - return name[:name.find(core.grad_var_suffix())] + pos = name.find(core.grad_var_suffix()) + return name[:pos] if pos != -1 else name def _append_grad_suffix_(name): @@ -139,7 +140,7 @@ def _append_backward_ops_(target, to_insert = [] for idx, op_desc in enumerate(grad_op_descs): for arg in op_desc.input_arg_names(): - if arg in no_grad_set[block.idx]: + if core.grad_var_suffix() in arg and arg in no_grad_set[block.idx]: to_insert.append((arg, idx)) for ele in reversed(to_insert): arg = ele[0] From dbf1d75f57c465696c82c618d593c4470e6d44ea Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 26 Dec 2017 15:27:42 +0800 Subject: [PATCH 054/181] Add a GemmConvMobileFunction. --- paddle/function/GemmConvOp.cpp | 152 +++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index de7b70e271..08eb6a5490 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -134,6 +134,154 @@ public: } }; +/* + * \brief Forward calculation of convolution, optimized for mobile. + */ +template +class GemmConvMobileFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + } + + void check(const BufferArgs& inputs, const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + check(inputs, outputs); + // TODO(hedaoyuan): Need to define some index macros, + // to avoid useing 0 and 1. + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + real beta; + if (outputs[0].getArgType() == ADD_TO) { + beta = 1.0; + } else { + beta = 0.0; + } + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + size_t outputHeight = output[2]; + size_t outputWidth = output[3]; + + real* inputData = inputs[0].data(); + real* filterData = inputs[1].data(); + real* outputData = outputs[0].data(); + bool needIm2col = isNeedIm2col(filter); + + TensorShape imShape = + TensorShape({inputChannels / groups_, inputHeight, inputWidth}); + + TensorShape colShape; + real* colData = NULL; + + size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; + size_t colWidth = outputHeight * outputWidth; + // Max col matrix height 256, Max col matrix width 1024 + size_t stepColHeight = std::min(colHeight, (size_t)256); + size_t stepColWidth = std::min(colWidth, (size_t)2048); + + if (needIm2col) { + colShape = TensorShape({inputChannels / groups_, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + resizeBuffer(stepColHeight * stepColWidth * sizeof(real)); + colData = reinterpret_cast(memory_->getBuf()); + } + + Im2ColFunctor im2col; + GemmFunctor gemm; + size_t inputOffset = imShape.getElements(); + size_t outputOffset = + (outputChannels / groups_) * outputHeight * outputWidth; + size_t filterOffset = filter.getElements() / groups_; + + int nStride = colWidth; + int kStride = colHeight; + for (size_t i = 0; i < batchSize; i++) { + for (size_t g = 0; g < groups_; g++) { + if (needIm2col) { + real beta_ = beta; + for (size_t colHeightStart = 0; colHeightStart < colHeight; + colHeightStart += stepColHeight) { + for (size_t colWidthStart = 0; colWidthStart < colWidth; + colWidthStart += stepColWidth) { + int N = std::min(colWidth - colWidthStart, stepColWidth); + int K = std::min(colHeight - colHeightStart, stepColHeight); + // im2col + im2col(inputData + g * inputOffset, + imShape, + colData, + colShape, + strideH(), + strideW(), + paddingH(), + paddingW(), + colHeightStart, + K, + colWidthStart, + N); + + // gemm + int M = outputChannels / groups_; + gemm(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + 1.0f, + filterData + g * filterOffset + colHeightStart, + kStride, + colData, + N, + beta_, + outputData + g * outputOffset + colWidthStart, + nStride); + } + beta_ = 1.0; + } + } else { + int M = outputChannels / groups_; + int N = outputHeight * outputWidth; + int K = inputChannels / groups_ * filterHeight * filterWidth; + gemm(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + inputData + g * inputOffset, + N, + beta, + outputData + g * outputOffset, + N); + } + } + inputData += inputChannels * inputHeight * inputWidth; + outputData += outputChannels * outputHeight * outputWidth; + } + } +}; + /* * \brief Backward input calculation of convolution. */ @@ -348,7 +496,11 @@ public: } }; +#ifdef PADDLE_MOBILE_INFERENCE +REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction); +#else REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); +#endif REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); #ifdef PADDLE_WITH_CUDA From d775895e939eb9e4ce4378e349a76d56bd4af72d Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 26 Dec 2017 15:43:30 +0800 Subject: [PATCH 055/181] Add Im2ColMobileFunctor. --- paddle/function/GemmConvOp.cpp | 56 +++++++++++++++++----------------- paddle/function/Im2Col.h | 48 +++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 08eb6a5490..75a5b4fe84 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -206,8 +206,7 @@ public: colData = reinterpret_cast(memory_->getBuf()); } - Im2ColFunctor im2col; - GemmFunctor gemm; + Im2ColMobileFunctor im2col; size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -241,19 +240,20 @@ public: // gemm int M = outputChannels / groups_; - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData + g * filterOffset + colHeightStart, - kStride, - colData, - N, - beta_, - outputData + g * outputOffset + colWidthStart, - nStride); + BlasGemm::compute( + false, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset + colHeightStart, + kStride, + colData, + N, + beta_, + outputData + g * outputOffset + colWidthStart, + nStride); } beta_ = 1.0; } @@ -261,19 +261,19 @@ public: int M = outputChannels / groups_; int N = outputHeight * outputWidth; int K = inputChannels / groups_ * filterHeight * filterWidth; - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData + g * filterOffset, - K, - inputData + g * inputOffset, - N, - beta, - outputData + g * outputOffset, - N); + BlasGemm::compute(false, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + inputData + g * inputOffset, + N, + beta, + outputData + g * outputOffset, + N); } } inputData += inputChannels * inputHeight * inputWidth; diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 0c37fc9724..f43ca465a2 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -98,4 +98,52 @@ public: int dilationWidth = 1); }; +template +class Im2ColMobileFunctor { +public: + void operator()(const T* imData, + const TensorShape& imShape, + T* colData, + const TensorShape& colShape, + int strideHeight, + int strideWidth, + int paddingHeight, + int paddingWidth, + int colHeightStart, + int colHeightSize, + int colWidthStart, + int colWidthSize) { + int inputHeight = imShape[1]; + int inputWidth = imShape[2]; + int filterHeight = colShape[1]; + int filterWidth = colShape[2]; + int outputWidth = colShape[4]; + + for (int colh = 0; colh < colHeightSize; colh++) { + int wOffset = (colHeightStart + colh) % filterWidth; + int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight; + int c_im = (colHeightStart + colh) / filterWidth / filterHeight; + + for (int colw = 0; colw < colWidthSize; colw++) { + int h = (colWidthStart + colw) / outputWidth; + int w = (colWidthStart + colw) % outputWidth; + + int imRowIdx = h * strideHeight + hOffset; + int imColIdx = w * strideWidth + wOffset; + if ((imRowIdx - paddingHeight) < 0 || + (imRowIdx - paddingHeight) >= inputHeight || + (imColIdx - paddingWidth) < 0 || + (imColIdx - paddingWidth) >= inputWidth) { + colData[colh * colWidthSize + colw] = T(0); + } else { + imRowIdx += c_im * inputHeight - paddingHeight; + imColIdx -= paddingWidth; + colData[colh * colWidthSize + colw] = + imData[imRowIdx * inputWidth + imColIdx]; + } + } + } + } +}; + } // namespace paddle From 19547943bac716d73354fcdb33c6d909b65308b3 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 26 Dec 2017 15:59:11 +0800 Subject: [PATCH 056/181] Add test for Im2ColMobileFunctor. --- paddle/function/Im2ColTest.cpp | 80 ++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 1f085538d8..0dc58696f7 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -138,4 +138,84 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor(); } #endif +template +void TestIm2ColMobileFunctor() { + for (size_t channels : {1, 5, 32}) { + for (size_t inputHeight : {5, 33, 100}) { + for (size_t inputWidth : {5, 32, 96}) { + for (size_t filterHeight : {1, 5}) { + for (size_t filterWidth : {3, 7}) { + for (size_t stride : {1, 2}) { + for (size_t padding : {0, 1}) { + for (size_t dilation : {1 /*, 3*/}) { + size_t filterSizeH = (filterHeight - 1) * dilation + 1; + size_t filterSizeW = (filterWidth - 1) * dilation + 1; + if (inputHeight + 2 * padding < filterSizeH || + inputWidth + 2 * padding < filterSizeW) + break; + if (padding >= filterSizeH || padding >= filterSizeW) break; + size_t outputHeight = + (inputHeight - filterSizeH + 2 * padding) / stride + 1; + size_t outputWidth = + (inputWidth - filterSizeW + 2 * padding) / stride + 1; + + TensorShape imShape = + TensorShape({channels, inputHeight, inputWidth}); + TensorShape colShape1 = TensorShape({channels, + filterHeight, + filterWidth, + outputHeight, + outputWidth}); + + size_t height = channels * filterHeight * filterWidth; + size_t width = outputHeight * outputWidth; + VectorPtr input1 = + Vector::create(imShape.getElements(), false); + VectorPtr input2 = + Vector::create(imShape.getElements(), false); + MatrixPtr output1 = + Matrix::create(height, width, false, false); + MatrixPtr output2 = + Matrix::create(height, width, false, false); + input1->uniform(0.001, 1); + input2->copyFrom(*input1); + + Im2ColFunctor im2Col1; + Im2ColMobileFunctor im2Col2; + im2Col1(input1->getData(), + imShape, + output1->getData(), + colShape1, + stride, + stride, + padding, + padding, + dilation, + dilation); + im2Col2(input2->getData(), + imShape, + output2->getData(), + colShape1, + stride, + stride, + padding, + padding, + 0, + height, + 0, + width); + + autotest::TensorCheckEqual(*output1, *output2); + } + } + } + } + } + } + } + } +} + +TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor(); } + } // namespace paddle From a850dec991d7d6d28f2669a959b3198a7a796ce9 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 26 Dec 2017 16:07:09 +0800 Subject: [PATCH 057/181] Add dilation. --- paddle/function/GemmConvOp.cpp | 2 ++ paddle/function/Im2Col.h | 6 ++++-- paddle/function/Im2ColTest.cpp | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 75a5b4fe84..acf1415ebf 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -233,6 +233,8 @@ public: strideW(), paddingH(), paddingW(), + dilationH(), + dilationW(), colHeightStart, K, colWidthStart, diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index f43ca465a2..1053e4fd23 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -109,6 +109,8 @@ public: int strideWidth, int paddingHeight, int paddingWidth, + int dilationHeight, + int dilationWidth, int colHeightStart, int colHeightSize, int colWidthStart, @@ -128,8 +130,8 @@ public: int h = (colWidthStart + colw) / outputWidth; int w = (colWidthStart + colw) % outputWidth; - int imRowIdx = h * strideHeight + hOffset; - int imColIdx = w * strideWidth + wOffset; + int imRowIdx = h * strideHeight + hOffset * dilationHeight; + int imColIdx = w * strideWidth + wOffset * dilationWidth; if ((imRowIdx - paddingHeight) < 0 || (imRowIdx - paddingHeight) >= inputHeight || (imColIdx - paddingWidth) < 0 || diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index 0dc58696f7..c573469168 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -147,7 +147,7 @@ void TestIm2ColMobileFunctor() { for (size_t filterWidth : {3, 7}) { for (size_t stride : {1, 2}) { for (size_t padding : {0, 1}) { - for (size_t dilation : {1 /*, 3*/}) { + for (size_t dilation : {1, 3}) { size_t filterSizeH = (filterHeight - 1) * dilation + 1; size_t filterSizeW = (filterWidth - 1) * dilation + 1; if (inputHeight + 2 * padding < filterSizeH || @@ -200,6 +200,8 @@ void TestIm2ColMobileFunctor() { stride, padding, padding, + dilation, + dilation, 0, height, 0, From fc7d0ca1cd64cca9109756e693145ceea9c60c3b Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 26 Dec 2017 16:09:01 +0800 Subject: [PATCH 058/181] fix errors --- python/paddle/v2/fluid/backward.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 9bc3e73f59..e05750c5bd 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -158,11 +158,9 @@ def _append_backward_ops_(target, op_type="fill_constant", inputs={}, outputs={"Out": [grad_target_name]}, - attrs={ - "shape": [1], - "value": 1.0, - "dtype": core.DataType.FP32 - })) + attrs={"shape": [1], + "value": 1.0, + "dtype": target.dtype})) for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) From f453b7137f8ed5a10ff47901401a796338d6e504 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 26 Dec 2017 16:10:15 +0800 Subject: [PATCH 059/181] Refine code. --- paddle/function/GemmConvOp.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index acf1415ebf..25cc3df667 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -126,14 +126,11 @@ public: inputData += inputChannels * inputHeight * inputWidth; outputData += outputChannels * outputHeight * outputWidth; } -#ifdef PADDLE_MOBILE_INFERENCE - if (Device == DEVICE_TYPE_CPU) { - memory_.reset(); - } -#endif } }; +#ifdef PADDLE_MOBILE_INFERENCE + /* * \brief Forward calculation of convolution, optimized for mobile. */ @@ -284,6 +281,8 @@ public: } }; +#endif + /* * \brief Backward input calculation of convolution. */ From 82a22d3258b7024e64cd4045c5bbf32aa99f070f Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 17:06:23 +0800 Subject: [PATCH 060/181] Update code --- paddle/framework/executor.cc | 8 ------ paddle/framework/tensor_impl.h | 8 ++++++ paddle/operators/sum_op.h | 2 ++ python/paddle/v2/fluid/executor.py | 25 ++++++++++++++++--- .../tests/book/test_label_semantic_roles.py | 2 +- python/paddle/v2/fluid/tests/decorators.py | 6 +++-- .../fluid/tests/test_dynrnn_gradient_check.py | 20 +++++++-------- 7 files changed, 46 insertions(+), 25 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index a07e8e0b1b..997773c168 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -66,14 +66,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); auto& block = pdesc.Block(block_id); - if (VLOG_IS_ON(100)) { - std::ostringstream sout; - for (auto& name : scope->GetAllNames(false)) { - sout << name << ", "; - } - VLOG(100) << "Scope has variable " << sout.str(); - } - Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 6c6f298edc..46ea3b881d 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -134,6 +134,14 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { #endif offset_ = 0; } + + if (typeid(float).hash_code() == type.hash_code()) { + auto buf = reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); + for (int64_t i = 0; i < this->numel(); ++i) { + buf[i] = NAN; + } + } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index eaa36aa1ae..cbde9976dc 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -107,10 +107,12 @@ class SumKernel : public framework::OpKernel { out_array.resize(i + 1); } if (out_array[i].numel() == 0) { + VLOG(10) << context.op().Output("Out") << " just copy"; framework::CopyFrom(in_array[i], in_array[i].place(), context.device_context(), &out_array[i]); out_array[i].set_lod(in_array[i].lod()); } else { + VLOG(10) << context.op().Output("Out") << " merged"; PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 2c91afb363..1d6c594b41 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -1,12 +1,31 @@ import numpy as np +import contextlib +from framework import Program, default_main_program from . import core -from framework import Program, default_main_program, Parameter, Variable -__all__ = ['Executor', 'g_scope'] +__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope'] g_scope = core.Scope() +def global_scope(): + return g_scope + + +def switch_scope(scope): + global g_scope + ex = g_scope + g_scope = scope + return ex + + +@contextlib.contextmanager +def scope_guard(scope): + ex = switch_scope(scope) + yield + switch_scope(ex) + + def as_numpy(tensor): if isinstance(tensor, list): return [as_numpy(t) for t in tensor] @@ -117,7 +136,7 @@ class Executor(object): raise TypeError() if scope is None: - scope = g_scope + scope = global_scope() program = program.clone() global_block = program.global_block() diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index c3591a613a..8acd470c5e 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -170,7 +170,7 @@ def main(): exe.run(fluid.default_startup_program()) - embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor() + embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor() embedding_param.set( load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place) diff --git a/python/paddle/v2/fluid/tests/decorators.py b/python/paddle/v2/fluid/tests/decorators.py index d3dcf3562d..154619b0e9 100644 --- a/python/paddle/v2/fluid/tests/decorators.py +++ b/python/paddle/v2/fluid/tests/decorators.py @@ -19,8 +19,10 @@ def prog_scope(): def __fn__(*args, **kwargs): prog = fluid.Program() startup_prog = fluid.Program() - with fluid.program_guard(prog, startup_prog): - fn(*args, **kwargs) + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + fn(*args, **kwargs) return __fn__ diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 22bb2b1cdf..7f61b966fd 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -298,7 +298,6 @@ class TestSimpleMulWithMemory(unittest.TestCase): @prog_scope() def test_forward_backward(self): py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory() - data = fluid.layers.data( name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1) data.stop_gradient = False @@ -323,19 +322,18 @@ class TestSimpleMulWithMemory(unittest.TestCase): cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) feed = py_rnn.to_feed(cpu) - for _ in xrange(2): - last_np, w_g, i_g = map(numpy.array, - exe.run(feed=feed, - fetch_list=[ - last, self.PARAM_NAME + "@GRAD", - self.DATA_NAME + "@GRAD" - ], - return_numpy=False)) + last_np, w_g, i_g = map(numpy.array, + exe.run(feed=feed, + fetch_list=[ + last, self.PARAM_NAME + "@GRAD", + self.DATA_NAME + "@GRAD" + ], + return_numpy=False)) last_by_py, = py_rnn.exe().values() - self.assertTrue(numpy.allclose(last_np, last_by_py)) w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME) - print w_g[0], w_g_num[0] + # print w_g_num[0], w_g[0] + self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1)) i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME) i_g_num = i_g_num.reshape(i_g.shape) From 8728885031be996588520373cf3eec8fab0efee3 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 26 Dec 2017 17:09:57 +0800 Subject: [PATCH 061/181] Revert debug code --- paddle/framework/tensor_impl.h | 8 -------- paddle/operators/sum_op.h | 2 -- 2 files changed, 10 deletions(-) diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 46ea3b881d..6c6f298edc 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -134,14 +134,6 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { #endif offset_ = 0; } - - if (typeid(float).hash_code() == type.hash_code()) { - auto buf = reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - for (int64_t i = 0; i < this->numel(); ++i) { - buf[i] = NAN; - } - } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index cbde9976dc..eaa36aa1ae 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -107,12 +107,10 @@ class SumKernel : public framework::OpKernel { out_array.resize(i + 1); } if (out_array[i].numel() == 0) { - VLOG(10) << context.op().Output("Out") << " just copy"; framework::CopyFrom(in_array[i], in_array[i].place(), context.device_context(), &out_array[i]); out_array[i].set_lod(in_array[i].lod()); } else { - VLOG(10) << context.op().Output("Out") << " merged"; PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); From 5b9dbbb9831229b71ce4ddcb94264622b8b398c2 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 26 Dec 2017 21:10:47 +0800 Subject: [PATCH 062/181] code clean --- python/paddle/v2/fluid/backward.py | 181 +++++++++++++++-------------- 1 file changed, 92 insertions(+), 89 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index e05750c5bd..b90949838e 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,7 +1,6 @@ from paddle.v2.fluid import framework as framework from . import core import collections -import pdb __all__ = ['append_backward'] @@ -45,7 +44,7 @@ def _infer_var_data_type_(var_name, block): grad_var.set_dtype(core.DataType.FP32) -def _is_all_in_set_(cands, s): +def _all_in_set_(cands, s): for c in cands: if not c in s: return False @@ -61,112 +60,114 @@ def _append_grad_suffix_(name): return name + core.grad_var_suffix() -def _append_backward_ops_(target, - block, - target_block, - no_grad_set, - callback=None): - grad_op_descs = [] - grad_to_var = dict() - program = block.program - for each_op in reversed(block.ops): - grad_sub_block_list = [] - if each_op.has_attr("sub_block"): - sub_block_idx = each_op.block_attr("sub_block") - sub_block = program.block(sub_block_idx) - grad_sub_block = program.create_block(parent_idx=sub_block_idx) - sub_grad_to_var = _append_backward_ops_( - target, sub_block, grad_sub_block, no_grad_set, callback) - grad_to_var = dict(grad_to_var, **sub_grad_to_var) - grad_sub_block_list.append(grad_sub_block.desc) - grad_op_desc, op_grad_to_var = core.get_grad_op_desc( - each_op.desc, no_grad_set[block.idx], grad_sub_block_list) - grad_op_descs.append(grad_op_desc) - grad_to_var = dict(grad_to_var, **op_grad_to_var) - # grad_op_descs = [[op1_g1, op1_g2], [op2_g], ...] - # flatten grad_op_descs - grad_op_descs = [op for sublist in grad_op_descs for op in sublist] # ????? - +def _addup_repetitive_outputs_(op_descs): + # In backward part, an variable my be the output of more than one ops. + # In this case, the variable should be the accumulation of all the outputs. + # We adopt adding `sum_op`s to implement the accumulate. pending_sum_ops = [] var_rename_count = collections.defaultdict(int) - var_inputs = collections.defaultdict(list) - for idx, op_desc in enumerate(grad_op_descs): + renamed_vars = collections.defaultdict(list) + for idx, op_desc in enumerate(op_descs): for var_name in op_desc.input_arg_names(): - if len(var_inputs[var_name]) > 1: - pending_sum_ops.append((_create_op_desc_( - op_type="sum", - inputs={"X": var_inputs[var_name]}, - outputs={"Out": [var_name]}, - attrs={}), idx)) - var_inputs[var_name] = [var_name] + if len(renamed_vars[var_name]) > 1: + pending_sum_ops.append( + (_create_op_desc_("sum", {"X": renamed_vars[var_name]}, + {"Out": [var_name]}, {}), idx)) + renamed_vars[var_name] = [var_name] for var_name in op_desc.output_arg_names(): - if var_name in op_desc.input_arg_names(): - # in place operator + if var_name == core.empty_var_name( + ) or var_name in op_desc.input_arg_names(): + # empty variable or inplace op continue - if var_name == core.empty_var_name() or len(var_inputs[ - var_name]) == 0: + if len(renamed_vars[var_name]) == 0: # it's the first time we get the variable - var_inputs[var_name] = [var_name] + renamed_vars[var_name] = [var_name] else: - if len(var_inputs[var_name]) == 1: + if len(renamed_vars[var_name]) == 1: new_name = var_name + "@RENAME@" + \ str(var_rename_count[var_name]) - var_rename_count[var_name] = var_rename_count[var_name] + 1 + var_rename_count[var_name] += 1 # rename original var_name - var_inputs[var_name][0] = new_name - _rename_arg_(grad_op_descs, var_name, new_name, 0, idx) + renamed_vars[var_name][0] = new_name + _rename_arg_(op_descs, var_name, new_name, 0, idx) _rename_arg_(pending_sum_ops, var_name, new_name) new_name = var_name + "@RENAME@" + \ str(var_rename_count[var_name]) - var_rename_count[var_name] = var_rename_count[var_name] + 1 + var_rename_count[var_name] += 1 op_desc.rename_output(var_name, new_name) - var_inputs[var_name].append(new_name) - for var_name, inputs in var_inputs.iteritems(): + renamed_vars[var_name].append(new_name) + for var_name, inputs in renamed_vars.iteritems(): if len(inputs) > 1: pending_sum_ops.append((_create_op_desc_( - op_type="sum", - inputs={"X": inputs}, - outputs={"Out": [var_name]}, - attrs={}), len(grad_op_descs))) + "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs))) # sum_op descs are sorted according to their insert position for p in reversed(pending_sum_ops): - grad_op_descs.insert(p[1], p[0]) - # Remove ops whose outputs are all in no_grad_set - grad_op_descs = filter( - lambda op_desc: not _is_all_in_set_(op_desc.output_arg_names(), no_grad_set[block.idx]), - grad_op_descs) + op_descs.insert(p[1], p[0]) + + return op_descs + + +def _remove_no_grad_branch_(op_descs, no_grad_set): + # Remove ops whose outputs are all in no_grad_dict + op_descs = filter( + lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set), + op_descs) # Insert fill_zeros_like_op to_insert = [] - for idx, op_desc in enumerate(grad_op_descs): + for idx, op_desc in enumerate(op_descs): for arg in op_desc.input_arg_names(): - if core.grad_var_suffix() in arg and arg in no_grad_set[block.idx]: - to_insert.append((arg, idx)) - for ele in reversed(to_insert): - arg = ele[0] - fill_zeros_like_op = _create_op_desc_( - "fill_zeros_like", {"X": [_strip_grad_suffix_(arg)]}, {"Y": [arg]}, - {}) - grad_op_descs.insert(ele[1], fill_zeros_like_op) + if core.grad_var_suffix() in arg and arg in no_grad_set: + to_insert.append((_create_op_desc_("fill_zeros_like", { + "X": [_strip_grad_suffix_(arg)] + }, {"Y": [arg]}, {}), idx)) + + map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert)) + + return op_descs + + +def _append_backward_ops_(target, + block, + target_block, + no_grad_dict, + grad_to_var, + callback=None): + grad_op_descs = [] + program = block.program + for op in reversed(block.ops): + grad_sub_block_list = [] + # If the op has its own sub-block, deal with the sub-block first + if op.has_attr("sub_block"): + sub_block = program.block(op.block_attr("sub_block")) + grad_sub_block = program.create_block(parent_idx=sub_block.idx) + _append_backward_ops_(target, sub_block, grad_sub_block, + no_grad_dict, grad_to_var, callback) + grad_sub_block_list.append(grad_sub_block.desc) + + grad_op_desc, op_grad_to_var = core.get_grad_op_desc( + op.desc, no_grad_dict[block.idx], grad_sub_block_list) + grad_op_descs.extend(grad_op_desc) + grad_to_var.update(op_grad_to_var) + + grad_op_descs = _addup_repetitive_outputs_(grad_op_descs) + + grad_op_descs = _remove_no_grad_branch_(grad_op_descs, + no_grad_dict[block.idx]) if target_block.idx == 0: - grad_target_name = _append_grad_suffix_(target.name) - # target_block.desc.var(grad_target_name.encode("ascii")) grad_op_descs.insert( 0, - _create_op_desc_( - op_type="fill_constant", - inputs={}, - outputs={"Out": [grad_target_name]}, - attrs={"shape": [1], - "value": 1.0, - "dtype": target.dtype})) + _create_op_desc_("fill_constant", {}, { + "Out": [_append_grad_suffix_(target.name)] + }, {"shape": [1], + "value": 1.0, + "dtype": target.dtype})) + # append op_desc in grad_op_descs to target_block for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - return grad_to_var - def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): for op_idx in range(start_op_idx, block.desc.op_size()): @@ -194,15 +195,15 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): _infer_var_data_type_(arg, block) -def append_backward(loss, parameter_list=None, no_grad_set=None): +def append_backward(loss, parameter_list=None, no_grad_dict=None): """ Create and add gradient Operators in BlockDesc to compute gradients of `loss` for parameters in parameter_list :param loss: an variable generated by cost function. :type loss: Variable - :param no_grad_set: variable that should not create gradient - :type no_grad_set: set + :param no_grad_dict: variable that should not create gradient + :type no_grad_dict: set :param parameter_list: parameters that need to compute gradient and update to optimize the lost. :type: list @@ -212,8 +213,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): assert isinstance(loss, framework.Variable) program = loss.block.program - if no_grad_set is None: - no_grad_set = dict() + if no_grad_dict is None: + no_grad_dict = dict() assert isinstance(program, framework.Program) for block in program.blocks: assert isinstance(block, framework.Block) @@ -222,19 +223,21 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): assert isinstance(var, framework.Variable) if var.stop_gradient: block_no_grad_set.add(_append_grad_suffix_(var.name)) - no_grad_set[block.idx] = block_no_grad_set - else: - # FIX ME - no_grad_set = {0: no_grad_set} + no_grad_dict[block.idx] = block_no_grad_set + elif isinstance(no_grad_dict, set): + no_grad_dict = {0: no_grad_dict} grad_info_map = dict() root_block = program.block(0) fwd_op_num = root_block.desc.op_size() current_block_idx = program.current_block_idx - grad_to_var = _append_backward_ops_(loss, root_block, root_block, - no_grad_set) + grad_to_var = dict() + + _append_backward_ops_(loss, root_block, root_block, no_grad_dict, + grad_to_var) _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) + program.current_block_idx = current_block_idx program.sync_with_cpp() From 5361911c689e1368adc4c8b0c86ea44c310796dc Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 26 Dec 2017 21:23:08 +0800 Subject: [PATCH 063/181] adam support sparse --- paddle/operators/adam_op.h | 13 ++--- python/paddle/v2/fluid/tests/test_adam_op.py | 58 +++++++++----------- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index aa58c4f990..5facd0112f 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -98,13 +98,12 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; - int64_t height_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t height) + int64_t row_numel) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -119,8 +118,7 @@ struct SparseAdamFunctor { param_(param), param_out_(param_out), rows_(rows), - row_numel_(row_numel), - height_(height) {} + row_numel_(row_numel) {} inline HOSTDEVICE void operator()(size_t i) const { for (int64_t j = 0; j < row_numel_; ++j) { @@ -136,6 +134,7 @@ struct SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // IMPORTANT: // FIXME(typhoonzero): row id may be duplicate moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; @@ -195,8 +194,7 @@ class AdamOpKernel : public framework::OpKernel { auto& grad_tensor = grad.value(); const T* grad_data = grad_tensor.template data(); auto* rows = grad.rows().data(); - auto height = grad.height(); - auto row_numel = grad_tensor.numel() / height; + auto row_numel = grad_tensor.numel() / grad.rows().size(); SparseAdamFunctor functor( beta1, beta2, epsilon, beta1_pow.template data(), @@ -205,8 +203,7 @@ class AdamOpKernel : public framework::OpKernel { mom2.template data(), mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - height); + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); platform::ForRange for_range( static_cast(ctx.device_context()), grad.rows().size()); diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index a66fd33102..996fcfe49d 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -1,6 +1,8 @@ import unittest import numpy as np from op_test import OpTest +from paddle.v2.fluid import core +from paddle.v2.fluid.op import Operator class TestAdamOp1(OpTest): @@ -196,9 +198,9 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): beta2 = attributes['beta2'] epsilon = attributes['epsilon'] - moment1_out = np.array([height, row_numel]) - moment2_out = np.array([height, row_numel]) - param_out = np.array([height, row_numel]) + moment1_out = np.zeros(shape=[height, row_numel]) + moment2_out = np.zeros(shape=[height, row_numel]) + param_out = np.zeros(shape=[height, row_numel]) for idx, row_id in enumerate(rows): moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 @@ -206,8 +208,8 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): moment2_out[row_id] = beta2 * moment2[row_id] + ( 1 - beta2) * np.square(np_grad[idx]) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) - param_out[row_id] = param[row_id] - lr_t * (moment1_out / ( - np.sqrt(moment2_out) + epsilon)) + param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( + np.sqrt(moment2_out[row_id]) + epsilon)) return param_out, moment1_out, moment2_out @@ -219,13 +221,15 @@ class TestSparseAdamOp(unittest.TestCase): height = 10 rows = [0, 4, 7] + self.rows = rows row_numel = 12 + self.row_numel = row_numel self.dense_inputs = { "Param": np.full((height, row_numel), 5.0).astype("float32"), "Moment1": np.full((height, row_numel), 5.0).astype("float32"), "Moment2": np.full((height, row_numel), 5.0).astype("float32"), - 'Beta1Pow': np.array([0.9**10]).astype("float32"), - 'Beta2Pow': np.array([0.999**10]).astype("float32"), + 'Beta1Pow': np.array([beta1**10]).astype("float32"), + 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} @@ -245,7 +249,7 @@ class TestSparseAdamOp(unittest.TestCase): param_out, mom1, mom2 = adam_step_sparse( self.dense_inputs, self.attrs, height, rows, row_numel, np_array) self.outputs = { - "Param": param_out, + "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } @@ -261,37 +265,29 @@ class TestSparseAdamOp(unittest.TestCase): op_args[key] = key for s in self.sparse_inputs: op_args[s] = s + for s in self.outputs: + var = scope.var(s).get_tensor() + var.set(self.outputs[s], place) + op_args[s] = s for k in self.attrs: op_args[k] = self.attrs[k] # create and run sgd operator - sgd_op = Operator("adam", **op_args) - sgd_op.run(scope, place) + adam_op = Operator("adam", **op_args) + adam_op.run(scope, place) for key, np_array in self.outputs.iteritems(): out_var = scope.var(key).get_tensor() actual = np.array(out_var) - actual.reshape([actual.size()]) - np_array.reshape([np_array.size()]) - i = 0 - while i < actual.size(): - self.assertAlmostEqual(actual[i], np_array[i]) - i += 1 - - # # rows[0] = 0, 5.0 - 2.0 * 2.0 - # self.assertAlmostEqual(1.0, result_array[rows[0], 0]) - # # rows[0] = 0, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[0], 2]) - # # 5.0 - 2.0 * 0.0 - # self.assertAlmostEqual(5.0, result_array[1, 0]) - # # rows[1] = 4, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[1], 10]) - # # 5.0 - 2.0 * 0.0 - # self.assertAlmostEqual(5.0, result_array[5, 8]) - # # rows[2] = 7, 5.0 - 2.0 * 1.0 - # self.assertAlmostEqual(3.0, result_array[rows[2], 1]) - # # rows[2] = 7, 5.0 - 2.0 * 4.0 - # self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + actual = actual.reshape([actual.size]) + np_array = np_array.reshape([np_array.size]) + for idx, row_id in enumerate(self.rows): + j = 0 + while j < self.row_numel: + pos = row_id * self.row_numel + j + print (actual[pos] - np_array[pos]) / actual[pos] + self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) + j += 1 def test_sparse_sgd(self): places = [core.CPUPlace()] From 32d881beabdfb7130072bb624bc29fa6c6b30904 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 26 Dec 2017 05:46:23 -0800 Subject: [PATCH 064/181] Optimize the rowwise add function. --- paddle/operators/math/math_function.cc | 32 ++++++++++++++++++++++ paddle/operators/math/math_function.cu | 27 ++++++++++++++++++ paddle/operators/math/math_function_impl.h | 19 ------------- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 2b35e4532a..1a4829c49f 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -302,8 +302,40 @@ void set_constant(const platform::DeviceContext& context, #endif } +template +struct RowwiseAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); + + // auto in = framework::EigenMatrix::From(input); + // auto vec = framework::EigenVector::Flatten(vector); + // auto out = framework::EigenMatrix::From(*output); + // for (int64_t i = 0; i < in_dims[0]; ++i) { + // out.chip(i, 0) = in.chip(i, 0) + vec; + // } + + auto* in = input.data(); + auto* vec = vector.data(); + auto* out = output->data(); + + int64_t h = in_dims[0]; + int64_t w = in_dims[1]; + for (int64_t i = 0; i < h; ++i) { + for (int64_t j = 0; j < w; ++j) { + out[i * w + j] = in[i * w + j] + vec[j]; + } + } + } +}; + template struct RowwiseAdd; template struct RowwiseAdd; + template struct ColwiseSum; template struct ColwiseSum; diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 927838a094..36e6cc8914 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -273,6 +273,33 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } +template +__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int64_t height, + int64_t width) { + int64_t num = height * width; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + int h = i / width; + int w = i % width; + int idx = h * width + w; + c[idx] = a[idx] + b[w]; + } +} + +template +struct RowwiseAdd { + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, framework::Tensor* output) { + auto in_dims = input.dims(); + int blocks = 512; + int grids = (input.numel() + blocks - 1) / blocks; + RowwiseAddKernel<<>>( + input.data(), vector.data(), output->data(), in_dims[0], + in_dims[1]); + } +}; + template struct RowwiseAdd; template struct RowwiseAdd; template struct ColwiseSum; diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index ddd798dace..de591626df 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -45,25 +45,6 @@ void Transpose::operator()( eigen_out.device(*dev) = eigen_in.shuffle(permute); } -template -void RowwiseAdd::operator()(const DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { - auto in_dims = input.dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); - - auto in = framework::EigenMatrix::From(input); - auto vec = framework::EigenMatrix::From(vector); - auto out = framework::EigenMatrix::From(*output); - Eigen::array shape({{1, static_cast(size)}}); - Eigen::array bcast({{static_cast(in_dims[0]), 1}}); - out.device(*context.eigen_device()) = - in + vec.reshape(shape).broadcast(bcast); -} - template void ColwiseSum::operator()(const DeviceContext& context, const framework::Tensor& input, From 41372ded20f18d8367b76aee792b6499117a3ce6 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 26 Dec 2017 06:03:45 -0800 Subject: [PATCH 065/181] Resume CPU implenmentation. --- paddle/operators/math/math_function.cc | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 1a4829c49f..d4f12f0a10 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -312,23 +312,12 @@ struct RowwiseAdd { PADDLE_ENFORCE_EQ(vector.numel(), size); PADDLE_ENFORCE_EQ(output->dims(), in_dims); - // auto in = framework::EigenMatrix::From(input); - // auto vec = framework::EigenVector::Flatten(vector); - // auto out = framework::EigenMatrix::From(*output); - // for (int64_t i = 0; i < in_dims[0]; ++i) { - // out.chip(i, 0) = in.chip(i, 0) + vec; - // } - - auto* in = input.data(); - auto* vec = vector.data(); - auto* out = output->data(); - - int64_t h = in_dims[0]; - int64_t w = in_dims[1]; - for (int64_t i = 0; i < h; ++i) { - for (int64_t j = 0; j < w; ++j) { - out[i * w + j] = in[i * w + j] + vec[j]; - } + auto in = framework::EigenMatrix::From(input); + auto vec = framework::EigenVector::Flatten(vector); + auto out = framework::EigenMatrix::From(*output); + + for (int64_t i = 0; i < in_dims[0]; ++i) { + out.chip(i, 0) = in.chip(i, 0) + vec; } } }; From c532fdab29f17d3aa7edc7902d9a5a94346660b4 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 26 Dec 2017 23:44:02 +0800 Subject: [PATCH 066/181] fix errors --- python/paddle/v2/fluid/backward.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index b90949838e..6966cc7580 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -195,7 +195,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): _infer_var_data_type_(arg, block) -def append_backward(loss, parameter_list=None, no_grad_dict=None): +def append_backward(loss, parameter_list=None, no_grad_set=None): """ Create and add gradient Operators in BlockDesc to compute gradients of `loss` for parameters in parameter_list @@ -213,8 +213,8 @@ def append_backward(loss, parameter_list=None, no_grad_dict=None): assert isinstance(loss, framework.Variable) program = loss.block.program - if no_grad_dict is None: - no_grad_dict = dict() + no_grad_dict = dict() + if no_grad_set is None: assert isinstance(program, framework.Program) for block in program.blocks: assert isinstance(block, framework.Block) @@ -224,8 +224,10 @@ def append_backward(loss, parameter_list=None, no_grad_dict=None): if var.stop_gradient: block_no_grad_set.add(_append_grad_suffix_(var.name)) no_grad_dict[block.idx] = block_no_grad_set - elif isinstance(no_grad_dict, set): - no_grad_dict = {0: no_grad_dict} + elif isinstance(no_grad_set, set): + no_grad_dict = {0: no_grad_set} + else: + raise ValueError("'no_grad_set' should be a set or None.") grad_info_map = dict() root_block = program.block(0) From 14b87dbfebe661d7cb98f2df9592923bc334c66b Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 27 Dec 2017 01:00:41 +0800 Subject: [PATCH 067/181] update reorder_lod_tensor_op_test --- python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py index 8f5774835e..7c136f6360 100644 --- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py +++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py @@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase): new_dat = fluid.layers.reorder_lod_tensor_by_rank( x=dat, rank_table=table) loss = fluid.layers.mean(x=new_dat) - fluid.backward.append_backward_ops(loss=loss) + fluid.backward.append_backward(loss=loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) From dd21ae6c1ee3b681bfd069760448fead207964ee Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 09:58:51 +0800 Subject: [PATCH 068/181] update --- python/paddle/v2/fluid/tests/test_adam_op.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index 996fcfe49d..3758ca457e 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -285,8 +285,9 @@ class TestSparseAdamOp(unittest.TestCase): j = 0 while j < self.row_numel: pos = row_id * self.row_numel + j - print (actual[pos] - np_array[pos]) / actual[pos] - self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) + print(actual[pos] - np_array[pos]) / actual[pos] + self.assertLess((actual[pos] - np_array[pos]) / actual[pos], + 0.00001) j += 1 def test_sparse_sgd(self): From e5777d062bd916b44d12ee5b4e28c8cbef32524d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 10:19:41 +0800 Subject: [PATCH 069/181] fix build link rt --- paddle/pybind/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 6afed7eec7..ced75cbfd8 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -3,6 +3,7 @@ if(WITH_PYTHON) SRCS pybind.cc exception.cc protobuf.cc const_value.cc DEPS pybind python backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB}) + target_link_libraries(paddle_pybind rt) endif(WITH_PYTHON) if(WITH_DOC) From fd2bf55016e6de50bbc436476050f1c442cb654c Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 10:29:29 +0800 Subject: [PATCH 070/181] Rename API of DeviceContext Make them as usual names. --- paddle/framework/init.cc | 2 +- paddle/framework/operator.cc | 4 +-- paddle/operators/array_operator.h | 4 +-- paddle/operators/array_to_lod_tensor_op.cc | 5 ++-- paddle/operators/assign_op.cc | 4 +-- paddle/operators/cond_op.cc | 4 +-- paddle/operators/feed_op.cc | 4 +-- paddle/operators/fetch_op.cc | 4 +-- paddle/operators/fill_constant_op.cc | 4 +-- paddle/operators/fill_op.cc | 5 ++-- paddle/operators/load_op.cc | 4 +-- paddle/operators/lod_tensor_to_array_op.cc | 5 ++-- paddle/operators/merge_lod_tensor_op.cc | 4 +-- paddle/operators/recurrent_op.cc | 9 +++--- .../reorder_lod_tensor_by_rank_op.cc | 4 +-- paddle/operators/save_op.cc | 4 +-- paddle/operators/shrink_rnn_memory_op.cc | 4 +-- paddle/operators/split_lod_tensor_op.cc | 4 +-- .../operators/tensor_array_read_write_op.cc | 10 ++++--- paddle/platform/device_context.cc | 20 +------------ paddle/platform/device_context.h | 12 ++------ paddle/platform/device_context_test.cu | 29 +++++-------------- paddle/platform/nccl_test.cu | 2 +- 23 files changed, 59 insertions(+), 92 deletions(-) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index d6601090d5..682cff168d 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -71,7 +71,7 @@ bool InitDevices(const std::vector &devices) { places.emplace_back(platform::CPUPlace()); LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - platform::DeviceContextPool::Create(places); + platform::DeviceContextPool::Init(places); return true; } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 66840a2e03..307730de2e 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -387,8 +387,8 @@ void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto dev_ctx = pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 060ffac827..e0eef5d9f9 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); size_t offset; if (platform::is_gpu_place(i_tensor.place())) { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index 0aa04c268b..49366fee8d 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { } auto slice = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0560040509..7d77be3be1 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 455fbd8ca3..e333002bfd 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const { // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index cecbb7226a..48da52c3b6 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { auto *out_item = out_var->GetMutable(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index fa20a06540..387d1e0a74 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index fe0706c4a9..dcd43a30c8 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index 57b4ec6938..084ba1db62 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(tensor, place, dev_ctx, &out); } } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 5425375c1f..65f021d919 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -40,8 +40,8 @@ class LoadOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); framework::DeserializeFromStream(fin, tensor); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (platform::is_gpu_place(place)) { // copy CPU to GPU diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index ed99915bb7..8d164b4abc 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 2287f34791..3f999e404f 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 71769e67c7..056fa46949 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { false /*create_local_scope*/); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output @@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { auto *program = block->Program(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc index 1063388e25..8d652ff806 100644 --- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { auto x_sliced = x.Slice(x_offset, x_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); out_offset += len; return out_offset; diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index d045a8b5b8..4b1cbe8883 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::SerializeToStream(fout, tensor, dev_ctx); } diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e8a4773547..e5ef0740b6 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 89826ca6ee..2d8787d740 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 9529aab573..53e38ec703 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); @@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index e450ef32a4..ea07f2e002 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -17,7 +17,7 @@ namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Borrow( +const platform::DeviceContext* DeviceContextPool::Get( const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { @@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( return it->second; } -std::vector DeviceContextPool::Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto it = device_contexts_.find(place); - if (it != device_contexts_.end()) { - borrowed_contexts.emplace_back(it->second); - } else { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - } - return borrowed_contexts; -} - DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 8ba12e1657..dfef2c16d8 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -109,13 +109,13 @@ class DeviceContextPool { public: explicit DeviceContextPool(const std::vector& places); - static DeviceContextPool& Get() { + static DeviceContextPool& Instance() { PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); return *pool; } /*! \brief Create should only called by Init function */ - static DeviceContextPool& Create(const std::vector& places) { + static DeviceContextPool& Init(const std::vector& places) { if (pool == nullptr) { pool = new DeviceContextPool(places); } @@ -123,13 +123,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Borrow(const platform::Place& place); - - /*! \brief Return handle of multi-device context. */ - std::vector Borrow( - const std::vector& places); - - ~DeviceContextPool() {} + const platform::DeviceContext* Get(const platform::Place& place); private: static DeviceContextPool* pool; diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 91011bf71c..ca10cf3463 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; - DeviceContextPool& pool = DeviceContextPool::Get(); - auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); - auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); - EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(CUDAPlace(i)); - } - auto dev_ctxs = pool.Borrow(gpu_places); - for (size_t i = 0; i < dev_ctxs.size(); ++i) { - auto* dev_ctx = static_cast(dev_ctxs[i]); - - // check same as CUDAPlace(i) - CUDAPlace place = boost::get(dev_ctx->GetPlace()); - EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); } } int main(int argc, char** argv) { - int dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " - "device count is " - << dev_count; - return 0; - } - std::vector places; places.emplace_back(paddle::platform::CPUPlace()); @@ -109,7 +94,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 8f815863a7..ef6d845874 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -144,7 +144,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From a5e1cf5a2eeec59740f5ff5c60dc104b2aa9b520 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 10:29:29 +0800 Subject: [PATCH 071/181] Rename API of DeviceContext Make them as usual names. --- paddle/framework/init.cc | 2 +- paddle/framework/operator.cc | 4 +-- paddle/operators/array_operator.h | 4 +-- paddle/operators/array_to_lod_tensor_op.cc | 5 ++-- paddle/operators/assign_op.cc | 4 +-- paddle/operators/cond_op.cc | 4 +-- paddle/operators/feed_op.cc | 4 +-- paddle/operators/fetch_op.cc | 4 +-- paddle/operators/fill_constant_op.cc | 4 +-- paddle/operators/fill_op.cc | 5 ++-- paddle/operators/load_op.cc | 4 +-- paddle/operators/lod_tensor_to_array_op.cc | 5 ++-- paddle/operators/merge_lod_tensor_op.cc | 4 +-- paddle/operators/recurrent_op.cc | 9 +++--- .../reorder_lod_tensor_by_rank_op.cc | 4 +-- paddle/operators/save_op.cc | 4 +-- paddle/operators/shrink_rnn_memory_op.cc | 4 +-- paddle/operators/split_lod_tensor_op.cc | 4 +-- .../operators/tensor_array_read_write_op.cc | 10 ++++--- paddle/platform/device_context.cc | 20 +------------ paddle/platform/device_context.h | 12 ++------ paddle/platform/device_context_test.cu | 29 +++++-------------- paddle/platform/nccl_test.cu | 2 +- 23 files changed, 59 insertions(+), 92 deletions(-) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index d6601090d5..682cff168d 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -71,7 +71,7 @@ bool InitDevices(const std::vector &devices) { places.emplace_back(platform::CPUPlace()); LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - platform::DeviceContextPool::Create(places); + platform::DeviceContextPool::Init(places); return true; } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 886f73e7b8..e8d4be8675 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -388,8 +388,8 @@ void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto dev_ctx = pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 060ffac827..e0eef5d9f9 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); size_t offset; if (platform::is_gpu_place(i_tensor.place())) { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index 0aa04c268b..49366fee8d 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { } auto slice = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0560040509..7d77be3be1 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 455fbd8ca3..e333002bfd 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const { // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index cecbb7226a..48da52c3b6 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { auto *out_item = out_var->GetMutable(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index fa20a06540..387d1e0a74 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index fe0706c4a9..dcd43a30c8 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index 57b4ec6938..084ba1db62 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(tensor, place, dev_ctx, &out); } } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 5425375c1f..65f021d919 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -40,8 +40,8 @@ class LoadOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); framework::DeserializeFromStream(fin, tensor); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (platform::is_gpu_place(place)) { // copy CPU to GPU diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index ed99915bb7..8d164b4abc 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 2287f34791..3f999e404f 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 71769e67c7..056fa46949 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { false /*create_local_scope*/); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output @@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { auto *program = block->Program(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc index 1063388e25..8d652ff806 100644 --- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { auto x_sliced = x.Slice(x_offset, x_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); out_offset += len; return out_offset; diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index d045a8b5b8..4b1cbe8883 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::SerializeToStream(fout, tensor, dev_ctx); } diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e8a4773547..e5ef0740b6 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 89826ca6ee..2d8787d740 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 9529aab573..53e38ec703 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); @@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index e450ef32a4..ea07f2e002 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -17,7 +17,7 @@ namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Borrow( +const platform::DeviceContext* DeviceContextPool::Get( const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { @@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( return it->second; } -std::vector DeviceContextPool::Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto it = device_contexts_.find(place); - if (it != device_contexts_.end()) { - borrowed_contexts.emplace_back(it->second); - } else { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - } - return borrowed_contexts; -} - DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 8ba12e1657..dfef2c16d8 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -109,13 +109,13 @@ class DeviceContextPool { public: explicit DeviceContextPool(const std::vector& places); - static DeviceContextPool& Get() { + static DeviceContextPool& Instance() { PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); return *pool; } /*! \brief Create should only called by Init function */ - static DeviceContextPool& Create(const std::vector& places) { + static DeviceContextPool& Init(const std::vector& places) { if (pool == nullptr) { pool = new DeviceContextPool(places); } @@ -123,13 +123,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Borrow(const platform::Place& place); - - /*! \brief Return handle of multi-device context. */ - std::vector Borrow( - const std::vector& places); - - ~DeviceContextPool() {} + const platform::DeviceContext* Get(const platform::Place& place); private: static DeviceContextPool* pool; diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 91011bf71c..ca10cf3463 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; - DeviceContextPool& pool = DeviceContextPool::Get(); - auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); - auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); - EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(CUDAPlace(i)); - } - auto dev_ctxs = pool.Borrow(gpu_places); - for (size_t i = 0; i < dev_ctxs.size(); ++i) { - auto* dev_ctx = static_cast(dev_ctxs[i]); - - // check same as CUDAPlace(i) - CUDAPlace place = boost::get(dev_ctx->GetPlace()); - EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); } } int main(int argc, char** argv) { - int dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " - "device count is " - << dev_count; - return 0; - } - std::vector places; places.emplace_back(paddle::platform::CPUPlace()); @@ -109,7 +94,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 8f815863a7..ef6d845874 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -144,7 +144,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From 8b877dd7c8ef71520d27cd187f6767fe6be02262 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 10:29:29 +0800 Subject: [PATCH 072/181] Rename API of DeviceContext Make them as usual names. --- paddle/framework/init.cc | 2 +- paddle/framework/operator.cc | 4 +-- paddle/operators/array_operator.h | 4 +-- paddle/operators/array_to_lod_tensor_op.cc | 5 ++-- paddle/operators/assign_op.cc | 4 +-- paddle/operators/cond_op.cc | 4 +-- paddle/operators/feed_op.cc | 4 +-- paddle/operators/fetch_op.cc | 4 +-- paddle/operators/fill_constant_op.cc | 4 +-- paddle/operators/fill_op.cc | 5 ++-- paddle/operators/load_op.cc | 4 +-- paddle/operators/lod_tensor_to_array_op.cc | 5 ++-- paddle/operators/merge_lod_tensor_op.cc | 4 +-- paddle/operators/recurrent_op.cc | 9 +++--- .../reorder_lod_tensor_by_rank_op.cc | 4 +-- paddle/operators/save_op.cc | 4 +-- paddle/operators/shrink_rnn_memory_op.cc | 4 +-- paddle/operators/split_lod_tensor_op.cc | 4 +-- .../operators/tensor_array_read_write_op.cc | 10 ++++--- paddle/platform/device_context.cc | 20 +------------ paddle/platform/device_context.h | 12 ++------ paddle/platform/device_context_test.cu | 29 +++++-------------- paddle/platform/nccl_test.cu | 2 +- 23 files changed, 59 insertions(+), 92 deletions(-) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index d6601090d5..682cff168d 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -71,7 +71,7 @@ bool InitDevices(const std::vector &devices) { places.emplace_back(platform::CPUPlace()); LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - platform::DeviceContextPool::Create(places); + platform::DeviceContextPool::Init(places); return true; } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 886f73e7b8..e8d4be8675 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -388,8 +388,8 @@ void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto dev_ctx = pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 060ffac827..e0eef5d9f9 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); size_t offset; if (platform::is_gpu_place(i_tensor.place())) { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index 0aa04c268b..49366fee8d 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { } auto slice = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0560040509..7d77be3be1 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 455fbd8ca3..e333002bfd 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const { // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index cecbb7226a..48da52c3b6 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { auto *out_item = out_var->GetMutable(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index fa20a06540..387d1e0a74 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index fe0706c4a9..dcd43a30c8 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index 57b4ec6938..084ba1db62 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(tensor, place, dev_ctx, &out); } } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 5425375c1f..65f021d919 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -40,8 +40,8 @@ class LoadOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); framework::DeserializeFromStream(fin, tensor); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (platform::is_gpu_place(place)) { // copy CPU to GPU diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index ed99915bb7..8d164b4abc 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 2287f34791..3f999e404f 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 71769e67c7..056fa46949 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { false /*create_local_scope*/); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output @@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { auto *program = block->Program(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc index 1063388e25..8d652ff806 100644 --- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { auto x_sliced = x.Slice(x_offset, x_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); out_offset += len; return out_offset; diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index d045a8b5b8..4b1cbe8883 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::SerializeToStream(fout, tensor, dev_ctx); } diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e8a4773547..e5ef0740b6 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 89826ca6ee..2d8787d740 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 9529aab573..53e38ec703 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); @@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index e450ef32a4..ea07f2e002 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -17,7 +17,7 @@ namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Borrow( +const platform::DeviceContext* DeviceContextPool::Get( const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { @@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( return it->second; } -std::vector DeviceContextPool::Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto it = device_contexts_.find(place); - if (it != device_contexts_.end()) { - borrowed_contexts.emplace_back(it->second); - } else { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - } - return borrowed_contexts; -} - DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 8ba12e1657..dfef2c16d8 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -109,13 +109,13 @@ class DeviceContextPool { public: explicit DeviceContextPool(const std::vector& places); - static DeviceContextPool& Get() { + static DeviceContextPool& Instance() { PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); return *pool; } /*! \brief Create should only called by Init function */ - static DeviceContextPool& Create(const std::vector& places) { + static DeviceContextPool& Init(const std::vector& places) { if (pool == nullptr) { pool = new DeviceContextPool(places); } @@ -123,13 +123,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Borrow(const platform::Place& place); - - /*! \brief Return handle of multi-device context. */ - std::vector Borrow( - const std::vector& places); - - ~DeviceContextPool() {} + const platform::DeviceContext* Get(const platform::Place& place); private: static DeviceContextPool* pool; diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 91011bf71c..ca10cf3463 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; - DeviceContextPool& pool = DeviceContextPool::Get(); - auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); - auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); - EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(CUDAPlace(i)); - } - auto dev_ctxs = pool.Borrow(gpu_places); - for (size_t i = 0; i < dev_ctxs.size(); ++i) { - auto* dev_ctx = static_cast(dev_ctxs[i]); - - // check same as CUDAPlace(i) - CUDAPlace place = boost::get(dev_ctx->GetPlace()); - EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); } } int main(int argc, char** argv) { - int dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " - "device count is " - << dev_count; - return 0; - } - std::vector places; places.emplace_back(paddle::platform::CPUPlace()); @@ -109,7 +94,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 8f815863a7..ef6d845874 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -144,7 +144,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From 42062c38b17f0a8ba3431bcb043e78b87440e6ad Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 11:12:18 +0800 Subject: [PATCH 073/181] Fix compile --- paddle/operators/beam_search_decode_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc index 52c28e7f53..72e05607b0 100644 --- a/paddle/operators/beam_search_decode_op.cc +++ b/paddle/operators/beam_search_decode_op.cc @@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope& scope, const platform::Place& dev_place) const override { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); framework::ExecutionContext ctx(*this, scope, dev_ctx); From b711870c4ae2803374a5d5d86f011aa819055b7c Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 13:24:40 +0800 Subject: [PATCH 074/181] Fix compile --- paddle/gserver/layers/MKLDNNLRNLayer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp index 741984bb68..ac217f1363 100644 --- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp @@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap, } /* the size of inputs for norm-layer is 1 */ - CHECK_EQ(config_.inputs_size(), 1UL); + CHECK_EQ(config_.inputs_size(), 1); const NormConfig& conf = config_.inputs(0).norm_conf(); localSize_ = conf.size(); alpha_ = conf.scale(); From 1cb963594736e01c3eab05a88bbf8cbd6d958b1a Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 14:09:14 +0800 Subject: [PATCH 075/181] fix dist train trainspiler bugs --- paddle/operators/send_op.cc | 3 +++ .../paddle/v2/fluid/distribute_transpiler.py | 8 +++--- python/paddle/v2/fluid/framework.py | 2 +- .../notest_recognize_digits_conv_dist.py | 26 ++++++++++++------- 4 files changed, 26 insertions(+), 13 deletions(-) rename python/paddle/v2/fluid/tests/{book => book_distribute}/notest_recognize_digits_conv_dist.py (76%) diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 6e82938683..317db0867e 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -49,14 +49,17 @@ class SendOp : public framework::OperatorBase { std::vector epmap = Attr>("epmap"); // TODO(typhoonzero): use async calls to send multiple variable asyncly. for (size_t i = 0; i < ins.size(); ++i) { + VLOG(3) << "sending " << ins[i]; bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]); if (!ret) { LOG(ERROR) << "send variable error: " << ins[i]; } } + VLOG(3) << "waiting batch "; // TODO(typhoonzero): support async optimization client_map_[epmap[0]]->Wait(); for (size_t i = 0; i < outs.size(); ++i) { + VLOG(3) << "getting " << outs[i]; bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]); if (!ret) { LOG(ERROR) << "GetVariable error: " << outs[i]; diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index 111937f59c..49ece7b725 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -95,7 +95,9 @@ class DistributeTranspiler: """ if program is None: program = default_main_program() + self.program = program self.trainers = trainers + self.optimize_ops = optimize_ops self._optimize_distributed( optimize_ops, program, @@ -156,9 +158,10 @@ class DistributeTranspiler: attrs={"endpoints": pserver_endpoints, "epmap": epmap}) - def get_trainer_program(optimize_ops, program): + def get_trainer_program(self): # remove optimize ops and add a send op to main_program - program.global_block().delete_ops(optimize_ops) + self.program.global_block().delete_ops(self.optimize_ops) + return self.program def _create_var_for_trainers(self, block, var, trainers): var_list = [] @@ -210,7 +213,6 @@ class DistributeTranspiler: if opt_op.inputs.has_key("Grad"): if opt_op.inputs["Grad"].name in grad_var_names: - print "appending ", opt_op.type, opt_op.inputs optimize_sub_program.global_block().append_op( type=opt_op.type, inputs=opt_op.inputs, diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index add854306e..dbdf9a043c 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -663,7 +663,7 @@ class Block(object): end = list(self.ops).index(ops[-1]) except Exception, e: raise e - self.desc.remove_op(start, end) + self.desc.remove_op(start, end + 1) def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() diff --git a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py similarity index 76% rename from python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py rename to python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py index 2680502efb..20b4a8b34c 100644 --- a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py @@ -38,35 +38,43 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) + t = fluid.DistributeTranspiler() +# all parameter server endpoints list for spliting parameters pserver_endpoints = os.getenv("PSERVERS") +# server endpoint for current node +current_endpoint = os.getenv("SERVER_ENDPOINT") +# run as trainer or parameter server training_role = os.getenv("TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver -t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1) +t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) if training_role == "PSERVER": - pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops) + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops) exe.run(fluid.default_startup_program()) exe.run(pserver_prog) elif training_role == "TRAINER": + trainer_prog = t.get_trainer_program() feeder = fluid.DataFeeder(feed_list=[images, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): accuracy.reset(exe) + batch_id = 0 for data in train_reader(): - loss, acc = exe.run(fluid.default_main_program(), + loss, acc = exe.run(trainer_prog, feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) - # print loss, acc - if loss < 10.0 and pass_acc > 0.9: - # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. - exit(0) + if batch_id % 100 == 0: + print("batch_id %d, loss: %f, acc: %f" % + (batch_id, loss, pass_acc)) + batch_id += 1 pass_acc = accuracy.eval(exe) print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER") - -exit(1) From 3e703e914a827646e0338e9282343a148dbba2d9 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 14:12:51 +0800 Subject: [PATCH 076/181] remove log --- paddle/operators/send_op.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 317db0867e..6e82938683 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -49,17 +49,14 @@ class SendOp : public framework::OperatorBase { std::vector epmap = Attr>("epmap"); // TODO(typhoonzero): use async calls to send multiple variable asyncly. for (size_t i = 0; i < ins.size(); ++i) { - VLOG(3) << "sending " << ins[i]; bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]); if (!ret) { LOG(ERROR) << "send variable error: " << ins[i]; } } - VLOG(3) << "waiting batch "; // TODO(typhoonzero): support async optimization client_map_[epmap[0]]->Wait(); for (size_t i = 0; i < outs.size(); ++i) { - VLOG(3) << "getting " << outs[i]; bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]); if (!ret) { LOG(ERROR) << "GetVariable error: " << outs[i]; From 15309fde2c50a485fd120f749661ea16a6c75232 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 14:45:04 +0800 Subject: [PATCH 077/181] Add API for HasNAN HasInf --- paddle/framework/tensor_util.h | 96 ++++++++++++++++++++++++++++++++ paddle/platform/device_context.h | 20 +++++++ paddle/platform/place.h | 28 +++++++++- 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index ea4e4f22ea..5c7822814c 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/data_type.h" +#include "paddle/framework/eigen.h" #include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -205,5 +208,98 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + platform::DeviceContextPool::Instance().Get(gpu)->Wait(); + CopyFrom(out, cpu, &tmp); + platform::DeviceContextPool::Instance().Get(gpu)->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNanPredicate { + template + auto operator()(T eigen_vec) const -> decltype(std::declval().isnan()) { + return eigen_vec.isnan(); + } +}; + +inline bool HasNan(const framework::Tensor& tensor) { + HasNanPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(T eigen_vec) const -> decltype(std::declval().isinf()) { + return eigen_vec.isinf(); + } +}; + +inline bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + } // namespace framework } // namespace paddle diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index dfef2c16d8..fd441d27f9 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext { std::unique_ptr eigen_device_; }; +template +struct DefaultDeviceContextType; + +template <> +struct DefaultDeviceContextType { + using TYPE = CPUDeviceContext; +}; + #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; @@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext { cublasHandle_t cublas_handle_; }; +template <> +struct DefaultDeviceContextType { + using T = CUDADeviceContext; +}; + class CUDNNDeviceContext : public CUDADeviceContext { public: explicit CUDNNDeviceContext(CUDAPlace place); @@ -125,6 +138,13 @@ class DeviceContextPool { /*! \brief Return handle of single device context. */ const platform::DeviceContext* Get(const platform::Place& place); + template + const typename DefaultDeviceContextType::TYPE* GetByPlace( + const Place& place) { + return reinterpret_cast< + const typename DefaultDeviceContextType::TYPE*>(Get(place)); + } + private: static DeviceContextPool* pool; constexpr static int LEFT_SHIFT = 8; diff --git a/paddle/platform/place.h b/paddle/platform/place.h index d25eaa689f..76b5c502cc 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include - +#include "paddle/platform/enforce.h" #include "paddle/platform/variant.h" namespace paddle { @@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &); std::ostream &operator<<(std::ostream &, const Place &); +template +struct PlaceVisitorWrapper + : public boost::static_visitor { + const Visitor &visitor_; + explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {} + + typename Visitor::result_type operator()(const CPUPlace &cpu) const { + return visitor_(cpu); + } + + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { +#ifdef PADDLE_WITH_CUDA + return visitor_(cuda); +#else + PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device"); + return typename Visitor::result_type(); +#endif + } +}; + +template +typename Visitor::result_type VisitPlace(const Place &place, + const Visitor &visitor) { + return boost::apply_visitor(PlaceVisitorWrapper(visitor), place); +} + } // namespace platform } // namespace paddle From 4518252e572c53ff0b1e8ac4149537bb400b80b5 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 14:47:08 +0800 Subject: [PATCH 078/181] Fix compile --- paddle/operators/nccl_op_test.cu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index 34a6e1a58d..6546096069 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -305,7 +305,7 @@ int main(int argc, char **argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); From b67969622e711475115cf8e624f6fdb7f4c34359 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 27 Dec 2017 14:52:14 +0800 Subject: [PATCH 079/181] refine CMakeLists.txt when add op need DEPS --- paddle/operators/CMakeLists.txt | 79 ++++++++++----------------------- 1 file changed, 24 insertions(+), 55 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf99332..038ad859db 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -1,5 +1,6 @@ file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +set(DEPS_OPS "") set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") function(op_library TARGET) @@ -48,6 +49,11 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) @@ -181,55 +187,20 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) -set(DEPS_OPS - cond_op - cross_entropy_op - recurrent_op - softmax_with_cross_entropy_op - softmax_op - sequence_softmax_op - sum_op - pool_op - maxout_op - unpool_op - pool_with_index_op - conv_op - conv_transpose_op - nccl_op - sequence_conv_op - sequence_pool_op - lod_rank_table_op - lod_tensor_to_array_op - array_to_lod_tensor_op - max_sequence_len_op - lstm_op - tensor_array_read_write_op - gru_op - adagrad_op - sgd_op - save_op - load_op - send_op - recv_op) - if(WITH_DISTRIBUTE) -add_subdirectory(detail) -op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - send_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) -set_source_files_properties( - recv_op.cc - PROPERTIES - COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - -cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) + add_subdirectory(detail) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) +else() + set(DEPS_OPS ${DEPS_OPS} send_op recv_op) endif() -op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) +op_library(cond_op DEPS framework_proto tensor net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) @@ -242,20 +213,19 @@ op_library(pool_op DEPS pooling) op_library(maxout_op DEPS maxouting) op_library(unpool_op DEPS unpooling) op_library(pool_with_index_op DEPS pooling) -op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) -op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) -op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) -op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table) -op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc) +op_library(lod_rank_table_op DEPS lod_rank_table) +op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +op_library(max_sequence_len_op DEPS lod_rank_table) if(WITH_GPU) -op_library(nccl_op DEPS nccl_common) + op_library(nccl_op DEPS nccl_common) endif() op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) -op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) +op_library(recurrent_op DEPS executor) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) @@ -269,13 +239,12 @@ endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") - cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) if(WITH_GPU) - cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) + cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) From 3d282ec407a518ece37adb1b9ee5da57429a9904 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 15:10:48 +0800 Subject: [PATCH 080/181] Add is_nan/is_inf --- paddle/framework/tensor_util.h | 12 +++++++----- paddle/framework/tensor_util_test.cc | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index 5c7822814c..7d786ad614 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -277,21 +277,23 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) { return platform::VisitPlace(place, visitor); } -struct HasNanPredicate { +struct HasNANPredicate { template - auto operator()(T eigen_vec) const -> decltype(std::declval().isnan()) { + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { return eigen_vec.isnan(); } }; -inline bool HasNan(const framework::Tensor& tensor) { - HasNanPredicate predicate; +inline bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; return Any(tensor, predicate); } struct HasInfPredicate { template - auto operator()(T eigen_vec) const -> decltype(std::declval().isinf()) { + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { return eigen_vec.isinf(); } }; diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index f388c19f28..01dfd4deb9 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -13,6 +13,7 @@ #include "paddle/framework/tensor_util.h" #include +#include #include namespace paddle { @@ -230,5 +231,28 @@ TEST(CopyToVector, Tensor) { #endif } +TEST(IsNAN, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + + ASSERT_TRUE(HasNAN(src)); +} + +TEST(IsInf, CPU) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(HasInf(src)); +} + } // namespace framework } // namespace paddle From a5291f9ce2466326588792a2e58a5f777c5fc51e Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 15:11:17 +0800 Subject: [PATCH 081/181] Fix compile --- paddle/pybind/tensor_py.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 67244d8260..64e981e4e8 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -63,9 +63,10 @@ struct CastToPyBufferImpl { auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance; auto dev_ctx = static_cast( - pool.Borrow(tensor.place())); + pool.Get(tensor.place())); paddle::platform::GpuMemcpyAsync( dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), @@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto dev_ctx = - static_cast(pool.Borrow(place)); + static_cast(pool.Get(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } From c67c54a8e7671aa473d6f478a32403a87bdaddc0 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 27 Dec 2017 07:17:37 +0000 Subject: [PATCH 082/181] Polish the doc of cross_entropy --- python/paddle/v2/fluid/layers/nn.py | 55 ++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 2a462ee6cb..b11fd07e7e 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -270,6 +270,7 @@ def gru_unit(input, attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) # create bias + if bias is None: bias_size = [1, 3 * size] bias = helper.create_parameter( @@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs): def cross_entropy(input, label, **kwargs): """ - This function computes cross_entropy using the input and label. + **Cross Entropy Layer** + + This layer computes the cross entropy between `input` and `label`. It supports + both standard cross-entropy and soft-label cross-entropy loss computation. + + 1) One-hot cross-entropy: + `soft_label = false`, `Label[i, 0]` indicates the class index for sample i: + + .. math:: + + Y[i] = -\log(X[i, Label[i]]) + + 2) Soft-label cross-entropy: + `soft_label = true`, `Label[i, j]` indicates the soft label of class j + for sample i: + + .. math:: + + Y[i] = \sum_j{-Label[i, j] * log(X[i, j])} + + Please make sure that in this case the summuation of each row of `label` + equals one. + + 3) One-hot cross-entropy with vecterized `label`: + As a special case of 2), when each row of 'label' has only one + non-zero element (equals 1), soft-label cross-entropy degenerates to a + one-hot cross-entropy with one-hot label representation. + + Args: + input (Variable|list): a 2-D tensor with shape N x D, where N is the + batch size and D is the number of classes. This input is a probability + computed by the previous operator, which is almost always the result + of a softmax operator. + label (Variable|list): the ground truth which is a 2-D tensor. When + `soft_label` is set to `false`, `label` is a tensor with shape + [N x 1]. When `soft_label` is set to `true`, `label` is a + tensor with shape [N x K]. + soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate + the given labels as soft labels, default `false`. + + Returns: + A 2-D tensor with shape [N x 1], the cross entropy loss. + + Raises: + `ValueError`: 1) If the 1st dimension of `input` and `label` are not equal; 2) If + `soft_label == true`, and the 2nd dimension of `input` and `label` are not + equal; 3) If `soft_label == false`, and the 2nd dimension of `label` is not 1. + + Examples: + .. code-block:: python + + predict = fluid.layers.fc(input=net, size=classdim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) """ helper = LayerHelper('cross_entropy', **kwargs) out = helper.create_tmp_variable(dtype=input.dtype) From 35c1683e803462d1ae78c49a8c8fb392ff6e2d32 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 27 Dec 2017 15:23:14 +0800 Subject: [PATCH 083/181] "refine kernel registrar" (#6998) * "refine kernel registrar" * "refine registrar with multikey" * "fix register" * "refine multikernel register" * "fix CI" * "fix CI" * "fix registry" * "swtich GPU to CUDA" * "add register macro test case" * "fix CI" --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/library_type.h | 27 +++++++- paddle/framework/op_kernel_type_test.cc | 2 +- paddle/framework/op_registry.h | 16 ++--- paddle/framework/op_registry_test.cc | 82 +++++++++++++++++++++++++ paddle/operators/conv_cudnn_op.cu.cc | 4 ++ 6 files changed, 122 insertions(+), 11 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 7436e8c228..738684795d 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -37,7 +37,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) -cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) +nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h index 6baae6c2bb..7707799cae 100644 --- a/paddle/framework/library_type.h +++ b/paddle/framework/library_type.h @@ -20,7 +20,11 @@ namespace framework { // For more details about the design of LibraryType, Please refer to // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library -enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; +enum class LibraryType { + kPlain = 0, + kMKLDNN = 1, + kCUDNN = 2, +}; inline std::string LibraryTypeToString(const LibraryType& library_type) { switch (library_type) { @@ -31,7 +35,26 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) { case LibraryType::kCUDNN: return "CUDNN"; default: - PADDLE_THROW("unknown LibraryType %d", library_type); + PADDLE_THROW("unknown LibraryType %d", static_cast(library_type)); + } +} + +inline LibraryType StringToLibraryType(const char* ctype) { + std::string s(ctype); + if (s == std::string("PLAIN")) { + return LibraryType::kPlain; + } else if (s == std::string("MKLDNN")) { + return LibraryType::kMKLDNN; + } else if (s == std::string("CUDNN")) { + return LibraryType::kCUDNN; + // To be compatible with register macro. + // CPU, CUDA, PLAIN are same library type. + } else if (s == std::string("CPU")) { + return LibraryType::kPlain; + } else if (s == std::string("CUDA")) { + return LibraryType::kPlain; + } else { + PADDLE_THROW("Unknown LibraryType %s", s.c_str()); } } diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index 8753d7cc37..dd04840500 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -48,4 +48,4 @@ TEST(OpKernelType, Hash) { OpKernelType::Hash hasher; ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); -} \ No newline at end of file +} diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 9bb2a3b5c2..bdaa259181 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -79,30 +79,31 @@ struct OpKernelRegistrarFunctor { using KERNEL_TYPE = typename std::tuple_element>::type; - void operator()(const char* op_type) const { + void operator()(const char* op_type, const char* library_type) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; - OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType()); + OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), + DataLayout::kAnyLayout, StringToLibraryType(library_type)); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor func; - func(op_type); + func(op_type, library_type); } }; template struct OpKernelRegistrarFunctor { - void operator()(const char* op_type) const {} + void operator()(const char* op_type, const char* library_type) const {} }; // User can register many kernel in one place. The data type could be different. template class OpKernelRegistrar : public Registrar { public: - explicit OpKernelRegistrar(const char* op_type) { + explicit OpKernelRegistrar(const char* op_type, const char* library_type) { OpKernelRegistrarFunctor func; - func(op_type); + func(op_type, library_type); } }; @@ -181,7 +182,8 @@ class OpKernelRegistrar : public Registrar { __reg_op_kernel_##op_type##_##DEVICE_TYPE##__, \ "REGISTER_OP_KERNEL must be called in global namespace"); \ static ::paddle::framework::OpKernelRegistrar \ - __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type); \ + __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type, \ + #DEVICE_TYPE); \ int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() { \ __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch(); \ return 0; \ diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 4cdf6e0865..cef530c6e6 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -1,3 +1,17 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + #include "paddle/framework/op_registry.h" #include @@ -182,3 +196,71 @@ TEST(OperatorRegistrar, Test) { using namespace paddle::framework; OperatorRegistrar reg("cos"); } + +namespace paddle { +namespace framework { + +class OpKernelTestMaker : public OpProtoAndCheckerMaker { + public: + OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment("NoGradOp, same input output. no Grad"); + } +}; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetActualKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::DataType::FP32, ctx.device_context()); + } +}; + +template +class OpKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const {} +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel, + paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_CPU_KERNEL( + op_with_kernel, + paddle::framework::OpKernelTest); + +REGISTER_OP_CUDA_KERNEL(op_with_kernel, + paddle::framework::OpKernelTest< + paddle::platform::CUDADeviceContext, float>); + +TEST(OperatorRegistrar, CPU) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cpu_place); +} + +#ifdef PADDLE_WITH_CUDA +TEST(OperatorRegistrar, CUDA) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cuda_place); +} +#endif diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 08ff0db086..0aa7dd48ca 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle +REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); + REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, paddle::operators::CudnnConvOpKernel, paddle::operators::CudnnConvOpKernel); From 4177e805453253b472d1cbaea826020cdb3caff9 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 27 Dec 2017 07:43:41 +0000 Subject: [PATCH 084/181] Add line feed character in the doc of cross_entropy --- python/paddle/v2/fluid/layers/nn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index b11fd07e7e..6e7145966f 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -403,8 +403,8 @@ def cross_entropy(input, label, **kwargs): A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) If the 1st dimension of `input` and `label` are not equal; 2) If - `soft_label == true`, and the 2nd dimension of `input` and `label` are not + `ValueError`: 1) If the 1st dimension of `input` and `label` are not equal; 2) If \ + `soft_label == true`, and the 2nd dimension of `input` and `label` are not \ equal; 3) If `soft_label == false`, and the 2nd dimension of `label` is not 1. Examples: From 3ae781eb2bc139a946b7f195183e31304af49822 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 15:45:13 +0800 Subject: [PATCH 085/181] Executor check nan --- paddle/framework/executor.cc | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 997773c168..9ee2ddb7c3 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -14,18 +14,17 @@ limitations under the License. */ #include "paddle/framework/executor.h" -#include -#include -#include #include -#include +#include "gflags/gflags.h" #include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/lod_rank_table.h" -#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" -#include "paddle/framework/scope.h" + +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); namespace paddle { namespace framework { @@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { } } +static void CheckTensorNANOrInf(const std::string& name, + const framework::Tensor& tensor) { + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { + return; + } + if (tensor.memory_size() == 0) { + return; + } + PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { // TODO(tonyyang-svail): @@ -101,6 +113,15 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); VLOG(3) << op->DebugString(); op->Run(*local_scope, place_); + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } } if (create_local_scope) { scope->DeleteScope(local_scope); From 16a84328c6947f224534cbd5e3218714adfb9e9b Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 15:47:18 +0800 Subject: [PATCH 086/181] Fix compile --- paddle/pybind/tensor_py.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 64e981e4e8..4d5e73e2c2 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -64,7 +64,7 @@ struct CastToPyBufferImpl { tensor.dims(), platform::CPUPlace())); platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance; + platform::DeviceContextPool::Instance(); auto dev_ctx = static_cast( pool.Get(tensor.place())); From 5162c41a9209da9daf5c440396ac3fbd516f16e7 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 27 Dec 2017 16:02:28 +0800 Subject: [PATCH 087/181] Add gflags --- python/paddle/v2/fluid/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index c72b573069..225b41c504 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -36,7 +36,7 @@ def __read_gflags_from_env__(): """ import sys import core - read_env_flags = ['use_pinned_memory'] + read_env_flags = ['use_pinned_memory', 'check_nan_inf'] if core.is_compile_gpu(): read_env_flags.append('fraction_of_gpu_memory_to_use') core.init_gflags([sys.argv[0]] + From b654e6f7c8151a733a20ce75439a529664770c50 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 27 Dec 2017 17:06:28 +0800 Subject: [PATCH 088/181] fix nccl cmake error in ONLY_CPU mode --- paddle/operators/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 038ad859db..3e686b1c41 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -187,6 +187,12 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) +if(WITH_GPU) + op_library(nccl_op DEPS nccl_common) +else() + set(DEPS_OPS ${DEPS_OPS} nccl_op) +endif() + if(WITH_DISTRIBUTE) add_subdirectory(detail) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) @@ -217,16 +223,12 @@ op_library(lod_rank_table_op DEPS lod_rank_table) op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) op_library(max_sequence_len_op DEPS lod_rank_table) -if(WITH_GPU) - op_library(nccl_op DEPS nccl_common) -endif() op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) - # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) From e2c2652fc046150f45bfe8ac481b3bbcbd1d0c5a Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 27 Dec 2017 09:13:23 +0000 Subject: [PATCH 089/181] amend comments in cross_entropy_op --- paddle/operators/cross_entropy_op.cc | 6 +++--- python/paddle/v2/fluid/layers/nn.py | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index a9c5c7046f..fe39cb481a 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", - "(Tensor, default Tensor), a 2-D tensor with shape N x D, " - "where N is the batch size and D is the number of classes. " + "(Tensor, default Tensor), a 2-D tensor with shape [N x D]," + " where N is the batch size and D is the number of classes. " "This input is a probability computed by the previous operator, " "which is almost always the result of a softmax operator."); AddInput("Label", "(Tensor), the ground truth which is a 2-D tensor. When " "soft_label is set to false, Label is a Tensor with shape " "[N x 1]. When soft_label is set to true, Label is a " - "Tensor with shape [N x K]."); + "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " "[N x 1]. The cross entropy loss."); diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 6e7145966f..26180c38c8 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -365,47 +365,47 @@ def cross_entropy(input, label, **kwargs): both standard cross-entropy and soft-label cross-entropy loss computation. 1) One-hot cross-entropy: - `soft_label = false`, `Label[i, 0]` indicates the class index for sample i: + `soft_label = False`, `Label[i, 0]` indicates the class index for sample i: .. math:: Y[i] = -\log(X[i, Label[i]]) 2) Soft-label cross-entropy: - `soft_label = true`, `Label[i, j]` indicates the soft label of class j + `soft_label = True`, `Label[i, j]` indicates the soft label of class j for sample i: .. math:: Y[i] = \sum_j{-Label[i, j] * log(X[i, j])} - Please make sure that in this case the summuation of each row of `label` + Please make sure that in this case the summation of each row of `label` equals one. 3) One-hot cross-entropy with vecterized `label`: As a special case of 2), when each row of 'label' has only one - non-zero element (equals 1), soft-label cross-entropy degenerates to a - one-hot cross-entropy with one-hot label representation. + non-zero element which is equal to 1, soft-label cross-entropy degenerates + to a one-hot cross-entropy with one-hot label representation. Args: - input (Variable|list): a 2-D tensor with shape N x D, where N is the + input (Variable|list): a 2-D tensor with shape [N x D], where N is the batch size and D is the number of classes. This input is a probability computed by the previous operator, which is almost always the result of a softmax operator. label (Variable|list): the ground truth which is a 2-D tensor. When - `soft_label` is set to `false`, `label` is a tensor with shape - [N x 1]. When `soft_label` is set to `true`, `label` is a - tensor with shape [N x K]. + `soft_label` is set to `False`, `label` is a tensor with shape + [N x 1]. When `soft_label` is set to `True`, `label` is a + tensor with shape [N x D]. soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate - the given labels as soft labels, default `false`. + the given labels as soft labels, default `False`. Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) If the 1st dimension of `input` and `label` are not equal; 2) If \ - `soft_label == true`, and the 2nd dimension of `input` and `label` are not \ - equal; 3) If `soft_label == false`, and the 2nd dimension of `label` is not 1. + `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ + `soft_label == True`, and the 2nd dimension of `input` and `label` are not \ + equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. Examples: .. code-block:: python From 2f76932d7cee1aa0d32ff2ad6b195b9de678635a Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 27 Dec 2017 17:22:49 +0800 Subject: [PATCH 090/181] enhance DataFeeder --- python/paddle/v2/fluid/data_feeder.py | 8 ++++++-- python/paddle/v2/fluid/io.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py index 30a542af21..24036c3e75 100644 --- a/python/paddle/v2/fluid/data_feeder.py +++ b/python/paddle/v2/fluid/data_feeder.py @@ -3,7 +3,7 @@ import core import numpy import six.moves as six -from framework import Variable +from framework import Variable, default_main_program __all__ = ['DataFeeder'] @@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object): class DataFeeder(object): - def __init__(self, feed_list, place): + def __init__(self, feed_list, place, program=None): self.feed_dtypes = [] self.feed_names = [] self.feed_shapes = [] self.feed_lod_level = [] + if program is None: + program = default_main_program() for each_var in feed_list: + if isinstance(each_var, basestring): + each_var = program.block(0).var(each_var) if not isinstance(each_var, Variable): raise TypeError("Feed list should contain a list of variable") self.feed_dtypes.append(each_var.dtype) diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index 69a732fc45..c47ce82aba 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -188,7 +188,7 @@ def save_inference_model(dirname, raise ValueError("'feed_var_names' should be a list of str.") if isinstance(target_vars, Variable): - feeded_var_names = [feeded_var_names] + target_vars = [target_vars] else: if not (bool(target_vars) and all( isinstance(var, Variable) for var in target_vars)): From 19367389c0f2245669e1d05afaa9e6cdd19022a0 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 26 Dec 2017 18:59:53 -0800 Subject: [PATCH 091/181] Update the CUDA kernel. --- paddle/operators/math/math_function.cu | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 36e6cc8914..d47a7f818d 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -274,15 +274,14 @@ void set_constant_with_place( } template -__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int64_t height, - int64_t width) { - int64_t num = height * width; +__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width, + int num) { + T tmp = 1.0 / width; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += blockDim.x * gridDim.x) { - int h = i / width; - int w = i % width; - int idx = h * width + w; - c[idx] = a[idx] + b[w]; + int h = i * tmp; + int w = i - h * width; + c[i] = a[i] + b[w]; } } @@ -292,11 +291,14 @@ struct RowwiseAdd { const framework::Tensor& input, const framework::Tensor& vector, framework::Tensor* output) { auto in_dims = input.dims(); + auto size = input.numel() / in_dims[0]; + PADDLE_ENFORCE_EQ(vector.numel(), size); + PADDLE_ENFORCE_EQ(output->dims(), in_dims); int blocks = 512; int grids = (input.numel() + blocks - 1) / blocks; RowwiseAddKernel<<>>( - input.data(), vector.data(), output->data(), in_dims[0], - in_dims[1]); + input.data(), vector.data(), output->data(), + static_cast(in_dims[1]), static_cast(input.numel())); } }; From 7ac00dd684b025a8b1ea6a34a4cdf39ce7fd792e Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Dec 2017 15:23:49 +0800 Subject: [PATCH 092/181] refine --- paddle/operators/cos_sim_op.cc | 38 +++++++++ paddle/operators/cos_sim_op.cu | 45 +++++++++++ paddle/operators/cos_sim_op.h | 137 ++++++++++++++------------------- 3 files changed, 142 insertions(+), 78 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 440c427cba..ab9cf745e3 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -149,6 +149,44 @@ class CosSimOpGrad : public framework::OperatorWithKernel { } }; +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + inline void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 1cb01f5945..eacac68bac 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -15,6 +15,51 @@ #define EIGEN_USE_GPU #include "paddle/operators/cos_sim_op.h" +namespace paddle { +namespace operators { + +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dy_(dy), + cols_(static_cast(cols)) {} + + inline void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm_[0] * y_norm_[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols_; ++i) { + T dy = dz * (x[i] * reciprocal_xy_norm_prod - + z * y_[i] * reciprocal_y_norm_square); + paddle::paddleAtomicAdd(dy_ + i, dy) + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dy_; + const size_t cols_; +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( cos_sim, ops::CosSimKernel); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index cd5c703c30..8b2a06a41b 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -21,10 +21,17 @@ namespace operators { using Tensor = framework::Tensor; -template -static void ForEachZip(IT1 begin1, IT1 last1, IT2 begin2, Callback callback) { - for (; begin1 < last1; ++begin1, ++begin2) { - callback(*begin1, *begin2); +template +struct CosSimDyFunctor { + CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dy, int cols); + inline void operator()(size_t) const; +}; + +template +static void ForEachZip(size_t num, Callback callback) { + for (size_t i = 0; i < num; ++i) { + callback(i); } } @@ -38,16 +45,11 @@ struct CosSimFunctor { z_(z), cols_(static_cast(cols)) {} - inline void operator()(T& x_norm, T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - size_t y_offset = &y_norm - y_norm_; - - auto* x = x_ + cols_ * x_offset; - - T xx = 0, xy = 0; - T yy = 0; + inline HOSTDEVICE void operator()(size_t offset) const { + auto* x = x_ + cols_ * offset; + T xx = 0, xy = 0, yy = 0; if (same_row) { - auto* y = y_ + cols_ * y_offset; + auto* y = y_ + cols_ * offset; for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; yy += y[i] * y[i]; @@ -55,21 +57,20 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - x_norm_[x_offset] = xx; - y_norm_[y_offset] = yy; - z_[x_offset] = xy / (xx * yy); + y_norm_[offset] = yy; + x_norm_[offset] = xx; + z_[offset] = xy / (xx * yy); } else { // This can be wrote in a better way. - auto* y = y_; for (size_t i = 0; i < cols_; ++i) { xx += x[i] * x[i]; - yy += y[i] * y[i]; // only need - xy += x[i] * y[i]; + yy += y_[i] * y_[i]; // only need + xy += x[i] * y_[i]; } xx = sqrt(xx); yy = sqrt(yy); - x_norm_[x_offset] = xx; y_norm_[0] = yy; - z_[x_offset] = xy / (xx * yy); + x_norm_[offset] = xx; + z_[offset] = xy / (xx * yy); } } @@ -104,14 +105,12 @@ class CosSimKernel : public framework::OpKernel { CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); - ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, - out_y_norm->data(), functor); + ForEachZip(rows_x, functor); } else { CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); - ForEachZip(out_x_norm->data(), out_x_norm->data() + rows_x, - out_y_norm->data(), functor); + ForEachZip(rows_x, functor); } } }; @@ -129,19 +128,15 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - size_t y_offset = &y_norm - y_norm_; + inline HOSTDEVICE void operator()(size_t offset) const { + auto x_norm_square = x_norm_[offset] * x_norm_[offset]; + auto xy_norm_prod = x_norm_[offset] * y_norm_[offset]; + auto dz = dz_[offset]; + auto z = z_[offset]; - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[y_offset]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - - auto* y = y_ + cols_ * y_offset; + auto* dx = dx_ + cols_ * offset; + auto* x = x_ + cols_ * offset; + auto* y = y_ + cols_ * offset; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto reciprocal_x_norm_square = 1 / x_norm_square; @@ -161,10 +156,10 @@ struct CosSimGradFunctor { const size_t cols_; }; -template +template struct CosSimDxFunctor { CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, T* dy, int cols) + const T* z, const T* dz, T* dx, int cols) : x_norm_(x_norm), y_norm_(y_norm), x_(x), @@ -172,37 +167,23 @@ struct CosSimDxFunctor { z_(z), dz_(dz), dx_(dx), - dy_(dy), cols_(static_cast(cols)) {} - inline void operator()(const T& x_norm, const T& y_norm) const { - size_t x_offset = &x_norm - x_norm_; - - auto xy_norm_prod = x_norm_[x_offset] * y_norm_[0]; - auto dz = dz_[x_offset]; - auto z = z_[x_offset]; - auto* x = x_ + cols_ * x_offset; + inline HOSTDEVICE void operator()(size_t offset) const { + auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; + auto dz = dz_[offset]; + auto z = z_[offset]; + auto* x = x_ + cols_ * offset; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[offset] * x_norm_[offset]; + auto* dx = dx_ + cols_ * offset; + auto reciprocal_x_norm_square = 1 / x_norm_square; - if (Dx) { - auto x_norm_square = x_norm_[x_offset] * x_norm_[x_offset]; - auto* dx = dx_ + cols_ * x_offset; - auto* x = x_ + cols_ * x_offset; - auto reciprocal_x_norm_square = 1 / x_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } else { - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); - } + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); } } - const T* x_norm_; const T* y_norm_; const T* x_; @@ -210,7 +191,6 @@ struct CosSimDxFunctor { const T* z_; const T* dz_; T* dx_; - T* dy_; const size_t cols_; }; @@ -239,33 +219,34 @@ class CosSimGradKernel : public framework::OpKernel { in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + ForEachZip(rows_x, functor); } if (out_grad_y) { CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), out_grad_y->mutable_data(context.GetPlace()), cols); - ForEachZip(in_y_norm->data(), in_y_norm->data() + rows_x, - in_x_norm->data(), functor); + ForEachZip(rows_x, functor); } } else { if (out_grad_x) { - CosSimDxFunctor functor( + CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_x->mutable_data(context.GetPlace()), nullptr, cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + out_grad_x->mutable_data(context.GetPlace()), cols); + ForEachZip(rows_x, functor); } if (out_grad_y) { - CosSimDxFunctor functor( + out_grad_y->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out_grad_y, static_cast(0)); + + CosSimDyFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), nullptr, - out_grad_y->mutable_data(context.GetPlace()), cols); - ForEachZip(in_x_norm->data(), in_x_norm->data() + rows_x, - in_y_norm->data(), functor); + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_y->data(), cols); + ForEachZip(rows_x, functor); } } } From a04f30e7cf777964221f26eef2cf4a837c80f622 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 27 Dec 2017 17:51:59 +0800 Subject: [PATCH 093/181] move ENFORCE position --- paddle/operators/conv_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index ab52a41b53..e65a5dce52 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector paddings = ctx->Attrs().Get>("paddings"); int groups = ctx->Attrs().Get("groups"); std::vector dilations = ctx->Attrs().Get>("dilations"); - int input_channels = in_dims[1]; - int output_channels = filter_dims[0]; PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, "Conv intput should be 4-D or 5-D tensor."); @@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ( paddings.size(), strides.size(), "Conv paddings dimension and Conv strides dimension should be the same."); + + int input_channels = in_dims[1]; PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, "The number of input channels should be equal to filter " "channels * groups."); + + int output_channels = filter_dims[0]; PADDLE_ENFORCE_EQ( output_channels % groups, 0, "The number of output channels should be divided by groups."); From e883e985d6dcc764d6180905246a818987ad45d1 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 27 Dec 2017 11:04:56 +0000 Subject: [PATCH 094/181] Remove duplicated reshape_op's doc on html --- doc/api/v2/fluid/layers.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 939731c0f3..004ee2d8c8 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -68,12 +68,6 @@ scale :noindex: -reshape ---------- -.. autofunction:: paddle.v2.fluid.layers.reshape - :noindex: - - transpose --------- .. autofunction:: paddle.v2.fluid.layers.transpose From b222ddcab7a6b24e4ea5780f00bde8eb15d512cb Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 27 Dec 2017 11:06:16 +0000 Subject: [PATCH 095/181] Add sub_seq's doc on html --- doc/api/v2/config/layer.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d81481ca81..ddf0b055a9 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -252,6 +252,11 @@ first_seq .. autoclass:: paddle.v2.layer.first_seq :noindex: +sub_seq +--------- +.. autoclass:: paddle.v2.layer.sub_seq + :noindex: + concat ------ .. autoclass:: paddle.v2.layer.concat From 7aed7eb5398fe3ebe447203fb05d173ac6642d13 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Wed, 27 Dec 2017 20:11:43 +0800 Subject: [PATCH 096/181] cache memory in local scope (#7058) * add KernelTypeToString interface * cache memory in local scope * fix typo * refine trans logic --- paddle/framework/data_transform.h | 14 +++--- paddle/framework/op_kernel_type.h | 8 ++++ paddle/framework/op_kernel_type_test.cc | 4 +- paddle/framework/operator.cc | 64 +++++++++++++++---------- 4 files changed, 55 insertions(+), 35 deletions(-) diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index 73f894a3e2..2191dd3783 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -27,7 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { -using DataTransformFN = +using DataTransformFn = std::function ctx, const Variable& in, Variable* out)>; using KernelTypePair = std::pair; @@ -47,7 +47,7 @@ struct KernelTypePairHash { }; using DataTransformMap = - std::unordered_map; + std::unordered_map; class DataTransformFnMap { public: @@ -58,25 +58,25 @@ class DataTransformFnMap { } void Insert(const OpKernelType& left, const OpKernelType& right, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { Insert(std::make_pair(left, right), data_tranform_fn); } void Insert(const KernelTypePair& kernel_type_pair, - const DataTransformFN& data_tranform_fn) { + const DataTransformFn& data_tranform_fn) { PADDLE_ENFORCE(!Has(kernel_type_pair), "KernelTypePair %s has been registered", ""); map_.insert({kernel_type_pair, data_tranform_fn}); } - const DataTransformFN& Get(const KernelTypePair& key_pair) const { + const DataTransformFn& Get(const KernelTypePair& key_pair) const { auto data_transformer = GetNullable(key_pair); PADDLE_ENFORCE_NOT_NULL(data_transformer, - "DataTransformFN should not be NULL"); + "DataTransformFn should not be NULL"); return *data_transformer; } - const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const { + const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const { auto it = map_.find(key_pair); if (it == map_.end()) { return nullptr; diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index 97b542e345..b06002096f 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -68,6 +68,8 @@ struct OpKernelType { data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && library_type_ == o.library_type_; } + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } }; inline std::ostream& operator<<(std::ostream& os, @@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os, return os; } +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index dd04840500..649afeee8a 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, LibraryType::kCUDNN); - std::ostringstream stream; - stream << op_kernel_type; ASSERT_EQ( - stream.str(), + paddle::framework::KernelTypeToString(op_kernel_type), "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 886f73e7b8..f48512b5c6 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -413,37 +413,51 @@ void OperatorWithKernel::Run(const Scope& scope, } if (actual_kernel_key == expected_kernel_key) { - kernel_iter->second->Compute(ctx); + PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_, + "Currently, model parallelism is only supported between " + "CPU and other devices. For example, multi-GPU model " + "parallelism will failed."); } else { - Scope& op_scope = scope.NewScope(); - auto input_vars = this->InputVars(); - for (auto var_name : input_vars) { - op_scope.Var(var_name); - } - - // TODO(qijun) get appropriate DeviceContext from DeviceContext pool - platform::DeviceContext* trans_dev_ctx = nullptr; - std::vector trans_dev_ctx_vec{trans_dev_ctx}; + const DataTransformFn* trans_fun = + DataTransformFnMap::Instance().GetNullable( + std::make_pair(actual_kernel_key, expected_kernel_key)); + if (trans_fun) { + auto input_vars = this->InputVars(); + // TODO(qijun) filter the input vars that do not need to be transformed + + // filter vars that has been transformed + std::vector need_trans; + for (auto var_name : input_vars) { + auto var_name_trans = + var_name + framework::KernelTypeToString(expected_kernel_key); + if (!scope.FindVar(var_name_trans)) { + const_cast(scope).Var(var_name_trans); + need_trans.push_back(var_name); + } + } - // TODO(qijun) get appropriate DataTransformFN from global map - framework::DataTransformFN trans_fun = nullptr; + if (!need_trans.empty()) { + // TODO(qijun) get appropriate DeviceContext from DeviceContext pool + platform::DeviceContext* trans_dev_ctx = nullptr; + std::vector trans_dev_ctx_vec{trans_dev_ctx}; - // Wait for transform starting - dev_ctx->Wait(); + // Wait for transform starting + dev_ctx->Wait(); - for (auto var_name : input_vars) { - trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)), - op_scope.FindVar(var_name)); - } - // Wait for data transform finishing - for (auto ctx : trans_dev_ctx_vec) { - ctx->Wait(); + for (auto var_name : need_trans) { + (*trans_fun)(trans_dev_ctx_vec, *(scope.FindVar(var_name)), + scope.FindVar(var_name + framework::KernelTypeToString( + expected_kernel_key))); + } + // Wait for data transform finishing + for (auto ctx : trans_dev_ctx_vec) { + ctx->Wait(); + } + } } - - // Create a new ExecutionContext - ExecutionContext op_ctx(*this, op_scope, *dev_ctx); - kernel_iter->second->Compute(op_ctx); } + + kernel_iter->second->Compute(ctx); } OpKernelType OperatorWithKernel::GetActualKernelType( From 3d2b2d408f9010ca8c5eda80642d5b9431936f00 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Dec 2017 18:43:49 +0800 Subject: [PATCH 097/181] refine doc --- python/paddle/v2/fluid/layers/nn.py | 60 +++++++++++++++-------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 1240b2576f..a51275282c 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -478,8 +478,7 @@ def conv2d(input, groups=None, param_attr=None, bias_attr=None, - act=None, - name=None): + act=None): """ **Convlution2D Layer** @@ -498,46 +497,51 @@ def conv2d(input, Out = \sigma (W \\ast X + b) - In the above equation: + In the above equation: * :math:`X`: Input value, a tensor with NCHW format. * :math:`W`: Filter value, a tensor with MCHW format. - * :math: \\ast : Convolution operation. + * :math:`\\ast`: Convolution operation. * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math: \\sigma : Activation function. + * :math:`\\sigma`: Activation function. * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. Example: - - Input: - Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ - Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ - - Output: - Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ Where - .. math:: - - H_{out}= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 - - W_{out}= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 + .. math:: - - All the input variables are passed in as local variables to the LayerHelper - constructor. + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 Args: - input(Variable): Input tensors. The format of input tensor is NCHW. - num_filters(int): Number of filters - filter_size(list/int): Filter size of Conv2d Layer - stride(list/int, optional): Strides(h_s, w_s) of Conv2d Layer. Default: 1 - padding(list/int, optional): Paddings(h_pad, w_pad) of Conv2d Layer. Default: 0 - groups(int, optional): The groups number of the Conv2d Layer. Default: 1 - param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None - bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None - act(str): Activation type. Default: None - name(str): Name/alias of the function + input(Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + groups(int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None + act(str): Activation type. Default: None Returns: Variable: The tensor variable storing the convolution and \ From d48a0e4eae939f3615fabc9f86f11670fcfad6e3 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 21:04:51 +0800 Subject: [PATCH 098/181] WIP: adding generic scattor functors --- .../operators/math/selected_rows_functor.cc | 47 +++++++++++++ .../operators/math/selected_rows_functor.cu | 67 +++++++++++++++++++ paddle/operators/math/selected_rows_functor.h | 47 +++++++++++++ 3 files changed, 161 insertions(+) diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index ab758d1e7f..21418ba4b0 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -179,6 +179,53 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +// This is a separated namespace for manipulate SelectedRows typed +// data. Like merge duplicated rows, adding two SelectedRows etc. +// +// Another group of functors is called "scatter updates", which means +// use SelectedRows to update a dense tensor with different Ops, like +// add or mul. +namespace scatter { + +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} + +template +struct MergeAdd { + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out) { + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + // std::unique_ptr out{ + // new framework::SelectedRows()}; + out->set_rows(merge_rows); + out->set_height(input.height()); + out->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out->mutable_value(), 0.0); + + auto* out_data = out->mutable_value()->data(); + auto* input_data = input.value().data(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = FindPos(merge_rows, input_rows[i]); + for (int64_t j = 0; j < input_width; j++) { + out_data[out_i * input_width + j] += input_data[i * input_width + j]; + } + } + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index 9fddd97a36..b2c0fe7bc3 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -222,6 +222,73 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; + +namespace scatter { + +template +__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, + T* out, const int64_t* out_rows, + size_t out_rows_size, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t out_idx; + + if (tid == 0) { + for (size_t i = 0; i < out_rows_size; i++) { + if (input_rows[ty] == out_rows[i]) { + out_idx = i; + } + } + } + + __syncthreads(); + + input += ty * row_numel; + out += out_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(out + index, input[index]); + } +} + +template +struct MergeAdd { + void operator()(const platform::GPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out) { + auto input_rows = input.rows(); + std::set row_set(input_rows.begin(), input_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto input_width = input.value().dims()[1]; + // std::unique_ptr out{ + // new framework::SelectedRows()}; + out->set_rows(merge_rows); + out->set_height(input.height()); + out->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out->mutable_value(), 0.0); + + auto* out_data = out->mutable_value()->data(); + auto* input_data = input.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, input_rows.size()); + + MergeAddKernel< + T, 256><<(context) + .stream()>>>(input_data, input.rows().data(), out_data, + out->rows().data(), out->rows().size(), + input_width); + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index 1149075abf..8adfca77f6 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -52,6 +52,53 @@ struct SelectedRowsAddToTensor { framework::Tensor* input2); }; +namespace scatter { +// functors for manuplating SelectedRows data + +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + void operator()(const DeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* out); +}; + +template +struct Add { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* out) { + out->set_rows(input1->rows()); + out->set_height(input1->height()); + out->mutable_value()->mutable_data(input1->value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1->value()); + auto e_in2 = framework::EigenVector::Flatten(input2->value()); + e_out.device(*context.eigen_device()) = e_in1 + e_in2; + } +}; + +template +struct Mul { + void operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2, + framework::SelectedRows* out) { + out->set_rows(input1->rows()); + out->set_height(input1->height()); + out->mutable_value()->mutable_data(input1->value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1->value()); + auto e_in2 = framework::EigenVector::Flatten(input2->value()); + e_out.device(*context.eigen_device()) = e_in1 * e_in2; + } +}; + +} // namespace scatter } // namespace math } // namespace operators } // namespace paddle From 74b122889cbce2aa3add92784d0b4a621abfdf45 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 27 Dec 2017 21:08:40 +0800 Subject: [PATCH 099/181] wip --- paddle/operators/math/selected_rows_functor.h | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index 8adfca77f6..eecd5e5362 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/eigen.h" #include "paddle/framework/selected_rows.h" #include "paddle/platform/device_context.h" @@ -70,13 +71,13 @@ struct Add { const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* out) { - out->set_rows(input1->rows()); - out->set_height(input1->height()); - out->mutable_value()->mutable_data(input1->value().dims(), + out->set_rows(input1.rows()); + out->set_height(input1.height()); + out->mutable_value()->mutable_data(input1.value().dims(), context.GetPlace()); auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); - auto e_in1 = framework::EigenVector::Flatten(input1->value()); - auto e_in2 = framework::EigenVector::Flatten(input2->value()); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 + e_in2; } }; @@ -87,13 +88,13 @@ struct Mul { const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* out) { - out->set_rows(input1->rows()); - out->set_height(input1->height()); - out->mutable_value()->mutable_data(input1->value().dims(), + out->set_rows(input1.rows()); + out->set_height(input1.height()); + out->mutable_value()->mutable_data(input1.value().dims(), context.GetPlace()); auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); - auto e_in1 = framework::EigenVector::Flatten(input1->value()); - auto e_in2 = framework::EigenVector::Flatten(input2->value()); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 * e_in2; } }; From 5347c8d76322d4288fbc2ad4e0e4ef77fabf558f Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 27 Dec 2017 21:13:48 +0800 Subject: [PATCH 100/181] Fix bugs (#7060) * fix bugs --- paddle/framework/executor.cc | 2 +- paddle/operators/send_op.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 997773c168..31749743a5 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -102,7 +102,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, VLOG(3) << op->DebugString(); op->Run(*local_scope, place_); } - if (create_local_scope) { + if (create_vars && create_local_scope) { scope->DeleteScope(local_scope); } } diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc index 6e82938683..95c207221a 100644 --- a/paddle/operators/send_op.cc +++ b/paddle/operators/send_op.cc @@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Recv operator -This operator will recv tensor from send_op +This operator will send tensor to recv_op. )DOC"); AddAttr>("endpoints", "(string vector, default 127.0.0.1:6164)" From c31cbae510344492d34b81e6840c69424493037c Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 27 Dec 2017 22:07:57 +0800 Subject: [PATCH 101/181] Fix/transform (#7079) * "fix data transform" * "split into next PR" --- paddle/framework/data_transform.cc | 1 + paddle/framework/data_transform_test.cc | 76 ++++++++++++++++--------- paddle/framework/tensor.h | 2 +- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 35f16025a9..376268888e 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/data_transform.h" +#include "paddle/framework/lod_tensor.h" namespace paddle { namespace framework { diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index f93a47eeb5..0825673546 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -11,36 +11,63 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include -#include "paddle/framework/data_transform.h" #include +#include "paddle/framework/data_transform.h" + namespace paddle { namespace framework { - using namespace platform; +/** + * @brief cross validation of different kernel type transform + * We use four bit map represent different combination. + * If the field has multiple possible value, only choose two of them. + * For DataType, only test the FP32(float), FP64(double). + * e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain + * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN + */ + +std::array kDataType = {proto::DataType::FP32, + proto::DataType::FP64}; + +std::array kPlace = {CPUPlace(), CUDAPlace(0)}; + +std::array kDataLayout = { + DataLayout::kNHWC, DataLayout::kNCHW, +}; + +std::array kLibraryType = { + LibraryType::kPlain, LibraryType::kMKLDNN, +}; + +OpKernelType GenFromBit(const std::vector bits) { + return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], + kLibraryType[bits[3]]); +} + int test_value = 0; -OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW, - LibraryType::kCUDNN); -OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0), - DataLayout::kNCHW, LibraryType::kCUDNN); -OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0), - DataLayout::kNCHW, LibraryType::kCUDNN); +auto kernel0 = GenFromBit({0, 0, 0, 0}); +auto kernel1 = GenFromBit({0, 0, 0, 1}); +auto kernel2 = GenFromBit({0, 0, 1, 0}); +auto kernel3 = GenFromBit({0, 0, 1, 1}); -void type1_to_type2(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataType_t(std::vector ctx, + const Variable& in, Variable* out) { test_value++; } -void type2_to_type3(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataLayout_t(std::vector ctx, + const Variable& in, Variable* out) { test_value--; } -void type1_to_type3(std::vector ctx, - const Variable& in, Variable* out) { +void TransLibraryType_t(std::vector ctx, + const Variable& in, Variable* out) { test_value += 2; } @@ -49,30 +76,25 @@ void type1_to_type3(std::vector ctx, namespace frw = paddle::framework; -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2, - frw::type1_to_type2); -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3, - frw::type2_to_type3); -REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3, - frw::type1_to_type3); +REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t); +REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t); +REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t); TEST(DataTransform, Register) { using namespace paddle::framework; using namespace paddle::platform; auto& instance = DataTransformFnMap::Instance(); - ASSERT_EQ(instance.Map().size(), 3UL); std::vector ctx; paddle::framework::Variable in; paddle::framework::Variable out; - instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in, - &out); + instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out); ASSERT_EQ(test_value, 1); - instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in, - &out); + + instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out); ASSERT_EQ(test_value, 0); - instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in, - &out); + + instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out); ASSERT_EQ(test_value, 2); } diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index b9f6884f7c..341a6949be 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -178,7 +178,7 @@ class Tensor { DDim dims_; /** - * @brief the layout of memory block, default is NCHW. + * @brief the layout of memory block, default is NHWC. * * @note the memory allocation order, describe how weight/data is stored * For example, in 4-D Tensor(rank=4), there are three commonly From 15e8c80ee0cba5c26d881b955afc66e59aaccbdb Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 27 Dec 2017 22:53:58 +0800 Subject: [PATCH 102/181] Rename API of DeviceContext (#7055) * Rename API of DeviceContext Make them as usual names. * Rename API of DeviceContext Make them as usual names. * Fix compile * Fix compile * Fix compile * Fix compile * Fix compile --- paddle/framework/init.cc | 2 +- paddle/framework/operator.cc | 4 +-- paddle/gserver/layers/MKLDNNLRNLayer.cpp | 2 +- paddle/operators/array_operator.h | 4 +-- paddle/operators/array_to_lod_tensor_op.cc | 5 ++-- paddle/operators/assign_op.cc | 4 +-- paddle/operators/beam_search_decode_op.cc | 4 +-- paddle/operators/cond_op.cc | 4 +-- paddle/operators/feed_op.cc | 4 +-- paddle/operators/fetch_op.cc | 4 +-- paddle/operators/fill_constant_op.cc | 4 +-- paddle/operators/fill_op.cc | 5 ++-- paddle/operators/load_op.cc | 4 +-- paddle/operators/lod_tensor_to_array_op.cc | 5 ++-- paddle/operators/merge_lod_tensor_op.cc | 4 +-- paddle/operators/nccl_op_test.cu.cc | 2 +- paddle/operators/recurrent_op.cc | 9 +++--- .../reorder_lod_tensor_by_rank_op.cc | 4 +-- paddle/operators/save_op.cc | 4 +-- paddle/operators/shrink_rnn_memory_op.cc | 4 +-- paddle/operators/split_lod_tensor_op.cc | 4 +-- .../operators/tensor_array_read_write_op.cc | 10 ++++--- paddle/platform/device_context.cc | 20 +------------ paddle/platform/device_context.h | 12 ++------ paddle/platform/device_context_test.cu | 29 +++++-------------- paddle/platform/nccl_test.cu | 2 +- paddle/pybind/tensor_py.h | 9 +++--- 27 files changed, 68 insertions(+), 100 deletions(-) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index d6601090d5..682cff168d 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -71,7 +71,7 @@ bool InitDevices(const std::vector &devices) { places.emplace_back(platform::CPUPlace()); LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - platform::DeviceContextPool::Create(places); + platform::DeviceContextPool::Init(places); return true; } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index f48512b5c6..c0be11294c 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -388,8 +388,8 @@ void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto dev_ctx = pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp index 741984bb68..ac217f1363 100644 --- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp @@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap, } /* the size of inputs for norm-layer is 1 */ - CHECK_EQ(config_.inputs_size(), 1UL); + CHECK_EQ(config_.inputs_size(), 1); const NormConfig& conf = config_.inputs(0).norm_conf(); localSize_ = conf.size(); alpha_ = conf.scale(); diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 060ffac827..e0eef5d9f9 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); size_t offset; if (platform::is_gpu_place(i_tensor.place())) { diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index 0aa04c268b..49366fee8d 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { } auto slice = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index 0560040509..7d77be3be1 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc index 52c28e7f53..72e05607b0 100644 --- a/paddle/operators/beam_search_decode_op.cc +++ b/paddle/operators/beam_search_decode_op.cc @@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope& scope, const platform::Place& dev_place) const override { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); framework::ExecutionContext ctx(*this, scope, dev_ctx); diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 455fbd8ca3..e333002bfd 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, void CondOp::Run(const Scope& scope, const platform::Place& place) const { // get device context from pool - platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); - auto& dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(place); PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index cecbb7226a..48da52c3b6 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { auto *out_item = out_var->GetMutable(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index fa20a06540..387d1e0a74 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index fe0706c4a9..dcd43a30c8 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index 57b4ec6938..084ba1db62 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(tensor, place, dev_ctx, &out); } } diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 5425375c1f..65f021d919 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -40,8 +40,8 @@ class LoadOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); framework::DeserializeFromStream(fin, tensor); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (platform::is_gpu_place(place)) { // copy CPU to GPU diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index ed99915bb7..8d164b4abc 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 2287f34791..3f999e404f 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::Place &dev_place) const override { // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index 34a6e1a58d..6546096069 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -305,7 +305,7 @@ int main(int argc, char **argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 71769e67c7..056fa46949 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { false /*create_local_scope*/); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output @@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { auto *program = block->Program(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc index 1063388e25..8d652ff806 100644 --- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { auto x_sliced = x.Slice(x_offset, x_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); out_offset += len; return out_offset; diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index d045a8b5b8..4b1cbe8883 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { auto &tensor = var->Get(); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::SerializeToStream(fout, tensor, dev_ctx); } diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e8a4773547..e5ef0740b6 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 89826ca6ee..2d8787d740 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 9529aab573..53e38ec703 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); @@ -132,8 +133,9 @@ class ReadFromArrayOp : public ArrayOp { auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); - auto &dev_ctx = *pool.Borrow(place); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index e450ef32a4..ea07f2e002 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -17,7 +17,7 @@ namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Borrow( +const platform::DeviceContext* DeviceContextPool::Get( const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { @@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( return it->second; } -std::vector DeviceContextPool::Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto it = device_contexts_.find(place); - if (it != device_contexts_.end()) { - borrowed_contexts.emplace_back(it->second); - } else { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - } - return borrowed_contexts; -} - DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 8ba12e1657..dfef2c16d8 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -109,13 +109,13 @@ class DeviceContextPool { public: explicit DeviceContextPool(const std::vector& places); - static DeviceContextPool& Get() { + static DeviceContextPool& Instance() { PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); return *pool; } /*! \brief Create should only called by Init function */ - static DeviceContextPool& Create(const std::vector& places) { + static DeviceContextPool& Init(const std::vector& places) { if (pool == nullptr) { pool = new DeviceContextPool(places); } @@ -123,13 +123,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Borrow(const platform::Place& place); - - /*! \brief Return handle of multi-device context. */ - std::vector Borrow( - const std::vector& places); - - ~DeviceContextPool() {} + const platform::DeviceContext* Get(const platform::Place& place); private: static DeviceContextPool* pool; diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 91011bf71c..ca10cf3463 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; - DeviceContextPool& pool = DeviceContextPool::Get(); - auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); - auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); - EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + DeviceContextPool& pool = DeviceContextPool::Instance(); + auto cpu_dev_ctx1 = pool.Get(CPUPlace()); + auto cpu_dev_ctx2 = pool.Get(CPUPlace()); + ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1); std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(CUDAPlace(i)); - } - auto dev_ctxs = pool.Borrow(gpu_places); - for (size_t i = 0; i < dev_ctxs.size(); ++i) { - auto* dev_ctx = static_cast(dev_ctxs[i]); - - // check same as CUDAPlace(i) - CUDAPlace place = boost::get(dev_ctx->GetPlace()); - EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + auto dev_ctx = pool.Get(CUDAPlace(i)); + ASSERT_NE(dev_ctx, nullptr); } } int main(int argc, char** argv) { - int dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " - "device count is " - << dev_count; - return 0; - } - std::vector places; places.emplace_back(paddle::platform::CPUPlace()); @@ -109,7 +94,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 8f815863a7..ef6d845874 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -144,7 +144,7 @@ int main(int argc, char** argv) { } VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Create(places); + paddle::platform::DeviceContextPool::Init(places); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 67244d8260..4d5e73e2c2 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -63,9 +63,10 @@ struct CastToPyBufferImpl { auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); auto dev_ctx = static_cast( - pool.Borrow(tensor.place())); + pool.Get(tensor.place())); paddle::platform::GpuMemcpyAsync( dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), @@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto dev_ctx = - static_cast(pool.Borrow(place)); + static_cast(pool.Get(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } From ba4322f4dda9e7fb91eab0245883c0b16c707ab4 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 28 Dec 2017 10:12:36 +0800 Subject: [PATCH 103/181] fix clang error (#7084) --- paddle/framework/data_transform_test.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 0825673546..4e2141ecd2 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -31,18 +31,16 @@ using namespace platform; * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN */ -std::array kDataType = {proto::DataType::FP32, - proto::DataType::FP64}; +std::array kDataType = { + {proto::DataType::FP32, proto::DataType::FP64}}; -std::array kPlace = {CPUPlace(), CUDAPlace(0)}; +std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; std::array kDataLayout = { - DataLayout::kNHWC, DataLayout::kNCHW, -}; + {DataLayout::kNHWC, DataLayout::kNCHW}}; std::array kLibraryType = { - LibraryType::kPlain, LibraryType::kMKLDNN, -}; + {LibraryType::kPlain, LibraryType::kMKLDNN}}; OpKernelType GenFromBit(const std::vector bits) { return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], From f5c2d175ae105e8938e8343068eff31db5745c19 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 10:25:18 +0800 Subject: [PATCH 104/181] Refine --- paddle/framework/executor.cc | 9 +++++---- paddle/framework/tensor_impl.h | 13 +++++++++++-- paddle/framework/variable.h | 1 + paddle/operators/fill_constant_op.cc | 1 + paddle/operators/shrink_rnn_memory_op.cc | 5 +++-- paddle/operators/sum_op.h | 4 +--- paddle/operators/tensor_array_read_write_op.cc | 2 +- paddle/operators/while_op.cc | 13 +++++++++++++ 8 files changed, 36 insertions(+), 12 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 9ee2ddb7c3..fe9a42ace0 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -59,15 +59,16 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CheckTensorNANOrInf(const std::string& name, const framework::Tensor& tensor) { - if (tensor.type().hash_code() != typeid(float).hash_code() && - tensor.type().hash_code() != typeid(double).hash_code()) { + if (tensor.memory_size() == 0) { return; } - if (tensor.memory_size() == 0) { + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { return; } PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); - PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN, %p", name, + &tensor); } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 6c6f298edc..0161ed8c47 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -134,8 +134,17 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { #endif offset_ = 0; } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); + void* buf = reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); + if (type.hash_code() == typeid(float).hash_code() || + type.hash_code() == typeid(double).hash_code()) { + float* tmp = (float*)(buf); + for (int64_t i = 0; i < numel(); ++i) { + tmp[i] = NAN; + } + } + + return buf; } inline void* Tensor::mutable_data(platform::Place place) { diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h index e5a94759f9..3720393601 100644 --- a/paddle/framework/variable.h +++ b/paddle/framework/variable.h @@ -35,6 +35,7 @@ class Variable { template T* GetMutable() { if (!IsType()) { + VLOG(10) << "Resetting " << *this->name_; holder_.reset(new PlaceholderImpl(new T())); } return static_cast(holder_->Ptr()); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index dcd43a30c8..196c380c73 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -51,6 +51,7 @@ class FillConstantOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); + VLOG(10) << "FillConstant to " << &out; math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index e5ef0740b6..9ef473e726 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -116,9 +116,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { auto height = dout_tensor.dims()[0]; auto slice = dx_tensor.Slice(0, static_cast(height)); framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice); - if (dx_tensor.dims()[0] < height) { + VLOG(10) << dx_tensor.dims()[0] << ", " << height; + if (dx_tensor.dims()[0] > height) { auto rest_tensor = dx_tensor.Slice( - static_cast(height), static_cast(dout_tensor.dims()[0])); + static_cast(height), static_cast(dx_tensor.dims()[0])); math::set_constant(dev_ctx, &rest_tensor, 0.0f); } } diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index eaa36aa1ae..d1277d3edd 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -38,11 +38,9 @@ class SumKernel : public framework::OpKernel { if (out_var->IsType()) { auto *out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - auto result = EigenVector::Flatten(*out); - if (!in_place) { + out->mutable_data(context.GetPlace()); math::SetConstant constant_functor; constant_functor(context.template device_context(), out, 0.0); diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 53e38ec703..d5ff3e3fce 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -130,9 +130,9 @@ class ReadFromArrayOp : public ArrayOp { auto &x_array = x->Get(); auto *out = scope.FindVar(Output("Out")); PADDLE_ENFORCE(out != nullptr, "Out must be set"); - auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { + auto *out_tensor = out->GetMutable(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 728ef60794..322270c829 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -194,14 +194,27 @@ class WhileGradOp : public framework::OperatorBase { } } + auto check_var_no_nan = [](const framework::Scope &scope, + const std::string &var_name) { + auto *var = scope.FindVar(var_name); + if (var->IsType()) { + VLOG(10) << "Checking " << var_name; + PADDLE_ENFORCE(!framework::HasNAN(var->Get()), + "%s has NAN", var_name); + } + }; + check_var_no_nan(cur_scope, inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name); + check_var_no_nan(cur_scope, new_inside_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_place); + check_var_no_nan(cur_scope, pg_names[param_id]); cur_scope.Rename(new_inside_name, inside_grad_name); } } + VLOG(1) << "Complete WhileOpGrad"; } }; From 003917d881fec0192e97bae19abb41599c6b0083 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 10:34:04 +0800 Subject: [PATCH 105/181] Fix compile --- paddle/platform/device_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index fd441d27f9..2b366e6383 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -100,7 +100,7 @@ class CUDADeviceContext : public DeviceContext { template <> struct DefaultDeviceContextType { - using T = CUDADeviceContext; + using TYPE = CUDADeviceContext; }; class CUDNNDeviceContext : public CUDADeviceContext { From de26ae416cce48705c930fcbeecd4e556e57a420 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Dec 2017 23:51:23 +0800 Subject: [PATCH 106/181] add gpu code --- paddle/operators/cos_sim_op.cc | 50 +++++++++----------------- paddle/operators/cos_sim_op.cu | 66 +++++++++++++++++----------------- paddle/operators/cos_sim_op.h | 20 +++++------ 3 files changed, 59 insertions(+), 77 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 80e0780030..77492e60f2 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -151,42 +151,26 @@ class CosSimOpGrad : public framework::OperatorWithKernel { template struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dy_(dy), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dy_[i] += dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); + inline void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t offset = 0; offset < rows; ++offset) { + auto xy_norm_prod = x_norm[offset] * y_norm[0]; + auto dz_data = dz[offset]; + auto z_data = z[offset]; + auto* x_data = x + cols * offset; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } } } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dy_; - const size_t cols_; }; - } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 88f49c1b14..42194d7a05 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -20,45 +20,45 @@ namespace paddle { namespace operators { template -struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dy_(dy), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int offset = blockIdx.x * blockDim.x + threadIdx.x; offset < rows; + offset += grid_size) { + T xy_norm_prod = x_norm[offset] * y_norm_data; + T dz_data = dz[offset]; + T z_data = z[offset]; + const T* x_data = x + cols * offset; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto y_norm_square = y_norm_[0] * y_norm_[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols_; ++i) { - T dy = dz * (x[i] * reciprocal_xy_norm_prod - - z * y_[i] * reciprocal_y_norm_square); - // platform::CudaAtomicAdd(dy_ + i, dy); - dy_[i] += dy; + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); } } +} - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dy_; - const size_t cols_; +template +struct CosSimDyFunctor { + inline void operator()(const platform::CUDADeviceContext& ctx, + const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, const size_t rows, + const size_t cols, T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } }; +template struct CosSimDyFunctor; + } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index bb7c893a29..a913e576f9 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -193,9 +193,10 @@ struct CosSimDxFunctor { template struct CosSimDyFunctor { - CosSimDyFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dy, int cols); - inline HOSTDEVICE void operator()(size_t) const; + inline void operator()(const DeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const; }; template @@ -255,14 +256,11 @@ class CosSimGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); set_zero(dev_ctx, out_grad_y, static_cast(0)); - CosSimDyFunctor functor( - in_x_norm->data(), in_y_norm->data(), in_x->data(), - in_y->data(), in_z->data(), in_grad_z->data(), - out_grad_y->data(), cols); - platform::ForRange for_range( - static_cast(context.device_context()), - rows_x); - for_range(functor); + CosSimDyFunctor functor; + functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), + in_x->data(), in_y->data(), in_z->data(), + in_grad_z->data(), static_cast(rows_x), + static_cast(cols), out_grad_y->data()); } } } From 878d2e919c5c15fabc659ed544da3b867272f0d2 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 10:52:41 +0800 Subject: [PATCH 107/181] Fix compile --- paddle/framework/CMakeLists.txt | 6 +- paddle/framework/tensor_util.cc | 115 ++++++++++++++++++++++++++++++++ paddle/framework/tensor_util.cu | 1 + paddle/framework/tensor_util.h | 96 +------------------------- 4 files changed, 123 insertions(+), 95 deletions(-) create mode 100644 paddle/framework/tensor_util.cc create mode 120000 paddle/framework/tensor_util.cu diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 738684795d..f72f49bc5e 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -5,7 +5,11 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) +if (WITH_GPU) + nv_binary(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context) +else() + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context) +endif () cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc new file mode 100644 index 0000000000..293c65a065 --- /dev/null +++ b/paddle/framework/tensor_util.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/tensor_util.h" + +namespace paddle { +namespace framework { +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void operator()() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + platform::DeviceContextPool::Instance().Get(gpu)->Wait(); + CopyFrom(out, cpu, &tmp); + platform::DeviceContextPool::Instance().Get(gpu)->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct HasNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + return eigen_vec.isnan(); + } +}; + +bool HasNAN(const framework::Tensor& tensor) { + HasNANPredicate predicate; + return Any(tensor, predicate); +} + +struct HasInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + return eigen_vec.isinf(); + } +}; + +bool HasInf(const framework::Tensor& tensor) { + HasInfPredicate predicate; + return Any(tensor, predicate); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu new file mode 120000 index 0000000000..b00e6e59d9 --- /dev/null +++ b/paddle/framework/tensor_util.cu @@ -0,0 +1 @@ +./tensor_util.cc \ No newline at end of file diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index 7d786ad614..e71d8e5672 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -208,100 +208,8 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } -template -struct AnyDTypeVisitor { - Predicate predicate_; - const Tensor& tensor_; - const DevCtx& ctx_; - Tensor* out_; - - AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, - Tensor* out) - : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} - - template - void operator()() const { - auto t = EigenVector::Flatten(tensor_); - auto o = EigenScalar::From(*out_); - o.device(*ctx_.eigen_device()) = predicate_(t).any(); - } -}; - -template -inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, - const DevCtx& ctx, framework::Tensor* out) { - VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( - predicate, tensor, ctx, out)); -} - -template -struct AnyVisitor : public boost::static_visitor { - const framework::Tensor& tensor_; - Predicate predicate_; - - AnyVisitor(const framework::Tensor& tensor, Predicate predicate) - : tensor_(tensor), predicate_(std::move(predicate)) {} - - template - bool operator()(const Place& place) const { - framework::Tensor out; - out.Resize({1}); - out.mutable_data(place); - auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); - AnyImpl(predicate_, tensor_, *ctx, &out); - return this->GetResult(out, place); - } - - bool GetResult(const framework::Tensor& out, - const platform::CUDAPlace& gpu) const { - platform::CPUPlace cpu; - framework::Tensor tmp; - tmp.Resize({1}); - tmp.mutable_data(cpu); - platform::DeviceContextPool::Instance().Get(gpu)->Wait(); - CopyFrom(out, cpu, &tmp); - platform::DeviceContextPool::Instance().Get(gpu)->Wait(); - return GetResult(tmp, cpu); - } - - bool GetResult(const framework::Tensor& out, - const platform::CPUPlace& cpu) const { - return *out.data(); - } -}; - -template -inline bool Any(const framework::Tensor& tensor, Predicate predicate) { - AnyVisitor visitor(tensor, predicate); - auto place = tensor.place(); - return platform::VisitPlace(place, visitor); -} - -struct HasNANPredicate { - template - auto operator()(const T& eigen_vec) const - -> decltype(std::declval().isnan()) { - return eigen_vec.isnan(); - } -}; - -inline bool HasNAN(const framework::Tensor& tensor) { - HasNANPredicate predicate; - return Any(tensor, predicate); -} - -struct HasInfPredicate { - template - auto operator()(const T& eigen_vec) const - -> decltype(std::declval().isinf()) { - return eigen_vec.isinf(); - } -}; - -inline bool HasInf(const framework::Tensor& tensor) { - HasInfPredicate predicate; - return Any(tensor, predicate); -} +extern bool HasNAN(const framework::Tensor& tensor); +extern bool HasInf(const framework::Tensor& tensor); } // namespace framework } // namespace paddle From a9a44e017c4b38cd7105365dd1ee3916fe3889ce Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 10:53:39 +0800 Subject: [PATCH 108/181] Fix compile --- paddle/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index f72f49bc5e..2af10a996c 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -6,7 +6,7 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) if (WITH_GPU) - nv_binary(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context) endif () From 717e125213db7bf5af62ed1fc857d1ee14b8660c Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 28 Dec 2017 11:11:18 +0800 Subject: [PATCH 109/181] fix some warning --- paddle/platform/for_range.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h index 5427aa2823..694a66d9ac 100644 --- a/paddle/platform/for_range.h +++ b/paddle/platform/for_range.h @@ -62,7 +62,7 @@ struct ForRange { template inline void operator()(Function func) const { - constexpr size_t num_threads = 1024; + constexpr int num_threads = 1024; int block_size = limit_ <= num_threads ? limit_ : num_threads; int grid_size = (limit_ + num_threads - 1) / num_threads; From 73aadb6679a7a398e8f10e1b114d1e23bf767acb Mon Sep 17 00:00:00 2001 From: Yancey Date: Thu, 28 Dec 2017 12:14:52 +0800 Subject: [PATCH 110/181] install python tk (#7095) --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index e43b9c218a..92039ec6b0 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -178,7 +178,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip dmidecode && pip install -U pip && \ + apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ From 1a685144bb06f1064d23f8d8d9d5e597a4ae66c3 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Thu, 28 Dec 2017 12:29:17 +0800 Subject: [PATCH 111/181] for xxYY to xx_yy --- paddle/operators/detection_output_op.cc | 7 +- paddle/operators/detection_output_op.cu.cc | 4 +- paddle/operators/detection_output_op.h | 22 +++--- paddle/operators/math/detection_util.h | 80 +++++++++++----------- 4 files changed, 56 insertions(+), 57 deletions(-) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index ae807d2810..ea44cd3267 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -18,8 +18,7 @@ namespace operators { class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker { public: - DetectionOutputOpMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) + DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Loc", "(Tensor) The input tensor of detection_output operator." @@ -86,5 +85,5 @@ REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp, ops::DetectionOutputOpMaker); REGISTER_OP_CPU_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc index e65b2afd21..4a6560e049 100644 --- a/paddle/operators/detection_output_op.cu.cc +++ b/paddle/operators/detection_output_op.cu.cc @@ -17,5 +17,5 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::DetectionOutputKernel, + ops::DetectionOutputKernel); diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 733ec3b0ed..c0a4e6a3a2 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -48,7 +48,7 @@ inline void transpose_fun(const framework::ExecutionContext& context, } } template -class Detection_output_Kernel : public framework::OpKernel { +class DetectionOutputKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { const framework::Tensor* in_loc = context.Input("Loc"); @@ -119,22 +119,22 @@ class Detection_output_Kernel : public framework::OpKernel { size_t prior_offset = i * 8; size_t loc_pred_offset = n * num_priors * 4 + i * 4; std::vector> prior_bbox_vec; - math::getBBoxFromPriorData(priorbox_data + prior_offset, 1, - prior_bbox_vec); + math::get_bbox_from_priorData(priorbox_data + prior_offset, 1, + prior_bbox_vec); std::vector> prior_bbox_var; - math::getBBoxVarFromPriorData(priorbox_data + prior_offset, 1, - prior_bbox_var); + math::get_bbox_var_from_prior_data(priorbox_data + prior_offset, 1, + prior_bbox_var); std::vector loc_pred_data; for (size_t j = 0; j < 4; ++j) loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); - math::BBox bbox = math::decodeBBoxWithVar( + math::BBox bbox = math::decode_bbox_with_var( prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); decoded_bboxes.push_back(bbox); } all_decoded_bboxes.push_back(decoded_bboxes); } std::vector>> all_indices; - int num_kept = math::getDetectionIndices( + int num_kept = math::get_detection_indices( conf_data, num_priors, num_classes, background_label_id, batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k, all_decoded_bboxes, &all_indices); @@ -154,11 +154,11 @@ class Detection_output_Kernel : public framework::OpKernel { out_cpu.mutable_data(out->dims(), platform::CPUPlace()); out_data = out_cpu.data(); } - math::getDetectionOutput(conf_data, num_kept, num_priors, num_classes, - batch_size, all_indices, all_decoded_bboxes, - out_data); + math::get_detection_output(conf_data, num_kept, num_priors, num_classes, + batch_size, all_indices, all_decoded_bboxes, + out_data); if (platform::is_gpu_place(context.GetPlace())) { - framework::CopyFrom(out_cpu, platform::GPUPlace(), + framework::CopyFrom(out_cpu, platform::CUDAPlace(), context.device_context(), out); } } diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h index b671f7b517..d1ae0e6343 100644 --- a/paddle/operators/math/detection_util.h +++ b/paddle/operators/math/detection_util.h @@ -51,31 +51,31 @@ struct BBox { // KNCHW ==> NHWC // template template -void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec); +void get_bbox_from_priorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec); template -void getBBoxVarFromPriorData(const T* prior_data, const size_t num, - std::vector>& var_vec); +void get_bbox_var_from_prior_data(const T* prior_data, const size_t num, + std::vector>& var_vec); template -BBox decodeBBoxWithVar(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data); +BBox decode_bbox_with_var(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data); template -bool sortScorePairDescend(const std::pair& pair1, - const std::pair& pair2); +bool sort_score_pair_descend(const std::pair& pair1, + const std::pair& pair2); template -bool sortScorePairDescend(const std::pair>& pair1, - const std::pair>& pair2); +bool sort_score_pair_descend(const std::pair>& pair1, + const std::pair>& pair2); template -T jaccardOverlap(const BBox& bbox1, const BBox& bbox2); +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2); template -void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, - size_t class_idx, size_t top_k, T conf_threshold, - T nms_threshold, size_t num_priors, size_t num_classes, - std::vector* indices); +void apply_nms_fast(const std::vector>& bboxes, + const T* conf_score_data, size_t class_idx, size_t top_k, + T conf_threshold, T nms_threshold, size_t num_priors, + size_t num_classes, std::vector* indices); template -int getDetectionIndices( +int get_detection_indices( const T* conf_data, const size_t num_priors, const size_t num_classes, const size_t background_label_id, const size_t batch_size, const T conf_threshold, const size_t nms_top_k, const T nms_threshold, @@ -85,14 +85,14 @@ int getDetectionIndices( template BBox clipBBox(const BBox& bbox); template -void getDetectionOutput( +void get_detection_output( const T* conf_data, const size_t num_kept, const size_t num_priors, const size_t num_classes, const size_t batch_size, const std::vector>>& all_indices, const std::vector>>& all_decoded_bboxes, T* out_data); template -void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec) { +void get_bbox_from_priorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { size_t out_offset = bbox_vec.size(); bbox_vec.resize(bbox_vec.size() + num_bboxes); for (size_t i = 0; i < num_bboxes; ++i) { @@ -105,8 +105,8 @@ void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, } } template -void getBBoxVarFromPriorData(const T* prior_data, const size_t num, - std::vector>& var_vec) { +void get_bbox_var_from_prior_data(const T* prior_data, const size_t num, + std::vector>& var_vec) { size_t out_offset = var_vec.size(); var_vec.resize(var_vec.size() + num); for (size_t i = 0; i < num; ++i) { @@ -119,9 +119,9 @@ void getBBoxVarFromPriorData(const T* prior_data, const size_t num, } } template -BBox decodeBBoxWithVar(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data) { +BBox decode_bbox_with_var(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { T prior_bbox_width = prior_bbox.get_width(); T prior_bbox_height = prior_bbox.get_height(); T prior_bbox_center_x = prior_bbox.get_center_x(); @@ -147,12 +147,12 @@ BBox decodeBBoxWithVar(BBox& prior_bbox, return decoded_bbox; } template -bool sortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { +bool sort_score_pair_descend(const std::pair& pair1, + const std::pair& pair2) { return pair1.first > pair2.first; } template -T jaccardOverlap(const BBox& bbox1, const BBox& bbox2) { +T jaccard_overlap(const BBox& bbox1, const BBox& bbox2) { if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { return 0.0; @@ -174,10 +174,10 @@ T jaccardOverlap(const BBox& bbox1, const BBox& bbox2) { } template -void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, - size_t class_idx, size_t top_k, T conf_threshold, - T nms_threshold, size_t num_priors, size_t num_classes, - std::vector* indices) { +void apply_nms_fast(const std::vector>& bboxes, + const T* conf_score_data, size_t class_idx, size_t top_k, + T conf_threshold, T nms_threshold, size_t num_priors, + size_t num_classes, std::vector* indices) { std::vector> scores; for (size_t i = 0; i < num_priors; ++i) { size_t conf_offset = i * num_classes + class_idx; @@ -185,7 +185,7 @@ void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); } std::stable_sort(scores.begin(), scores.end(), - sortScorePairDescend); + sort_score_pair_descend); if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); while (scores.size() > 0) { const size_t idx = scores.front().second; @@ -193,7 +193,7 @@ void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, for (size_t i = 0; i < indices->size(); ++i) { if (keep) { const size_t saved_idx = (*indices)[i]; - T overlap = jaccardOverlap(bboxes[idx], bboxes[saved_idx]); + T overlap = jaccard_overlap(bboxes[idx], bboxes[saved_idx]); keep = overlap <= nms_threshold; } else { break; @@ -204,7 +204,7 @@ void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, } } template -int getDetectionIndices( +int get_detection_indices( const T* conf_data, const size_t num_priors, const size_t num_classes, const size_t background_label_id, const size_t batch_size, const T conf_threshold, const size_t nms_top_k, const T nms_threshold, @@ -219,9 +219,9 @@ int getDetectionIndices( size_t conf_offset = n * num_priors * num_classes; for (size_t c = 0; c < num_classes; ++c) { if (c == background_label_id) continue; - applyNMSFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, - conf_threshold, nms_threshold, num_priors, num_classes, - &(indices[c])); + apply_nms_fast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); num_detected += indices[c].size(); } if (top_k > 0 && num_detected > top_k) { @@ -237,7 +237,7 @@ int getDetectionIndices( } } std::sort(score_index_pairs.begin(), score_index_pairs.end(), - sortScorePairDescend>); + sort_score_pair_descend>); score_index_pairs.resize(top_k); std::map> new_indices; for (size_t i = 0; i < score_index_pairs.size(); ++i) { @@ -266,7 +266,7 @@ BBox clipBBox(const BBox& bbox) { return clipped_bbox; } template -void getDetectionOutput( +void get_detection_output( const T* conf_data, const size_t num_kept, const size_t num_priors, const size_t num_classes, const size_t batch_size, const std::vector>>& all_indices, From 96bc335216f418a8682e49f75ddaf50eedb71704 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 12:49:02 +0800 Subject: [PATCH 112/181] Update --- paddle/framework/variable.h | 1 - paddle/operators/sum_op.h | 6 ++++-- python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h index 3720393601..e5a94759f9 100644 --- a/paddle/framework/variable.h +++ b/paddle/framework/variable.h @@ -35,7 +35,6 @@ class Variable { template T* GetMutable() { if (!IsType()) { - VLOG(10) << "Resetting " << *this->name_; holder_.reset(new PlaceholderImpl(new T())); } return static_cast(holder_->Ptr()); diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index d1277d3edd..552b48f608 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -37,10 +37,12 @@ class SumKernel : public framework::OpKernel { bool in_place = out_var == in_vars[0]; if (out_var->IsType()) { - auto *out = context.Output("Out"); - auto result = EigenVector::Flatten(*out); + auto *out = context.Output("Out"); if (!in_place) { out->mutable_data(context.GetPlace()); + } + auto result = EigenVector::Flatten(*out); + if (!in_place) { math::SetConstant constant_functor; constant_functor(context.template device_context(), out, 0.0); diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 7f61b966fd..238fd1a8cb 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -242,7 +242,7 @@ class TestSimpleMul(unittest.TestCase): out = rnn() out = fluid.layers.sequence_pool(out, pool_type='last') loss = fluid.layers.mean(x=out) - fluid.backward.append_backward_ops(loss) + fluid.backward.append_backward(loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) @@ -317,7 +317,7 @@ class TestSimpleMulWithMemory(unittest.TestCase): out = rnn() last = fluid.layers.sequence_pool(input=out, pool_type='last') loss = fluid.layers.mean(x=last) - fluid.backward.append_backward_ops(loss) + fluid.backward.append_backward(loss) cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) @@ -330,6 +330,7 @@ class TestSimpleMulWithMemory(unittest.TestCase): ], return_numpy=False)) last_by_py, = py_rnn.exe().values() + print w_g[0] self.assertTrue(numpy.allclose(last_np, last_by_py)) w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME) # print w_g_num[0], w_g[0] From 8bd759007a29aaece5e85b90362d8c886a8a7e5b Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 28 Dec 2017 12:53:00 +0800 Subject: [PATCH 113/181] refine CosSimDyFunctor --- paddle/operators/cos_sim_op.cu | 2 -- paddle/operators/cos_sim_op.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 42194d7a05..86dc04995a 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -57,8 +57,6 @@ struct CosSimDyFunctor { } }; -template struct CosSimDyFunctor; - } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index a913e576f9..7641ca15f1 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -56,7 +56,7 @@ struct CosSimFunctor { tep_x = x[i]; tep_y = y_[i]; xx += tep_x * tep_x; - yy += tep_y * tep_y; // only need + yy += tep_y * tep_y; xy += tep_x * tep_y; } xx = sqrt(xx); From 2cdef424d9cb379887cb826ae514b01e3ff7d569 Mon Sep 17 00:00:00 2001 From: Yancey Date: Thu, 28 Dec 2017 13:31:50 +0800 Subject: [PATCH 114/181] Implement selectedrows serialize and deserialize (#7042) * implement selectedrows serialize and deserialize * make serialize/deserialize as global function * recover send_imp.cc * delete unused brackets * fix compile error * serialize version in LodTensor and SelecetedRows * fix ci * fix ci --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/lod_tensor.cc | 112 +++++------------------- paddle/framework/lod_tensor_test.cc | 14 +++ paddle/framework/selected_rows.cc | 55 +++++++++++- paddle/framework/selected_rows.h | 9 ++ paddle/framework/selected_rows_test.cc | 14 +++ paddle/framework/tensor_test.cc | 114 +++++++++++++------------ paddle/framework/tensor_util.h | 100 ++++++++++++++++++++++ paddle/framework/tensor_util_test.cc | 50 +++++++++++ paddle/operators/load_op.cc | 2 +- paddle/operators/math/CMakeLists.txt | 8 +- 11 files changed, 326 insertions(+), 154 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 738684795d..8bfa41715f 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) +cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context framework_proto) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index f8a3be9a82..7b6dc09bdb 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { - // TODO(typhoonzero): serialize to ostream - { // the 1st field, uint32_t version + { // the 1st field, uint32_t version for LoDTensor constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); } - { // the 2nd field, tensor description - // int32_t size - // void* protobuf message - proto::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); - auto dims = framework::vectorize(tensor.dims()); - auto *pb_dims = desc.mutable_dims(); - pb_dims->Resize(static_cast(dims.size()), 0); - std::copy(dims.begin(), dims.end(), pb_dims->begin()); - int32_t size = desc.ByteSize(); - os.write(reinterpret_cast(&size), sizeof(size)); - auto out = desc.SerializeAsString(); - os.write(out.data(), size); - } - { // the 3rd field, tensor data - uint64_t size = tensor.memory_size(); - auto *data_ptr = tensor.data(); - PADDLE_ENFORCE(size < std::numeric_limits::max(), - "Index overflow when writing tensor"); - if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA - constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB - std::unique_ptr buf(new char[kBufSize]); - auto &gpu_dev_ctx = - static_cast(dev_ctx); - platform::CPUPlace cpu; - uintptr_t data = reinterpret_cast(data_ptr); - while (size != 0) { - size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), - reinterpret_cast(data), size_to_write, - gpu_dev_ctx.stream()); - gpu_dev_ctx.Wait(); - os.write(buf.get(), size_to_write); - data += size_to_write; - size -= size_to_write; - } -#else - PADDLE_THROW("Unexpected branch"); -#endif - } else { - os.write(static_cast(data_ptr), - static_cast(size)); - } - } - { // the 4th field, lod information - // uint64_t lod_level - // uint64_t lod_level_1 size in byte. - // int* lod_level_1 data - // ... + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... auto lod = tensor.lod(); uint64_t size = lod.size(); os.write(reinterpret_cast(&size), sizeof(size)); @@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, static_cast(size)); } } + // the 3st field, Tensor + SerializeToStream(os, static_cast(tensor), dev_ctx); } void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { - uint32_t version; - is.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); - proto::TensorDesc desc; - { // int32_t size - // proto buffer - int32_t size; - is.read(reinterpret_cast(&size), sizeof(size)); - std::unique_ptr buf(new char[size]); - is.read(reinterpret_cast(buf.get()), size); - PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), - "Cannot parse tensor desc"); - } - { // read tensor - std::vector dims; - dims.reserve(static_cast(desc.dims().size())); - std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); - tensor->Resize(framework::make_ddim(dims)); - - void *buf; - platform::Place cpu = platform::CPUPlace(); - switch (desc.data_type()) { - case proto::FP32: - buf = tensor->mutable_data(cpu); - break; - case proto::FP64: - buf = tensor->mutable_data(cpu); - break; - case proto::INT32: - buf = tensor->mutable_data(cpu); - break; - case proto::INT64: - buf = tensor->mutable_data(cpu); - break; - default: - PADDLE_THROW("DataType %d not supported", desc.data_type()); - } - is.read(static_cast(buf), tensor->memory_size()); - } - { // read lod + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, LoD information uint64_t lod_level; is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); auto &lod = *tensor->mutable_lod(); @@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { lod[i] = tmp; } } + // the 3st filed, Tensor + DeserializeFromStream(is, static_cast(tensor)); } } // namespace framework diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc index 02d84b6823..0747c8db53 100644 --- a/paddle/framework/lod_tensor_test.cc +++ b/paddle/framework/lod_tensor_test.cc @@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) { EXPECT_NE(t1.data(), lod_tensor_.data()); } +TEST_F(LoDTensorTester, SerializeAndDeserialize) { + LoDTensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + std::ostringstream oss; + SerializeToStream(oss, lod_tensor_, cpu_ctx); + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + float* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < kLodTensorSize; ++i) { + EXPECT_EQ(dst_ptr[i], i); + } + EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod()); +} + TEST(LodExpand, test) { LoD lod{{0, 2}}; LoDTensor tensor; diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc index c74459c9dd..82adfa7123 100644 --- a/paddle/framework/selected_rows.cc +++ b/paddle/framework/selected_rows.cc @@ -12,5 +12,58 @@ limitations under the License. */ #include "paddle/framework/selected_rows.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + SerializeToStream(os, selected_rows.value(), dev_ctx); +} + +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) { + auto tensor = *selected_rows->mutable_value(); + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + DeserializeFromStream(is, &tensor); +} + +} // namespace framework } // namespace paddle diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h index 0332b91323..699e392688 100644 --- a/paddle/framework/selected_rows.h +++ b/paddle/framework/selected_rows.h @@ -59,5 +59,14 @@ class SelectedRows { int64_t height_; }; +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows); + } // namespace framework } // namespace paddle diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc index 4ee13a65d7..75487c4010 100644 --- a/paddle/framework/selected_rows_test.cc +++ b/paddle/framework/selected_rows_test.cc @@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) { ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); } +TEST_F(SelectedRowsTester, SerializeAndDeseralize) { + SelectedRows dst_tensor; + platform::CPUDeviceContext cpu_ctx(place_); + std::ostringstream oss; + + SerializeToStream(oss, *selected_rows_, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + + ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); + ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index ca76a9fcb9..a1b4a03289 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -15,12 +15,13 @@ #include #include +namespace framework = paddle::framework; +namespace platform = paddle::platform; + TEST(Tensor, Dims) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor tt; + framework::Tensor tt; tt.Resize({2, 3, 4}); - DDim dims = tt.dims(); + framework::DDim dims = tt.dims(); ASSERT_EQ(arity(dims), 3); for (int i = 0; i < 3; ++i) { EXPECT_EQ(i + 2, dims[i]); @@ -28,12 +29,12 @@ TEST(Tensor, Dims) { } TEST(Tensor, DataAssert) { - paddle::framework::Tensor src_tensor; + framework::Tensor src_tensor; bool caught = false; try { src_tensor.data(); - } catch (paddle::platform::EnforceNotMet err) { + } catch (platform::EnforceNotMet err) { caught = true; std::string msg = "holder_ should not be null\nTensor holds no memory. Call " @@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) { because Memory::Alloc() and Memory::Free() have not been ready. */ TEST(Tensor, MutableData) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; + framework::Tensor src_tensor; float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CPUPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CPUPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), CPUPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CPUPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CPUPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CPUPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); EXPECT_EQ(p1, p2); } #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; + framework::Tensor src_tensor; float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CUDAPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CUDAPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), CUDAPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CUDAPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CUDAPlace()); + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CUDAPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), CUDAPlace()); + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CUDAPlace()); EXPECT_EQ(p1, p2); } #endif } TEST(Tensor, ShareDataWith) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; - Tensor dst_tensor; + framework::Tensor src_tensor; + framework::Tensor dst_tensor; // Try to share data form uninitialized tensor bool caught = false; try { @@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) { } ASSERT_TRUE(caught); - src_tensor.mutable_data(make_ddim({2, 3, 4}), CPUPlace()); + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CPUPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; - Tensor dst_tensor; - src_tensor.mutable_data(make_ddim({2, 3, 4}), CUDAPlace()); + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CUDAPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) { } TEST(Tensor, Slice) { - using namespace paddle::framework; - using namespace paddle::platform; { - Tensor src_tensor; - src_tensor.mutable_data(make_ddim({5, 3, 4}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 3); - DDim slice_dims = slice_tensor.dims(); + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({5, 3, 4}), + platform::CPUPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(1, 3); + framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 3); EXPECT_EQ(slice_dims[0], 2); EXPECT_EQ(slice_dims[1], 3); @@ -153,11 +159,12 @@ TEST(Tensor, Slice) { uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), CPUPlace())); + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); - uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), CPUPlace())); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CPUPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); @@ -165,22 +172,25 @@ TEST(Tensor, Slice) { #ifdef PADDLE_WITH_CUDA { - Tensor src_tensor; - src_tensor.mutable_data(make_ddim({6, 9}), CUDAPlace()); - Tensor slice_tensor = src_tensor.Slice(2, 6); - DDim slice_dims = slice_tensor.dims(); + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::CUDAPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); EXPECT_EQ(slice_dims[0], 4); EXPECT_EQ(slice_dims[1], 9); uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); - uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), CUDAPlace())); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); - uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), CUDAPlace())); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); @@ -189,23 +199,19 @@ TEST(Tensor, Slice) { } TEST(Tensor, ReshapeToMatrix) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor src; - int* src_ptr = src.mutable_data({2, 3, 4, 9}, CPUPlace()); + framework::Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, platform::CPUPlace()); for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { src_ptr[i] = i; } - Tensor res = ReshapeToMatrix(src, 2); + framework::Tensor res = framework::ReshapeToMatrix(src, 2); ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } TEST(Tensor, Layout) { - using namespace paddle::framework; - using namespace paddle::platform; - Tensor src; - ASSERT_EQ(src.layout(), DataLayout::kNHWC); - src.set_layout(DataLayout::kAnyLayout); - ASSERT_EQ(src.layout(), DataLayout::kAnyLayout); + framework::Tensor src; + ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC); + src.set_layout(framework::DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index ea4e4f22ea..108006911a 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/framework/data_type.h" +#include "paddle/framework/framework.pb.h" #include "paddle/framework/tensor.h" namespace paddle { @@ -205,5 +207,103 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +inline void SerializeToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + // TODO(typhoonzero): serialize to ostream + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.memory_size(); + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +inline void DeserializeFromStream(std::istream& is, Tensor* tensor) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + + void* buf; + platform::Place cpu = platform::CPUPlace(); + // TODO(Yancey1989): use VisiterDataType instead of DataType switch + switch (desc.data_type()) { + case proto::FP32: + buf = tensor->mutable_data(cpu); + break; + case proto::FP64: + buf = tensor->mutable_data(cpu); + break; + case proto::INT32: + buf = tensor->mutable_data(cpu); + break; + case proto::INT64: + buf = tensor->mutable_data(cpu); + break; + default: + PADDLE_THROW("DataType %d not supported", desc.data_type()); + } + is.read(static_cast(buf), tensor->memory_size()); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index f388c19f28..1281e9c2a4 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -230,5 +230,55 @@ TEST(CopyToVector, Tensor) { #endif } +TEST(Tensor, SerializeAndDeserialize) { + framework::Tensor src_tensor; + int array[6] = {1, 2, 3, 4, 5, 6}; + src_tensor.Resize({2, 3}); + int* src_ptr = src_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + src_ptr[i] = array[i]; + } + { + framework::Tensor dst_tensor; + auto place = new platform::CPUPlace(); + platform::CPUDeviceContext cpu_ctx(*place); + std::ostringstream oss; + SerializeToStream(oss, src_tensor, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 5; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + delete place; + } +#ifdef PADDLE_WITH_CUDA + { + Tensor gpu_tensor; + gpu_tensor.Resize({2, 3}); + Tensor dst_tensor; + + auto gpu_place = new platform::CUDAPlace(); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + + CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + std::ostringstream oss; + SerializeToStream(oss, gpu_tensor, gpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor); + + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(dst_ptr[i], array[i]); + } + + delete gpu_place; + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 65f021d919..08b972a233 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -38,7 +38,7 @@ class LoadOp : public framework::OperatorBase { out_var_name); auto *tensor = out_var->GetMutable(); - framework::DeserializeFromStream(fin, tensor); + DeserializeFromStream(fin, tensor); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index bf47879f77..b97faec4ed 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -9,9 +9,9 @@ if(WITH_GPU) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) - nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) + nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) - nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) + nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) @@ -23,9 +23,9 @@ else() cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) - cc_library(vol2col SRCS vol2col.cc DEPS device_context) + cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(context_project SRCS context_project.cc DEPS device_context math_function) - cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) + cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context) From dd2bbf3a14fec5623609bf84377350e5812342f0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 28 Dec 2017 13:51:17 +0800 Subject: [PATCH 115/181] update md5 of flowers dataset --- python/paddle/v2/dataset/flowers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 634388094c..7bdddeaabe 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid'] DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' -DATA_MD5 = '52808999861908f626f3c1f4e79d11fa' +DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' # In official 'readme', tstid is the flag of test data From b7c4b58d3d041d4afe4da3d7f8b7d7366e8dce8d Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Thu, 28 Dec 2017 14:51:32 +0800 Subject: [PATCH 116/181] Follow comments. --- paddle/function/GemmConvOp.cpp | 6 ++++-- paddle/function/Im2Col.h | 2 +- paddle/function/Im2ColTest.cpp | 14 +++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 25cc3df667..cbdbf5335d 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -189,8 +189,8 @@ public: size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth; size_t colWidth = outputHeight * outputWidth; // Max col matrix height 256, Max col matrix width 1024 - size_t stepColHeight = std::min(colHeight, (size_t)256); - size_t stepColWidth = std::min(colWidth, (size_t)2048); + size_t stepColHeight = std::min(colHeight, static_cast(256)); + size_t stepColWidth = std::min(colWidth, static_cast(2048)); if (needIm2col) { colShape = TensorShape({inputChannels / groups_, @@ -278,6 +278,8 @@ public: inputData += inputChannels * inputHeight * inputWidth; outputData += outputChannels * outputHeight * outputWidth; } + + memory_.reset(); } }; diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h index 1053e4fd23..36a9bcf84e 100644 --- a/paddle/function/Im2Col.h +++ b/paddle/function/Im2Col.h @@ -136,7 +136,7 @@ public: (imRowIdx - paddingHeight) >= inputHeight || (imColIdx - paddingWidth) < 0 || (imColIdx - paddingWidth) >= inputWidth) { - colData[colh * colWidthSize + colw] = T(0); + colData[colh * colWidthSize + colw] = static_cast(0); } else { imRowIdx += c_im * inputHeight - paddingHeight; imColIdx -= paddingWidth; diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp index c573469168..3ba866dcdd 100644 --- a/paddle/function/Im2ColTest.cpp +++ b/paddle/function/Im2ColTest.cpp @@ -140,13 +140,13 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor(); } template void TestIm2ColMobileFunctor() { - for (size_t channels : {1, 5, 32}) { - for (size_t inputHeight : {5, 33, 100}) { - for (size_t inputWidth : {5, 32, 96}) { - for (size_t filterHeight : {1, 5}) { - for (size_t filterWidth : {3, 7}) { - for (size_t stride : {1, 2}) { - for (size_t padding : {0, 1}) { + for (size_t channels : {32}) { + for (size_t inputHeight : {33, 100}) { + for (size_t inputWidth : {32, 96}) { + for (size_t filterHeight : {5}) { + for (size_t filterWidth : {7}) { + for (size_t stride : {2}) { + for (size_t padding : {1}) { for (size_t dilation : {1, 3}) { size_t filterSizeH = (filterHeight - 1) * dilation + 1; size_t filterSizeW = (filterWidth - 1) * dilation + 1; From 95aec835e6ee59099581212796e82d3f2957e5f7 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Thu, 28 Dec 2017 16:10:28 +0800 Subject: [PATCH 117/181] modify fun name --- paddle/operators/detection_output_op.h | 18 +++--- paddle/operators/math/detection_util.h | 80 +++++++++++++------------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index c0a4e6a3a2..cd6417087a 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -119,22 +119,22 @@ class DetectionOutputKernel : public framework::OpKernel { size_t prior_offset = i * 8; size_t loc_pred_offset = n * num_priors * 4 + i * 4; std::vector> prior_bbox_vec; - math::get_bbox_from_priorData(priorbox_data + prior_offset, 1, - prior_bbox_vec); + math::GetBBoxFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_vec); std::vector> prior_bbox_var; - math::get_bbox_var_from_prior_data(priorbox_data + prior_offset, 1, - prior_bbox_var); + math::GetBBoxVarFromPriorData(priorbox_data + prior_offset, 1, + prior_bbox_var); std::vector loc_pred_data; for (size_t j = 0; j < 4; ++j) loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); - math::BBox bbox = math::decode_bbox_with_var( + math::BBox bbox = math::DecodeBBoxWithVar( prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); decoded_bboxes.push_back(bbox); } all_decoded_bboxes.push_back(decoded_bboxes); } std::vector>> all_indices; - int num_kept = math::get_detection_indices( + int num_kept = math::GetDetectionIndices( conf_data, num_priors, num_classes, background_label_id, batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k, all_decoded_bboxes, &all_indices); @@ -154,9 +154,9 @@ class DetectionOutputKernel : public framework::OpKernel { out_cpu.mutable_data(out->dims(), platform::CPUPlace()); out_data = out_cpu.data(); } - math::get_detection_output(conf_data, num_kept, num_priors, num_classes, - batch_size, all_indices, all_decoded_bboxes, - out_data); + math::GetDetectionOutput(conf_data, num_kept, num_priors, num_classes, + batch_size, all_indices, all_decoded_bboxes, + out_data); if (platform::is_gpu_place(context.GetPlace())) { framework::CopyFrom(out_cpu, platform::CUDAPlace(), context.device_context(), out); diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h index d1ae0e6343..e3a3ef2bad 100644 --- a/paddle/operators/math/detection_util.h +++ b/paddle/operators/math/detection_util.h @@ -51,31 +51,31 @@ struct BBox { // KNCHW ==> NHWC // template template -void get_bbox_from_priorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec); +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec); template -void get_bbox_var_from_prior_data(const T* prior_data, const size_t num, - std::vector>& var_vec); +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec); template -BBox decode_bbox_with_var(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data); +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data); template -bool sort_score_pair_descend(const std::pair& pair1, - const std::pair& pair2); +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2); template -bool sort_score_pair_descend(const std::pair>& pair1, - const std::pair>& pair2); +bool SortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); template T jaccard_overlap(const BBox& bbox1, const BBox& bbox2); template -void apply_nms_fast(const std::vector>& bboxes, - const T* conf_score_data, size_t class_idx, size_t top_k, - T conf_threshold, T nms_threshold, size_t num_priors, - size_t num_classes, std::vector* indices); +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices); template -int get_detection_indices( +int GetDetectionIndices( const T* conf_data, const size_t num_priors, const size_t num_classes, const size_t background_label_id, const size_t batch_size, const T conf_threshold, const size_t nms_top_k, const T nms_threshold, @@ -83,16 +83,16 @@ int get_detection_indices( const std::vector>>& all_decoded_bboxes, std::vector>>* all_detection_indices); template -BBox clipBBox(const BBox& bbox); +BBox ClipBBox(const BBox& bbox); template -void get_detection_output( +void GetDetectionOutput( const T* conf_data, const size_t num_kept, const size_t num_priors, const size_t num_classes, const size_t batch_size, const std::vector>>& all_indices, const std::vector>>& all_decoded_bboxes, T* out_data); template -void get_bbox_from_priorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec) { +void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { size_t out_offset = bbox_vec.size(); bbox_vec.resize(bbox_vec.size() + num_bboxes); for (size_t i = 0; i < num_bboxes; ++i) { @@ -105,8 +105,8 @@ void get_bbox_from_priorData(const T* prior_data, const size_t num_bboxes, } } template -void get_bbox_var_from_prior_data(const T* prior_data, const size_t num, - std::vector>& var_vec) { +void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec) { size_t out_offset = var_vec.size(); var_vec.resize(var_vec.size() + num); for (size_t i = 0; i < num; ++i) { @@ -119,9 +119,9 @@ void get_bbox_var_from_prior_data(const T* prior_data, const size_t num, } } template -BBox decode_bbox_with_var(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data) { +BBox DecodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { T prior_bbox_width = prior_bbox.get_width(); T prior_bbox_height = prior_bbox.get_height(); T prior_bbox_center_x = prior_bbox.get_center_x(); @@ -147,8 +147,8 @@ BBox decode_bbox_with_var(BBox& prior_bbox, return decoded_bbox; } template -bool sort_score_pair_descend(const std::pair& pair1, - const std::pair& pair2) { +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { return pair1.first > pair2.first; } template @@ -174,10 +174,10 @@ T jaccard_overlap(const BBox& bbox1, const BBox& bbox2) { } template -void apply_nms_fast(const std::vector>& bboxes, - const T* conf_score_data, size_t class_idx, size_t top_k, - T conf_threshold, T nms_threshold, size_t num_priors, - size_t num_classes, std::vector* indices) { +void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices) { std::vector> scores; for (size_t i = 0; i < num_priors; ++i) { size_t conf_offset = i * num_classes + class_idx; @@ -185,7 +185,7 @@ void apply_nms_fast(const std::vector>& bboxes, scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); } std::stable_sort(scores.begin(), scores.end(), - sort_score_pair_descend); + SortScorePairDescend); if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); while (scores.size() > 0) { const size_t idx = scores.front().second; @@ -204,7 +204,7 @@ void apply_nms_fast(const std::vector>& bboxes, } } template -int get_detection_indices( +int GetDetectionIndices( const T* conf_data, const size_t num_priors, const size_t num_classes, const size_t background_label_id, const size_t batch_size, const T conf_threshold, const size_t nms_top_k, const T nms_threshold, @@ -219,9 +219,9 @@ int get_detection_indices( size_t conf_offset = n * num_priors * num_classes; for (size_t c = 0; c < num_classes; ++c) { if (c == background_label_id) continue; - apply_nms_fast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, - conf_threshold, nms_threshold, num_priors, num_classes, - &(indices[c])); + ApplyNmsFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); num_detected += indices[c].size(); } if (top_k > 0 && num_detected > top_k) { @@ -237,7 +237,7 @@ int get_detection_indices( } } std::sort(score_index_pairs.begin(), score_index_pairs.end(), - sort_score_pair_descend>); + SortScorePairDescend>); score_index_pairs.resize(top_k); std::map> new_indices; for (size_t i = 0; i < score_index_pairs.size(); ++i) { @@ -255,7 +255,7 @@ int get_detection_indices( return total_keep_num; } template -BBox clipBBox(const BBox& bbox) { +BBox ClipBBox(const BBox& bbox) { T one = static_cast(1.0); T zero = static_cast(0.0); BBox clipped_bbox; @@ -266,7 +266,7 @@ BBox clipBBox(const BBox& bbox) { return clipped_bbox; } template -void get_detection_output( +void GetDetectionOutput( const T* conf_data, const size_t num_kept, const size_t num_priors, const size_t num_classes, const size_t batch_size, const std::vector>>& all_indices, @@ -285,7 +285,7 @@ void get_detection_output( out_data[count * 7] = n; out_data[count * 7 + 1] = label; out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; - BBox clipped_bbox = clipBBox(decoded_bboxes[idx]); + BBox clipped_bbox = ClipBBox(decoded_bboxes[idx]); out_data[count * 7 + 3] = clipped_bbox.x_min; out_data[count * 7 + 4] = clipped_bbox.y_min; out_data[count * 7 + 5] = clipped_bbox.x_max; From d2cb28413e5728b819cc93ec68a5c81d62f75007 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 16:58:39 +0800 Subject: [PATCH 118/181] Fix ALL RNN error --- paddle/operators/while_op.cc | 12 +++++++++++- paddle/pybind/tensor_py.h | 8 ++++---- .../v2/fluid/tests/test_dynrnn_gradient_check.py | 6 ++---- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 322270c829..341c163aa1 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/framework/executor.h" #include "paddle/framework/lod_tensor_array.h" @@ -201,6 +202,15 @@ class WhileGradOp : public framework::OperatorBase { VLOG(10) << "Checking " << var_name; PADDLE_ENFORCE(!framework::HasNAN(var->Get()), "%s has NAN", var_name); + if (var->Get().type() == + typeid(float)) { // NOLINT + auto &tensor = var->Get(); + auto *buf = tensor.data(); + for (int64_t i = 0; i < tensor.numel(); ++i) { + PADDLE_ENFORCE(!std::isnan(buf[i])); + } + VLOG(10) << buf[0]; + } } }; check_var_no_nan(cur_scope, inside_grad_name); @@ -210,7 +220,7 @@ class WhileGradOp : public framework::OperatorBase { "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_place); - check_var_no_nan(cur_scope, pg_names[param_id]); + check_var_no_nan(scope, pg_names[param_id]); cur_scope.Rename(new_inside_name, inside_grad_name); } } diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 4d5e73e2c2..6b4290972b 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -77,10 +77,10 @@ struct CastToPyBufferImpl { } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; } - return py::buffer_info( - dst_tensor.mutable_data(dst_tensor.place()), - sizeof(CUR_TYPE), py::format_descriptor::format(), - (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); + return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), + py::format_descriptor::format(), + (size_t)framework::arity(dst_tensor.dims()), + dims_outside, strides); } else { constexpr bool less = I + 1 < std::tuple_size>::value; return CastToPyBufferImpl()(tensor); diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 238fd1a8cb..6569ccb9e6 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -294,7 +294,7 @@ class TestSimpleMulWithMemory(unittest.TestCase): assert isinstance(Out, Output) Out.out(o) - # @many_times(10) + @many_times(10) @prog_scope() def test_forward_backward(self): py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory() @@ -330,10 +330,8 @@ class TestSimpleMulWithMemory(unittest.TestCase): ], return_numpy=False)) last_by_py, = py_rnn.exe().values() - print w_g[0] - self.assertTrue(numpy.allclose(last_np, last_by_py)) w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME) - # print w_g_num[0], w_g[0] + self.assertTrue(numpy.allclose(last_np, last_by_py)) self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1)) i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME) From 5022ee63597c0ac52a9b5344f81546f6c26b2dc7 Mon Sep 17 00:00:00 2001 From: Yancey Date: Thu, 28 Dec 2017 17:09:11 +0800 Subject: [PATCH 119/181] ThreadPool::Run interface return std::future (#7099) * Run interface return future * delete unused comments --- paddle/framework/threadpool.h | 19 +++++++++++++------ paddle/framework/threadpool_test.cc | 19 ++++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h index 5f6b2d458f..bcd8190755 100644 --- a/paddle/framework/threadpool.h +++ b/paddle/framework/threadpool.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include #include @@ -25,10 +26,11 @@ limitations under the License. */ namespace paddle { namespace framework { -typedef std::function Task; - class ThreadPool { public: + typedef std::packaged_task Task; + typedef std::function Fun; + /** * @brief Get a instance of threadpool, the thread number will * be specified as the number of hardware thread contexts @@ -61,13 +63,18 @@ class ThreadPool { /** * @brief Push a function to the queue, and will be scheduled and * executed if a thread is available. - * @param[in] Task will be pushed to the task queue. + * @param[in] Task, will be pushed to the task queue. + * @return std::future, we could wait for the task finished by + * f.wait(). */ - void Run(const Task& fn) { + std::future Run(const Fun& fn) { std::unique_lock lock(mutex_); - tasks_.push(fn); + Task task(std::bind(fn)); + std::future f = task.get_future(); + tasks_.push(std::move(task)); lock.unlock(); scheduled_.notify_one(); + return f; } /** @@ -110,7 +117,7 @@ class ThreadPool { break; } // pop a task from the task queue - auto task = tasks_.front(); + auto task = std::move(tasks_.front()); tasks_.pop(); --available_; diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc index 012d92a5ed..50b6238cd8 100644 --- a/paddle/framework/threadpool_test.cc +++ b/paddle/framework/threadpool_test.cc @@ -20,16 +20,21 @@ limitations under the License. */ namespace framework = paddle::framework; void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { + std::vector> fs; for (int i = 0; i < cnt; ++i) { - pool->Run([&sum]() { sum.fetch_add(1); }); + auto f = pool->Run([&sum]() { sum.fetch_add(1); }); + fs.push_back(std::move(f)); + } + for (auto& f : fs) { + f.wait(); } } TEST(ThreadPool, ConcurrentInit) { framework::ThreadPool* pool; - int concurrent_cnt = 50; + int n = 50; std::vector threads; - for (int i = 0; i < concurrent_cnt; ++i) { + for (int i = 0; i < n; ++i) { std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); threads.push_back(std::move(t)); } @@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) { } } -TEST(ThreadPool, ConcurrentStart) { +TEST(ThreadPool, ConcurrentRun) { framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); std::atomic sum(0); std::vector threads; - int concurrent_cnt = 50; + int n = 50; // sum = (n * (n + 1)) / 2 - for (int i = 1; i <= concurrent_cnt; ++i) { + for (int i = 1; i <= n; ++i) { std::thread t(do_sum, pool, std::ref(sum), i); threads.push_back(std::move(t)); } @@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) { t.join(); } pool->Wait(); - EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); + EXPECT_EQ(sum, ((n + 1) * n) / 2); } From f74dff97ea625a557146fe8c11313e5242c7cd0a Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 28 Dec 2017 17:36:16 +0800 Subject: [PATCH 120/181] Refine the activation type in the GRU operator related --- paddle/operators/gru_op.h | 19 +++++++---- paddle/operators/math/detail/gru_cpu_kernel.h | 34 +++++++++---------- paddle/operators/math/detail/gru_gpu_kernel.h | 10 +++--- paddle/operators/math/detail/gru_kernel.h | 17 +++++----- paddle/operators/math/gru_compute.cc | 12 +++---- paddle/operators/math/gru_compute.cu | 10 +++--- paddle/operators/math/gru_compute.h | 21 ++++++------ 7 files changed, 60 insertions(+), 63 deletions(-) diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index c6228864d7..d773521259 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/gru_compute.h" #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/sequence2batch.h" @@ -70,7 +71,7 @@ class GRUKernel : public framework::OpKernel { } int frame_size = hidden_dims[1]; - math::hl_gru_value gru_value; + math::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); @@ -102,8 +103,10 @@ class GRUKernel : public framework::OpKernel { gru_value.reset_output_value = reset_hidden_prev_t.data(); math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, - math::ActiveType(context.Attr("activation")), - math::ActiveType(context.Attr("gate_activation"))); + math::detail::GetActivationType( + context.Attr("activation")), + math::detail::GetActivationType( + context.Attr("gate_activation"))); gru_value.prev_out_value = gru_value.output_value; } @@ -170,12 +173,12 @@ class GRUGradKernel : public framework::OpKernel { batch_hidden_grad.set_lod(batch_hidden->lod()); to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); - math::hl_gru_value gru_value; + math::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); - math::hl_gru_grad gru_grad; + math::GRUMetaGrad gru_grad; if (weight_grad) { gru_grad.gate_weight_grad = weight_grad->mutable_data(context.GetPlace()); @@ -220,8 +223,10 @@ class GRUGradKernel : public framework::OpKernel { math::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, - math::ActiveType(context.Attr("activation")), - math::ActiveType(context.Attr("gate_activation"))); + math::detail::GetActivationType( + context.Attr("activation")), + math::detail::GetActivationType( + context.Attr("gate_activation"))); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h index 4c67dec9cb..a61b232f42 100644 --- a/paddle/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/operators/math/detail/gru_cpu_kernel.h @@ -28,7 +28,7 @@ template void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, T *gate_value, T *reset_output_value, T *prev_output_value, int frame_size, - activation_mode_t active_gate) { + ActivationType active_gate) { T r_value_update_gate; T r_value_reset_gate; T r_value_reset_output; @@ -56,7 +56,7 @@ template void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, - activation_mode_t active_node) { + ActivationType active_node) { T r_value_update_gate; T r_value_frame_state; T r_prev_out = 0; @@ -83,7 +83,7 @@ template void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, T *gate_value, T *reset_output_value, T *prev_output_value, int frame_size, - activation_mode_t active_gate) { + ActivationType active_gate) { #ifdef __AVX__ __m256 r_value_update_gate; __m256 r_value_reset_gate; @@ -113,7 +113,7 @@ template void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, - activation_mode_t active_node) { + ActivationType active_node) { #ifdef __AVX__ __m256 r_value_update_gate; __m256 r_value_frame_state; @@ -140,9 +140,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, template inline void forward_reset_output(OpResetOutput op_reset_output, - hl_gru_value value, int frame_size, - int batch_size, - activation_mode_t active_gate) { + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_gate) { for (int b = 0; b < batch_size; b++) { if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_forward_reset_output( @@ -164,9 +163,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output, template inline void forward_final_output(OpFinalOutput op_final_output, - hl_gru_value value, int frame_size, - int batch_size, - activation_mode_t active_node) { + GRUMetaValue value, int frame_size, + int batch_size, ActivationType active_node) { for (int b = 0; b < batch_size; b++) { if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_forward_final_output(op_final_output, value.gate_value, @@ -191,7 +189,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, int frame_size, - activation_mode_t active_node) { + ActivationType active_node) { T r_update_gate_value; T r_update_gate_grad; T r_frame_state_value; @@ -232,7 +230,7 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *reset_output_grad, int frame_size, - activation_mode_t active_gate) { + ActivationType active_gate) { T r_update_gate_value; T r_update_gate_grad; T r_reset_gate_value; @@ -277,7 +275,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, int frame_size, - activation_mode_t active_node) { + ActivationType active_node) { #ifdef __AVX__ __m256 r_update_gate_value; __m256 r_update_gate_grad; @@ -320,7 +318,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *reset_output_grad, int frame_size, - activation_mode_t active_gate) { + ActivationType active_gate) { #ifdef __AVX__ __m256 r_update_gate_value; __m256 r_update_gate_grad; @@ -364,9 +362,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, template inline void backward_state_grad(OpStateGrad op_state_grad, - hl_gru_value value, hl_gru_grad grad, + GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - activation_mode_t active_node) { + ActivationType active_node) { for (int b = 0; b < batch_size; b++) { if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_backward_state_grad( @@ -393,9 +391,9 @@ inline void backward_state_grad(OpStateGrad op_state_grad, template inline void backward_reset_grad(OpResetGrad op_reset_grad, - hl_gru_value value, hl_gru_grad grad, + GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - activation_mode_t active_gate) { + ActivationType active_gate) { for (int b = 0; b < batch_size; b++) { if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_backward_reset_grad( diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h index d2edcb7f25..1783d46096 100644 --- a/paddle/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/operators/math/detail/gru_gpu_kernel.h @@ -19,8 +19,6 @@ limitations under the License. */ #include "paddle/platform/cuda_helper.h" #include "paddle/platform/device_context.h" -#include - namespace paddle { namespace operators { namespace math { @@ -35,7 +33,7 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, T *gate_value, T *reset_output_value, T *prev_output_value, int frame_size, int batch_size, - activation_mode_t active_gate) { + ActivationType active_gate) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; @@ -74,7 +72,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, T *gate_value, T *prev_output_value, T *output_value, int frame_size, int batch_size, - activation_mode_t active_node) { + ActivationType active_node) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; @@ -111,7 +109,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *output_grad, int frame_size, int batch_size, - activation_mode_t active_node) { + ActivationType active_node) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; @@ -159,7 +157,7 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, T *gate_grad, T *prev_out_value, T *prev_out_grad, T *reset_output_grad, int frame_size, int batch_size, - activation_mode_t active_gate) { + ActivationType active_gate) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h index acd84be01d..4d8245cb5d 100644 --- a/paddle/operators/math/detail/gru_kernel.h +++ b/paddle/operators/math/detail/gru_kernel.h @@ -30,7 +30,7 @@ class gru_resetOutput { public: HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, T &prev_out, T &value_reset_output, - activation_mode_t act_gate) { + ActivationType act_gate) { value_update_gate = activation(value_update_gate, act_gate); value_reset_gate = activation(value_reset_gate, act_gate); value_reset_output = prev_out * value_reset_gate; @@ -43,7 +43,7 @@ class gru_resetOutput { HOSTDEVICE void operator()(__m256 &value_update_gate, __m256 &value_reset_gate, __m256 &prev_out, __m256 &value_reset_output, - activation_mode_t act_gate) { + ActivationType act_gate) { value_update_gate = activation(value_update_gate, act_gate); value_reset_gate = activation(value_reset_gate, act_gate); value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); @@ -57,7 +57,7 @@ class gru_finalOutput { public: HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, T &prev_out, T &value_output, - activation_mode_t act_input) { + ActivationType act_input) { value_frame_state = activation(value_frame_state, act_input); value_output = prev_out - (value_update_gate * prev_out) + (value_update_gate * value_frame_state); @@ -69,8 +69,7 @@ class gru_finalOutput { static const bool avx = true; HOSTDEVICE void operator()(__m256 &value_update_gate, __m256 &value_frame_state, __m256 &prev_out, - __m256 &value_output, - activation_mode_t act_input) { + __m256 &value_output, ActivationType act_input) { value_frame_state = activation(value_frame_state, act_input); value_output = _mm256_add_ps( _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), @@ -89,7 +88,7 @@ class gru_stateGrad { HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, T &value_frame_state, T &grad_frame_state, T &value_prev_out, T &grad_prev_out, - T &grad_output, activation_mode_t act_input) { + T &grad_output, ActivationType act_input) { grad_update_gate = (grad_output * value_frame_state); grad_update_gate -= (grad_output * value_prev_out); grad_prev_out -= (grad_output * value_update_gate); @@ -107,7 +106,7 @@ class gru_stateGrad { __m256 &value_frame_state, __m256 &grad_frame_state, __m256 &value_prev_out, __m256 &grad_prev_out, __m256 &grad_output, - activation_mode_t act_input) { + ActivationType act_input) { grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); grad_update_gate = _mm256_sub_ps( grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); @@ -128,7 +127,7 @@ class gru_resetGrad { HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, T &value_reset_gate, T &grad_reset_gate, T &value_prev_out, T &grad_prev_out, - T &grad_reset_output, activation_mode_t act_gate) { + T &grad_reset_output, ActivationType act_gate) { grad_reset_gate = (grad_reset_output * value_prev_out); grad_prev_out += (grad_reset_output * value_reset_gate); grad_update_gate = @@ -144,7 +143,7 @@ class gru_resetGrad { __m256 &grad_update_gate, __m256 &value_reset_gate, __m256 &grad_reset_gate, __m256 &value_prev_out, __m256 &grad_prev_out, __m256 &grad_reset_output, - activation_mode_t act_gate) { + ActivationType act_gate) { grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); grad_prev_out = _mm256_add_ps( grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc index d570c68cd4..101ab85962 100644 --- a/paddle/operators/math/gru_compute.cc +++ b/paddle/operators/math/gru_compute.cc @@ -21,9 +21,9 @@ namespace math { template struct GRUUnitFunctor { static void compute(const platform::CPUDeviceContext &context, - hl_gru_value value, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate) { + GRUMetaValue value, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { #ifndef __NVCC__ if (value.prev_out_value) { math::gemm( @@ -51,10 +51,10 @@ struct GRUUnitFunctor { template struct GRUUnitGradFunctor { static void compute(const platform::CPUDeviceContext &context, - hl_gru_value value, hl_gru_grad grad, + GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate) { + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { #ifndef __NVCC__ detail::backward_state_grad(detail::backward::gru_stateGrad(), value, grad, frame_size, batch_size, active_node); diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index dd518cd1e4..aab3e2309b 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -21,9 +21,8 @@ namespace math { template struct GRUUnitFunctor { static void compute(const platform::CUDADeviceContext &context, - hl_gru_value value, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate) { + GRUMetaValue value, int frame_size, int batch_size, + ActivationType active_node, ActivationType active_gate) { auto stream = context.stream(); dim3 threads; dim3 grid; @@ -88,10 +87,9 @@ struct GRUUnitFunctor { template struct GRUUnitGradFunctor { static void compute(const platform::CUDADeviceContext &context, - hl_gru_value value, hl_gru_grad grad, + GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate) { + ActivationType active_node, ActivationType active_gate) { auto stream = context.stream(); dim3 threads; dim3 grid; diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h index ca1343cb2c..bf69147b50 100644 --- a/paddle/operators/math/gru_compute.h +++ b/paddle/operators/math/gru_compute.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once -#include "paddle/operators/math/lstm_compute.h" +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" @@ -19,9 +19,8 @@ namespace paddle { namespace operators { namespace math { -// TODO(guosheng): refine code style in gru_compute template -struct hl_gru_value { +struct GRUMetaValue { T *gate_weight; T *state_weight; T *gate_value; @@ -31,7 +30,7 @@ struct hl_gru_value { }; template -struct hl_gru_grad { +struct GRUMetaGrad { T *gate_weight_grad; T *state_weight_grad; T *gate_grad; @@ -42,18 +41,18 @@ struct hl_gru_grad { template struct GRUUnitFunctor { - static void compute(const DeviceContext &context, hl_gru_value value, + static void compute(const DeviceContext &context, GRUMetaValue value, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate); + const detail::ActivationType active_node, + const detail::ActivationType active_gate); }; template struct GRUUnitGradFunctor { - static void compute(const DeviceContext &context, hl_gru_value value, - hl_gru_grad grad, int frame_size, int batch_size, - activation_mode_t active_node, - activation_mode_t active_gate); + static void compute(const DeviceContext &context, GRUMetaValue value, + GRUMetaGrad grad, int frame_size, int batch_size, + const detail::ActivationType active_node, + const detail::ActivationType active_gate); }; } // namespace math From 3158b4b37a7743239030a331de56f9c227d14adf Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 28 Dec 2017 17:50:29 +0800 Subject: [PATCH 121/181] Update tensor_util --- paddle/framework/CMakeLists.txt | 6 ++- paddle/framework/tensor_util.cc | 10 +++-- paddle/framework/tensor_util.h | 3 ++ paddle/framework/tensor_util_test.cu | 57 ++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 paddle/framework/tensor_util_test.cu diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 2af10a996c..46dce7d1d2 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -12,7 +12,11 @@ else() endif () cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) -cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +if (WITH_GPU) + nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) +else() + cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc index 293c65a065..7efc649d0b 100644 --- a/paddle/framework/tensor_util.cc +++ b/paddle/framework/tensor_util.cc @@ -31,6 +31,7 @@ struct AnyDTypeVisitor { void operator()() const { auto t = EigenVector::Flatten(tensor_); auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. o.device(*ctx_.eigen_device()) = predicate_(t).any(); } }; @@ -66,9 +67,10 @@ struct AnyVisitor : public boost::static_visitor { framework::Tensor tmp; tmp.Resize({1}); tmp.mutable_data(cpu); - platform::DeviceContextPool::Instance().Get(gpu)->Wait(); - CopyFrom(out, cpu, &tmp); - platform::DeviceContextPool::Instance().Get(gpu)->Wait(); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + CopyFrom(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); return GetResult(tmp, cpu); } @@ -89,6 +91,7 @@ struct HasNANPredicate { template auto operator()(const T& eigen_vec) const -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. return eigen_vec.isnan(); } }; @@ -102,6 +105,7 @@ struct HasInfPredicate { template auto operator()(const T& eigen_vec) const -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. return eigen_vec.isinf(); } }; diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index e71d8e5672..784170dae3 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -208,7 +208,10 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +// Returns true if a tensor contains NAN, i.e., Not A Number. extern bool HasNAN(const framework::Tensor& tensor); + +// Returns true if a tensor contains Inf, i.e., Infinity. extern bool HasInf(const framework::Tensor& tensor); } // namespace framework diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu new file mode 100644 index 0000000000..ebd35fdf6c --- /dev/null +++ b/paddle/framework/tensor_util_test.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/framework/tensor_util.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace framework { + +static __global__ void FillNAN(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = NAN; +} +static __global__ void FillInf(float* buf) { + buf[0] = 0.0; + buf[1] = INFINITY; + buf[2] = 0.5; +} + +TEST(HasNAN, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasNAN(tensor)); +} + +TEST(HasInf, GPU) { + Tensor tensor; + platform::CUDAPlace gpu(0); + auto& pool = platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(HasInf(tensor)); +} + +} // namespace framework +} // namespace paddle From bb0427add03ce29b8013511f9cebf509e9de3585 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 28 Dec 2017 17:57:17 +0800 Subject: [PATCH 122/181] Add comments for functions in backward.py --- python/paddle/v2/fluid/backward.py | 77 ++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 6966cc7580..b3c1bab298 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -5,14 +5,17 @@ import collections __all__ = ['append_backward'] -def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, - end_idx=None): +def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): + """ + Traverse all ops in op_descs[begin_idx : end_idx], + if any op has inputs/outputs named "old_name", rename it as 'new_name' + """ if begin_idx is None: begin_idx = 0 if end_idx is None: - end_idx = len(op_desc_list) + end_idx = len(op_descs) for i in range(begin_idx, end_idx): - op_desc = op_desc_list[i] + op_desc = op_descs[i] if isinstance(op_desc, tuple): op_desc = op_desc[0] op_desc.rename_input(old_name, new_name) @@ -20,6 +23,9 @@ def _rename_arg_(op_desc_list, old_name, new_name, begin_idx=None, def _create_op_desc_(op_type, inputs, outputs, attrs): + """ + Create a C++ OpDesc object with specified inputs, outputs and attributes. + """ op_desc = core.OpDesc() op_desc.set_type(op_type) for para, args in inputs.iteritems(): @@ -34,9 +40,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): return op_desc -def _infer_var_data_type_(var_name, block): - grad_var = block.desc.find_var(var_name.encode("ascii")) - fwd_name = _strip_grad_suffix_(var_name.encode("ascii")) +def _infer_var_data_type_(grad_var_name, block): + """ + Infer the data type of given grad variable + """ + grad_var = block.desc.find_var(grad_var_name.encode("ascii")) + fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii")) if block.desc.has_var_recursive(fwd_name): fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii")) grad_var.set_dtype(fwd_var.dtype()) @@ -45,6 +54,9 @@ def _infer_var_data_type_(var_name, block): def _all_in_set_(cands, s): + """ + Test if all elements of 'cands' are in set 's' + """ for c in cands: if not c in s: return False @@ -52,18 +64,29 @@ def _all_in_set_(cands, s): def _strip_grad_suffix_(name): + """ + Strip the grad suffix from the given varibale name + e.g. x@GRAD ==> x + y@GRAD@RENAME@1 ==> y + """ pos = name.find(core.grad_var_suffix()) return name[:pos] if pos != -1 else name def _append_grad_suffix_(name): + """ + Append grad suffix to the given variable name + e.g. x ==> x@GRAD + """ return name + core.grad_var_suffix() def _addup_repetitive_outputs_(op_descs): - # In backward part, an variable my be the output of more than one ops. - # In this case, the variable should be the accumulation of all the outputs. - # We adopt adding `sum_op`s to implement the accumulate. + """ + In backward part, an variable may be the output of more than one ops. + In this case, the variable should be the accumulation of all the outputs. + `sum_op`s are added to implement the accumulate. + """ pending_sum_ops = [] var_rename_count = collections.defaultdict(int) renamed_vars = collections.defaultdict(list) @@ -109,6 +132,12 @@ def _addup_repetitive_outputs_(op_descs): def _remove_no_grad_branch_(op_descs, no_grad_set): + """ + Remove unnecessary grad ops + A grad op can be removed in two cases: + 1. all outputs of the grad op are in 'no_grad_set' + 2. (TODO) all grad inputs of the grad op are in 'no_grad_set' + """ # Remove ops whose outputs are all in no_grad_dict op_descs = filter( lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set), @@ -133,6 +162,20 @@ def _append_backward_ops_(target, no_grad_dict, grad_to_var, callback=None): + """ + Create all grad ops, and insert them into given block + + Args: + target(Variable): the target variable of forward pass + block(Block): the block where forward ops are + target_block(Block): the block which is going to hold new generated grad ops + no_grad_dict(dict): + key(int) block index + val(set) a set of varibale names. These varibales have no gradient + grad_to_var(dict)(output argument): + key(str): grad variable name + val(str): corresponding forward variable name + """ grad_op_descs = [] program = block.program for op in reversed(block.ops): @@ -170,6 +213,20 @@ def _append_backward_ops_(target, def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): + """ + Create new variables required by backward pass. + + Args: + block(Block): the block where new variables will be created + start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created + grad_to_var(dict): + key(str): grad variable name + val(str): corresponding forward variable name + In most cases, this dict is generated by _append_backward_ops_() + grad_info_map(dict)(output argument): + key(str): forward variable name + val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index + """ for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) if op_desc.has_attr("sub_block"): From 23b53c48df461b11a2a39929e30c661fbc407aee Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 28 Dec 2017 18:09:52 +0800 Subject: [PATCH 123/181] Delete the old activation type for LSTM and GRU operator --- paddle/operators/math/gru_compute.cu | 6 ++++-- paddle/operators/math/lstm_compute.h | 22 ---------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index aab3e2309b..d5a0e630ea 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -22,7 +22,8 @@ template struct GRUUnitFunctor { static void compute(const platform::CUDADeviceContext &context, GRUMetaValue value, int frame_size, int batch_size, - ActivationType active_node, ActivationType active_gate) { + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { auto stream = context.stream(); dim3 threads; dim3 grid; @@ -89,7 +90,8 @@ struct GRUUnitGradFunctor { static void compute(const platform::CUDADeviceContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, int batch_size, - ActivationType active_node, ActivationType active_gate) { + const detail::ActivationType active_node, + const detail::ActivationType active_gate) { auto stream = context.stream(); dim3 threads; dim3 grid; diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 954762f922..e1ad6b64d2 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -22,14 +22,6 @@ namespace paddle { namespace operators { namespace math { -typedef enum { - HL_ACTIVATION_SIGMOID = 0, - HL_ACTIVATION_RELU = 1, - HL_ACTIVATION_TANH = 2, - HL_ACTIVATION_LINEAR = 3, - HL_ACTIVATION_END -} activation_mode_t; - template struct LstmMetaValue { T *gate_value; @@ -54,20 +46,6 @@ struct LstmMetaGrad { T *check_og_grad; }; -inline activation_mode_t ActiveType(const std::string &type) { - if (type == "sigmoid") { - return HL_ACTIVATION_SIGMOID; - } else if (type == "relu") { - return HL_ACTIVATION_RELU; - } else if (type == "tanh") { - return HL_ACTIVATION_TANH; - } else if (type == "linear" || type == "identity" || type == "") { - return HL_ACTIVATION_LINEAR; - } else { - PADDLE_THROW("Do not support activation type."); - } -} - template class LstmUnitFunctor { public: From cf9e09b115bae0ad9cbb2ad3594f0f10f30a813b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 28 Dec 2017 21:51:47 +0800 Subject: [PATCH 124/181] set openblas env to avoid threads conflicts --- benchmark/paddle/image/run_openblas_train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index e9df83fee2..d82c8384e0 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -2,6 +2,7 @@ set -e function train() { unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + export OPENBLAS_NUM_THREADS=1 topology=$1 layer_num=$2 bs=$3 From 33b5382efc8f3e58eda8bae24559f22d6485824c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 28 Dec 2017 22:15:59 +0800 Subject: [PATCH 125/181] auto set openblas env --- paddle/scripts/submit_local.sh.in | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index a94bc01b35..8a352b0078 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -71,9 +71,7 @@ function threads_config() { # auto set OMP_NUM_THREADS and MKL_NUM_THREADS # according to trainer_count and total processors # only when MKL enabled - if [ "@WITH_MKL@" == "OFF" ]; then - return 0 - fi + # auto set OPENBLAS_NUM_THREADS when do not use MKL processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` if [ -z $trainers ]; then @@ -83,12 +81,19 @@ function threads_config() { if [ $threads -eq 0 ]; then threads=1 fi - if [ -z "$OMP_NUM_THREADS" ]; then - export OMP_NUM_THREADS=$threads - fi - if [ -z "$MKL_NUM_THREADS" ]; then - export MKL_NUM_THREADS=$threads + if [ "@WITH_MKL@" == "ON" ]; then + if [ -z "$OMP_NUM_THREADS" ]; then + export OMP_NUM_THREADS=$threads + fi + if [ -z "$MKL_NUM_THREADS" ]; then + export MKL_NUM_THREADS=$threads + fi + else + if [ -z "$OPENBLAS_NUM_THREADS" ]; then + export OPENBLAS_NUM_THREADS=$threads + fi fi + } PADDLE_CONF_HOME="$HOME/.config/paddle" @@ -150,7 +155,7 @@ fi case "$1" in "train") threads_config $@ - # echo $OMP_NUM_THREADS $MKL_NUM_THREADS + # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ;; "merge_model") From 641b4c0fe6db944ffe47a3dbd8a88c7a966c41f1 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 10:49:28 +0800 Subject: [PATCH 126/181] wip --- paddle/operators/adagrad_op.cc | 44 ++------ paddle/operators/adagrad_op.cu | 48 ++------ paddle/operators/adam_op.h | 17 ++- .../operators/math/selected_rows_functor.cc | 90 +++++++++++++-- .../operators/math/selected_rows_functor.cu | 106 ++++++++++++++++-- paddle/operators/math/selected_rows_functor.h | 74 ++++++++---- python/paddle/v2/fluid/tests/test_adam_op.py | 1 - 7 files changed, 251 insertions(+), 129 deletions(-) diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index 052c793a01..c83318a272 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -105,48 +105,18 @@ struct SparseAdagradFunctor { const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) - auto grad_rows = grad.rows(); - std::set row_set(grad_rows.begin(), grad_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); - auto grad_width = grad.value().dims()[1]; - std::unique_ptr grad_merge{ - new framework::SelectedRows()}; - grad_merge->set_rows(merge_rows); - grad_merge->set_height(grad.height()); - grad_merge->mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), grad_width}), - context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, grad_merge->mutable_value(), 0.0); - - auto* grad_merge_data = grad_merge->mutable_value()->data(); - auto* grad_data = grad.value().data(); - - for (size_t i = 0; i < grad_rows.size(); i++) { - size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]); - for (int64_t j = 0; j < grad_width; j++) { - grad_merge_data[grad_merge_i * grad_width + j] += - grad_data[i * grad_width + j]; - } - } + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto& merge_rows = grad_merge.rows(); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); // 2. m += g_m * g_m - std::unique_ptr grad_square{ - new framework::SelectedRows()}; - grad_square->set_rows(grad_merge->rows()); - grad_square->set_height(grad_merge->height()); - grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), - context.GetPlace()); - auto gs = - framework::EigenVector::Flatten(*(grad_square->mutable_value())); - auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.eigen_device()) = gm * gm; + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; - functor(context, *grad_square, moment); + functor(context, grad_square, moment); // 3. update parameter auto* lr = learning_rate.data(); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 585b2d9289..86b3dd860d 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -78,51 +78,17 @@ struct SparseAdagradFunctor { const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) - auto grad_rows = grad.rows(); - std::set row_set(grad_rows.begin(), grad_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); - auto grad_width = grad.value().dims()[1]; - std::unique_ptr grad_merge{ - new framework::SelectedRows()}; - grad_merge->set_rows(merge_rows); - grad_merge->set_height(grad.height()); - grad_merge->mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), grad_width}), - context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, grad_merge->mutable_value(), 0.0); - - auto* grad_merge_data = grad_merge->mutable_value()->data(); - auto* grad_data = grad.value().data(); - - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid1(1, grad_rows.size()); - - MergeGradKernel< - T, 256><<(context) - .stream()>>>(grad_data, grad.rows().data(), - grad_merge_data, grad_merge->rows().data(), - grad_merge->rows().size(), grad_width); - + math::scatter::MergeAdd merge_func; + auto grad_merge = merge_func(context, grad); + auto* grad_merge_data = grad_merge.mutable_value()->template data(); + auto& merge_rows = grad_merge.rows; // 2. m += g_m * g_m - std::unique_ptr grad_square{ - new framework::SelectedRows()}; - grad_square->set_rows(grad_merge->rows()); - grad_square->set_height(grad_merge->height()); - grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), - context.GetPlace()); - auto gs = - framework::EigenVector::Flatten(*(grad_square->mutable_value())); - auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.eigen_device()) = gm * gm; + math::scatter::Mul sqare_func; + auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; - functor(context, *grad_square, moment); + functor(context, grad_square, moment); // 3. update parameter auto* lr = learning_rate.data(); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 5facd0112f..3c4148ccc0 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -16,11 +16,14 @@ limitations under the License. */ #include // for sqrt in CPU and CUDA #include "paddle/framework/op_registry.h" #include "paddle/operators/detail/safe_ref.h" +#include "paddle/operators/math/selected_rows_functor.h" #include "paddle/platform/for_range.h" namespace paddle { namespace operators { +namespace scatter = paddle::operators::math::scatter; + template struct AdamFunctor { T beta1_; @@ -134,8 +137,6 @@ struct SparseAdamFunctor { mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // IMPORTANT: - // FIXME(typhoonzero): row id may be duplicate moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; param_out_[rows_[i] * row_numel_ + j] = p; @@ -191,10 +192,14 @@ class AdamOpKernel : public framework::OpKernel { } else if (grad_var->IsType()) { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); - auto& grad_tensor = grad.value(); + // merge duplicated rows if any. + scatter::MergeAdd merge_func; + auto grad_merge = + merge_func(ctx.template device_context(), grad); + auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - auto* rows = grad.rows().data(); - auto row_numel = grad_tensor.numel() / grad.rows().size(); + auto* rows = grad_merge.rows().data(); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); SparseAdamFunctor functor( beta1, beta2, epsilon, beta1_pow.template data(), @@ -206,7 +211,7 @@ class AdamOpKernel : public framework::OpKernel { param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); platform::ForRange for_range( static_cast(ctx.device_context()), - grad.rows().size()); + grad_merge.rows().size()); for_range(functor); } else { PADDLE_THROW("Variable type not supported by adam_op"); diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 21418ba4b0..c9f3c10c61 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/math/selected_rows_functor.h" +#include + #include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/selected_rows_functor.h" namespace paddle { namespace operators { @@ -193,27 +195,25 @@ size_t FindPos(const std::vector& rows, int64_t value) { template struct MergeAdd { - void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out) { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; auto input_rows = input.rows(); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; - // std::unique_ptr out{ - // new framework::SelectedRows()}; - out->set_rows(merge_rows); - out->set_height(input.height()); - out->mutable_value()->mutable_data( + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); math::SetConstant constant_functor; - constant_functor(context, out->mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), 0.0); - auto* out_data = out->mutable_value()->data(); + auto* out_data = out.mutable_value()->data(); auto* input_data = input.value().data(); for (size_t i = 0; i < input_rows.size(); i++) { @@ -222,6 +222,74 @@ struct MergeAdd { out_data[out_i * input_width + j] += input_data[i * input_width + j]; } } + return out; + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +struct UpdateToTensor { + framework::Tensor operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::ADD: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUB: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] -= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUBBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] - + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + case ScatterOps::MUL: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] *= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIV: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] /= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIVBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] / + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + } } }; diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index b2c0fe7bc3..48413403db 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -252,27 +252,26 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, template struct MergeAdd { - void operator()(const platform::GPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out) { + framework::SelectedRows operator()(const platform::GPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; auto input_rows = input.rows(); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); auto input_width = input.value().dims()[1]; - // std::unique_ptr out{ - // new framework::SelectedRows()}; - out->set_rows(merge_rows); - out->set_height(input.height()); - out->mutable_value()->mutable_data( + + out.set_rows(merge_rows); + out.set_height(input.height()); + out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); math::SetConstant constant_functor; - constant_functor(context, out->mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), 0.0); - auto* out_data = out->mutable_value()->data(); + auto* out_data = out.mutable_value()->data(); auto* input_data = input.value().data(); const int block_size = 256; @@ -283,11 +282,96 @@ struct MergeAdd { T, 256><<(context) .stream()>>>(input_data, input.rows().data(), out_data, - out->rows().data(), out->rows().size(), + out.rows().data(), out.rows().size(), input_width); + return out; } }; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +__global__ void UpdateToTensorKernel(const T* selected_rows, + const int64_t* rows, const ScatterOps& op, + T* tensor_out, int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index]; + } + break; + case ScatterOps::ADD: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] += selected_rows[index]; + } + break; + case ScatterOps::SUB: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] -= selected_rows[index]; + } + break; + case ScatterOps::SUBBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] - tensor_out[index]; + } + break; + case ScatterOps::MUL: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] *= selected_rows[index]; + } + break; + case ScatterOps::DIV: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] /= selected_rows[index]; + } + break; + case ScatterOps::DIVBY: + for (int index = tid; index < row_numel; index += block_size) { + tensor_out[index] = selected_rows[index] / tensor_out[index]; + } + break; + } +} + +template +struct UpdateToTensor { + framework::Tensor operator()(const platform::GPUDeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2) { + // NOTE: Use SelectedRowsAddToTensor for better performance + // no additional MergeAdd called. + auto merged_in1 = MergeAdd()(context, input1); + + auto in1_height = merged_in1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = merged_in1.value(); + auto& in1_rows = merged_in1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + dim3 threads(PADDLE_CUDA_NUM_THREADS, 1); + dim3 grid(1, in1_rows.size()); + UpdateToTensorKernel< + T, PADDLE_CUDA_NUM_THREADS><<>>( + in1_data, in1_rows.data(), op, in2_data, in1_row_numel); + } +}; } // namespace scatter } // namespace math } // namespace operators diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index eecd5e5362..d4bef72980 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -16,6 +16,10 @@ limitations under the License. */ #include "paddle/framework/selected_rows.h" #include "paddle/platform/device_context.h" +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + namespace paddle { namespace operators { namespace math { @@ -55,50 +59,76 @@ struct SelectedRowsAddToTensor { namespace scatter { // functors for manuplating SelectedRows data - template struct MergeAdd { // unary functor, merge by adding duplicated rows in // the input SelectedRows object. - void operator()(const DeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* out); + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input); }; template struct Add { - void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* out) { - out->set_rows(input1.rows()); - out->set_height(input1.height()); - out->mutable_value()->mutable_data(input1.value().dims(), - context.GetPlace()); - auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); auto e_in1 = framework::EigenVector::Flatten(input1.value()); auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 + e_in2; + return out; } }; template struct Mul { - void operator()(const DeviceContext& context, - const framework::SelectedRows& input1, - const framework::SelectedRows& input2, - framework::SelectedRows* out) { - out->set_rows(input1.rows()); - out->set_height(input1.height()); - out->mutable_value()->mutable_data(input1.value().dims(), - context.GetPlace()); - auto e_out = framework::EigenVector::Flatten(*(out->mutable_value())); + // multiply two SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const framework::SelectedRows& input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); auto e_in1 = framework::EigenVector::Flatten(input1.value()); auto e_in2 = framework::EigenVector::Flatten(input2.value()); e_out.device(*context.eigen_device()) = e_in1 * e_in2; + return out; + } + // multiply scalar to SelectedRows + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input1, + const T input2) { + framework::SelectedRows out; + out.set_rows(input1.rows()); + out.set_height(input1.height()); + out.mutable_value()->mutable_data(input1.value().dims(), + context.GetPlace()); + auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); + auto e_in1 = framework::EigenVector::Flatten(input1.value()); + e_out.device(*context.eigen_device()) = input2 * e_in1; + return out; } }; +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = seleted_rows_in / tensor +template +struct UpdateToTensor { + framework::Tensor operator()(const DeviceContext& context, + const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + } // namespace scatter } // namespace math } // namespace operators diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index 3758ca457e..7dbc2fa085 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -285,7 +285,6 @@ class TestSparseAdamOp(unittest.TestCase): j = 0 while j < self.row_numel: pos = row_id * self.row_numel + j - print(actual[pos] - np_array[pos]) / actual[pos] self.assertLess((actual[pos] - np_array[pos]) / actual[pos], 0.00001) j += 1 From d630d3921452b3f92dd358caaf03fa7d33942627 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 29 Dec 2017 12:18:23 +0800 Subject: [PATCH 127/181] auto set openblas env when inference and remove unused env for openblas --- benchmark/paddle/image/run_openblas_infer.sh | 16 ++++++++++------ benchmark/paddle/image/run_openblas_train.sh | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh index da034f3b9d..71a49231a5 100755 --- a/benchmark/paddle/image/run_openblas_infer.sh +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -8,15 +8,19 @@ function clock_to_seconds() { } function infer() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY topology=$1 layer_num=$2 bs=$3 - thread=`nproc` - if [ $thread -gt $bs ]; then - thread=$bs + trainers=`nproc` + if [ $trainers -gt $bs ]; then + trainers=$bs fi - log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log" + log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log" + threads=$((`nproc` / trainers)) + if [ $threads -eq 0 ]; then + threads=1 + fi + export OPENBLAS_NUM_THREADS=$threads models_in="models/${topology}-${layer_num}/pass-00000/" if [ ! -d $models_in ]; then @@ -28,7 +32,7 @@ function infer() { --config="${topology}.py" \ --use_mkldnn=False \ --use_gpu=False \ - --trainer_count=$thread \ + --trainer_count=$trainers \ --log_period=$log_period \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ --init_model_path=$models_in \ diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index d82c8384e0..935cff6f2c 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -1,7 +1,6 @@ set -e function train() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY export OPENBLAS_NUM_THREADS=1 topology=$1 layer_num=$2 From 5139e6c740f9829234de3cc4ed5a3fcd56e2331c Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Fri, 29 Dec 2017 12:57:57 +0800 Subject: [PATCH 128/181] Follow comments --- paddle/framework/executor.cc | 6 +++--- paddle/framework/tensor_util.h | 4 ++-- paddle/framework/tensor_util_test.cc | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index de4d3395eb..bf1f0471cc 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -59,11 +59,11 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CheckTensorNANOrInf(const std::string& name, const framework::Tensor& tensor) { - if (tensor.type().hash_code() != typeid(float).hash_code() && - tensor.type().hash_code() != typeid(double).hash_code()) { + if (tensor.memory_size() == 0) { return; } - if (tensor.memory_size() == 0) { + if (tensor.type().hash_code() != typeid(float).hash_code() && + tensor.type().hash_code() != typeid(double).hash_code()) { return; } PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index a86fab2925..6a21f8db1e 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -210,10 +210,10 @@ inline void CopyToVector(const Tensor& src, std::vector* dst) { } // Returns true if a tensor contains NAN, i.e., Not A Number. -extern bool HasNAN(const framework::Tensor& tensor); +bool HasNAN(const framework::Tensor& tensor); // Returns true if a tensor contains Inf, i.e., Infinity. -extern bool HasInf(const framework::Tensor& tensor); +bool HasInf(const framework::Tensor& tensor); inline void SerializeToStream(std::ostream& os, const Tensor& tensor, const platform::DeviceContext& dev_ctx) { diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index f00ce79548..0dc5166fca 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -231,7 +231,7 @@ TEST(CopyToVector, Tensor) { #endif } -TEST(IsNAN, CPU) { +TEST(HasNAN, CPU) { using namespace paddle::framework; using namespace paddle::platform; Tensor src; @@ -243,7 +243,7 @@ TEST(IsNAN, CPU) { ASSERT_TRUE(HasNAN(src)); } -TEST(IsInf, CPU) { +TEST(HasInf, CPU) { using namespace paddle::framework; using namespace paddle::platform; Tensor src; From 0fd4a04abdc6f411ebb77d7a389108e951223c7e Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Fri, 29 Dec 2017 13:10:53 +0800 Subject: [PATCH 129/181] Remove debug codes --- paddle/framework/tensor_impl.h | 13 ++----------- paddle/operators/fill_constant_op.cc | 1 - paddle/operators/shrink_rnn_memory_op.cc | 1 - paddle/operators/while_op.cc | 23 ----------------------- 4 files changed, 2 insertions(+), 36 deletions(-) diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 0161ed8c47..6c6f298edc 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -134,17 +134,8 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { #endif offset_ = 0; } - void* buf = reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - if (type.hash_code() == typeid(float).hash_code() || - type.hash_code() == typeid(double).hash_code()) { - float* tmp = (float*)(buf); - for (int64_t i = 0; i < numel(); ++i) { - tmp[i] = NAN; - } - } - - return buf; + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); } inline void* Tensor::mutable_data(platform::Place place) { diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index 196c380c73..dcd43a30c8 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -51,7 +51,6 @@ class FillConstantOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); - VLOG(10) << "FillConstant to " << &out; math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index 9ef473e726..b37269b471 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -116,7 +116,6 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { auto height = dout_tensor.dims()[0]; auto slice = dx_tensor.Slice(0, static_cast(height)); framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice); - VLOG(10) << dx_tensor.dims()[0] << ", " << height; if (dx_tensor.dims()[0] > height) { auto rest_tensor = dx_tensor.Slice( static_cast(height), static_cast(dx_tensor.dims()[0])); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 341c163aa1..728ef60794 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "paddle/framework/executor.h" #include "paddle/framework/lod_tensor_array.h" @@ -195,36 +194,14 @@ class WhileGradOp : public framework::OperatorBase { } } - auto check_var_no_nan = [](const framework::Scope &scope, - const std::string &var_name) { - auto *var = scope.FindVar(var_name); - if (var->IsType()) { - VLOG(10) << "Checking " << var_name; - PADDLE_ENFORCE(!framework::HasNAN(var->Get()), - "%s has NAN", var_name); - if (var->Get().type() == - typeid(float)) { // NOLINT - auto &tensor = var->Get(); - auto *buf = tensor.data(); - for (int64_t i = 0; i < tensor.numel(); ++i) { - PADDLE_ENFORCE(!std::isnan(buf[i])); - } - VLOG(10) << buf[0]; - } - } - }; - check_var_no_nan(cur_scope, inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name); - check_var_no_nan(cur_scope, new_inside_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_place); - check_var_no_nan(scope, pg_names[param_id]); cur_scope.Rename(new_inside_name, inside_grad_name); } } - VLOG(1) << "Complete WhileOpGrad"; } }; From fcd84c15303cac9573432a6ce4516c2d643064e8 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Fri, 29 Dec 2017 13:14:31 +0800 Subject: [PATCH 130/181] Comment debug code --- python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py index 6569ccb9e6..c02c59284e 100644 --- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -294,7 +294,8 @@ class TestSimpleMulWithMemory(unittest.TestCase): assert isinstance(Out, Output) Out.out(o) - @many_times(10) + # many_times used locally for debug. Make sure the calculation is stable. + # @many_times(10) @prog_scope() def test_forward_backward(self): py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory() From 1039c1e3b7b391963fe2e4f1dba22d3358104a98 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 13:51:41 +0800 Subject: [PATCH 131/181] scatter optimizers --- paddle/operators/adagrad_op.cu | 10 +++-- .../operators/math/selected_rows_functor.cc | 7 ++-- .../operators/math/selected_rows_functor.cu | 38 ++++++++++--------- paddle/operators/math/selected_rows_functor.h | 7 ++-- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 86b3dd860d..fed2e29367 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -79,12 +79,12 @@ struct SparseAdagradFunctor { framework::Tensor* moment, framework::Tensor* param) { // 1. g_m.rows = set(g.rows) auto grad_width = grad.value().dims()[1]; - math::scatter::MergeAdd merge_func; + math::scatter::MergeAdd merge_func; auto grad_merge = merge_func(context, grad); auto* grad_merge_data = grad_merge.mutable_value()->template data(); - auto& merge_rows = grad_merge.rows; + auto& merge_rows = grad_merge.rows(); // 2. m += g_m * g_m - math::scatter::Mul sqare_func; + math::scatter::Mul sqare_func; auto grad_square = sqare_func(context, grad_merge, grad_merge); math::SelectedRowsAddToTensor functor; @@ -95,11 +95,13 @@ struct SparseAdagradFunctor { auto* param_data = param->data(); auto* moment_data = moment->data(); + const int block_size = 256; + dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); SparseAdagradFunctorKernel< T, 256><<(context) - .stream()>>>(grad_merge_data, grad_merge->rows().data(), + .stream()>>>(grad_merge_data, grad_merge.rows().data(), lr, param_data, moment_data, grad_width, epsilon); } diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index c9f3c10c61..8a1ebb58c2 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -233,10 +233,9 @@ template struct MergeAdd; template struct UpdateToTensor { - framework::Tensor operator()(const platform::CPUDeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2) { + void operator()(const platform::CPUDeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index 48413403db..0ee456f9bc 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/operators/math/math_function.h" #include "paddle/operators/math/selected_rows_functor.h" #include "paddle/platform/cuda_helper.h" @@ -251,8 +253,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, } template -struct MergeAdd { - framework::SelectedRows operator()(const platform::GPUDeviceContext& context, +struct MergeAdd { + framework::SelectedRows operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input) { framework::SelectedRows out; auto input_rows = input.rows(); @@ -288,10 +290,10 @@ struct MergeAdd { } }; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; template __global__ void UpdateToTensorKernel(const T* selected_rows, @@ -343,14 +345,14 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, } template -struct UpdateToTensor { - framework::Tensor operator()(const platform::GPUDeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2) { +struct UpdateToTensor { + void operator()(const platform::CUDADeviceContext& context, + const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. - auto merged_in1 = MergeAdd()(context, input1); + MergeAdd merge_func; + auto merged_in1 = merge_func(context, input1); auto in1_height = merged_in1.height(); auto in2_dims = input2->dims(); @@ -362,14 +364,14 @@ struct UpdateToTensor { int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); - auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); + auto* in1_data = in1_value.template data(); + auto* in2_data = input2->data(); - dim3 threads(PADDLE_CUDA_NUM_THREADS, 1); + dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(1, in1_rows.size()); - UpdateToTensorKernel< - T, PADDLE_CUDA_NUM_THREADS><<>>( - in1_data, in1_rows.data(), op, in2_data, in1_row_numel); + UpdateToTensorKernel<<< + grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op, + in2_data, in1_row_numel); } }; } // namespace scatter diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index d4bef72980..09d4631905 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -123,10 +123,9 @@ enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; // out = seleted_rows_in / tensor template struct UpdateToTensor { - framework::Tensor operator()(const DeviceContext& context, - const ScatterOps& op, - const framework::SelectedRows& input1, - framework::Tensor* input2); + void operator()(const DeviceContext& context, const ScatterOps& op, + const framework::SelectedRows& input1, + framework::Tensor* input2); }; } // namespace scatter From 903d5609c61046cfa37280af5506ca21e350b852 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 29 Dec 2017 14:11:37 +0800 Subject: [PATCH 132/181] follow comment1 --- paddle/operators/adam_op.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 3c4148ccc0..9cc34bdded 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -124,19 +124,20 @@ struct SparseAdamFunctor { row_numel_(row_numel) {} inline HOSTDEVICE void operator()(size_t i) const { + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; for (int64_t j = 0; j < row_numel_; ++j) { T g = grad_[i * row_numel_ + j]; T mom1 = moment1_[rows_[i] * row_numel_ + j]; T mom2 = moment2_[rows_[i] * row_numel_ + j]; T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; T p = param_[rows_[i] * row_numel_ + j]; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + moment1_out_[rows_[i] * row_numel_ + j] = mom1; moment2_out_[rows_[i] * row_numel_ + j] = mom2; param_out_[rows_[i] * row_numel_ + j] = p; From d25f382d0b8c095008e1f5694e7aaf6f7fa7c075 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Fri, 29 Dec 2017 14:52:40 +0800 Subject: [PATCH 133/181] Remove debug codes --- paddle/framework/executor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index d465f88888..bf1f0471cc 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -67,8 +67,7 @@ static void CheckTensorNANOrInf(const std::string& name, return; } PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name); - PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN, %p", name, - &tensor); + PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name); } void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, From 4a11fdb4ef698bb757ad310b53592c0968893b95 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 29 Dec 2017 15:07:07 +0800 Subject: [PATCH 134/181] follow comments --- paddle/operators/cos_sim_op.cc | 10 +++---- paddle/operators/cos_sim_op.cu | 12 ++++----- paddle/operators/cos_sim_op.h | 48 +++++++++++++++++----------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 77492e60f2..d4f3ca5e32 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -155,11 +155,11 @@ struct CosSimDyFunctor { const T* y_norm, const T* x, const T* y, const T* z, const T* dz, const size_t rows, const size_t cols, T* dy) const { - for (size_t offset = 0; offset < rows; ++offset) { - auto xy_norm_prod = x_norm[offset] * y_norm[0]; - auto dz_data = dz[offset]; - auto z_data = z[offset]; - auto* x_data = x + cols * offset; + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto y_norm_square = y_norm[0] * y_norm[0]; diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 86dc04995a..891436c948 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -25,12 +25,12 @@ __global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, const size_t rows, const size_t cols, T* dy) { int grid_size = blockDim.x * gridDim.x; T y_norm_data = y_norm[0]; - for (int offset = blockIdx.x * blockDim.x + threadIdx.x; offset < rows; - offset += grid_size) { - T xy_norm_prod = x_norm[offset] * y_norm_data; - T dz_data = dz[offset]; - T z_data = z[offset]; - const T* x_data = x + cols * offset; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; T reciprocal_xy_norm_prod = 1 / xy_norm_prod; T y_norm_square = y_norm_data * y_norm_data; diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 7641ca15f1..160edb0b56 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -32,11 +32,11 @@ struct CosSimFunctor { z_(z), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto* x = x_ + cols_ * offset; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; T xx = 0, xy = 0, yy = 0; if (same_row) { - auto* y = y_ + cols_ * offset; + auto* y = y_ + cols_ * row_id; T tep_x, tep_y; for (size_t i = 0; i < cols_; ++i) { tep_x = x[i]; @@ -47,9 +47,9 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - y_norm_[offset] = yy; - x_norm_[offset] = xx; - z_[offset] = xy / (xx * yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); } else { // This can be wrote in a better way. T tep_x, tep_y; for (size_t i = 0; i < cols_; ++i) { @@ -61,9 +61,9 @@ struct CosSimFunctor { } xx = sqrt(xx); yy = sqrt(yy); - if (offset == 0) y_norm_[0] = yy; - x_norm_[offset] = xx; - z_[offset] = xy / (xx * yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); } } @@ -125,15 +125,15 @@ struct CosSimGradFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto x_norm_square = x_norm_[offset] * x_norm_[offset]; - auto xy_norm_prod = x_norm_[offset] * y_norm_[offset]; - auto dz = dz_[offset]; - auto z = z_[offset]; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; - auto* dx = dx_ + cols_ * offset; - auto* x = x_ + cols_ * offset; - auto* y = y_ + cols_ * offset; + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; auto reciprocal_x_norm_square = 1 / x_norm_square; @@ -166,14 +166,14 @@ struct CosSimDxFunctor { dx_(dx), cols_(static_cast(cols)) {} - inline HOSTDEVICE void operator()(size_t offset) const { - auto xy_norm_prod = x_norm_[offset] * y_norm_[0]; - auto dz = dz_[offset]; - auto z = z_[offset]; - auto* x = x_ + cols_ * offset; + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto x_norm_square = x_norm_[offset] * x_norm_[offset]; - auto* dx = dx_ + cols_ * offset; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; auto reciprocal_x_norm_square = 1 / x_norm_square; for (size_t i = 0; i < cols_; ++i) { From e188f0c16041f560ce7efe3a763a9dc164a06f28 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 29 Dec 2017 15:36:43 +0800 Subject: [PATCH 135/181] add paddle version of pip install --- doc/getstarted/build_and_install/pip_install_cn.rst | 4 ++-- doc/getstarted/build_and_install/pip_install_en.rst | 4 ++-- doc/getstarted/index_cn.rst | 4 ++-- doc/getstarted/index_en.rst | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index a4587f82a9..0c741e936b 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具 ------------------------------ -执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 +执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas。 .. code-block:: bash pip install paddlepaddle -如果需要安装支持GPU的版本,需要执行: +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: .. code-block:: bash diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 55e31560a0..285ed09805 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -12,14 +12,14 @@ Install Using pip ------------------------------ Run the following command to install PaddlePaddle on the current -machine, it will also download requirements. +machine, it will also download requirements, the version is cpu_avx_openblas. .. code-block:: bash pip install paddlepaddle -If you wish to install GPU version, just run: +If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run: .. code-block:: bash diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index a9087be6f3..9f6ee25987 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -7,13 +7,13 @@ ++++++++ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 -执行下面的命令完成快速安装: +执行下面的命令完成快速安装,版本为cpu_avx_openblas: .. code-block:: bash pip install paddlepaddle -如果需要安装支持GPU的版本,需要执行: +如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: .. code-block:: bash diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index d14e3f5c0c..063d9d880c 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -8,13 +8,13 @@ Quick Install You can use pip to install PaddlePaddle with a single command, supports CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. -Simply run the following command to install: +Simply run the following command to install, the version is cpu_avx_openblas: .. code-block:: bash pip install paddlepaddle -If you need to install GPU version, run: +If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: .. code-block:: bash From c144261d40ab7c5d24e29c03155310a53d79909e Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 29 Dec 2017 16:34:08 +0800 Subject: [PATCH 136/181] add paddle version of docker --- doc/getstarted/build_and_install/docker_install_cn.rst | 8 ++++---- doc/getstarted/build_and_install/docker_install_en.rst | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index fa1b6a3727..bae42593dd 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -15,7 +15,7 @@ 获取PaddlePaddle的Docker镜像 ------------------------------ -执行下面的命令获取最新的PaddlePaddle Docker镜像 +执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl: .. code-block:: bash @@ -27,7 +27,7 @@ docker pull docker.paddlepaddle.org/paddle -下载GPU版本的Docker镜像: +下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: .. code-block:: bash @@ -54,7 +54,7 @@ .. _docker_run: 在Docker中执行PaddlePaddle训练程序 ------------------------------- +---------------------------------- 假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 `PaddlePaddleBook `_ @@ -82,7 +82,7 @@ .. _docker_run_book: 使用Docker启动PaddlePaddle Book教程 ------------------------------- +----------------------------------- 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 06012bf65e..56a7c68e4d 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps. Pull PaddlePaddle Docker Image ------------------------------ -Run the following command to download the latest Docker images: +Run the following command to download the latest Docker images, the version is cpu_avx_mkl: .. code-block:: bash @@ -28,7 +28,7 @@ For users in China, we provide a faster mirror: docker pull docker.paddlepaddle.org/paddle -Download GPU version images: +Download GPU version (cuda8.0_cudnn5_avx_mkl) images: .. code-block:: bash @@ -58,7 +58,7 @@ and run: .. _docker_run: Launch your training program in Docker ------------------------------- +-------------------------------------- Assume that you have already written a PaddlePaddle program named :code:`train.py` under directory :code:`/home/work` (refer to From 5036cf03872a1a1b68cd974e21193ae82f5da071 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Fri, 29 Dec 2017 16:43:10 +0800 Subject: [PATCH 137/181] add helper function to get appropriate DeviceContext (#7066) * add helper function to get appropriate DeviceContext --- paddle/framework/data_transform.h | 5 ++-- paddle/framework/data_transform_test.cc | 15 ++++++------ paddle/framework/operator.cc | 32 ++++++++++++++++++------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index 2191dd3783..bd6d301c12 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -27,9 +27,8 @@ limitations under the License. */ namespace paddle { namespace framework { -using DataTransformFn = - std::function ctx, - const Variable& in, Variable* out)>; +using DataTransformFn = std::function; using KernelTypePair = std::pair; struct KernelTypePairHash { diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 4e2141ecd2..5f05e881fa 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -54,18 +54,18 @@ auto kernel1 = GenFromBit({0, 0, 0, 1}); auto kernel2 = GenFromBit({0, 0, 1, 0}); auto kernel3 = GenFromBit({0, 0, 1, 1}); -void TransDataType_t(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value++; } -void TransDataLayout_t(std::vector ctx, - const Variable& in, Variable* out) { +void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value--; } -void TransLibraryType_t(std::vector ctx, - const Variable& in, Variable* out) { +void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in, + Variable* out) { test_value += 2; } @@ -83,7 +83,8 @@ TEST(DataTransform, Register) { using namespace paddle::platform; auto& instance = DataTransformFnMap::Instance(); - std::vector ctx; + ASSERT_EQ(instance.Map().size(), 3UL); + DeviceContext* ctx = nullptr; paddle::framework::Variable in; paddle::framework::Variable out; diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index c0be11294c..a3ce96c409 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -384,6 +384,24 @@ class RuntimeInferShapeContext : public InferShapeContext { const Scope& scope_; }; +const platform::DeviceContext* GetDeviceContext( + framework::KernelTypePair& kernel_pair) { + auto& actual_kernel_key = kernel_pair.first; + auto& expected_kernel_key = kernel_pair.second; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + if (platform::is_gpu_place(actual_kernel_key.place_) && + platform::is_cpu_place(expected_kernel_key.place_)) { + return pool.Get(actual_kernel_key.place_); + } else if (platform::is_cpu_place(actual_kernel_key.place_) && + platform::is_gpu_place(expected_kernel_key.place_)) { + return pool.Get(expected_kernel_key.place_); + } else { + PADDLE_THROW( + "Currently, model parallelism is only supported between CPU and CUDA"); + } +} + void OperatorWithKernel::Run(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); @@ -418,9 +436,9 @@ void OperatorWithKernel::Run(const Scope& scope, "CPU and other devices. For example, multi-GPU model " "parallelism will failed."); } else { + auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key); const DataTransformFn* trans_fun = - DataTransformFnMap::Instance().GetNullable( - std::make_pair(actual_kernel_key, expected_kernel_key)); + DataTransformFnMap::Instance().GetNullable(kernel_pair); if (trans_fun) { auto input_vars = this->InputVars(); // TODO(qijun) filter the input vars that do not need to be transformed @@ -437,22 +455,18 @@ void OperatorWithKernel::Run(const Scope& scope, } if (!need_trans.empty()) { - // TODO(qijun) get appropriate DeviceContext from DeviceContext pool - platform::DeviceContext* trans_dev_ctx = nullptr; - std::vector trans_dev_ctx_vec{trans_dev_ctx}; + auto trans_dev_ctx = GetDeviceContext(kernel_pair); // Wait for transform starting dev_ctx->Wait(); for (auto var_name : need_trans) { - (*trans_fun)(trans_dev_ctx_vec, *(scope.FindVar(var_name)), + (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)), scope.FindVar(var_name + framework::KernelTypeToString( expected_kernel_key))); } // Wait for data transform finishing - for (auto ctx : trans_dev_ctx_vec) { - ctx->Wait(); - } + trans_dev_ctx->Wait(); } } } From 24cf2fcd90a8409da2e5e38118c73eb4af13121f Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 29 Dec 2017 15:16:49 +0800 Subject: [PATCH 138/181] move cos_sim_functor to math --- paddle/operators/CMakeLists.txt | 4 +- paddle/operators/cos_sim_op.cc | 22 --- paddle/operators/cos_sim_op.cu | 45 ------ paddle/operators/cos_sim_op.h | 153 +-------------------- paddle/operators/math/CMakeLists.txt | 2 + paddle/operators/math/cos_sim_functor.cc | 48 +++++++ paddle/operators/math/cos_sim_functor.cu | 64 +++++++++ paddle/operators/math/cos_sim_functor.h | 166 +++++++++++++++++++++++ 8 files changed, 290 insertions(+), 214 deletions(-) create mode 100644 paddle/operators/math/cos_sim_functor.cc create mode 100644 paddle/operators/math/cos_sim_functor.cu create mode 100644 paddle/operators/math/cos_sim_functor.h diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf99332..c6da04b5b4 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -210,7 +210,8 @@ set(DEPS_OPS save_op load_op send_op - recv_op) + recv_op + cos_sim_op) if(WITH_DISTRIBUTE) add_subdirectory(detail) @@ -256,6 +257,7 @@ op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) +op_library(cos_sim_op DEPS cos_sim_functor) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index d4f3ca5e32..9019a1edb3 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -149,28 +149,6 @@ class CosSimOpGrad : public framework::OperatorWithKernel { } }; -template -struct CosSimDyFunctor { - inline void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, - const T* y_norm, const T* x, const T* y, const T* z, - const T* dz, const size_t rows, const size_t cols, - T* dy) const { - for (size_t row_id = 0; row_id < rows; ++row_id) { - auto xy_norm_prod = x_norm[row_id] * y_norm[0]; - auto dz_data = dz[row_id]; - auto z_data = z[row_id]; - auto* x_data = x + cols * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - auto y_norm_square = y_norm[0] * y_norm[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols; ++i) { - dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - - z_data * y[i] * reciprocal_y_norm_square); - } - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 891436c948..9e5d1b6e4f 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -14,51 +14,6 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/operators/cos_sim_op.h" -#include "paddle/platform/cuda_helper.h" - -namespace paddle { -namespace operators { - -template -__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, - const T* y, const T* z, const T* dz, - const size_t rows, const size_t cols, T* dy) { - int grid_size = blockDim.x * gridDim.x; - T y_norm_data = y_norm[0]; - for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; - row_id += grid_size) { - T xy_norm_prod = x_norm[row_id] * y_norm_data; - T dz_data = dz[row_id]; - T z_data = z[row_id]; - const T* x_data = x + cols * row_id; - T reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - T y_norm_square = y_norm_data * y_norm_data; - T reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols; ++i) { - T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - - z_data * y[i] * reciprocal_y_norm_square); - platform::CudaAtomicAdd(dy + i, dy_data); - } - } -} - -template -struct CosSimDyFunctor { - inline void operator()(const platform::CUDADeviceContext& ctx, - const T* x_norm, const T* y_norm, const T* x, - const T* y, const T* z, const T* dz, const size_t rows, - const size_t cols, T* dy) const { - const int block_size = 512; - dim3 threads(block_size, 1); - dim3 grid(1, (rows + block_size - 1) / block_size); - CosSimDyKernel<<>>( - x_norm, y_norm, x, y, z, dz, rows, cols, dy); - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 160edb0b56..eadcca55f9 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/cos_sim_functor.h" #include "paddle/operators/math/math_function.h" #include "paddle/platform/for_range.h" @@ -22,59 +23,6 @@ namespace operators { using Tensor = framework::Tensor; -template -struct CosSimFunctor { - CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto* x = x_ + cols_ * row_id; - T xx = 0, xy = 0, yy = 0; - if (same_row) { - auto* y = y_ + cols_ * row_id; - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - y_norm_[row_id] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } else { // This can be wrote in a better way. - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y_[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - if (row_id == 0) y_norm_[0] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } - } - - T* x_norm_; - T* y_norm_; - const T* x_; - const T* y_; - T* z_; - const size_t cols_; -}; - template class CosSimKernel : public framework::OpKernel { public: @@ -95,14 +43,14 @@ class CosSimKernel : public framework::OpKernel { int cols = framework::product(in_x->dims()) / rows_x; if (rows_x == rows_y) { - CosSimFunctor functor( + math::CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); platform::ForRange for_range( static_cast(context.device_context()), rows_x); for_range(functor); } else { - CosSimFunctor functor( + math::CosSimFunctor functor( in_x->data(), in_y->data(), out_x_norm->data(), out_y_norm->data(), out_z->data(), cols); platform::ForRange for_range( @@ -112,93 +60,6 @@ class CosSimKernel : public framework::OpKernel { } }; -template -struct CosSimGradFunctor { - CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - - auto* dx = dx_ + cols_ * row_id; - auto* x = x_ + cols_ * row_id; - auto* y = y_ + cols_ * row_id; - - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto reciprocal_x_norm_square = 1 / x_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDxFunctor { - CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, - const T* z, const T* dz, T* dx, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - auto* x = x_ + cols_ * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto* dx = dx_ + cols_ * row_id; - auto reciprocal_x_norm_square = 1 / x_norm_square; - - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDyFunctor { - inline void operator()(const DeviceContext& ctx, const T* x_norm, - const T* y_norm, const T* x, const T* y, const T* z, - const T* dz, const size_t rows, const size_t cols, - T* dy) const; -}; - template class CosSimGradKernel : public framework::OpKernel { public: @@ -220,7 +81,7 @@ class CosSimGradKernel : public framework::OpKernel { if (rows_x == rows_y) { if (out_grad_x) { - CosSimGradFunctor functor( + math::CosSimGradFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); @@ -230,7 +91,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { - CosSimGradFunctor functor( + math::CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), out_grad_y->mutable_data(context.GetPlace()), cols); @@ -241,7 +102,7 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { - CosSimDxFunctor functor( + math::CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), out_grad_x->mutable_data(context.GetPlace()), cols); @@ -256,7 +117,7 @@ class CosSimGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); set_zero(dev_ctx, out_grad_y, static_cast(0)); - CosSimDyFunctor functor; + math::CosSimDyFunctor functor; functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), static_cast(rows_x), diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index bf47879f77..830ae53cbe 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -16,6 +16,7 @@ if(WITH_GPU) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) + nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) @@ -30,6 +31,7 @@ else() cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context) cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) + cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context) endif() cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc new file mode 100644 index 0000000000..f52a82b108 --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/cos_sim_functor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimDyFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + for (size_t row_id = 0; row_id < rows; ++row_id) { + auto xy_norm_prod = x_norm[row_id] * y_norm[0]; + auto dz_data = dz[row_id]; + auto z_data = z[row_id]; + auto* x_data = x + cols * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + auto y_norm_square = y_norm[0] * y_norm[0]; + auto reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + } + } + } +}; + +template class CosSimDyFunctor; +template class CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu new file mode 100644 index 0000000000..fb19a8b38a --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/cos_sim_functor.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x, + const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) { + int grid_size = blockDim.x * gridDim.x; + T y_norm_data = y_norm[0]; + for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows; + row_id += grid_size) { + T xy_norm_prod = x_norm[row_id] * y_norm_data; + T dz_data = dz[row_id]; + T z_data = z[row_id]; + const T* x_data = x + cols * row_id; + T reciprocal_xy_norm_prod = 1 / xy_norm_prod; + + T y_norm_square = y_norm_data * y_norm_data; + T reciprocal_y_norm_square = 1 / y_norm_square; + for (size_t i = 0; i < cols; ++i) { + T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - + z_data * y[i] * reciprocal_y_norm_square); + platform::CudaAtomicAdd(dy + i, dy_data); + } + } +} + +template +struct CosSimDyFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm, + const T* y_norm, const T* x, const T* y, const T* z, + const T* dz, const size_t rows, const size_t cols, + T* dy) const { + const int block_size = 512; + dim3 threads(block_size, 1); + dim3 grid(1, (rows + block_size - 1) / block_size); + CosSimDyKernel<<>>( + x_norm, y_norm, x, y, z, dz, rows, cols, dy); + } +}; + +template class CosSimDyFunctor; +template class CosSimDyFunctor; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h new file mode 100644 index 0000000000..aae8ab5b7a --- /dev/null +++ b/paddle/operators/math/cos_sim_functor.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/platform/device_context.h" +#include "paddle/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct CosSimFunctor { + CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto* x = x_ + cols_ * row_id; + T xx = 0, xy = 0, yy = 0; + if (same_row) { + auto* y = y_ + cols_ * row_id; + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + y_norm_[row_id] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } else { // This can be wrote in a better way. + T tep_x, tep_y; + for (size_t i = 0; i < cols_; ++i) { + tep_x = x[i]; + tep_y = y_[i]; + xx += tep_x * tep_x; + yy += tep_y * tep_y; + xy += tep_x * tep_y; + } + xx = sqrt(xx); + yy = sqrt(yy); + if (row_id == 0) y_norm_[0] = yy; + x_norm_[row_id] = xx; + z_[row_id] = xy / (xx * yy); + } + } + + T* x_norm_; + T* y_norm_; + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +template +struct CosSimGradFunctor { + CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + + auto* dx = dx_ + cols_ * row_id; + auto* x = x_ + cols_ * row_id; + auto* y = y_ + cols_ * row_id; + + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto reciprocal_x_norm_square = 1 / x_norm_square; + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDxFunctor { + CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y, + const T* z, const T* dz, T* dx, int cols) + : x_norm_(x_norm), + y_norm_(y_norm), + x_(x), + y_(y), + z_(z), + dz_(dz), + dx_(dx), + cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; + auto dz = dz_[row_id]; + auto z = z_[row_id]; + auto* x = x_ + cols_ * row_id; + auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; + auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; + auto* dx = dx_ + cols_ * row_id; + auto reciprocal_x_norm_square = 1 / x_norm_square; + + for (size_t i = 0; i < cols_; ++i) { + dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - + z * x[i] * reciprocal_x_norm_square); + } + } + const T* x_norm_; + const T* y_norm_; + const T* x_; + const T* y_; + const T* z_; + const T* dz_; + T* dx_; + const size_t cols_; +}; + +template +struct CosSimDyFunctor { + void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm, + const T* x, const T* y, const T* z, const T* dz, + const size_t rows, const size_t cols, T* dy) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle From d14ca1c39f16b3744cd42e27d86a21a1f5020e37 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 29 Dec 2017 17:30:28 +0800 Subject: [PATCH 139/181] fix inference crash of alexnet benchmark --- benchmark/paddle/image/alexnet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py index 77d130ae34..cad6051f14 100644 --- a/benchmark/paddle/image/alexnet.py +++ b/benchmark/paddle/image/alexnet.py @@ -19,7 +19,11 @@ args = { 'num_samples': num_samples } define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, From 747741a9d5c446bb20af288d3549920f453f5a76 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 29 Dec 2017 17:56:52 +0800 Subject: [PATCH 140/181] update alexnet inference benchmark data on MKL --- benchmark/IntelOptimizedPaddle.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 6cc9598947..084d3237d9 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -93,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz | MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | | MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | +- Alexnet + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|--------|--------|--------|--------|--------| +| OpenBLAS | | | | | | +| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 | +| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 | + +chart TBD ### Laptop TBD From 0a5fbb06508731aa55ffda3e4a68a9fabff2a72a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 29 Dec 2017 18:04:03 +0800 Subject: [PATCH 141/181] Refine code struct. --- paddle/platform/device_context.h | 12 --- paddle/platform/profiler.cc | 149 +++++++++++++++++++++++++------ paddle/platform/profiler.h | 131 +++++---------------------- paddle/platform/profiler_test.cc | 12 +-- 4 files changed, 154 insertions(+), 150 deletions(-) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 07e197ba0b..2b366e6383 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext { cudnnHandle_t cudnn_handle_; }; -class DeviceGuard { - public: - explicit DeviceGuard(int device) { - original_device_ = platform::GetCurrentDeviceId(); - platform::SetDeviceId(device); - } - ~DeviceGuard() { platform::SetDeviceId(original_device_); } - - private: - int original_device_; -}; - #endif /*! \brief device context pool singleton */ diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 40b34b732c..4e89e5c600 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -17,34 +17,133 @@ limitations under the License. */ namespace paddle { namespace platform { -ProfilerState kState = ProfilerState::kDisabled; -uint32_t kNextThreadId = 0; -std::mutex kAllEventListsMutex; -std::list> kAllEventLists; -thread_local std::shared_ptr kEventList; -thread_local int32_t kThreadId; +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + DeviceContext* dev_ctx) + : kind_(kind), + name_(std::move(name)), + thread_id_(thread_id), + has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + auto* cuda_dev_ctx = static_cast(dev_ctx); + if (cuda_dev_ctx) { + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + has_cuda_ = true; + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedUs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000.0); +} + +double Event::CudaElapsedUs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms * 1000.0; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, + dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, + dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, + dev_ctx_); +} void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); - PADDLE_ENFORCE(kState == ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, "The profiling state should be disabled when calling ", "EnableProfiler."); - kState = state; + g_state = state; #ifdef PADDLE_WITH_CUDA - auto ForEachDevice = [](std::function op) { - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; i++) { - DeviceGuard dev_guard(i); - op(i); - } - }; - if (kState == ProfilerState::kCUDA) { + if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); Mark("_cuda_startup_", dev_ctx); dev_ctx->Wait(); }); @@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) { } #endif // Mark the profiling start. - Mark("_start_profiler_"); + Mark("_start_profiler_", nullptr); } std::vector> DisableProfiler() { - PADDLE_ENFORCE(kState != ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, "Can't disable profiling, since it's not starting."); // Mark the profiling stop. - Mark("_stop_profiler_"); - kState = ProfilerState::kDisabled; + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; std::vector> result; - std::lock_guard guard(kAllEventListsMutex); - for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { - auto& list = *it; - result.emplace_back(list->Reduce()); + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); } return result; } diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 2242635024..47104ea9d0 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -24,76 +24,24 @@ namespace platform { enum EventKind { kMark, kPushRange, kPopRange }; -inline uint64_t GetTimeInNsec() { - // using std::chrono; - using clock = std::conditional::type; - return std::chrono::duration_cast( - clock::now().time_since_epoch()) - .count(); -} - class Event { public: - // the DeviceContext is used to get the cuda stream. + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. Event(EventKind kind, std::string name, uint32_t thread_id, - const platform::DeviceContext* dev_ctx = nullptr) - : kind_(kind), name_(std::move(name)), thread_id_(thread_id) { - has_cuda_ = false; -#ifdef PADDLE_WITH_CUDA - auto* cuda_dev_ctx = - static_cast(dev_ctx); - if (cuda_dev_ctx) { - PADDLE_ENFORCE(cudaGetDevice(&device_)); - PADDLE_ENFORCE(cudaEventCreate(&event_)); - auto stream = cuda_dev_ctx->stream(); - PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - has_cuda_ = true; - } -#endif - cpu_ns_ = GetTimeInNsec(); - } - - std::string kind() const { - switch (kind_) { - case EventKind::kMark: - return "mark"; - case EventKind::kPushRange: - return "push"; - case EventKind::kPopRange: - return "pop"; - } - PADDLE_THROW("Unknown EventKind."); - } + DeviceContext* dev_ctx); + std::string kind() const; std::string name() const { return name_; } - bool has_cuda() const { return has_cuda_; } #ifdef PADDLE_WITH_CUDA cudaEvent_t event() const { return event_; } - int device() const { return device_; } #endif - double CpuElapsedUs(const Event& e) const { - return (e.cpu_ns_ - cpu_ns_) / (1000.0); - } - - double CudaElapsedUs(const Event& e) const { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE(e.has_cuda() && has_cuda()); - PADDLE_ENFORCE(e.device() == device()); - PADDLE_ENFORCE(cudaEventSynchronize(event_)); - PADDLE_ENFORCE(cudaEventSynchronize(e.event())); - float ms; - PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms * 1000.0; -#else - PADDLE_THROW("CUDA is not enabled"); -#endif - } + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; private: EventKind kind_; @@ -108,11 +56,11 @@ class Event { }; struct EventList { - constexpr static std::size_t kMB = 1024 * 1024; - constexpr static std::size_t kEventBlockSize = 16 * kMB; - constexpr static std::size_t kEventSize = sizeof(Event); - constexpr static std::size_t kEventAlign = alignof(Event); - constexpr static std::size_t kNumBlock = + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = kEventBlockSize / ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); @@ -139,58 +87,27 @@ struct EventList { }; enum ProfilerState { - kDisabled, - kCPU, - kCUDA, + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state }; -// The profiler state, the initial value is ProfilerState::kDisabled -extern ProfilerState kState; -// The global mutex -extern std::mutex kAllEventListsMutex; -// The total event lists of all threads -extern std::list> kAllEventLists; -// The thread local event list only can be accessed by the specific thread -extern thread_local std::shared_ptr kEventList; -// The thread index of each thread -extern thread_local int32_t kThreadId; -// The kNextThreadId is a global counter for threads, by the kThreadId and -// kNextThreadId, we can know how many threads have created EventList. -extern uint32_t kNextThreadId; - -inline EventList& GetEventList() { - if (!kEventList) { - std::lock_guard guard(kAllEventListsMutex); - kEventList = std::make_shared(); - kThreadId = kNextThreadId++; - kAllEventLists.emplace_front(kEventList); - } - return *kEventList; -} - -inline void Mark(const std::string name, - const platform::DeviceContext* dev_ctx = nullptr) { - GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx); -} +void Mark(const std::string& name, DeviceContext* dev_ctx); struct RecordEvent { - explicit RecordEvent(const std::string name, - platform::DeviceContext* dev_ctx = nullptr) { - if (kState == ProfilerState::kDisabled) return; - dev_ctx_ = dev_ctx; - GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, - dev_ctx_); - } + explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); - ~RecordEvent() { - if (kState == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, - dev_ctx_); - } - platform::DeviceContext* dev_ctx_; + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + DeviceContext* dev_ctx_; }; +// Enable the profiling function. void EnableProfiler(ProfilerState state); + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> DisableProfiler(); } // namespace platform diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 5bd0a9d859..47cf7be146 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) { using paddle::platform::Event; using paddle::platform::EventKind; - Event start_event(EventKind::kPushRange, "test", 0); + Event start_event(EventKind::kPushRange, "test", 0, nullptr); EXPECT_TRUE(start_event.has_cuda() == false); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0); + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); } @@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) { TEST(Event, CudaElapsedTime) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; using paddle::platform::Event; using paddle::platform::EventKind; - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); EXPECT_TRUE(start_event.has_cuda() == true); int counter = 0; @@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) { DeviceContext* dev_ctx = nullptr; #ifdef PADDLE_WITH_CUDA using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; state = ProfilerState::kCUDA; dev_ctx = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); #endif EnableProfiler(state); From a6ff5240f519380257f206fbc9c7f720fff4badc Mon Sep 17 00:00:00 2001 From: guosheng Date: Fri, 29 Dec 2017 20:41:07 +0800 Subject: [PATCH 142/181] Refine the activation type of GRUOp by following comments --- paddle/operators/gru_op.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index d773521259..b1957fb9ce 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -90,6 +90,10 @@ class GRUKernel : public framework::OpKernel { } auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); for (size_t n = 0; n < num_batch; n++) { int bstart = static_cast(batch_starts[n]); int bend = static_cast(batch_starts[n + 1]); @@ -102,11 +106,8 @@ class GRUKernel : public framework::OpKernel { gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); math::GRUUnitFunctor::compute( - dev_ctx, gru_value, frame_size, cur_batch_size, - math::detail::GetActivationType( - context.Attr("activation")), - math::detail::GetActivationType( - context.Attr("gate_activation"))); + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); gru_value.prev_out_value = gru_value.output_value; } @@ -192,6 +193,10 @@ class GRUGradKernel : public framework::OpKernel { auto batch_starts = batch_hidden_grad.lod()[0]; size_t num_batch = batch_starts.size() - 1; + auto active_node = math::detail::GetActivationType( + context.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + context.Attr("gate_activation")); for (int n = static_cast(num_batch) - 1; n >= 0; n--) { int bstart = static_cast(batch_starts[n]); int bend = static_cast(batch_starts[n + 1]); @@ -222,11 +227,8 @@ class GRUGradKernel : public framework::OpKernel { } math::GRUUnitGradFunctor::compute( - dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, - math::detail::GetActivationType( - context.Attr("activation")), - math::detail::GetActivationType( - context.Attr("gate_activation"))); + dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, + active_gate); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); From 621663174c44957851296c5f548165bbe9f429ae Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 1 Jan 2018 18:49:42 +0800 Subject: [PATCH 143/181] fix library not found for -lrt on MAC (#7119) --- paddle/pybind/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index ced75cbfd8..7b37430707 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -3,7 +3,9 @@ if(WITH_PYTHON) SRCS pybind.cc exception.cc protobuf.cc const_value.cc DEPS pybind python backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB}) - target_link_libraries(paddle_pybind rt) + if(NOT APPLE AND NOT ANDROID) + target_link_libraries(paddle_pybind rt) + endif(NOT APPLE AND NOT ANDROID) endif(WITH_PYTHON) if(WITH_DOC) From 8543ad64635294c0dc52cd6701f076cd3d5981a2 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 1 Jan 2018 20:21:41 +0800 Subject: [PATCH 144/181] update backward doc --- paddle/framework/backward.md | 183 +++++++++++++++++++---------- python/paddle/v2/fluid/backward.py | 26 ++-- 2 files changed, 136 insertions(+), 73 deletions(-) diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md index ac60be5724..acc95e99c4 100644 --- a/paddle/framework/backward.md +++ b/paddle/framework/backward.md @@ -1,100 +1,161 @@ -# Operator/expression 's Backward +# Backward Building ## Motivation -In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass. +In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to definate the backward part. So a mechanism is required by the framework which is able to complete the model's backward part automatically acoording to the given forward part. -## Implementation - -In this design doc, we exported only one API for generating the backward pass. - -```c++ -std::unique_ptr Backward(const OperatorBase& forwardOp, - const std::unordered_set& no_grad_vars); -``` +When implementing a certain `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forward part. In this way, gradients spread from the end to the beginning of the model, in other word, from the loss to parameters. -The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**. +## Challenges -### Backward Operator Registry +The motivation of backward building is obvious. However, to implement it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into right place. -A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients. +## Usage -| | forward operator | backward operator -| ---------------------- | ---------------- |------------------------- | -| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients | -| **Operator::outputs_** | Outputs | InputGradients | +Although the whole algorithm is comprised of many functions, only one is exposed as API: - In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced. +```python +def append_backward(loss, parameter_list=None, no_grad_set=None): + """ + Append backward part to main_program -For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro: + Args: + loss(Variable): The variable generated by cost function. + parameter_list(list): Parameters that need to be updated by optimizer. + If None, it means all parameters need to be updated. -```cpp -REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad); + no_grad_set(set): Variables that have no gradients in Block 0. + If None, the set will be generated inside the function and + contains all variables with `step_gradient=True` from all blocks. + + Return: + (list[Variable]): list of (parameters, gradients) pair. + """ ``` -`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively. +By invoking this API, the framework appends backward part for the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run. -`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name. +This API will be invoked automatically before optimizer building. +As a result, in most cases users do not need to invoke the API by themselves to append backward part. -### Backward Opeartor Creating - -Given a certain forward operator, we can get its corresponding backward operator by calling: +## Implementation -```cpp -OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op); +The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided to two independent parts: creating of `grad_op`s and creating of new variables. + +### Creating `grad_op`s + +The creating of `grad_op`s is implemented by: + +```python +def _append_backward_ops_(target, + block, + target_block, + no_grad_dict, + grad_to_var): + """ + Create all grad ops, and insert them into given block + + Args: + target(Variable): the target variable of forward pass + block(Block): the block where forward ops are + target_block(Block): the block which is going to hold new generated grad ops + no_grad_dict(dict): + key(int) block index + val(set) a set of varibale names. These varibales have no gradient + grad_to_var(dict)(output argument): + key(str): grad variable name + val(str): corresponding forward variable name + """ ``` -The function `BuildGradOp` will sequentially execute following processes: - -1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`. - -2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing. +Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. -3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`. +However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive. -4. Building backward operator with `inputs`, `outputs` and forward operator's attributes. +During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process: -### Backward Network Building - -A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing. - -1. Op - - When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`. +``` +******* pseudo-code ******** +for op in reversed(block.ops): + if op has an attribute named 'sub_block': + Get the sub-block(`s_block`) from op's attribute. + Create a new block(`grad_s_block`), whose father is `s_block`. + Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block` + + Invoke `core.get_grad_op_desc()` to get op's grad_op. + Insert name correspondings between variables and their gradients of the grad_op to grad_to_var + Assign grad_s_block to grad_op as it's 'sub_block' attribute. + Append grad_op to current target_block. +``` -2. NetOp +The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0). - In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp. +### Corner Cases of `grad_op` Creating -3. RnnOp +In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, regular algorithm is not enough to get the correct result and appending handling is required. These addtional processes run after the above-mentioned algorithm and do some special adjusts on its output `grad_op`s. - RnnOp is a nested stepnet operator. Backward module needs to recusively call `Backward` for every stepnet. +#### Shared Variables -4. Sharing Variables +If a variable is readed by more than one `op` in the forward pass, its gradient is likey to be written by more than one `grad_op`s in the following backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variables, and then add a `sum_op` to add them up. - As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable. +For the debug convinience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`... -

-
+

+ + +
-​ Figure 1. Sharing variables in operators. +See function `_addup_repetitive_outputs_` in `backward.py` for implementation details. -

+#### No Gradient Variables -​ Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting. +In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Obviously, when all the outputs of some `grad_op` is marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. -

-
+But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. -​ Figure 2. Replace sharing variable's gradient with `Add` operator. +This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`'s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on fly by scanning all variables' `no_gradient` attribute(True or False). -

+### Creating Backward Variables -​ Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction. +Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by: -5. Part of the Gradient is Zero. +```python +def _append_backward_vars_(block, + start_op_idx, + grad_to_var, + grad_info_map): + """ + Create new variables required by backward pass. - In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator. + Args: + block(Block): the block where new variables will be created + start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created + grad_to_var(dict): + key(str): grad variable name + val(str): corresponding forward variable name + In most cases, this dict is generated by _append_backward_ops_() + grad_info_map(dict)(output argument): + key(str): forward variable name + val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index + """ +``` +Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process: -Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it. +``` +for op in block.ops[start_op_idx : ]: + + if op has an attribute named 'sub_block': + Get the sub-block(`s_block`) from op's attribute. + Invoke _append_backward_vars_(), with `block=s_block` + + for var_name in op.all_output_names(): + if block.has_var_recursive(var_name) or var_name is the name of empty variable: + continue + create a new variable named 'var_name' in block + if grad_to_var.has_key(var_name): + set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block) + + do op's var type inference + do op's shape inference +``` diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index b3c1bab298..f11c83f59c 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -176,6 +176,7 @@ def _append_backward_ops_(target, key(str): grad variable name val(str): corresponding forward variable name """ + # grad_op_descs holds created grad_op, and will be appended to target_block grad_op_descs = [] program = block.program for op in reversed(block.ops): @@ -188,6 +189,7 @@ def _append_backward_ops_(target, no_grad_dict, grad_to_var, callback) grad_sub_block_list.append(grad_sub_block.desc) + # Getting op's corresponding grad_op grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, no_grad_dict[block.idx], grad_sub_block_list) grad_op_descs.extend(grad_op_desc) @@ -254,18 +256,18 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): def append_backward(loss, parameter_list=None, no_grad_set=None): """ - Create and add gradient Operators in BlockDesc to compute - gradients of `loss` for parameters in parameter_list - - :param loss: an variable generated by cost function. - :type loss: Variable - :param no_grad_dict: variable that should not create gradient - :type no_grad_dict: set - :param parameter_list: parameters that need to compute gradient and - update to optimize the lost. - :type: list - :return: list of (parameters, gradients) pair. - :rtype: list[Variable] + Append backward part to main_program + + Args: + loss(Variable): The variable generated by cost function. + parameter_list(list): Parameters that need to be updated by optimizer. + If None, it means all parameters need to be updated. + no_grad_set(set): Variables that have no gradients in Block 0. + If None, the set will be generated inside the function and + contains all variables with `step_gradient=True` from all blocks. + + Return: + (list[Variable]): list of (parameters, gradients) pair. """ assert isinstance(loss, framework.Variable) From a0e70cb1990a8143060e7b156de06391d962a850 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 1 Jan 2018 20:22:51 +0800 Subject: [PATCH 145/181] move backward doc postion --- {paddle/framework => doc}/backward.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {paddle/framework => doc}/backward.md (100%) diff --git a/paddle/framework/backward.md b/doc/backward.md similarity index 100% rename from paddle/framework/backward.md rename to doc/backward.md From d52fd00a66c6278eb718d378ea67226cdb1633b6 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 1 Jan 2018 22:11:17 +0800 Subject: [PATCH 146/181] int to size_t --- paddle/operators/detection_output_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index cd6417087a..f8abd5b640 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -63,7 +63,7 @@ class DetectionOutputKernel : public framework::OpKernel { float nms_threshold = context.template Attr("nms_threshold"); float confidence_threshold = context.template Attr("confidence_threshold"); - int batch_size = in_conf->dims()[1]; + size_t batch_size = in_conf->dims()[1]; int conf_sum_size = in_conf->numel(); // for softmax std::vector conf_shape_softmax_vec( From deacfa9eb9c7e8cd55dd16a5b25424c7d9d04b9e Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 01:04:32 +0800 Subject: [PATCH 147/181] fix typo --- doc/{ => design}/backward.md | 29 ++++++++---------- .../design}/images/duplicate_op.graffle | Bin .../design}/images/duplicate_op.png | Bin .../design}/images/duplicate_op2.graffle | Bin .../design}/images/duplicate_op2.png | Bin 5 files changed, 12 insertions(+), 17 deletions(-) rename doc/{ => design}/backward.md (68%) rename {paddle/framework => doc/design}/images/duplicate_op.graffle (100%) rename {paddle/framework => doc/design}/images/duplicate_op.png (100%) rename {paddle/framework => doc/design}/images/duplicate_op2.graffle (100%) rename {paddle/framework => doc/design}/images/duplicate_op2.png (100%) diff --git a/doc/backward.md b/doc/design/backward.md similarity index 68% rename from doc/backward.md rename to doc/design/backward.md index acc95e99c4..85f45b5c74 100644 --- a/doc/backward.md +++ b/doc/design/backward.md @@ -2,13 +2,13 @@ ## Motivation -In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to definate the backward part. So a mechanism is required by the framework which is able to complete the model's backward part automatically acoording to the given forward part. +In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part. -When implementing a certain `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forward part. In this way, gradients spread from the end to the beginning of the model, in other word, from the loss to parameters. +When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters. ## Challenges -The motivation of backward building is obvious. However, to implement it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into right place. +The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. ## Usage @@ -20,8 +20,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): Append backward part to main_program Args: - loss(Variable): The variable generated by cost function. - parameter_list(list): Parameters that need to be updated by optimizer. + loss(Variable): The variable generated by the cost function. + parameter_list(list): Parameters that need to be updated by optimizers. If None, it means all parameters need to be updated. no_grad_set(set): Variables that have no gradients in Block 0. @@ -33,14 +33,14 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): """ ``` -By invoking this API, the framework appends backward part for the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run. +By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run. This API will be invoked automatically before optimizer building. -As a result, in most cases users do not need to invoke the API by themselves to append backward part. +As a result, in most cases, users do not need to invoke the API by themselves to append backward part. ## Implementation -The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided to two independent parts: creating of `grad_op`s and creating of new variables. +The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating of `grad_op`s and creating new variables. ### Creating `grad_op`s @@ -92,24 +92,19 @@ The first invoking of `_append_backward_ops_()` is initiated by `append_backward ### Corner Cases of `grad_op` Creating -In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, regular algorithm is not enough to get the correct result and appending handling is required. These addtional processes run after the above-mentioned algorithm and do some special adjusts on its output `grad_op`s. +In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s. #### Shared Variables -If a variable is readed by more than one `op` in the forward pass, its gradient is likey to be written by more than one `grad_op`s in the following backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variables, and then add a `sum_op` to add them up. +If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. -For the debug convinience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`... - -
- - -
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`... See function `_addup_repetitive_outputs_` in `backward.py` for implementation details. #### No Gradient Variables -In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Obviously, when all the outputs of some `grad_op` is marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. +In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. diff --git a/paddle/framework/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle similarity index 100% rename from paddle/framework/images/duplicate_op.graffle rename to doc/design/images/duplicate_op.graffle diff --git a/paddle/framework/images/duplicate_op.png b/doc/design/images/duplicate_op.png similarity index 100% rename from paddle/framework/images/duplicate_op.png rename to doc/design/images/duplicate_op.png diff --git a/paddle/framework/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle similarity index 100% rename from paddle/framework/images/duplicate_op2.graffle rename to doc/design/images/duplicate_op2.graffle diff --git a/paddle/framework/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png similarity index 100% rename from paddle/framework/images/duplicate_op2.png rename to doc/design/images/duplicate_op2.png From 46a69e995f6e0e0ac450a25bfe2216ed4932bfb2 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 01:08:27 +0800 Subject: [PATCH 148/181] fix typo --- doc/design/backward.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/design/backward.md b/doc/design/backward.md index 85f45b5c74..35f03692bb 100644 --- a/doc/design/backward.md +++ b/doc/design/backward.md @@ -40,7 +40,7 @@ As a result, in most cases, users do not need to invoke the API by themselves to ## Implementation -The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating of `grad_op`s and creating new variables. +The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. ### Creating `grad_op`s @@ -108,7 +108,7 @@ In our framework, variables can be marked as *no_gradient*, it means that the gr But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. -This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`'s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on fly by scanning all variables' `no_gradient` attribute(True or False). +This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). ### Creating Backward Variables From 2a5c6a4435157868e337109a77cbed7320a3e7a3 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 29 Dec 2017 13:44:40 +0800 Subject: [PATCH 149/181] Fix the typo error and add more comments. --- doc/design/profiler.md | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/doc/design/profiler.md b/doc/design/profiler.md index 3b95bf0065..b20b5efdc1 100644 --- a/doc/design/profiler.md +++ b/doc/design/profiler.md @@ -1,44 +1,46 @@ ## Introduction -There are many performance analysis tools for [different programming languages and different software framework](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning framework, they used several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning framework, the PaddlePaddle also used C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU device. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile the only CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually wants to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tools. +There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool. ## Architecture -The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speed training, all the deep learning framework supports parallel computing, including multi-threads on CPU and multi-GPUs. So the profiler must enable to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to enable or disable by the developers. At last, the profiler should show a human-readable report. +The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report. ```python for i in xrange(M): # M is the iteration number - for op in operator_lists: # The `operator_lists` is the all operators in the network graph. + for op in operator_lists: # The `operator_lists` contains all the operators in the network. op.run(); ``` -In a summary, the proflier should have follow features: +In summary, the proflier should have following features: -- record time span in loop. -- support nested time span. -- support multi-threads/multi-GPUs. -- support to enable and disable the profiler. +- records time span in loop. +- supports nested time span. +- supports multiple threads/multiple GPUs. +- supports to be enabled and disabled by users. -But how to record the time for the mixed C++ and CUDA program? There many C++ interfaces to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different streams (http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. The CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device's perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summary and show statistics based on these events. +But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events. -The overall flow is shown as following figure. +The overall flow is shown as the following figure.
### Event -In above work flow, a pair of events are needed before and aftern the piece of code to collect time. So the event has a flag to mark it is starting event or ending event. There three kinds of event: +In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event: ```c++ -enum EventKind { kMark, +enum EventKind { + kMark, kPushRange, kPopRange}; ``` -- kMark: only a mark. +- kMark: only a marker without time range. - kPushRange: mark the starting event for time range. -- kPopRange: mark the ending event for the time range. +- kPopRange: mark the ending event for time range. + +For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece. -For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, a event lists are used to record each piece. ```c++ class Event { public: @@ -64,7 +66,7 @@ struct EventList { }; ``` -As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or distable the profiler. +As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. ```c++ enum ProfilerState { @@ -72,11 +74,11 @@ enum ProfilerState { kCPU, kCUDA }; -ProfilerState kState; +ProfilerState g_state; ``` -- kDisabled: the disabled state. -- kCPU: profiling for CPU code. -- kCUDA: profiling for GPU code. +- kDisabled: the disabled state. +- kCPU: CPU profiling state. +- kCUDA: GPU profiling state. A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. From 105ee86d14200253b77a06f9607bf6d19936c2f6 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 2 Jan 2018 11:07:29 +0800 Subject: [PATCH 150/181] fix compile (#7125) --- paddle/operators/math/cos_sim_functor.cc | 4 ++-- paddle/operators/math/cos_sim_functor.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc index f52a82b108..6af9f0fcd9 100644 --- a/paddle/operators/math/cos_sim_functor.cc +++ b/paddle/operators/math/cos_sim_functor.cc @@ -41,8 +41,8 @@ struct CosSimDyFunctor { } }; -template class CosSimDyFunctor; -template class CosSimDyFunctor; +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu index fb19a8b38a..6eb0a4ea4c 100644 --- a/paddle/operators/math/cos_sim_functor.cu +++ b/paddle/operators/math/cos_sim_functor.cu @@ -57,8 +57,8 @@ struct CosSimDyFunctor { } }; -template class CosSimDyFunctor; -template class CosSimDyFunctor; +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; } // namespace math } // namespace operators } // namespace paddle From 0df22907070f16a599c7b77fa4e2c444a1684da6 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 2 Jan 2018 12:42:03 +0800 Subject: [PATCH 151/181] for makelist update --- paddle/operators/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 1386146b01..0c47a71f73 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -208,7 +208,6 @@ set(DEPS_OPS array_to_lod_tensor_op max_sequence_len_op lstm_op - tensor_array_read_write_op gru_op adagrad_op sgd_op From 10cd6eb67a7177bbf95300c0c8512650d27e57e5 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 12:51:25 +0800 Subject: [PATCH 152/181] Add doc for lod_rank_table. --- python/paddle/v2/fluid/layers/control_flow.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 22a37c22c3..48f1ffa668 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -397,9 +397,40 @@ class While(object): def lod_rank_table(x, level=0): - """ - This function creates an operator for creating a LOD_RANK_TABLE - using the input x. + """LoD Rank Table Operator. Given an input variable `x` and a LoD level, + this layer creates a LodRankTable object. A LoDRankTable object contains a + list of bi-element tuples and each tuple consists of an index and a length. + For given level's LoD information, the index is the sequence position and + the length representes the sequence length. Please note that the list is + ranked in descending order by the length. The following is an example: + + .. code-block:: text + + x is a LoDTensor: + x.lod = [[0, 1, 2, 3], + [0, 5, 6, 7]] + x.data = [a, b, c, d, e, f, g] + + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=1) + + Get: + lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] + + Args: + x (Variable): Input variable, a LoDTensor based which to create the lod + rank table. + level (int): Specify the LoD level. + + Returns: + Variable: The created LoDRankTable object. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], + dtype='float32', lod_level=1) + out = layers.lod_rank_table(x=x, level=0) """ helper = LayerHelper("lod_rank_table", **locals()) table = helper.create_variable( From 57bc564d12d5910f3f03d52ac9616b9e72ed4de2 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 14:01:17 +0800 Subject: [PATCH 153/181] Polish doc for lod_rank_table. --- python/paddle/v2/fluid/layers/control_flow.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 48f1ffa668..458ced460a 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -397,25 +397,34 @@ class While(object): def lod_rank_table(x, level=0): - """LoD Rank Table Operator. Given an input variable `x` and a LoD level, - this layer creates a LodRankTable object. A LoDRankTable object contains a - list of bi-element tuples and each tuple consists of an index and a length. - For given level's LoD information, the index is the sequence position and - the length representes the sequence length. Please note that the list is - ranked in descending order by the length. The following is an example: + """LoD Rank Table Operator. Given an input variable **x** and a level number + of LoD, this layer creates a LodRankTable object. A LoDRankTable object + contains a list of bi-element tuples. Each tuple consists of an index and + a length, both of which are int type. Reffering to specified level of LoD, + the index is the sequence index number and the length representes the + sequence length. Please note that the list is ranked in descending order by + the length. The following is an example: .. code-block:: text x is a LoDTensor: - x.lod = [[0, 1, 2, 3], + x.lod = [[0, 2, 3], [0, 5, 6, 7]] x.data = [a, b, c, d, e, f, g] - Create lod rank table: - lod_rank_table_obj = lod_rank_table(x, level=1) + 1. set level to 0: + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=0) - Get: - lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] + Get: + lod_rank_table_obj.items() = [(0, 2), (1, 1)] + + 2. set level to 1: + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=1) + + Get: + lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] Args: x (Variable): Input variable, a LoDTensor based which to create the lod From 0c5202cbb562b7d070c807d38de5b54db06833b4 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Tue, 2 Jan 2018 14:42:27 +0800 Subject: [PATCH 154/181] Tiny enhance of while_op --- paddle/operators/while_op.cc | 42 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 728ef60794..65d827e0e0 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -25,12 +25,12 @@ namespace operators { using StepScopeVar = std::vector; using LoDTensor = framework::LoDTensor; -constexpr char kStepBlock[] = "sub_block"; -constexpr char kCondition[] = "Condition"; -constexpr char kStepScopes[] = "StepScopes"; -constexpr char kParameters[] = "X"; -constexpr char kParamGrads[] = "X@GRAD"; -constexpr char kOutputs[] = "Out"; +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; class WhileOp : public framework::OperatorBase { public: @@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { public: WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput(kParameters, + AddInput(kX, "A set of variables, which are required by operators inside the " "block of While Op.") .AsDuplicable(); @@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase { executor.Run(*program, *cur_scope_iter, block->ID(), false); - auto &pg_names = Outputs(kParamGrads); - auto &p_names = Inputs(kParameters); + auto &pg_names = Outputs(kXGRAD); + auto &p_names = Inputs(kX); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { if (pg_names[param_id] == framework::kEmptyVarName) { @@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto *grad = new framework::OpDesc(); grad->SetType("while_grad"); - grad->SetInput(kParameters, Input(kParameters)); + grad->SetInput(kX, Input(kX)); // Not all of IGs will be generated by inner gradient operators of while op. // Ignore IGs that is not generated by the inside block. - auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false); + auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); std::unordered_set all_outs; for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) { @@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { } } - grad->SetOutput(framework::GradVarName(kParameters), igs); + grad->SetOutput(framework::GradVarName(kX), igs); grad->SetInput(kOutputs, Output(kOutputs)); @@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { std::unordered_set block_ins; auto *fwd_block = this->grad_block_[0]->ParentBlock(); { - for (auto &p : Input(kParameters)) { + for (auto &p : Input(kX)) { block_ins.insert(p); } for (auto &o : Output(kOutputs)) { @@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { public: void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { - auto p_names = op_desc.Input(kParameters); - auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); + auto p_names = op_desc.Input(kX); + auto pg_names = op_desc.Output(framework::GradVarName(kX)); for (size_t i = 0; i < p_names.size(); ++i) { auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); @@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { class WhileGradOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { - ctx->HasInputs(kParameters); - ctx->HasOutputs(framework::GradVarName(kParameters)); + ctx->HasInputs(kX); + ctx->HasOutputs(framework::GradVarName(kX)); ctx->HasInputs(kOutputs); ctx->HasInputs(framework::GradVarName(kOutputs)); - auto p_names = ctx->Inputs(kParameters); - auto pg_names = ctx->Outputs(kParamGrads); - auto var_types = ctx->GetInputsVarType(kParameters); + auto p_names = ctx->Inputs(kX); + auto pg_names = ctx->Outputs(kXGRAD); + auto var_types = ctx->GetInputsVarType(kX); std::vector names_to_set; std::vector dims_to_set; for (size_t i = 0; i < p_names.size(); ++i) { if (pg_names[i] == framework::kEmptyVarName) { continue; } - auto dims = ctx->GetInputsElementDim(kParameters, i); + auto dims = ctx->GetInputsElementDim(kX, i); if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) { names_to_set.push_back(pg_names[i]); dims_to_set.push_back(dims); From a5200b89ac6b60b6e2f5e5a3eb374502e1285772 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 14:55:18 +0800 Subject: [PATCH 155/181] Add doc for max_sequence_len. --- python/paddle/v2/fluid/layers/control_flow.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 22a37c22c3..0f8295d177 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -414,9 +414,25 @@ def lod_rank_table(x, level=0): def max_sequence_len(rank_table): - """ - This function creates an operator to calculate the length of - max seqence through input rank_table(should be a lod_rank_table) + """Max Sequence Len Operator. Given a LoDRankTable object, this layer + returns the max length of batch of sequences. In fact, a LoDRankTable object + contains a list of tuples () and the list + is already sorted by sequence length in descending order, so the operator + just returns the sequence length of the first tuple element. + + Args: + rank_table (Variable): Input variable which is a LoDRankTable object. + + Returns: + Variable: the max length of sequence. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], + dtype='float32', lod_level=1) + rank_table = layers.lod_rank_table(x=x, level=0) + max_seq_len = layers.max_sequence_len(rank_table) """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") From 554f6967127fec6f6847802333e988565c726fbe Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 2 Jan 2018 15:03:10 +0800 Subject: [PATCH 156/181] for del DEPS --- paddle/operators/CMakeLists.txt | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index bfcc70b31d..9f603474de 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -186,36 +186,6 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) -set(DEPS_OPS - cond_op - cross_entropy_op - recurrent_op - softmax_with_cross_entropy_op - softmax_op - sequence_softmax_op - sum_op - pool_op - maxout_op - unpool_op - pool_with_index_op - conv_op - conv_transpose_op - nccl_op - sequence_conv_op - sequence_pool_op - lod_rank_table_op - lod_tensor_to_array_op - array_to_lod_tensor_op - max_sequence_len_op - lstm_op - gru_op - adagrad_op - sgd_op - save_op - load_op - send_op - recv_op - detection_output_op) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) else() From 0d4fdce07f55957d2ade921dfb382c3f5ee790e8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 15:04:01 +0800 Subject: [PATCH 157/181] Minor refinement. --- python/paddle/v2/fluid/layers/control_flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 458ced460a..08c52390e9 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -429,7 +429,8 @@ def lod_rank_table(x, level=0): Args: x (Variable): Input variable, a LoDTensor based which to create the lod rank table. - level (int): Specify the LoD level. + level (int): Specify the LoD level, on which to create the lod rank + table. Returns: Variable: The created LoDRankTable object. From 783f9eade49aef3ae2c4e89404b1da4ce49fb6f5 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 2 Jan 2018 15:32:32 +0800 Subject: [PATCH 158/181] del using in .h --- paddle/operators/norm_op.h | 81 ++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h index b22df373af..7bee48919e 100644 --- a/paddle/operators/norm_op.h +++ b/paddle/operators/norm_op.h @@ -19,13 +19,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; - template class NormKernel : public framework::OpKernel { public: @@ -42,29 +35,37 @@ class NormKernel : public framework::OpKernel { int fea_len = height * width; auto* place = context.template device_context().eigen_device(); - auto x = EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); // get square framework::Tensor x_square; x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); x_square_eigen.device(*place) = x.square(); - auto scale_eigen = EigenVector::Flatten(*scale); + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); for (int n = 0; n < batch_size; ++n) { framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); framework::Tensor out_batch = out->Slice(n, n + 1); - auto out_batch_eigen = EigenMatrix::From( - out_batch, framework::make_ddim({channels, fea_len})); + auto out_batch_eigen = + framework::EigenMatrix::From( + out_batch, framework::make_ddim({channels, fea_len})); framework::Tensor tmp_tensor; tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), context.GetPlace()); - auto tmp = EigenVector::Flatten(tmp_tensor); + auto tmp = framework::EigenVector::Flatten(tmp_tensor); // get colsum and sqrt , inverse auto dim = Eigen::array({{0}}); tmp.device(*place) = x_square_batch_eigen.sum(dim); @@ -102,40 +103,52 @@ class NormGradKernel : public framework::OpKernel { auto* place = context.template device_context().eigen_device(); - auto scale_eigen = EigenVector::Flatten(*scale); - auto x = EigenMatrix::From( - *in_x, framework::make_ddim({batch_size, fea_len * channels})); + auto scale_eigen = + framework::EigenVector::Flatten( + *scale); + auto x = + framework::EigenMatrix::From( + *in_x, framework::make_ddim({batch_size, fea_len * channels})); // get square framework::Tensor x_square; x_square.mutable_data(in_x->dims(), context.GetPlace()); - auto x_square_eigen = EigenMatrix::From( - x_square, framework::make_ddim({batch_size, fea_len * channels})); + auto x_square_eigen = + framework::EigenMatrix::From( + x_square, framework::make_ddim({batch_size, fea_len * channels})); x_square_eigen.device(*place) = x.square(); for (int n = 0; n < batch_size; ++n) { framework::Tensor in_x_batch = in_x->Slice(n, n + 1); - auto in_x_batch_eigen = EigenMatrix::From( - in_x_batch, framework::make_ddim({channels, fea_len})); + auto in_x_batch_eigen = + framework::EigenMatrix::From( + in_x_batch, framework::make_ddim({channels, fea_len})); framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1); - auto in_g_batch_eigen = EigenMatrix::From( - in_g_batch, framework::make_ddim({channels, fea_len})); + auto in_g_batch_eigen = + framework::EigenMatrix::From( + in_g_batch, framework::make_ddim({channels, fea_len})); framework::Tensor x_square_batch = x_square.Slice(n, n + 1); - auto x_square_batch_eigen = EigenMatrix::From( - x_square_batch, framework::make_ddim({channels, fea_len})); + auto x_square_batch_eigen = + framework::EigenMatrix::From( + x_square_batch, framework::make_ddim({channels, fea_len})); framework::Tensor outg_batch = out_grad->Slice(n, n + 1); - auto outg_batch_eigen = EigenMatrix::From( - outg_batch, framework::make_ddim({channels, fea_len})); + auto outg_batch_eigen = + framework::EigenMatrix::From( + outg_batch, framework::make_ddim({channels, fea_len})); framework::Tensor tmp_tensor; tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), context.GetPlace()); - auto tmp_eigen = EigenVector::Flatten(tmp_tensor); + auto tmp_eigen = + framework::EigenVector::Flatten(tmp_tensor); auto dim = Eigen::array({{0}}); tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim); framework::Tensor norm_tmp_tensor; norm_tmp_tensor.mutable_data(framework::make_ddim({1, fea_len}), context.GetPlace()); - auto norm_tmp_eigen = EigenVector::Flatten(norm_tmp_tensor); + auto norm_tmp_eigen = + framework::EigenVector::Flatten(norm_tmp_tensor); norm_tmp_eigen.device(*place) = (x_square_batch_eigen.sum(dim) + epsilon).sqrt(); Eigen::array broadcast_dim_col; From 899a79cceb5b949d41d25a93c6c4d79446ba41b9 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 2 Jan 2018 15:51:53 +0800 Subject: [PATCH 159/181] Feature/transform (#7111) * "fix data transform" * "data transformer" * "add device pool" * "add test" * "fix ci" * "fix datalayout implementation " * "fix based on comment" --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/data_transform.cc | 79 +++++++++++++++++++++++ paddle/framework/data_transform.h | 67 +++++++++++++++++++- paddle/framework/data_transform_test.cc | 83 +++++++++++++++++++++---- paddle/framework/operator.cc | 2 +- paddle/operators/math/math_function.cc | 9 ++- 6 files changed, 222 insertions(+), 20 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6788cb34fb..b4458eb955 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -29,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(scope SRCS scope.cc DEPS glog) cc_test(scope_test SRCS scope_test.cc DEPS scope) -cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto) +cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto) cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context) cc_library(attribute SRCS attribute.cc DEPS framework_proto) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 376268888e..58780e3863 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/framework/data_transform.h" #include "paddle/framework/lod_tensor.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -23,5 +24,83 @@ DataTransformFnMap& DataTransformFnMap::Instance() { return data_transform_map; } +auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNCHW, LibraryType::kPlain); + +void TransDataType(const platform::DeviceContext* ctx, + const KernelTypePair& kernel_pair, const Variable& in, + Variable* out) { + PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_pair.first.place_, + kernel_pair.second.place_), + "TransDataType Only Support DataType transform on same place!"); + + auto src = in.Get(); + auto* dst = out->GetMutable(); + + auto dims = src.dims(); + dst->Resize(dims); + auto dst_type = kernel_pair.second.data_type_; + auto src_type = kernel_pair.first.data_type_; + + switch (src_type) { + case proto::DataType::FP32: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::FP64: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::INT32: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::INT64: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::BOOL: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +void TransDataLayout(const platform::DeviceContext* ctx, + const KernelTypePair& kernel_pair, const Variable& in, + Variable* out) { + PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_pair.first.place_, + kernel_pair.second.place_), + "TransDataType Only Support DataType transform on same place!"); + + auto src = in.Get(); + auto* dst = out->GetMutable(); + PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); + + dst->Resize(src.dims()); + auto place = kernel_pair.second.place_; + CopyFrom(src, place, *ctx, dst); + const std::vector axis = {0, 2, 3, 1}; + + auto src_type = kernel_pair.first.data_type_; + framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); + + dst->set_layout(kernel_pair.second.data_layout_); +} + } // namespace framework } // namespace paddle + +namespace f = paddle::framework; +REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType); +REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout); diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index bd6d301c12..9abb3c99bf 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -21,16 +21,20 @@ limitations under the License. */ #include "paddle/framework/op_kernel_type.h" #include "paddle/framework/tensor.h" #include "paddle/framework/variable.h" +#include "paddle/operators/math/math_function.h" #include "paddle/platform/device_context.h" #include "paddle/platform/macros.h" +#include "paddle/platform/transform.h" namespace paddle { namespace framework { -using DataTransformFn = std::function; using KernelTypePair = std::pair; +using DataTransformFn = + std::function; + struct KernelTypePairHash { static void HashCombine(const OpKernelType& t, std::size_t* seed) { OpKernelType::Hash kernel_type_hasher; @@ -45,6 +49,65 @@ struct KernelTypePairHash { } }; +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + + auto* in_begin = in_.data(); + auto numel = in_.numel(); + auto* in_end = in_begin + numel; + auto* out_begin = out_->mutable_data(place); + if (platform::is_cpu_place(place)) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + } else { + // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type? + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +struct CastDataLayout { + CastDataLayout(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx, + const std::vector& axis) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + using DataTransformMap = std::unordered_map; diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 5f05e881fa..5b01c8434b 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/framework/data_transform.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -31,16 +32,18 @@ using namespace platform; * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN */ -std::array kDataType = { - {proto::DataType::FP32, proto::DataType::FP64}}; +std::array kDataType = {proto::DataType::FP32, + proto::DataType::FP64}; -std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; +std::array kPlace = {CPUPlace(), CUDAPlace(0)}; std::array kDataLayout = { - {DataLayout::kNHWC, DataLayout::kNCHW}}; + DataLayout::kNHWC, DataLayout::kNCHW, +}; std::array kLibraryType = { - {LibraryType::kPlain, LibraryType::kMKLDNN}}; + LibraryType::kPlain, LibraryType::kMKLDNN, +}; OpKernelType GenFromBit(const std::vector bits) { return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], @@ -54,17 +57,20 @@ auto kernel1 = GenFromBit({0, 0, 0, 1}); auto kernel2 = GenFromBit({0, 0, 1, 0}); auto kernel3 = GenFromBit({0, 0, 1, 1}); -void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in, +void TransDataType_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value++; } -void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in, +void TransDataLayout_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value--; } -void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in, +void TransLibraryType_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value += 2; } @@ -83,17 +89,68 @@ TEST(DataTransform, Register) { using namespace paddle::platform; auto& instance = DataTransformFnMap::Instance(); - ASSERT_EQ(instance.Map().size(), 3UL); - DeviceContext* ctx = nullptr; paddle::framework::Variable in; paddle::framework::Variable out; - instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out); + DeviceContext* ctx = new CPUDeviceContext(); + auto pair0 = std::make_pair(frw::kernel0, frw::kernel1); + instance.Get(pair0)(ctx, pair0, in, &out); ASSERT_EQ(test_value, 1); - instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out); + auto pair1 = std::make_pair(frw::kernel1, frw::kernel2); + instance.Get(pair1)(ctx, pair1, in, &out); ASSERT_EQ(test_value, 0); - instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out); + auto pair3 = std::make_pair(frw::kernel0, frw::kernel2); + instance.Get(pair3)(ctx, pair3, in, &out); ASSERT_EQ(test_value, 2); } + +TEST(DataTransform, Layout) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto& instance = DataTransformFnMap::Instance(); + Variable in; + Variable out; + Tensor* src = in.GetMutable(); + src->mutable_data(make_ddim({2, 3, 1, 2}), CPUPlace()); + src->set_layout(DataLayout::kNHWC); + + DeviceContext* ctx = new CPUDeviceContext(); + + { + auto kernel1 = GenFromBit({1, 0, 0, 0}); + auto kernel2 = GenFromBit({1, 0, 1, 0}); + auto pair0 = std::make_pair(kernel1, kernel2); + instance.Get(pair0)(ctx, pair0, in, &out); + } + + Tensor dst = out.Get(); + EXPECT_TRUE(dst.layout() != src->layout()); +} + +TEST(DataTransform, DataType) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto& instance = DataTransformFnMap::Instance(); + DeviceContext* ctx = new CPUDeviceContext(); + + Variable in; + Variable out; + Tensor* src = in.GetMutable(); + float* ptr = src->mutable_data(make_ddim({2, 3}), CPUPlace()); + for (int i = 0; i < 6; ++i) { + ptr[i] = i / 3; + } + + { + auto kernel1 = GenFromBit({0, 0, 0, 0}); + auto kernel2 = GenFromBit({1, 0, 0, 0}); + auto pair0 = std::make_pair(kernel1, kernel2); + instance.Get(pair0)(ctx, pair0, in, &out); + } + Tensor dst = out.Get(); + EXPECT_TRUE(dst.data() != nullptr); +} diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index a3ce96c409..fc7091f1c8 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -461,7 +461,7 @@ void OperatorWithKernel::Run(const Scope& scope, dev_ctx->Wait(); for (auto var_name : need_trans) { - (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)), + (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)), scope.FindVar(var_name + framework::KernelTypeToString( expected_kernel_key))); } diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index d4f12f0a10..dcf4b85e1a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -245,9 +245,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); From 7be57de9434053e7aa2e7b1d78da62ee1cb41ba7 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 16:55:51 +0800 Subject: [PATCH 160/181] enhance no_grad_var handling --- python/paddle/v2/fluid/backward.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index f11c83f59c..43e9abc354 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -57,6 +57,8 @@ def _all_in_set_(cands, s): """ Test if all elements of 'cands' are in set 's' """ + if len(cands) == 0: + return False for c in cands: if not c in s: return False @@ -138,10 +140,20 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): 1. all outputs of the grad op are in 'no_grad_set' 2. (TODO) all grad inputs of the grad op are in 'no_grad_set' """ + + def _op_can_be_removed_(op_desc, no_grad_set): + if _all_in_set_(op_desc.output_arg_names(), no_grad_set): + return True + if _all_in_set_( + filter(lambda name: name.find(core.grad_var_suffix()) != -1, + op_desc.input_arg_names()), no_grad_set): + no_grad_set.union(op_desc.output_arg_names()) + return True + return False + # Remove ops whose outputs are all in no_grad_dict op_descs = filter( - lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set), - op_descs) + lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs) # Insert fill_zeros_like_op to_insert = [] for idx, op_desc in enumerate(op_descs): From 8d4a607fb35a6eb9b5eacf9999f955bde911e2ad Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 17:30:40 +0800 Subject: [PATCH 161/181] update backward doc --- doc/design/backward.md | 6 ++++-- python/paddle/v2/fluid/backward.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/design/backward.md b/doc/design/backward.md index 35f03692bb..20fda7a98f 100644 --- a/doc/design/backward.md +++ b/doc/design/backward.md @@ -106,9 +106,11 @@ See function `_addup_repetitive_outputs_` in `backward.py` for implementation de In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. -But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. +Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped. -This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). +It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. + +This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). ### Creating Backward Variables diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 43e9abc354..a1be768daa 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -138,7 +138,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): Remove unnecessary grad ops A grad op can be removed in two cases: 1. all outputs of the grad op are in 'no_grad_set' - 2. (TODO) all grad inputs of the grad op are in 'no_grad_set' + 2. all grad inputs of the grad op are in 'no_grad_set' """ def _op_can_be_removed_(op_desc, no_grad_set): From 1bcf7e23bdf23ad8a96cf75a42a37f1e45fea89b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 19:29:05 +0800 Subject: [PATCH 162/181] Minor refinement. --- python/paddle/v2/fluid/layers/control_flow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 0f8295d177..114d46b5f8 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -415,16 +415,16 @@ def lod_rank_table(x, level=0): def max_sequence_len(rank_table): """Max Sequence Len Operator. Given a LoDRankTable object, this layer - returns the max length of batch of sequences. In fact, a LoDRankTable object - contains a list of tuples () and the list - is already sorted by sequence length in descending order, so the operator - just returns the sequence length of the first tuple element. + returns the max length of a batch of sequences. In fact, a LoDRankTable + object contains a list of tuples() and + the list is already sorted by sequence length in descending order, so the + operator just returns the sequence length of the first tuple element. Args: rank_table (Variable): Input variable which is a LoDRankTable object. Returns: - Variable: the max length of sequence. + Variable: The max length of sequence. Examples: .. code-block:: python From 33e75201e9d3c14945bbe556267b8bae069de327 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 20:00:00 +0800 Subject: [PATCH 163/181] fix bugs --- python/paddle/v2/fluid/backward.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index a1be768daa..ac60bf5436 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -142,12 +142,13 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): """ def _op_can_be_removed_(op_desc, no_grad_set): - if _all_in_set_(op_desc.output_arg_names(), no_grad_set): + out_arg_names = op_desc.output_arg_names() + if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set): return True if _all_in_set_( filter(lambda name: name.find(core.grad_var_suffix()) != -1, op_desc.input_arg_names()), no_grad_set): - no_grad_set.union(op_desc.output_arg_names()) + no_grad_set.union(out_arg_names) return True return False @@ -296,7 +297,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): block_no_grad_set.add(_append_grad_suffix_(var.name)) no_grad_dict[block.idx] = block_no_grad_set elif isinstance(no_grad_set, set): - no_grad_dict = {0: no_grad_set} + no_grad_dict = { + 0: set([_append_grad_suffix_(name) for name in no_grad_set]) + } else: raise ValueError("'no_grad_set' should be a set or None.") From fba6a10dd99edf6110280754555af78889f19dd3 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 2 Jan 2018 21:00:09 +0800 Subject: [PATCH 164/181] fix bug in TransDataLayout (#7137) --- paddle/framework/data_transform.cc | 11 ++++++++++- paddle/framework/data_transform_test.cc | 14 +++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 58780e3863..9d6a842442 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -87,11 +87,20 @@ void TransDataLayout(const platform::DeviceContext* ctx, auto* dst = out->GetMutable(); PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); - dst->Resize(src.dims()); + auto src_dim = src.dims(); + dst->Resize(src_dim); auto place = kernel_pair.second.place_; CopyFrom(src, place, *ctx, dst); const std::vector axis = {0, 2, 3, 1}; + std::vector dst_dim; + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + dst->Resize(make_ddim(dst_dim)); + auto src_type = kernel_pair.first.data_type_; framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 5b01c8434b..8665b6248f 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -32,18 +32,18 @@ using namespace platform; * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN */ -std::array kDataType = {proto::DataType::FP32, - proto::DataType::FP64}; +std::array kDataType = { + {proto::DataType::FP32, proto::DataType::FP64}}; -std::array kPlace = {CPUPlace(), CUDAPlace(0)}; +std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; -std::array kDataLayout = { +std::array kDataLayout = {{ DataLayout::kNHWC, DataLayout::kNCHW, -}; +}}; -std::array kLibraryType = { +std::array kLibraryType = {{ LibraryType::kPlain, LibraryType::kMKLDNN, -}; +}}; OpKernelType GenFromBit(const std::vector bits) { return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], From f3851fe58dbae3d5d6a450af76b97fb49aa4f4ba Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 2 Jan 2018 21:18:26 +0800 Subject: [PATCH 165/181] auto pybind when *_op.cc contains several operators --- paddle/operators/CMakeLists.txt | 83 +++++---------------------------- 1 file changed, 11 insertions(+), 72 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 9f603474de..467963f666 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -71,74 +71,11 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n") endif() - # conv_op contains several operators - if ("${TARGET}" STREQUAL "conv_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(conv2d);\n") - endif() - - # conv_cudnn_op contains several operators - if ("${TARGET}" STREQUAL "conv_cudnn_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n") - endif() - - # pool_op contains several operators - if ("${TARGET}" STREQUAL "pool_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(pool2d);\n") - endif() - - # pool_cudnn_op contains several operators - if ("${TARGET}" STREQUAL "pool_cudnn_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n") - endif() - if ("${TARGET}" STREQUAL "logical_op") set(pybind_flag 1) file(APPEND ${pybind_file} "USE_OP(logical_and);\n") endif() - # pool_with_index_op contains several operators - if ("${TARGET}" STREQUAL "pool_with_index_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n") - endif() - - # conv_transpose_op contains several operators - if ("${TARGET}" STREQUAL "conv_transpose_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n") - endif() - - # conv_transpose_cudnn_op contains two operators - if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n") - endif() - - # save_restore_op contains several operators - if ("${TARGET}" STREQUAL "save_restore_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n") - endif() - - # activation_op contains several operators - if ("${TARGET}" STREQUAL "activation_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(sigmoid);\n") - endif() - # nccl_op contains several operators if ("${TARGET}" STREQUAL "nccl_op") set(pybind_flag 1) @@ -146,21 +83,24 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") endif() - # reduce_op contains several operators - if ("${TARGET}" STREQUAL "reduce_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") - endif() - if ("${TARGET}" STREQUAL "tensor_array_read_write_op") set(pybind_flag 1) file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n") endif() + file(READ ${TARGET}.cc TARGET_CONTENT) + # It's enough to just adding one operator to pybind + string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}") + string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}") + if (one_register STREQUAL "") + string(REPLACE "_op" "" TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}") + string(REPLACE "," "" TARGET "${TARGET}") + endif() + # pybind USE_NO_KERNEL_OP # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel - file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REPLACE "_op" "" TARGET "${TARGET}") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") @@ -171,7 +111,6 @@ function(op_library TARGET) # pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) - if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) From e4e95beedc3cabd73e3d37faf6c6d95c96f955df Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 2 Jan 2018 21:21:42 +0800 Subject: [PATCH 166/181] manually pybind some specific operators --- paddle/operators/CMakeLists.txt | 35 +++++++++------------------------ 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 467963f666..df737ed9b0 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -61,32 +61,12 @@ function(op_library TARGET) ${op_common_deps}) endif() - # net_op doesn't need pybind - if ("${TARGET}" STREQUAL "net_op") - set(pybind_flag 1) - endif() - - if ("${TARGET}" STREQUAL "compare_op") - set(pybind_flag 1) - file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n") - endif() - - if ("${TARGET}" STREQUAL "logical_op") - set(pybind_flag 1) - file(APPEND ${pybind_file} "USE_OP(logical_and);\n") - endif() - - # nccl_op contains several operators - if ("${TARGET}" STREQUAL "nccl_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") - endif() - - if ("${TARGET}" STREQUAL "tensor_array_read_write_op") - set(pybind_flag 1) - file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n") - endif() + # net_op doesn't need pybind, others will be pybind manually + foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() file(READ ${TARGET}.cc TARGET_CONTENT) # It's enough to just adding one operator to pybind @@ -127,6 +107,7 @@ add_subdirectory(nccl) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() @@ -177,6 +158,8 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") From f3812825d06c2e9fb2311ea3890f70fc2dcf0836 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 2 Jan 2018 13:43:47 -0800 Subject: [PATCH 167/181] Added documentation for topk (#6861) --- python/paddle/v2/fluid/layers/control_flow.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index a055cea1bf..588114a275 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -485,6 +485,30 @@ def max_sequence_len(rank_table): def topk(input, k): + """ + **topk** + + This function performs the operation that selects the k entries in the input + vector and outputs their values and indices as vectors. Thus topk_out[j] is + the j-th largest entry in input, and its index is topk_indices[j] + + Args: + input (Variable|list): The input tensor that has all the data. + k (int): The number of top elements that the function will pick. + + Returns: + Variable: The variable of type array that contains the k largest entries + from input. + Variable: The variable of type array that contains the indices of k + largest entries from input. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10]) + k = 5 + array = fluid.layers.topk(x, k) + """ helper = LayerHelper('topk', **locals()) topk_out = helper.create_tmp_variable(dtype=input.data_type) topk_indices = helper.create_tmp_variable(dtype='int64') From e9a60e4c8e7f73b3b1e33cec4fd2d855055cd1eb Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Tue, 2 Jan 2018 14:45:42 -0800 Subject: [PATCH 168/181] Adding API docs for ones and zeros methods (#7150) --- python/paddle/v2/fluid/layers/tensor.py | 40 ++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index e5820d24cd..9ce25a9e08 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input, def ones(shape, dtype): """ - This function performs the same function as fill_constant() declared above - with the constant value being 1.0. + **ones** + + This function creates a tensor of specified *shape* and + *dtype*, and initializes this with 1. + + It also sets *stop_gradient* to True. + + Args: + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.ones(shape=[1], dtype='int64') """ return fill_constant(value=1.0, **locals()) def zeros(shape, dtype): """ - This function performs the same function as fill_constant() declared above - with the constant value being 0.0. + **zeros** + + This function creates a tensor of specified *shape* and + *dtype*, and initializes this with 0. + + It also sets *stop_gradient* to True. + + Args: + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.zeros(shape=[1], dtype='int64') """ return fill_constant(value=0.0, **locals()) From 27fea24fd15a3d878df78786374820e78d83c045 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 2 Jan 2018 14:46:13 -0800 Subject: [PATCH 169/181] Addign document for fluid split_lod_tensor and merge_lod_tensor (#6859) * Addign document for fluid split_lod_tensor * Adding document for fluid merge_lod_tensor --- python/paddle/v2/fluid/layers/control_flow.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 588114a275..acc22bef98 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -16,6 +16,36 @@ __all__ = [ def split_lod_tensor(input, mask, level=0): + """ + **split_lod_tensor** + + This function takes in an input that contains the complete lod information, + and takes in a mask which is used to mask certain parts of the input. + The output is the true branch and the false branch with the mask applied to + the input at a certain level in the tensor. + + Args: + input(tuple|list|None): The input tensor that contains complete + lod information needed to construct the output. + mask(list): A bool column vector which masks the input. + level(int): The specific lod level to rank. + + Returns: + Variable: The true branch of tensor as per the mask applied to input. + Variable: The false branch of tensor as per the mask applied to input. + + Examples: + .. code-block:: python + + x = layers.data(name='x', shape=[1]) + x.persistable = True + + y = layers.data(name='y', shape=[1]) + y.persistable = True + + out_true, out_false = layers.split_lod_tensor( + input=x, mask=y, level=level) + """ helper = LayerHelper('split_lod_tensor', **locals()) out_true = helper.create_tmp_variable(dtype=input.dtype) out_false = helper.create_tmp_variable(dtype=input.dtype) @@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0): def merge_lod_tensor(in_true, in_false, x, mask, level=0): + """ + **merge_lod_tensor** + + This function takes in an input :math:`x`, the True branch, the False + branch and a binary :math:`mask`. Using this information, this function + merges the True and False branches of the tensor into a single Output + at a certain lod level indiacted by :math:`level`. + + Args: + in_true(tuple|list|None): The True branch to be merged. + in_false(tuple|list|None): The False branch to be merged. + x(tuple|list|None): The input tensor that contains complete + lod information needed to construct the output. + mask(list): A bool column vector which masks the input. + level(int): The specific lod level to rank. + + Returns: + Variable: The merged output tensor. + + Examples: + .. code-block:: python + + x = layers.data( + name='x', shape=[1], dtype='float32', stop_gradient=False) + y = layers.data( + name='y', shape=[1], dtype='bool', stop_gradient=False) + + level = 0 + + out_true, out_false = layers.split_lod_tensor( + input=x, mask=y, level=level) + out = layers.merge_lod_tensor( + in_true=out_true, in_false=out_false, mask=y, x=x, level=level) + """ helper = LayerHelper('merge_lod_tensor', **locals()) out = helper.create_tmp_variable(dtype=in_true.dtype) helper.append_op( From 87f46ebb368929feae76b7d909944b317d7dad92 Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Tue, 2 Jan 2018 14:46:49 -0800 Subject: [PATCH 170/181] Add squared error layers doc (#6862) --- python/paddle/v2/fluid/layers/nn.py | 32 +++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 55b35ad543..55d8bf8a8a 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -426,8 +426,36 @@ def cross_entropy(input, label, **kwargs): def square_error_cost(input, label, **kwargs): """ - This functions returns the squared error cost using the input and label. - The output is appending the op to do the above. + **Square error cost layer** + + This layer accepts input predictions and target label and returns the squared error cost. + For predictions, :math:`X`, and target labels, :math:`Y`, the equation is: + + .. math:: + + Out = (X - Y)^2 + + In the above equation: + + * :math:`X`: Input predictions, a tensor. + * :math:`Y`: Input labels, a tensor. + * :math:`Out`: Output value, same shape with :math:`X`. + + Args: + input(Variable): Input tensor, has predictions. + label(Variable): Label tensor, has target labels. + + Returns: + Variable: The tensor variable storing the element-wise squared error difference \ + of input and label. + + Examples: + .. code-block:: python + + y = layers.data(name='y', shape=[1], dtype='float32') + y_predict = layers.data(name='y_predict', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + """ helper = LayerHelper('square_error_cost', **kwargs) minus_out = helper.create_tmp_variable(dtype=input.dtype) From df2b054b13d19d467afa51aafdf1871569c6fa56 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 3 Jan 2018 11:37:55 +0800 Subject: [PATCH 171/181] follow comments refine code --- .../layers/MKLPackedRecurrentLayer.cpp | 64 ++++++++----------- .../gserver/layers/MKLPackedRecurrentLayer.h | 29 ++------- paddle/gserver/layers/MKLPackedWeight.h | 20 +----- paddle/gserver/layers/RecurrentLayer.cpp | 4 -- 4 files changed, 36 insertions(+), 81 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index bd3c4ceb5e..b4a6413048 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -53,28 +53,19 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); /* forward one batch */ for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { - MatrixPtr batch2 = batchValue_->getBatchValue(n); + MatrixPtr batchValue = batchValue_->getBatchValue(n); if (n != 0) { - MatrixPtr batch1 = - batchValue_->getBatchValue(n - 1, batch2->getHeight()); + MatrixPtr preBatchValue = + batchValue_->getBatchValue(n - 1, batchValue->getHeight()); - // batch2->mul(*batch1, *weight_->getW(), 1, 1); - packed_weight_->compute(batch2, batch1); - } - -#pragma omp parallel for collapse(2) - for (size_t i = 0; i < batch2->getHeight(); i++) { - for (size_t j = 0; j < batch2->getWidth(); j++) { - *(batch2->getData() + i * batch2->getWidth() + j) = - *(batch2->getData() + i * batch2->getWidth() + j) > 0 - ? *(batch2->getData() + i * batch2->getWidth() + j) - : 0; - } + packed_weight_->compute(batchValue, preBatchValue); } + Argument arg; + arg.value = batchValue; + activation_->forward(arg).check(); } } - batchValue_->copyBackSeq(*output_.value); } @@ -94,25 +85,27 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); /* backward one batch */ for (int n = (int)numBatch - 1; n >= 0; n--) { - MatrixPtr batch2 = batchGrad_->getBatchValue(n); - MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight()); + MatrixPtr batchGrad = batchGrad_->getBatchValue(n); + MatrixPtr batchValue = + batchValue_->getBatchValue(n, batchGrad->getHeight()); Argument arg; - arg.value = batch1; - arg.grad = batch2; + arg.value = batchValue; + arg.grad = batchGrad; activation_->backward(arg).check(); if (n != 0) { - batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); - // batch1->mul(*batch2, *weightT, 1, 1); - packed_weightT_->compute(batch1, batch2); + batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight()); + packed_weightT_->compute(batchValue, batchGrad); } if (backwardByBatch && weight_->getWGrad()) { if (n != 0) { /* backward weight */ - batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); - weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); + batchValue = + batchValue_->getBatchValue(n - 1, batchGrad->getHeight()); + weight_->getWGrad()->mul( + *batchValue->getTranspose(), *batchGrad, 1, 1); } } } @@ -124,19 +117,14 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); for (size_t seq = 0; seq < numSequences; ++seq) { int len = starts[seq + 1] - starts[seq]; - if (!reversed_) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), - *output_.grad->subMatrix(starts[seq] + 1, len - 1), - 1, - 1); - } else { - weight_->getWGrad()->mul( - *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), - *output_.grad->subMatrix(starts[seq], len - 1), - 1, - 1); - } + weight_->getWGrad()->mul( + *output_.value + ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1) + ->getTranspose(), + *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1, + len - 1), + 1, + 1); } } } diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index ba6487b11e..19874d538e 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -14,36 +14,18 @@ limitations under the License. */ #pragma once -#include -#include "Layer.h" #include "MKLPackedWeight.h" #include "RecurrentLayer.h" -#include "SequenceToBatch.h" -#include "paddle/utils/Stat.h" DECLARE_bool(rnn_use_batch); namespace paddle { /** - * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the - * same with - * input layer. - * For each sequence [start, end] it performs the following computation: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ - * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end - * - * \f] - * If reversed is true, the order is reversed: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ - * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end - * \f] - * There are two methods to calculate rnn. One way is to compute rnn one - * sequence by one sequence. The other way is to reorganize the input - * into batches, then compute rnn one batch by one batch. Users can select - * them by rnn_use_batch flag. + * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized + * with MKL cblas packed gemm. + * More details: + * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md */ class MKLPackedRecurrentLayer : public RecurrentLayer { @@ -66,7 +48,10 @@ protected: const int* starts) override; protected: + /// packed_weight_ is contains same data with + /// RecurrentLayer::weight_ but is packed std::unique_ptr packed_weight_; + /// packed_weightT_ is the transposition matrix of packed_weight_ std::unique_ptr packed_weightT_; }; diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index cc8a336154..f77aa4dbbf 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -22,7 +22,9 @@ namespace paddle { class MKLPackedWeight { protected: + /// The pointor of weight real *weight_; + /// The pointor of cblas packed gemm to weight real *packedWeight_; size_t height_; size_t width_; @@ -41,7 +43,7 @@ public: void pack() { pack_(weight_); } - void compute(MatrixPtr dst, MatrixPtr src) { + void compute(MatrixPtr dst, const MatrixPtr src) { cblas_sgemm_compute(CblasRowMajor, CblasNoTrans, CblasPacked, @@ -57,22 +59,6 @@ public: dst->getWidth()); } - void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - M, - width_, - height_, - A, - lda, - packedWeight_, - width_, - 1.0, - C, - ldc); - } - protected: void pack_(real *src) { if (!packedWeight_) { diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index 285b11b5a0..6bd42c06ca 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "RecurrentLayer.h" -#include -#include "Layer.h" -#include "SequenceToBatch.h" -#include "paddle/utils/Stat.h" DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); From 89cb3a249cc5ecbf3955ea37e67655ec431142e3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 3 Jan 2018 14:02:01 +0800 Subject: [PATCH 172/181] follow comments, refine comment and function name --- paddle/gserver/layers/MKLPackedRecurrentLayer.cpp | 4 ++-- paddle/gserver/layers/MKLPackedRecurrentLayer.h | 6 +++--- paddle/gserver/layers/MKLPackedWeight.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index b4a6413048..dd75555fae 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -59,7 +59,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, MatrixPtr preBatchValue = batchValue_->getBatchValue(n - 1, batchValue->getHeight()); - packed_weight_->compute(batchValue, preBatchValue); + packed_weight_->gemm_compute(preBatchValue, batchValue); } Argument arg; arg.value = batchValue; @@ -96,7 +96,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, if (n != 0) { batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight()); - packed_weightT_->compute(batchValue, batchGrad); + packed_weightT_->gemm_compute(batchGrad, batchValue); } if (backwardByBatch && weight_->getWGrad()) { diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index 19874d538e..bded523a8f 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -22,8 +22,8 @@ DECLARE_bool(rnn_use_batch); namespace paddle { /** - * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized - * with MKL cblas packed gemm. + * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer + * but is optimized with MKL cblas packed gemm. * More details: * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md */ @@ -48,7 +48,7 @@ protected: const int* starts) override; protected: - /// packed_weight_ is contains same data with + /// packed_weight_ contains same data with /// RecurrentLayer::weight_ but is packed std::unique_ptr packed_weight_; /// packed_weightT_ is the transposition matrix of packed_weight_ diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index f77aa4dbbf..15d5093beb 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -22,9 +22,9 @@ namespace paddle { class MKLPackedWeight { protected: - /// The pointor of weight + /// The pointer of weight real *weight_; - /// The pointor of cblas packed gemm to weight + /// The pointer of cblas packed gemm to weight real *packedWeight_; size_t height_; size_t width_; @@ -43,7 +43,7 @@ public: void pack() { pack_(weight_); } - void compute(MatrixPtr dst, const MatrixPtr src) { + void gemm_compute(const MatrixPtr src, MatrixPtr dst) { cblas_sgemm_compute(CblasRowMajor, CblasNoTrans, CblasPacked, From f0e797e5b70bf098b407f0ef4983b2bd8f853609 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 3 Jan 2018 14:12:15 +0800 Subject: [PATCH 173/181] Doc fix and enhancement for lstm_unit python wrapper. --- python/paddle/v2/fluid/layers/nn.py | 126 +++++++++++++++------------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 55d8bf8a8a..1a2019d1f2 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): Args: input(Variable): Input to the function - size(tuple|list|None): Shape of the look up table parameter + size(tuple|list|None): Shape of the look up table parameter is_sparse(bool): Boolean flag that specifying whether the input is sparse param_attr(ParamAttr): Parameters for this layer dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc @@ -366,9 +366,9 @@ def cross_entropy(input, label, **kwargs): 1) One-hot cross-entropy: `soft_label = False`, `Label[i, 0]` indicates the class index for sample i: - + .. math:: - + Y[i] = -\log(X[i, Label[i]]) 2) Soft-label cross-entropy: @@ -386,15 +386,15 @@ def cross_entropy(input, label, **kwargs): As a special case of 2), when each row of 'label' has only one non-zero element which is equal to 1, soft-label cross-entropy degenerates to a one-hot cross-entropy with one-hot label representation. - + Args: - input (Variable|list): a 2-D tensor with shape [N x D], where N is the - batch size and D is the number of classes. This input is a probability + input (Variable|list): a 2-D tensor with shape [N x D], where N is the + batch size and D is the number of classes. This input is a probability computed by the previous operator, which is almost always the result of a softmax operator. - label (Variable|list): the ground truth which is a 2-D tensor. When - `soft_label` is set to `False`, `label` is a tensor with shape - [N x 1]. When `soft_label` is set to `True`, `label` is a + label (Variable|list): the ground truth which is a 2-D tensor. When + `soft_label` is set to `False`, `label` is a tensor with shape + [N x 1]. When `soft_label` is set to `True`, `label` is a tensor with shape [N x D]. soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate the given labels as soft labels, default `False`. @@ -403,7 +403,7 @@ def cross_entropy(input, label, **kwargs): A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ + `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ `soft_label == True`, and the 2nd dimension of `input` and `label` are not \ equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. @@ -727,9 +727,9 @@ def conv2d(input, def sequence_pool(input, pool_type, **kwargs): """ - This function add the operator for sequence pooling. - It pools features of all time-steps of each instance, and is applied - on top of the input using pool_type mentioned in the parameters. + This function add the operator for sequence pooling. + It pools features of all time-steps of each instance, and is applied + on top of the input using pool_type mentioned in the parameters. It supports four pool_type: @@ -758,7 +758,7 @@ def sequence_pool(input, pool_type, **kwargs): Args: input(variable): The input variable which is a LoDTensor. - pool_type (string): The pooling type of sequence_pool. + pool_type (string): The pooling type of sequence_pool. It supports average, sum, sqrt and max. Returns: @@ -768,7 +768,7 @@ def sequence_pool(input, pool_type, **kwargs): .. code-block:: python - x = fluid.layers.data(name='x', shape=[7, 1], + x = fluid.layers.data(name='x', shape=[7, 1], dtype='float32', lod_level=1) avg_x = fluid.layers.sequence_pool(input=x, pool_type='average') sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum') @@ -816,7 +816,7 @@ def sequence_first_step(input, **kwargs): .. code-block:: python - x = fluid.layers.data(name='x', shape=[7, 1], + x = fluid.layers.data(name='x', shape=[7, 1], dtype='float32', lod_level=1) x_first_step = fluid.layers.sequence_first_step(input=x) """ @@ -849,7 +849,7 @@ def sequence_last_step(input, **kwargs): .. code-block:: python - x = fluid.layers.data(name='x', shape=[7, 1], + x = fluid.layers.data(name='x', shape=[7, 1], dtype='float32', lod_level=1) x_last_step = fluid.layers.sequence_last_step(input=x) """ @@ -1168,25 +1168,26 @@ def lstm_unit(x_t, .. math:: - i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i) + i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i) - f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f) + f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f) - c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c) + c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c) - o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o) + o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o) h_t & = o_t tanh(c_t) - The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and - :math:`c_{t-1}`. The implementation separates the linear transformation - and non-linear transformation apart. Here, we take :math:`i_t` as an - example. The linear transformation is applied by calling a `fc` layer and - the equation is: + The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and + :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}` + should be same. The implementation separates the linear transformation and + non-linear transformation apart. Here, we take :math:`i_t` as an example. + The linear transformation is applied by calling a `fc` layer and the + equation is: .. math:: - L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i + L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i The non-linear transformation is applied by calling `lstm_unit_op` and the equation is: @@ -1213,14 +1214,15 @@ def lstm_unit(x_t, Raises: ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\ not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \ - and **cell_t_prev** not be the same. + and **cell_t_prev** not be the same or the 2nd dimensions of \ + **hidden_t_prev** and **cell_t_prev** not be the same. Examples: .. code-block:: python x_t = fluid.layers.fc(input=x_t_data, size=10) - prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20) + prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30) prev_cell = fluid.layers.fc(input=prev_cell_data, size=30) hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t, hidden_t_prev=prev_hidden, @@ -1239,7 +1241,11 @@ def lstm_unit(x_t, if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[ 0] != cell_t_prev.shape[0]: - raise ValueError("The 1s dimension of x_t, hidden_t_prev and " + raise ValueError("The 1s dimensions of x_t, hidden_t_prev and " + "cell_t_prev must be the same.") + + if hidden_t_prev.shape[1] != cell_t_prev.shape[1]: + raise ValueError("The 2nd dimensions of hidden_t_prev and " "cell_t_prev must be the same.") if bias_attr is None: @@ -1268,17 +1274,17 @@ def lstm_unit(x_t, def reduce_sum(input, dim=None, keep_dim=False): """ - Computes the sum of tensor elements over the given dimension. + Computes the sum of tensor elements over the given dimension. Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the sum is performed. If - :attr:`None`, sum all elements of :attr:`input` and return a - Tensor variable with a single element, otherwise must be in the - range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, + dim (int|None): The dimension along which the sum is performed. If + :attr:`None`, sum all elements of :attr:`input` and return a + Tensor variable with a single element, otherwise must be in the + range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. - keep_dim (bool): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. Returns: @@ -1312,17 +1318,17 @@ def reduce_sum(input, dim=None, keep_dim=False): def reduce_mean(input, dim=None, keep_dim=False): """ - Computes the mean of tensor elements over the given dimension. + Computes the mean of tensor elements over the given dimension. Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the mean is computed. If - :attr:`None`, compute the mean over all elements of :attr:`input` - and return a Tensor variable with a single element, otherwise - must be in the range :math:`[-rank(input), rank(input))`. If + dim (int|None): The dimension along which the mean is computed. If + :attr:`None`, compute the mean over all elements of :attr:`input` + and return a Tensor variable with a single element, otherwise + must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. - keep_dim (bool): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. Returns: @@ -1356,22 +1362,22 @@ def reduce_mean(input, dim=None, keep_dim=False): def reduce_max(input, dim=None, keep_dim=False): """ - Computes the maximum of tensor elements over the given dimension. + Computes the maximum of tensor elements over the given dimension. Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the maximum is computed. - If :attr:`None`, compute the maximum over all elements of - :attr:`input` and return a Tensor variable with a single element, - otherwise must be in the range :math:`[-rank(input), rank(input))`. + dim (int|None): The dimension along which the maximum is computed. + If :attr:`None`, compute the maximum over all elements of + :attr:`input` and return a Tensor variable with a single element, + otherwise must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. - keep_dim (bool): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. Returns: Variable: The reduced Tensor variable. - + Examples: .. code-block:: python @@ -1400,22 +1406,22 @@ def reduce_max(input, dim=None, keep_dim=False): def reduce_min(input, dim=None, keep_dim=False): """ - Computes the minimum of tensor elements over the given dimension. + Computes the minimum of tensor elements over the given dimension. Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int|None): The dimension along which the minimum is computed. - If :attr:`None`, compute the minimum over all elements of - :attr:`input` and return a Tensor variable with a single element, - otherwise must be in the range :math:`[-rank(input), rank(input))`. + dim (int|None): The dimension along which the minimum is computed. + If :attr:`None`, compute the minimum over all elements of + :attr:`input` and return a Tensor variable with a single element, + otherwise must be in the range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. - keep_dim (bool): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension than the :attr:`input` unless :attr:`keep_dim` is true. Returns: Variable: The reduced Tensor variable. - + Examples: .. code-block:: python From d6ec9630473712bf0a61b121030369b63a9996b8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 3 Jan 2018 14:20:33 +0800 Subject: [PATCH 174/181] Minor correction. --- python/paddle/v2/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 1a2019d1f2..09b71cc371 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1241,7 +1241,7 @@ def lstm_unit(x_t, if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[ 0] != cell_t_prev.shape[0]: - raise ValueError("The 1s dimensions of x_t, hidden_t_prev and " + raise ValueError("The 1st dimensions of x_t, hidden_t_prev and " "cell_t_prev must be the same.") if hidden_t_prev.shape[1] != cell_t_prev.shape[1]: From c0f6f492bcc86dcb2a5702332915852734884b9a Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 3 Jan 2018 14:24:31 +0800 Subject: [PATCH 175/181] Add shape info for arguments. --- python/paddle/v2/fluid/layers/nn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 09b71cc371..5442cce494 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1199,9 +1199,9 @@ def lstm_unit(x_t, This layer has two outputs including :math:`h_t` and :math:`o_t`. Args: - x_t (Variable): The input value of current step. - hidden_t_prev (Variable): The hidden value of lstm unit. - cell_t_prev (Variable): The cell value of lstm unit. + x_t (Variable): The input value of current step, a 2-D tensor. + hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor. + cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor. forget_bias (float): The forget bias of lstm unit. param_attr (ParamAttr): The attributes of parameter weights, used to set initializer, name etc. From 5974c1b76e400da3b6f3e1dd8884fb006d48cc59 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 3 Jan 2018 15:09:24 +0800 Subject: [PATCH 176/181] refine comments in CMakelists.txt of operator --- paddle/operators/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index df737ed9b0..a0b61640e5 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -61,7 +61,7 @@ function(op_library TARGET) ${op_common_deps}) endif() - # net_op doesn't need pybind, others will be pybind manually + # Define operators that don't need pybind here. foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) @@ -69,7 +69,8 @@ function(op_library TARGET) endforeach() file(READ ${TARGET}.cc TARGET_CONTENT) - # It's enough to just adding one operator to pybind + # It's enough to just adding one operator to pybind. + # And for detail pybind information, please see paddle/pybind/pybind.h. string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}") string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}") if (one_register STREQUAL "") From 60fecce43db68281112a91198d85a79a972f03f9 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 3 Jan 2018 15:20:00 +0800 Subject: [PATCH 177/181] Fix unit test for lstm_unit. --- python/paddle/v2/fluid/layers/nn.py | 9 ++++++--- python/paddle/v2/fluid/tests/test_layers.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 5442cce494..1c1c09dd28 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -1199,9 +1199,12 @@ def lstm_unit(x_t, This layer has two outputs including :math:`h_t` and :math:`o_t`. Args: - x_t (Variable): The input value of current step, a 2-D tensor. - hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor. - cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor. + x_t (Variable): The input value of current step, a 2-D tensor with shape + M x N, M for batch size and N for input size. + hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor + with shape M x S, M for batch size and S for size of lstm unit. + cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with + shape M x S, M for batch size and S for size of lstm unit. forget_bias (float): The forget bias of lstm unit. param_attr (ParamAttr): The attributes of parameter weights, used to set initializer, name etc. diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 9d2dcca56d..77f0f11f1b 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -177,8 +177,8 @@ class TestBook(unittest.TestCase): name='x_t_data', shape=[10, 10], dtype='float32') x_t = layers.fc(input=x_t_data, size=10) prev_hidden_data = layers.data( - name='prev_hidden_data', shape=[10, 20], dtype='float32') - prev_hidden = layers.fc(input=prev_hidden_data, size=20) + name='prev_hidden_data', shape=[10, 30], dtype='float32') + prev_hidden = layers.fc(input=prev_hidden_data, size=30) prev_cell_data = layers.data( name='prev_cell', shape=[10, 30], dtype='float32') prev_cell = layers.fc(input=prev_cell_data, size=30) From 907e6d04de0c5ccc41b84952e5cc18d1f1a85531 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Wed, 3 Jan 2018 17:57:33 +0800 Subject: [PATCH 178/181] Fix bug in SetAttrDescVisitor (#7165) * fix bug in SetAttrDescVisitor * add comments --- paddle/framework/op_desc.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 781bbb4c19..3e58e6442e 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -260,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(int v) const { attr_->set_i(v); } void operator()(float v) const { attr_->set_f(v); } void operator()(const std::string &v) const { attr_->set_s(v); } - void operator()(bool b) const { attr_->set_b(b); } + + // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162 + template ::value>::type> + void operator()(T b) const { + attr_->set_b(b); + } void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_ints()); @@ -274,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor { void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_bools()); } - void operator()(proto::BlockDesc *desc) const { - attr_->set_block_idx(desc->idx()); - } + void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } }; From 2d2b633282523c494a99e02da092c87da0c87dc0 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 3 Jan 2018 19:53:22 +0800 Subject: [PATCH 179/181] add more comments in CMakelists.txt of operator --- paddle/framework/op_registry.h | 4 ++-- paddle/operators/CMakeLists.txt | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index bdaa259181..d75c0233e8 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -37,8 +37,8 @@ class Registrar { public: // In our design, various kinds of classes, e.g., operators and kernels, // have their corresponding registry and registrar. The action of - // registration is in the constructor of a global registrar variable, which, - // however, are not used in the code that calls package framework, and would + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would // be removed from the generated binary file by the linker. To avoid such // removal, we add Touch to all registrar classes and make USE_OP macros to // call this method. So, as long as the callee code calls USE_OP, the global diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index a0b61640e5..77b52eb176 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -68,9 +68,10 @@ function(op_library TARGET) endif() endforeach() + # The registration of USE_OP, please refer to paddle/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. file(READ ${TARGET}.cc TARGET_CONTENT) - # It's enough to just adding one operator to pybind. - # And for detail pybind information, please see paddle/pybind/pybind.h. string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}") string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}") if (one_register STREQUAL "") From 19541468b6a99b57a3ef130fba841fac721b75c8 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 3 Jan 2018 22:04:35 +0800 Subject: [PATCH 180/181] "fix frigled test gradient of rnn" (#7166) * "fix frigled test gradient of rnn" * "fix based on comments" --- paddle/gserver/tests/test_LayerGrad.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index a2f07937b8..ba83667ebc 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) { for (auto reversed : {false, true}) { config.layerConfig.set_reversed(reversed); config.testState = !reversed; - testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu); + testLayerGrad( + config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0); } } } @@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) { for (auto reversed : {false, true}) { config.layerConfig.set_reversed(reversed); config.testState = !reversed; - testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu); + testLayerGrad( + config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02); } } for (auto useGpu : {true}) { From a893f156527942d9172d51ab6662748b7000d5bc Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 4 Jan 2018 13:30:51 +0800 Subject: [PATCH 181/181] fix layout transform (#7149) * "fix typo" * "fix based on comments" * "follow gogle style" * "fix based on comemnts" --- paddle/framework/data_transform.cc | 36 +++++++++++++++++++------ paddle/framework/data_transform.h | 8 +++--- paddle/framework/data_transform_test.cc | 16 +++++++++-- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 9d6a842442..ac6e40a3ae 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/framework/data_transform.h" #include "paddle/framework/lod_tensor.h" @@ -74,26 +75,28 @@ void TransDataType(const platform::DeviceContext* ctx, } } -void TransDataLayout(const platform::DeviceContext* ctx, +void TransDataLayout(const std::vector& axis, + const platform::DeviceContext* ctx, const KernelTypePair& kernel_pair, const Variable& in, Variable* out) { - PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE(in.IsType(), "Only support Tensor transform!."); PADDLE_ENFORCE( platform::places_are_same_class(kernel_pair.first.place_, kernel_pair.second.place_), - "TransDataType Only Support DataType transform on same place!"); + "TransDataLayout only support DataLayout transform on same place!"); + PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_, + "TransDataLayout only support Datatype are same!"); auto src = in.Get(); auto* dst = out->GetMutable(); PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); - auto src_dim = src.dims(); - dst->Resize(src_dim); auto place = kernel_pair.second.place_; CopyFrom(src, place, *ctx, dst); - const std::vector axis = {0, 2, 3, 1}; + auto src_dim = src.dims(); std::vector dst_dim; + dst_dim.resize(axis.size()); for (size_t i = 0; i < axis.size(); i++) { dst_dim[i] = src_dim[axis[i]]; @@ -102,7 +105,7 @@ void TransDataLayout(const platform::DeviceContext* ctx, dst->Resize(make_ddim(dst_dim)); auto src_type = kernel_pair.first.data_type_; - framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); + framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst)); dst->set_layout(kernel_pair.second.data_layout_); } @@ -111,5 +114,22 @@ void TransDataLayout(const platform::DeviceContext* ctx, } // namespace paddle namespace f = paddle::framework; + +namespace { +std::vector NHWC2NCHW = {0, 3, 1, 2}; +std::vector NCHW2NHWC = {0, 2, 3, 1}; +} + REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType); -REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout); +REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, + std::bind(f::TransDataLayout, NHWC2NCHW, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + std::placeholders::_4)); +REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC, + std::bind(f::TransDataLayout, NCHW2NHWC, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + std::placeholders::_4)); diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index 9abb3c99bf..56ebc80f43 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -73,6 +73,7 @@ struct CastDataType { auto numel = in_.numel(); auto* in_end = in_begin + numel; auto* out_begin = out_->mutable_data(place); + if (platform::is_cpu_place(place)) { platform::Transform trans; auto* context = static_cast(ctx_); @@ -86,9 +87,9 @@ struct CastDataType { }; struct CastDataLayout { - CastDataLayout(const framework::Tensor& in, framework::Tensor* out, - const platform::DeviceContext* ctx, - const std::vector& axis) + CastDataLayout(const platform::DeviceContext* ctx, + const std::vector& axis, const framework::Tensor& in, + framework::Tensor* out) : in_(in), out_(out), ctx_(ctx), axis_(axis) {} const framework::Tensor in_; framework::Tensor* out_; @@ -98,6 +99,7 @@ struct CastDataLayout { template void operator()() { auto place = ctx_->GetPlace(); + if (platform::is_cpu_place(place)) { operators::math::Transpose trans4; auto* context = static_cast(ctx_); diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 8665b6248f..edd305fd17 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -106,7 +106,7 @@ TEST(DataTransform, Register) { ASSERT_EQ(test_value, 2); } -TEST(DataTransform, Layout) { +TEST(DataTransform, DataLayout) { using namespace paddle::framework; using namespace paddle::platform; @@ -127,7 +127,19 @@ TEST(DataTransform, Layout) { } Tensor dst = out.Get(); - EXPECT_TRUE(dst.layout() != src->layout()); + + EXPECT_TRUE(dst.layout() == DataLayout::kNCHW); + EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1})); + + { + auto kernel1 = GenFromBit({1, 0, 1, 0}); + auto kernel2 = GenFromBit({1, 0, 0, 0}); + auto pair0 = std::make_pair(kernel1, kernel2); + instance.Get(pair0)(ctx, pair0, out, &in); + } + + EXPECT_TRUE(src->layout() == DataLayout::kNHWC); + EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2})); } TEST(DataTransform, DataType) {